From 2ceba0d2189230a099b692ee2acff75dc3fac71b Mon Sep 17 00:00:00 2001 From: Jenny Chen Date: Mon, 3 Nov 2025 20:52:38 -0500 Subject: [PATCH 01/15] add EP in PTQ (#15015) Signed-off-by: jenchen13 Signed-off-by: Pablo Garay --- nemo/collections/llm/api.py | 2 ++ scripts/llm/ptq.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 3cbf6583acf4..29ad1f6deabb 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -471,6 +471,7 @@ def ptq( export_config: ExportConfig, calibration_tp: int = 1, calibration_pp: int = 1, + calibration_ep: int = 1, num_layers_in_first_pipeline_stage: int | None = None, num_layers_in_last_pipeline_stage: int | None = None, devices: int | None = None, @@ -558,6 +559,7 @@ def ptq( pipeline_model_parallel_size=calibration_pp, num_layers_in_first_pipeline_stage=num_layers_in_first_pipeline_stage, num_layers_in_last_pipeline_stage=num_layers_in_last_pipeline_stage, + expert_model_parallel_size=calibration_ep, devices=devices, num_nodes=num_nodes, inference_only=True, diff --git a/scripts/llm/ptq.py b/scripts/llm/ptq.py index f7ee5f37d4a4..1b8bf84b16ab 100644 --- a/scripts/llm/ptq.py +++ b/scripts/llm/ptq.py @@ -34,6 +34,7 @@ def get_args(): ) parser.add_argument("--decoder_type", type=str, help="Decoder type for TensorRT-Model-Optimizer") parser.add_argument("-ctp", "--calibration_tp", "--calib_tp", type=int, default=1) + parser.add_argument("-cep", "--calibration_ep", "--calib_ep", type=int, default=1) parser.add_argument("-cpp", "--calibration_pp", "--calib_pp", type=int, default=1) parser.add_argument( "--num_layers_in_first_pipeline_stage", @@ -167,6 +168,7 @@ def main(): export_config=export_config, calibration_tp=args.calibration_tp, calibration_pp=args.calibration_pp, + calibration_ep=args.calibration_ep, num_layers_in_first_pipeline_stage=args.num_layers_in_first_pipeline_stage, num_layers_in_last_pipeline_stage=args.num_layers_in_last_pipeline_stage, devices=args.devices, From 5457b93724044de2b2c37783660b45ce968a9037 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Tue, 4 Nov 2025 15:36:04 -0800 Subject: [PATCH 02/15] remove ExportDeploy Signed-off-by: Pablo Garay --- docker/Dockerfile.ci.export_deploy | 98 - nemo/deploy/__init__.py | 27 - nemo/deploy/deploy_base.py | 116 -- nemo/deploy/deploy_pytriton.py | 184 -- nemo/deploy/multimodal/__init__.py | 16 - nemo/deploy/multimodal/query_multimodal.py | 164 -- nemo/deploy/nlp/__init__.py | 22 - nemo/deploy/nlp/hf_deployable.py | 319 --- nemo/deploy/nlp/megatronllm_deployable.py | 414 ---- nemo/deploy/nlp/query_llm.py | 544 ----- nemo/deploy/service/__init__.py | 14 - .../service/fastapi_interface_to_pytriton.py | 326 --- nemo/deploy/service/rest_model_api.py | 132 -- nemo/deploy/triton_deployable.py | 31 - nemo/deploy/utils.py | 204 -- nemo/export/__init__.py | 30 - nemo/export/multimodal/__init__.py | 13 - nemo/export/multimodal/build.py | 728 ------- nemo/export/multimodal/converter.py | 412 ---- nemo/export/multimodal/run.py | 1168 ----------- nemo/export/onnx_llm_exporter.py | 475 ----- nemo/export/quantize/__init__.py | 15 - nemo/export/quantize/quantizer.py | 277 --- nemo/export/sentencepiece_tokenizer.py | 280 --- nemo/export/tarutils.py | 265 --- nemo/export/tensorrt_lazy_compiler.py | 714 ------- nemo/export/tensorrt_llm.py | 1804 ----------------- nemo/export/tensorrt_mm_exporter.py | 365 ---- nemo/export/tiktoken_tokenizer.py | 123 -- nemo/export/trt_llm/__init__.py | 13 - nemo/export/trt_llm/converter/__init__.py | 13 - .../trt_llm/converter/model_converter.py | 307 --- .../converter/model_to_trt_llm_ckpt.py | 496 ----- nemo/export/trt_llm/converter/utils.py | 598 ------ .../trt_llm/nemo_ckpt_loader/__init__.py | 13 - .../trt_llm/nemo_ckpt_loader/nemo_file.py | 706 ------- nemo/export/trt_llm/qnemo/__init__.py | 15 - .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py | 118 -- nemo/export/trt_llm/qnemo/tokenizer_utils.py | 55 - nemo/export/trt_llm/qnemo/utils.py | 32 - nemo/export/trt_llm/tensorrt_llm_build.py | 133 -- nemo/export/trt_llm/tensorrt_llm_run.py | 931 --------- nemo/export/trt_llm/utils.py | 35 - nemo/export/utils/__init__.py | 45 - nemo/export/utils/_mock_import.py | 79 - nemo/export/utils/constants.py | 16 - nemo/export/utils/lora_converter.py | 223 -- nemo/export/utils/model_loader.py | 209 -- nemo/export/utils/utils.py | 155 -- nemo/export/vllm/__init__.py | 13 - nemo/export/vllm/model_config.py | 252 --- nemo/export/vllm/model_converters.py | 421 ---- nemo/export/vllm/model_loader.py | 101 - nemo/export/vllm_exporter.py | 537 ----- nemo/export/vllm_hf_exporter.py | 132 -- scripts/deploy/multimodal/deploy_triton.py | 237 --- scripts/deploy/multimodal/query.py | 68 - .../deploy/nlp/benchmark_llm_inframework.py | 205 -- .../nlp/deploy_in_fw_oai_server_eval.py | 101 - .../nlp/deploy_inframework_hf_triton.py | 232 --- .../deploy/nlp/deploy_inframework_triton.py | 140 -- scripts/deploy/nlp/deploy_triton.py | 500 ----- scripts/deploy/nlp/deploy_vllm_triton.py | 182 -- scripts/deploy/nlp/query.py | 247 --- scripts/deploy/nlp/query_inframework.py | 95 - scripts/deploy/nlp/query_inframework_hf.py | 168 -- scripts/export.py | 191 -- scripts/export/convert_nemo2_for_export.py | 123 -- scripts/export/export_mm_to_trtllm.py | 139 -- scripts/export/export_to_trt_llm.py | 187 -- scripts/export/setup_vllm_venv.sh | 15 - tests/deploy/__init__.py | 0 tests/deploy/nemo_deploy.py | 591 ------ tests/deploy/test_deploy_base.py | 101 - tests/deploy/test_deploy_pytriton.py | 103 - tests/deploy/test_deploy_query.py | 78 - tests/deploy/test_deploy_utils.py | 221 -- tests/deploy/test_deployment_service.py | 319 --- tests/deploy/test_hf_deployable.py | 198 -- tests/deploy/test_hf_import.py | 149 -- tests/deploy/test_megatronllm_deployable.py | 142 -- tests/deploy/test_query_llm.py | 200 -- tests/deploy/test_query_multimodal.py | 142 -- tests/deploy/test_triton_deployable.py | 78 - tests/export/__init__.py | 13 - tests/export/multimodal/test_build.py | 109 - tests/export/multimodal/test_converter.py | 101 - tests/export/nemo_export.py | 910 --------- tests/export/test_export_onnx.py | 128 -- tests/export/test_mock_import.py | 36 - tests/export/test_model_loading.py | 60 - tests/export/test_onnx_llm_exporter.py | 60 - tests/export/test_quantizer.py | 110 - tests/export/test_sentencepiece_tokenizer.py | 192 -- tests/export/test_tarutils.py | 125 -- tests/export/test_tensorrt_lazy_compiler.py | 166 -- tests/export/test_tensorrt_llm.py | 233 --- tests/export/test_tensorrt_mm_exporter.py | 224 -- tests/export/test_tiktoken_tokenizer.py | 109 - tests/export/test_trt_compile.py | 142 -- tests/export/test_vllm_hf_exporter.py | 180 -- tests/export/trt_llm/__init__.py | 0 tests/export/trt_llm/converter/__init__.py | 0 .../trt_llm/converter/test_converter_utils.py | 119 -- .../trt_llm/converter/test_model_converter.py | 77 - .../converter/test_model_to_trt_llm_ckpt.py | 117 -- .../trt_llm/test_tensorrt_llm_export.py | 127 -- tests/export/utils/test_exp_utils.py | 144 -- tests/export/utils/test_lora_converter.py | 111 - tests/export/utils/test_model_loader.py | 127 -- .../L0_Unit_Tests_CPU_Export_Deploy.sh | 14 - .../L0_Unit_Tests_GPU_Export_Deploy.sh | 14 - ...NeMo_2_Export_Deploy_Query_In_Framework.sh | 27 - 113 files changed, 23855 deletions(-) delete mode 100644 docker/Dockerfile.ci.export_deploy delete mode 100644 nemo/deploy/__init__.py delete mode 100644 nemo/deploy/deploy_base.py delete mode 100644 nemo/deploy/deploy_pytriton.py delete mode 100644 nemo/deploy/multimodal/__init__.py delete mode 100644 nemo/deploy/multimodal/query_multimodal.py delete mode 100755 nemo/deploy/nlp/__init__.py delete mode 100755 nemo/deploy/nlp/hf_deployable.py delete mode 100755 nemo/deploy/nlp/megatronllm_deployable.py delete mode 100755 nemo/deploy/nlp/query_llm.py delete mode 100644 nemo/deploy/service/__init__.py delete mode 100644 nemo/deploy/service/fastapi_interface_to_pytriton.py delete mode 100644 nemo/deploy/service/rest_model_api.py delete mode 100644 nemo/deploy/triton_deployable.py delete mode 100644 nemo/deploy/utils.py delete mode 100644 nemo/export/__init__.py delete mode 100644 nemo/export/multimodal/__init__.py delete mode 100644 nemo/export/multimodal/build.py delete mode 100644 nemo/export/multimodal/converter.py delete mode 100644 nemo/export/multimodal/run.py delete mode 100755 nemo/export/onnx_llm_exporter.py delete mode 100644 nemo/export/quantize/__init__.py delete mode 100644 nemo/export/quantize/quantizer.py delete mode 100644 nemo/export/sentencepiece_tokenizer.py delete mode 100644 nemo/export/tarutils.py delete mode 100644 nemo/export/tensorrt_lazy_compiler.py delete mode 100644 nemo/export/tensorrt_llm.py delete mode 100644 nemo/export/tensorrt_mm_exporter.py delete mode 100644 nemo/export/tiktoken_tokenizer.py delete mode 100644 nemo/export/trt_llm/__init__.py delete mode 100644 nemo/export/trt_llm/converter/__init__.py delete mode 100755 nemo/export/trt_llm/converter/model_converter.py delete mode 100644 nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py delete mode 100755 nemo/export/trt_llm/converter/utils.py delete mode 100644 nemo/export/trt_llm/nemo_ckpt_loader/__init__.py delete mode 100644 nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py delete mode 100644 nemo/export/trt_llm/qnemo/__init__.py delete mode 100644 nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py delete mode 100644 nemo/export/trt_llm/qnemo/tokenizer_utils.py delete mode 100644 nemo/export/trt_llm/qnemo/utils.py delete mode 100755 nemo/export/trt_llm/tensorrt_llm_build.py delete mode 100644 nemo/export/trt_llm/tensorrt_llm_run.py delete mode 100644 nemo/export/trt_llm/utils.py delete mode 100644 nemo/export/utils/__init__.py delete mode 100644 nemo/export/utils/_mock_import.py delete mode 100644 nemo/export/utils/constants.py delete mode 100644 nemo/export/utils/lora_converter.py delete mode 100644 nemo/export/utils/model_loader.py delete mode 100755 nemo/export/utils/utils.py delete mode 100644 nemo/export/vllm/__init__.py delete mode 100644 nemo/export/vllm/model_config.py delete mode 100644 nemo/export/vllm/model_converters.py delete mode 100644 nemo/export/vllm/model_loader.py delete mode 100644 nemo/export/vllm_exporter.py delete mode 100755 nemo/export/vllm_hf_exporter.py delete mode 100755 scripts/deploy/multimodal/deploy_triton.py delete mode 100644 scripts/deploy/multimodal/query.py delete mode 100644 scripts/deploy/nlp/benchmark_llm_inframework.py delete mode 100644 scripts/deploy/nlp/deploy_in_fw_oai_server_eval.py delete mode 100755 scripts/deploy/nlp/deploy_inframework_hf_triton.py delete mode 100755 scripts/deploy/nlp/deploy_inframework_triton.py delete mode 100755 scripts/deploy/nlp/deploy_triton.py delete mode 100755 scripts/deploy/nlp/deploy_vllm_triton.py delete mode 100644 scripts/deploy/nlp/query.py delete mode 100644 scripts/deploy/nlp/query_inframework.py delete mode 100644 scripts/deploy/nlp/query_inframework_hf.py delete mode 100644 scripts/export.py delete mode 100644 scripts/export/convert_nemo2_for_export.py delete mode 100644 scripts/export/export_mm_to_trtllm.py delete mode 100644 scripts/export/export_to_trt_llm.py delete mode 100755 scripts/export/setup_vllm_venv.sh delete mode 100644 tests/deploy/__init__.py delete mode 100644 tests/deploy/nemo_deploy.py delete mode 100755 tests/deploy/test_deploy_base.py delete mode 100755 tests/deploy/test_deploy_pytriton.py delete mode 100755 tests/deploy/test_deploy_query.py delete mode 100644 tests/deploy/test_deploy_utils.py delete mode 100644 tests/deploy/test_deployment_service.py delete mode 100755 tests/deploy/test_hf_deployable.py delete mode 100644 tests/deploy/test_hf_import.py delete mode 100644 tests/deploy/test_megatronllm_deployable.py delete mode 100755 tests/deploy/test_query_llm.py delete mode 100644 tests/deploy/test_query_multimodal.py delete mode 100644 tests/deploy/test_triton_deployable.py delete mode 100644 tests/export/__init__.py delete mode 100644 tests/export/multimodal/test_build.py delete mode 100755 tests/export/multimodal/test_converter.py delete mode 100644 tests/export/nemo_export.py delete mode 100644 tests/export/test_export_onnx.py delete mode 100644 tests/export/test_mock_import.py delete mode 100644 tests/export/test_model_loading.py delete mode 100644 tests/export/test_onnx_llm_exporter.py delete mode 100644 tests/export/test_quantizer.py delete mode 100644 tests/export/test_sentencepiece_tokenizer.py delete mode 100644 tests/export/test_tarutils.py delete mode 100755 tests/export/test_tensorrt_lazy_compiler.py delete mode 100644 tests/export/test_tensorrt_llm.py delete mode 100644 tests/export/test_tensorrt_mm_exporter.py delete mode 100644 tests/export/test_tiktoken_tokenizer.py delete mode 100644 tests/export/test_trt_compile.py delete mode 100644 tests/export/test_vllm_hf_exporter.py delete mode 100644 tests/export/trt_llm/__init__.py delete mode 100644 tests/export/trt_llm/converter/__init__.py delete mode 100755 tests/export/trt_llm/converter/test_converter_utils.py delete mode 100644 tests/export/trt_llm/converter/test_model_converter.py delete mode 100644 tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py delete mode 100755 tests/export/trt_llm/test_tensorrt_llm_export.py delete mode 100644 tests/export/utils/test_exp_utils.py delete mode 100644 tests/export/utils/test_lora_converter.py delete mode 100644 tests/export/utils/test_model_loader.py delete mode 100644 tests/functional_tests/L0_Unit_Tests_CPU_Export_Deploy.sh delete mode 100644 tests/functional_tests/L0_Unit_Tests_GPU_Export_Deploy.sh delete mode 100644 tests/functional_tests/L2_NeMo_2_Export_Deploy_Query_In_Framework.sh diff --git a/docker/Dockerfile.ci.export_deploy b/docker/Dockerfile.ci.export_deploy deleted file mode 100644 index 68a3210bab89..000000000000 --- a/docker/Dockerfile.ci.export_deploy +++ /dev/null @@ -1,98 +0,0 @@ -# syntax=docker/dockerfile:1-labs - -# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3 - -FROM ${BASE_IMAGE} AS base-image -ENV PIP_CONSTRAINT="" -ARG IMAGE_LABEL -LABEL "nemo.library"=${IMAGE_LABEL} - -ENV TRANSFORMERS_OFFLINE=0 -ENV HYDRA_FULL_ERROR=1 -ENV PYTHONUNBUFFERED=1 - -# APT packages -RUN bash -ex <<"EOF" -apt-get update -apt-get install -y bc -apt-get clean -EOF - -WORKDIR /opt/NeMo -ARG TRTLLM_REPO -ARG TRTLLM_TAG -RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \ - --mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches bash -ex <<"EOF" - - bash /opt/NeMo/install_dep.sh --library trt --mode install -EOF - -FROM base-image AS trt-llm-wheel -WORKDIR /opt/NeMo -ARG TRTLLM_REPO -ARG TRTLLM_TAG -RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \ - --mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches bash -ex <<"EOF" - - bash /opt/NeMo/install_dep.sh --library trtllm --mode build -EOF - -FROM base-image as te-wheel -WORKDIR /opt/NeMo -ARG TE_REPO -ARG TE_TAG -RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \ - --mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches bash -ex <<"EOF" - - bash /opt/NeMo/install_dep.sh --library te --mode build - ls -al /opt/Megatron-LM || true -EOF - -FROM base-image as mcore-wheel -WORKDIR /opt/NeMo -ARG MLM_REPO -ARG MLM_TAG -RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh bash -ex <<"EOF" - - bash /opt/NeMo/install_dep.sh --library mcore --mode build - ls -al /opt/Megatron-LM || true -EOF - -FROM base-image -WORKDIR /opt/NeMo -ENV INSTALL_DIR="/opt" -RUN \ - --mount=type=bind,from=trt-llm-wheel,source=/opt/wheels/trtllm,target=/opt/wheels/trtllm \ - --mount=type=bind,from=te-wheel,source=/opt/wheels/te,target=/opt/wheels/te \ - --mount=type=bind,from=mcore-wheel,source=/opt/wheels/mcore,target=/opt/wheels/mcore \ - --mount=type=bind,source=requirements,target=/opt/NeMo/requirements \ - --mount=type=bind,source=tools/ctc_segmentation/requirements.txt,target=/opt/NeMo/tools/ctc_segmentation/requirements.txt \ - --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \ - --mount=type=bind,source=setup.py,target=/opt/NeMo/setup.py \ - --mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches \ - --mount=type=bind,source=README.md,target=/opt/NeMo/README.md \ - --mount=type=bind,source=nemo/package_info.py,target=/opt/NeMo/nemo/package_info.py \ - --mount=type=bind,source=nemo/__init__.py,target=/opt/NeMo/nemo/__init__.py bash -ex <<"EOF" - - bash /opt/NeMo/install_dep.sh --library all --mode install - pip install --no-cache-dir ".[deploy,test]" - -EOF - -WORKDIR /workspace -ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM" -ENV NEMO_HOME="/home/TestData/nemo_home" diff --git a/nemo/deploy/__init__.py b/nemo/deploy/__init__.py deleted file mode 100644 index 2859fd065711..000000000000 --- a/nemo/deploy/__init__.py +++ /dev/null @@ -1,27 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import warnings - -from nemo.deploy.deploy_base import DeployBase # noqa: F401 -from nemo.deploy.deploy_pytriton import DeployPyTriton # noqa: F401 -from nemo.deploy.triton_deployable import ITritonDeployable # noqa: F401 - -warnings.warn( - "The 'nemo.deploy' is deprecated and will be removed in NeMo FW 25.09 container release. " - "For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Export-Deploy", - DeprecationWarning, - stacklevel=2, -) diff --git a/nemo/deploy/deploy_base.py b/nemo/deploy/deploy_base.py deleted file mode 100644 index aeb94255a273..000000000000 --- a/nemo/deploy/deploy_base.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import importlib -import logging -from abc import ABC, abstractmethod - -use_pytorch_lightning = True -try: - from lightning.pytorch import Trainer -except Exception: - use_pytorch_lightning = False - -from nemo.deploy.triton_deployable import ITritonDeployable - -use_nemo = True -try: - from nemo.core.classes.modelPT import ModelPT -except Exception: - use_nemo = False - - -LOGGER = logging.getLogger("NeMo") - - -class DeployBase(ABC): - def __init__( - self, - triton_model_name: str, - triton_model_version: int = 1, - checkpoint_path: str = None, - model=None, - max_batch_size: int = 128, - http_port: int = 8000, - grpc_port: int = 8001, - address="0.0.0.0", - allow_grpc=True, - allow_http=True, - streaming=False, - pytriton_log_verbose=0, - ): - self.checkpoint_path = checkpoint_path - self.triton_model_name = triton_model_name - self.triton_model_version = triton_model_version - self.max_batch_size = max_batch_size - self.model = model - self.http_port = http_port - self.grpc_port = grpc_port - self.address = address - self.triton = None - self.allow_grpc = allow_grpc - self.allow_http = allow_http - self.streaming = streaming - self.pytriton_log_verbose = pytriton_log_verbose - - if checkpoint_path is None and model is None: - raise Exception("Either checkpoint_path or model should be provided.") - - @abstractmethod - def deploy(self): - pass - - @abstractmethod - def serve(self): - pass - - @abstractmethod - def run(self): - pass - - @abstractmethod - def stop(self): - pass - - def _init_nemo_model(self): - if self.checkpoint_path is not None: - model_config = ModelPT.restore_from(self.checkpoint_path, return_config=True) - module_path, class_name = DeployBase.get_module_and_class(model_config.target) - cls = getattr(importlib.import_module(module_path), class_name) - self.model = cls.restore_from(restore_path=self.checkpoint_path, trainer=Trainer()) - self.model.freeze() - - # has to turn off activations_checkpoint_method for inference - try: - self.model.model.language_model.encoder.activations_checkpoint_method = None - except AttributeError as e: - LOGGER.warning(e) - - if self.model is None: - raise Exception("There is no model to deploy.") - - self._is_model_deployable() - - def _is_model_deployable(self): - if not issubclass(type(self.model), ITritonDeployable): - raise Exception( - "This model is not deployable to Triton." "nemo.deploy.ITritonDeployable class should be inherited" - ) - else: - return True - - @staticmethod - def get_module_and_class(target: str): - ln = target.rindex(".") - return target[0:ln], target[ln + 1 : len(target)] diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py deleted file mode 100644 index 9963f5422232..000000000000 --- a/nemo/deploy/deploy_pytriton.py +++ /dev/null @@ -1,184 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -use_pytriton = True -try: - from pytriton.model_config import ModelConfig - from pytriton.triton import Triton, TritonConfig -except Exception: - use_pytriton = False - -from nemo.deploy.deploy_base import DeployBase - - -class DeployPyTriton(DeployBase): - """ - Deploys any models to Triton Inference Server that implements ITritonDeployable interface in nemo.deploy. - - Example: - from nemo.deploy import DeployPyTriton, NemoQueryLLM - from nemo.export.tensorrt_llm import TensorRTLLM - - trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files") - trt_llm_exporter.export( - nemo_checkpoint_path="/path/for/nemo/checkpoint", - model_type="llama", - tensor_parallelism_size=1, - ) - - nm = DeployPyTriton(model=trt_llm_exporter, triton_model_name="model_name", http_port=8000) - nm.deploy() - nm.run() - nq = NemoQueryLLM(url="localhost", model_name="model_name") - - prompts = ["hello, testing GPT inference", "another GPT inference test?"] - output = nq.query_llm(prompts=prompts, max_output_len=100) - print("prompts: ", prompts) - print("") - print("output: ", output) - print("") - - prompts = ["Give me some info about Paris", "Do you think Londan is a good city to visit?", "What do you think about Rome?"] - output = nq.query_llm(prompts=prompts, max_output_len=250) - print("prompts: ", prompts) - print("") - print("output: ", output) - print("") - - """ - - def __init__( - self, - triton_model_name: str, - triton_model_version: int = 1, - checkpoint_path: str = None, - model=None, - max_batch_size: int = 128, - http_port: int = 8000, - grpc_port: int = 8001, - address="0.0.0.0", - allow_grpc=True, - allow_http=True, - streaming=False, - pytriton_log_verbose=0, - ): - """ - A nemo checkpoint or model is expected for serving on Triton Inference Server. - - Args: - triton_model_name (str): Name for the service - triton_model_version(int): Version for the service - checkpoint_path (str): path of the nemo file - model (ITritonDeployable): A model that implements the ITritonDeployable from nemo.deploy import ITritonDeployable - max_batch_size (int): max batch size - port (int) : port for the Triton server - address (str): http address for Triton server to bind. - """ - - super().__init__( - triton_model_name=triton_model_name, - triton_model_version=triton_model_version, - checkpoint_path=checkpoint_path, - model=model, - max_batch_size=max_batch_size, - http_port=http_port, - grpc_port=grpc_port, - address=address, - allow_grpc=allow_grpc, - allow_http=allow_http, - streaming=streaming, - pytriton_log_verbose=pytriton_log_verbose, - ) - - def deploy(self): - """ - Deploys any models to Triton Inference Server. - """ - - self._init_nemo_model() - - try: - if self.streaming: - # TODO: can't set allow_http=True due to a bug in pytriton, will fix in latest pytriton - triton_config = TritonConfig( - log_verbose=self.pytriton_log_verbose, - allow_grpc=self.allow_grpc, - allow_http=self.allow_http, - grpc_address=self.address, - ) - self.triton = Triton(config=triton_config) - self.triton.bind( - model_name=self.triton_model_name, - model_version=self.triton_model_version, - infer_func=self.model.triton_infer_fn_streaming, - inputs=self.model.get_triton_input, - outputs=self.model.get_triton_output, - config=ModelConfig(decoupled=True), - ) - else: - triton_config = TritonConfig( - http_address=self.address, - http_port=self.http_port, - grpc_address=self.address, - grpc_port=self.grpc_port, - allow_grpc=self.allow_grpc, - allow_http=self.allow_http, - ) - self.triton = Triton(config=triton_config) - self.triton.bind( - model_name=self.triton_model_name, - model_version=self.triton_model_version, - infer_func=self.model.triton_infer_fn, - inputs=self.model.get_triton_input, - outputs=self.model.get_triton_output, - config=ModelConfig(max_batch_size=self.max_batch_size), - ) - except Exception as e: - self.triton = None - print(e) - - def serve(self): - """ - Starts serving the model and waits for the requests - """ - - if self.triton is None: - raise Exception("deploy should be called first.") - - try: - self.triton.serve() - except Exception as e: - self.triton = None - print(e) - - def run(self): - """ - Starts serving the model asynchronously. - """ - - if self.triton is None: - raise Exception("deploy should be called first.") - - self.triton.run() - - def stop(self): - """ - Stops serving the model. - """ - - if self.triton is None: - raise Exception("deploy should be called first.") - - self.triton.stop() diff --git a/nemo/deploy/multimodal/__init__.py b/nemo/deploy/multimodal/__init__.py deleted file mode 100644 index ead9c7084837..000000000000 --- a/nemo/deploy/multimodal/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from nemo.deploy.multimodal.query_multimodal import NemoQueryMultimodal diff --git a/nemo/deploy/multimodal/query_multimodal.py b/nemo/deploy/multimodal/query_multimodal.py deleted file mode 100644 index fb8b3f9c6f5b..000000000000 --- a/nemo/deploy/multimodal/query_multimodal.py +++ /dev/null @@ -1,164 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from io import BytesIO - -import numpy as np -import requests -import soundfile as sf -from PIL import Image - -from nemo.deploy.utils import str_list2numpy - -use_pytriton = True -try: - from pytriton.client import ModelClient -except Exception: - use_pytriton = False - -try: - from decord import VideoReader -except Exception: - import logging - - logging.warning("The package `decord` was not installed in this environment.") - - -class NemoQueryMultimodal: - """ - Sends a query to Triton for Multimodal inference - - Example: - from nemo.deploy.multimodal import NemoQueryMultimodal - - nq = NemoQueryMultimodal(url="localhost", model_name="neva", model_type="neva") - - input_text = "Hi! What is in this image?" - output = nq.query( - input_text=input_text, - input_media="/path/to/image.jpg", - max_output_len=30, - top_k=1, - top_p=0.0, - temperature=1.0, - ) - print("prompts: ", prompts) - """ - - def __init__(self, url, model_name, model_type): - self.url = url - self.model_name = model_name - self.model_type = model_type - - def setup_media(self, input_media): - """Setup input media""" - if self.model_type == "video-neva": - vr = VideoReader(input_media) - frames = [f.asnumpy() for f in vr] - return np.array(frames) - elif self.model_type == "lita" or self.model_type == "vita": - vr = VideoReader(input_media) - frames = [f.asnumpy() for f in vr] - subsample_len = self.frame_len(frames) - sub_frames = self.get_subsampled_frames(frames, subsample_len) - return np.array(sub_frames) - elif self.model_type in ["neva", "vila", "mllama"]: - if input_media.startswith("http") or input_media.startswith("https"): - response = requests.get(input_media, timeout=5) - media = Image.open(BytesIO(response.content)).convert("RGB") - else: - media = Image.open(input_media).convert('RGB') - return np.expand_dims(np.array(media), axis=0) - elif self.model_type == "salm": - waveform, sample_rate = sf.read(input_media, dtype=np.float32) - input_signal = np.array([waveform], dtype=np.float32) - input_signal_length = np.array([[len(waveform)]], dtype=np.int32) - return {"input_signal": input_signal, "input_signal_length": input_signal_length} - else: - raise RuntimeError(f"Invalid model type {self.model_type}") - - def frame_len(self, frames): - """Get frame len""" - max_frames = 256 - if len(frames) <= max_frames: - return len(frames) - else: - subsample = int(np.ceil(float(len(frames)) / max_frames)) - return int(np.round(float(len(frames)) / subsample)) - - def get_subsampled_frames(self, frames, subsample_len): - """Get subsampled frames""" - idx = np.round(np.linspace(0, len(frames) - 1, subsample_len)).astype(int) - sub_frames = [frames[i] for i in idx] - return sub_frames - - def query( - self, - input_text, - input_media, - batch_size=1, - max_output_len=30, - top_k=1, - top_p=0.0, - temperature=1.0, - repetition_penalty=1.0, - num_beams=1, - init_timeout=60.0, - lora_uids=None, - ): - """Run query""" - - prompts = str_list2numpy([input_text]) - inputs = {"input_text": prompts} - - media = self.setup_media(input_media) - if isinstance(media, dict): - inputs.update(media) - else: - inputs["input_media"] = np.repeat(media[np.newaxis, :, :, :, :], prompts.shape[0], axis=0) - - if batch_size is not None: - inputs["batch_size"] = np.full(prompts.shape, batch_size, dtype=np.int_) - - if max_output_len is not None: - inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_) - - if top_k is not None: - inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_) - - if top_p is not None: - inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single) - - if temperature is not None: - inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single) - - if repetition_penalty is not None: - inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single) - - if num_beams is not None: - inputs["num_beams"] = np.full(prompts.shape, num_beams, dtype=np.int_) - - if lora_uids is not None: - lora_uids = np.char.encode(lora_uids, "utf-8") - inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids) - - with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client: - result_dict = client.infer_batch(**inputs) - output_type = client.model_config.outputs[0].dtype - - if output_type == np.bytes_: - sentences = np.char.decode(result_dict["outputs"].astype("bytes"), "utf-8") - return sentences - else: - return result_dict["outputs"] diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py deleted file mode 100755 index 8680e795a399..000000000000 --- a/nemo/deploy/nlp/__init__.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from nemo.deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMHF, NemoQueryLLMPyTorch - -__all__ = [ - "NemoQueryLLM", - "NemoQueryLLMHF", - "NemoQueryLLMPyTorch", -] diff --git a/nemo/deploy/nlp/hf_deployable.py b/nemo/deploy/nlp/hf_deployable.py deleted file mode 100755 index 0a2bb59117bd..000000000000 --- a/nemo/deploy/nlp/hf_deployable.py +++ /dev/null @@ -1,319 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import logging -from typing import Any, List, Optional - -import numpy as np -import torch -from peft import PeftModel -from pytriton.decorators import batch -from pytriton.model_config import Tensor -from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer - -from nemo.deploy import ITritonDeployable -from nemo.deploy.utils import broadcast_list, cast_output, str_ndarray2list - -LOGGER = logging.getLogger("NeMo") - -SUPPORTED_TASKS = ["text-generation"] - - -class HuggingFaceLLMDeploy(ITritonDeployable): - """A Triton inference server compatible wrapper for HuggingFace models. - - This class provides a standardized interface for deploying HuggingFace models - in Triton inference server. It supports various NLP tasks and handles model - loading, inference, and deployment configurations. - - Args: - hf_model_id_path (Optional[str]): Path to the HuggingFace model or model identifier. - Can be a local path or a model ID from HuggingFace Hub. - hf_peft_model_id_path (Optional[str]): Path to the PEFT model or model identifier. - Can be a local path or a model ID from HuggingFace Hub. - tokenizer_id_path (Optional[str]): Path to the tokenizer or tokenizer identifier. - If None, will use the same path as hf_model_id_path. - model (Optional[AutoModel]): Pre-loaded HuggingFace model. - tokenizer (Optional[AutoTokenizer]): Pre-loaded HuggingFace tokenizer. - tokenizer_padding (bool): Whether to enable padding in tokenizer. Defaults to True. - tokenizer_truncation (bool): Whether to enable truncation in tokenizer. Defaults to True. - tokenizer_padding_side (str): Which side to pad on ('left' or 'right'). Defaults to 'left'. - task (str): HuggingFace task type (e.g., "text-generation"). Defaults to "text-generation". - **hf_kwargs: Additional keyword arguments to pass to HuggingFace model loading. - """ - - def __init__( - self, - hf_model_id_path: Optional[str] = None, - hf_peft_model_id_path: Optional[str] = None, - tokenizer_id_path: Optional[str] = None, - model: Optional[AutoModel] = None, - tokenizer: Optional[AutoTokenizer] = None, - tokenizer_padding=True, - tokenizer_truncation=True, - tokenizer_padding_side="left", - task: Optional[str] = "text-generation", - **hf_kwargs, - ): - if hf_model_id_path is None and model is None: - raise ValueError("hf_model_id_path or model parameters has to be passed.") - elif hf_model_id_path is not None and model is not None: - LOGGER.warning( - "hf_model_id_path will be ignored and the HuggingFace model " "set with model parameter will be used." - ) - - assert task in SUPPORTED_TASKS, "Task {0} is not a support task.".format(task) - - self.hf_model_id_path = hf_model_id_path - self.hf_peft_model_id_path = hf_peft_model_id_path - self.task = task - self.model = model - self.tokenizer = tokenizer - self.tokenizer_padding = tokenizer_padding - self.tokenizer_truncation = tokenizer_truncation - self.tokenizer_padding_side = tokenizer_padding_side - - if tokenizer_id_path is None: - self.tokenizer_id_path = hf_model_id_path - else: - self.tokenizer_id_path = tokenizer_id_path - - if model is None: - self._load(**hf_kwargs) - - def _load(self, **hf_kwargs) -> None: - """ - Load the HuggingFace pipeline with the specified model and task. - - This method initializes the HuggingFace AutoModel classes using the provided model - configuration and task type. It handles the model and tokenizer loading - process. - - Raises: - AssertionError: If task is not specified. - """ - assert self.task is not None, "A task has to be given for the generation task." - - if self.task == "text-generation": - self.model = AutoModelForCausalLM.from_pretrained(self.hf_model_id_path, **hf_kwargs) - - if self.hf_peft_model_id_path is not None: - self.model = PeftModel.from_pretrained(self.model, self.hf_peft_model_id_path) - else: - raise ValueError("Task {0} is not supported.".format(self.task)) - - self.model.cuda() - self.tokenizer = AutoTokenizer.from_pretrained( - self.tokenizer_id_path, - trust_remote_code=hf_kwargs.pop("trust_remote_code", False), - padding=self.tokenizer_padding, - truncation=self.tokenizer_truncation, - padding_side=self.tokenizer_padding_side, - ) - - if self.tokenizer.pad_token is None: - self.tokenizer.pad_token = self.tokenizer.eos_token - - def generate( - self, - **kwargs: Any, - ) -> List[str]: - """Generate text based on the provided input prompts. - - This method processes input prompts through the loaded pipeline and - generates text according to the specified parameters. - - Args: - **kwargs: Generation parameters including: - - text_inputs: List of input prompts - - max_length: Maximum number of tokens to generate - - num_return_sequences: Number of sequences to generate per prompt - - temperature: Sampling temperature - - top_k: Number of highest probability tokens to consider - - top_p: Cumulative probability threshold for token sampling - - do_sample: Whether to use sampling - - return_full_text: Whether to return full text or only generated part - - Returns: - If output logits and output scores are False: - List[str]: A list of generated texts, one for each input prompt. - If output logits and output scores are True: - Dict: A dictionary containing: - - sentences: List of generated texts - - logits: List of logits - - scores: List of scores - - Raises: - RuntimeError: If the pipeline is not initialized. - """ - - if not self.model: - raise RuntimeError("Model is not initialized") - - inputs = self.tokenizer( - kwargs["text_inputs"], - return_tensors="pt", - padding=self.tokenizer_padding, - truncation=self.tokenizer_truncation, - ) - kwargs = {**inputs, **kwargs} - kwargs.pop("text_inputs") - for key, val in kwargs.items(): - if torch.is_tensor(val): - kwargs[key] = val.cuda() - - with torch.no_grad(): - generated_ids = self.model.generate(**kwargs) - return_dict_in_generate = kwargs.get("return_dict_in_generate", False) - if return_dict_in_generate: - output = {"sentences": self.tokenizer.batch_decode(generated_ids["sequences"], skip_special_tokens=True)} - if kwargs.get("output_logits", False): - output["logits"] = generated_ids["logits"] - if kwargs.get("output_scores", False): - output["scores"] = generated_ids["scores"] - else: - output = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - return output - - def generate_other_ranks(self): - """ - Generate function for ranks other than the rank 0. - """ - - while True: - message = torch.empty(1, dtype=torch.long, device="cuda") - torch.distributed.broadcast(message, src=0) - if message == 0: - prompts = broadcast_list(data=[None], src=0) - temperature, top_k, top_p, num_tokens_to_generate, output_logits, output_scores = broadcast_list( - data=[None], src=0 - ) - - return_dict_in_generate = False - if output_logits or output_scores: - return_dict_in_generate = True - - self.generate( - text_inputs=prompts, - do_sample=True, - top_k=top_k, - top_p=top_p, - temperature=temperature, - max_new_tokens=num_tokens_to_generate, - output_logits=output_logits, - output_scores=output_scores, - return_dict_in_generate=return_dict_in_generate, - ) - else: - return - - @property - def get_triton_input(self): - inputs = ( - Tensor(name="prompts", shape=(-1,), dtype=bytes), - Tensor(name="max_length", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="max_batch_size", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True), - Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True), - Tensor(name="random_seed", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="max_length", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="output_logits", shape=(-1,), dtype=np.bool_, optional=True), - Tensor(name="output_scores", shape=(-1,), dtype=np.bool_, optional=True), - ) - return inputs - - @property - def get_triton_output(self): - return ( - Tensor(name="sentences", shape=(-1,), dtype=bytes), - Tensor(name="logits", shape=(-1,), dtype=np.single), - Tensor(name="scores", shape=(-1,), dtype=np.single), - ) - - @batch - def triton_infer_fn(self, **inputs: np.ndarray): - output_infer = {} - - try: - prompts = str_ndarray2list(inputs.pop("prompts")) - temperature = inputs.pop("temperature")[0][0] if "temperature" in inputs else 1.0 - top_k = int(inputs.pop("top_k")[0][0] if "top_k" in inputs else 1) - top_p = inputs.pop("top_p")[0][0] if "top_k" in inputs else 0.0 - num_tokens_to_generate = inputs.pop("max_length")[0][0] if "max_length" in inputs else 256 - output_logits = inputs.pop("output_logits")[0][0] if "output_logits" in inputs else False - output_scores = inputs.pop("output_scores")[0][0] if "output_scores" in inputs else False - return_dict_in_generate = False - if output_logits or output_scores: - return_dict_in_generate = True - - if torch.distributed.is_initialized(): - if torch.distributed.get_world_size() > 1: - torch.distributed.broadcast(torch.tensor([0], dtype=torch.long, device="cuda"), src=0) - broadcast_list(prompts, src=0) - broadcast_list( - data=[ - temperature, - top_k, - top_p, - num_tokens_to_generate, - output_logits, - output_scores, - ], - src=0, - ) - - output = self.generate( - text_inputs=prompts, - do_sample=True, - top_k=top_k, - top_p=top_p, - temperature=temperature, - max_new_tokens=num_tokens_to_generate, - output_logits=output_logits, - output_scores=output_scores, - return_dict_in_generate=return_dict_in_generate, - ) - - if isinstance(output, dict): - output_infer = {"sentences": cast_output(output["sentences"], np.bytes_)} - - if "scores" in output.keys(): - output_scores = [] - for r in output["scores"]: - lp = torch.tensor(r).cpu().detach().numpy() - if len(lp) == 0: - output_scores.append([0]) - else: - output_scores.append(lp) - output_infer["scores"] = np.array(output_scores).transpose(1, 0, 2) - - if "logits" in output.keys(): - output_logits = [] - for r in output["logits"]: - lp = torch.tensor(r).cpu().detach().numpy() - if len(lp) == 0: - output_logits.append([0]) - else: - output_logits.append(lp) - output_infer["logits"] = np.array(output_logits).transpose(1, 0, 2) - else: - output_infer = {"sentences": cast_output(output, np.bytes_)} - - except Exception as error: - err_msg = "An error occurred: {0}".format(str(error)) - output_infer["sentences"] = cast_output([err_msg], np.bytes_) - - return output_infer diff --git a/nemo/deploy/nlp/megatronllm_deployable.py b/nemo/deploy/nlp/megatronllm_deployable.py deleted file mode 100755 index 476dca598c51..000000000000 --- a/nemo/deploy/nlp/megatronllm_deployable.py +++ /dev/null @@ -1,414 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import logging -from pathlib import Path -from typing import List, Optional - -import numpy as np -import torch -import torch.distributed -import wrapt -from jinja2 import Template -from megatron.core.dist_checkpointing.validation import StrictHandling -from megatron.core.inference.common_inference_params import CommonInferenceParams -from megatron.core.inference.inference_request import InferenceRequest - -import nemo.lightning as nl -from nemo.collections.llm import inference -from nemo.deploy import ITritonDeployable -from nemo.deploy.utils import NEMO2, broadcast_list, cast_output, nemo_checkpoint_version, str_ndarray2list - - -@wrapt.decorator -def noop_decorator(func): - """A no-op decorator that returns the original function unchanged. - - Used as a fallback when pytriton's batch decorator is not available. - - Args: - func: The function to decorate - - Returns: - The original function without any modifications - """ - - def wrapper(*args, **kwargs): - """ - Wrapper method returning the func. - """ - return func(*args, **kwargs) - - return wrapper - - -use_pytriton = True -batch = noop_decorator -try: - from pytriton.decorators import batch, first_value - from pytriton.model_config import Tensor -except Exception: - use_pytriton = False - -LOGGER = logging.getLogger("NeMo") - - -class MegatronLLMDeploy: - """ - A factory class for creating deployable instances of Megatron LLM models. - This class provides a method to get the appropriate deployable instance - based on the version of the NeMo checkpoint model used. - """ - - @staticmethod - def get_deployable( - nemo_checkpoint_filepath: str, - num_devices: int = 1, - num_nodes: int = 1, - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - context_parallel_size: int = 1, - max_batch_size: int = 32, - random_seed: Optional[int] = None, - enable_flash_decode: bool = False, - legacy_ckpt: bool = False, - ): - """ - Returns the appropriate deployable instance for the given NeMo checkpoint. - - Args: - nemo_checkpoint_filepath (str): Path to the .nemo checkpoint file. - num_devices (int): Number of devices to use for deployment. - num_nodes (int): Number of nodes to use for deployment. - tensor_model_parallel_size (int): Size of the tensor model parallelism. - pipeline_model_parallel_size (int): Size of the pipeline model parallelism. - context_parallel_size (int): Size of the context parallelism. - enable_flash_decode (bool): Whether to enable flash decode for inference. - - Returns: - ITritonDeployable: An instance of a deployable class compatible with Triton inference server. - """ - if nemo_checkpoint_version(nemo_checkpoint_filepath) == NEMO2: - return MegatronLLMDeployableNemo2( - nemo_checkpoint_filepath=nemo_checkpoint_filepath, - num_devices=num_devices, - num_nodes=num_nodes, - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - context_parallel_size=context_parallel_size, - max_batch_size=max_batch_size, - random_seed=random_seed, - enable_flash_decode=enable_flash_decode, - legacy_ckpt=legacy_ckpt, - ) - else: - raise Exception("Only NeMo 2.0 checkpoint is supported.") - - -def dict_to_str(messages): - """ - Serializes dict to str - """ - return json.dumps(messages) - - -class MegatronLLMDeployableNemo2(ITritonDeployable): - """ - Triton inference server compatible deploy class for a .nemo model file - - Args: - nemo_checkpoint_filepath (str): path for the nemo checkpoint. - num_devices (int): number of GPUs. - num_nodes (int): number of nodes. - tensor_model_parallel_size (int): tensor parallelism. - pipeline_parallelism_size (int): pipeline parallelism. - context_parallel_size (int): context parallelism. - params_dtype (torch.dtype): max input length. - inference_batch_times_seqlen_threshold (int): squence threshold. - inference_max_seq_length (int): max_seq_length for inference. Required by MCoreEngine (>=0.12). Defaults to - 4096. - max_batch_size (int): max batch size for inference. Defaults to 32. - random_seed (Optional[int]): random seed for inference. Defaults to None. - enable_flash_decode (bool): enable flash decode for inference. Defaults to False. - """ - - def __init__( - self, - nemo_checkpoint_filepath: str = None, - num_devices: int = 1, - num_nodes: int = 1, - tensor_model_parallel_size: int = 1, - pipeline_model_parallel_size: int = 1, - context_parallel_size: int = 1, - expert_model_parallel_size: int = 1, - expert_tensor_parallel_size: int = 1, - params_dtype: torch.dtype = torch.bfloat16, - inference_batch_times_seqlen_threshold: int = 1000, - inference_max_seq_length: int = 4096, - max_batch_size: int = 32, - random_seed: Optional[int] = None, - enable_flash_decode: bool = True, - legacy_ckpt: bool = False, - ): - self.nemo_checkpoint_filepath = nemo_checkpoint_filepath - - strategy = nl.MegatronStrategy( - tensor_model_parallel_size=tensor_model_parallel_size, - pipeline_model_parallel_size=pipeline_model_parallel_size, - context_parallel_size=context_parallel_size, - expert_model_parallel_size=expert_model_parallel_size, - expert_tensor_parallel_size=expert_tensor_parallel_size, - sequence_parallel=False, - setup_optimizers=False, - store_optimizer_states=False, - ckpt_load_strictness=StrictHandling.LOG_ALL if legacy_ckpt else None, - ) - - trainer = nl.Trainer( - accelerator="gpu", - devices=num_devices, - num_nodes=num_nodes, - strategy=strategy, - plugins=nl.MegatronMixedPrecision( - precision="bf16-mixed", - params_dtype=torch.bfloat16, - pipeline_dtype=torch.bfloat16, - autocast_enabled=False, - grad_reduce_in_fp32=False, - ), - ) - - self.mcore_engine, self.inference_wrapped_model, self.mcore_tokenizer = inference.setup_mcore_engine( - path=Path(nemo_checkpoint_filepath), - trainer=trainer, - params_dtype=params_dtype, - inference_batch_times_seqlen_threshold=inference_batch_times_seqlen_threshold, - inference_max_seq_length=inference_max_seq_length, - max_batch_size=max_batch_size, - random_seed=random_seed, - enable_flash_decode=enable_flash_decode, - ) - - def generate( - self, prompts: List[str], inference_params: Optional[CommonInferenceParams] = None - ) -> List[InferenceRequest]: - """ - Generates text based on the provided input prompts. - - Args: - prompts (List[str]): A list of input strings. - inference_params (Optional[CommonInferenceParams]): Parameters for controlling the inference process. - Returns: - List[InferenceRequest]: A list containing the generated results. - """ - - inference_params = inference_params or CommonInferenceParams() - results = self.mcore_engine.generate( - prompts=prompts, - add_BOS=False, - common_inference_params=inference_params, - ) - return list(results) - - def generate_other_ranks(self): - """ - Generate function for ranks other than the rank 0. - """ - - while True: - message = torch.empty(1, dtype=torch.long, device="cuda") - torch.distributed.broadcast(message, src=0) - if message == 0: - prompts = broadcast_list(data=[None], src=0) - temperature, top_k, top_p, num_tokens_to_generate, log_probs = broadcast_list(data=[None], src=0) - - inference_params = CommonInferenceParams( - temperature=temperature, - top_k=int(top_k), - top_p=float(top_p), - num_tokens_to_generate=num_tokens_to_generate, - return_log_probs=log_probs, - ) - - self.generate(prompts, inference_params) - else: - return - - def apply_chat_template(self, messages, add_generation_prompt=True): - """ - Load the chat template. - Works when model's tokenizer has chat template (typically chat models). - """ - try: - tokenizer_chat_template = self.mcore_tokenizer.tokenizer.tokenizer.chat_template - bos_token = self.mcore_tokenizer.tokenizer.tokenizer.bos_token - template = Template(tokenizer_chat_template) - except AttributeError: - # If the tokenizer does not have chat_template - raise ValueError( - "The tokenizer does not have chat template, if you would like to evaluate chat model \ - ensure your model's tokenizer has a chat template" - ) - # Render the template with the provided messages - rendered_output = template.render( - messages=messages, bos_token=bos_token, add_generation_prompt=add_generation_prompt - ) - - return rendered_output - - def remove_eos_token(self, text): - """ - Removes eos token if it exists in the output, otherwise does nothing - """ - eos_token = self.mcore_tokenizer.tokenizer.tokenizer.eos_token - output = [] - for t in text: - if eos_token in t: - output.append(t.rsplit(eos_token, 1)[0]) - else: - output.append(t) - return output - - def str_to_dict(self, json_str): - """ - Convert str to dict. - """ - return json.loads(json_str) - - @property - def get_triton_input(self): - inputs = ( - Tensor(name="prompts", shape=(-1,), dtype=bytes), - Tensor(name="max_length", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="max_batch_size", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True), - Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True), - Tensor(name="random_seed", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="compute_logprob", shape=(-1,), dtype=np.bool_, optional=True), - Tensor(name="apply_chat_template", shape=(-1,), dtype=np.bool_, optional=True), - Tensor(name="n_top_logprobs", shape=(-1,), dtype=np.int_, optional=True), - Tensor(name="echo", shape=(-1,), dtype=np.bool_, optional=True), - ) - return inputs - - @property - def get_triton_output(self): - return ( - Tensor(name="sentences", shape=(-1,), dtype=bytes), - Tensor(name="log_probs", shape=(-1,), dtype=np.single), - Tensor(name="top_logprobs", shape=(-1,), dtype=bytes), - ) - - @batch - @first_value( - "max_length", - "max_batch_size", - "top_k", - "top_p", - "temperature", - "random_seed", - "compute_logprob", - "apply_chat_template", - "n_top_logprobs", - "echo", - ) - def triton_infer_fn(self, **inputs: np.ndarray): - output_infer = {} - prompts = str_ndarray2list(inputs.pop("prompts")) - temperature = inputs.pop("temperature", 1.0) - top_k = inputs.pop("top_k", 1) - top_p = inputs.pop("top_p", 0.0) - num_tokens_to_generate = inputs.pop("max_length", 256) - log_probs = inputs.pop("compute_logprob", False) - apply_chat_template = inputs.pop("apply_chat_template", False) - top_logprobs = inputs.pop("n_top_logprobs", 0) - echo = inputs.pop("echo", False) - text_only = True - - if apply_chat_template: - # Deserialize the JSON string back to a dictionary - prompts = [self.str_to_dict(prompt) for prompt in prompts] - prompts = [self.apply_chat_template(prompt) for prompt in prompts] - # Input to generate should be list of string, otherwise if its string directly TE raises an error: - # The provided qkv memory layout is not supported! - if torch.distributed.is_initialized(): - if torch.distributed.get_world_size() > 1: - torch.distributed.broadcast(torch.tensor([0], dtype=torch.long, device="cuda"), src=0) - broadcast_list(prompts, src=0) - broadcast_list( - data=[ - temperature, - top_k, - top_p, - num_tokens_to_generate, - log_probs, - ], - src=0, - ) - - inference_params = CommonInferenceParams( - temperature=temperature, - top_k=int(top_k), - top_p=float(top_p), - num_tokens_to_generate=num_tokens_to_generate, - return_log_probs=log_probs, - top_n_logprobs=top_logprobs, - ) - - results = self.generate(prompts, inference_params) - if echo: - output_texts = [r.prompt + r.generated_text if text_only else r for r in results] - else: - output_texts = [r.generated_text if text_only else r for r in results] - output_texts = self.remove_eos_token(output_texts) - output_infer = {"sentences": cast_output(output_texts, np.bytes_)} - if log_probs: - output_log_probs = [] ## will have 2 np arrays if 2 prompts are sent - for r in results: - # Convert to torch tensor and then move to cpu as generated_log_probs is a list and cant be moved - # to cpu otherwise - if echo: - lp = torch.tensor(r.prompt_log_probs + r.generated_log_probs).cpu().detach().numpy() - else: - lp = torch.tensor(r.generated_log_probs).cpu().detach().numpy() - if len(lp) == 0: - output_log_probs.append([0]) - else: - output_log_probs.append(lp) - if echo: - # if echo, arrays in output_log_probs can have diff len due to diff num of prompt tokens. Pad the - # tokens in that case - # Find the maximum length - max_len = max(len(arr) for arr in output_log_probs) - # Pad each array to the maximum length. Pads 0. - padded = np.array( - [np.pad(arr, (0, max_len - len(arr)), constant_values=0) for arr in output_log_probs] - ) - - output_infer["log_probs"] = padded - else: - output_infer["log_probs"] = np.array(output_log_probs) - if top_logprobs: - output_top_n_log_probs = [] - for r in results: - # Convert to torch tensor and then move to cpu as generated_log_probs is a list and cant be moved - # to cpu otherwise - # TODO: if echo=True add top_logprobs for input tokens once supported - top_n_lp = dict_to_str(r.generated_top_n_logprobs) - output_top_n_log_probs.append(top_n_lp) - output_infer["top_logprobs"] = cast_output(output_top_n_log_probs, np.bytes_) - - return output_infer diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py deleted file mode 100755 index 258e11543f51..000000000000 --- a/nemo/deploy/nlp/query_llm.py +++ /dev/null @@ -1,544 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import time -from abc import ABC -from typing import List, Optional - -import numpy as np - -from nemo.deploy.utils import str_list2numpy - -use_pytriton = True -try: - from pytriton.client import DecoupledModelClient, ModelClient -except Exception: - use_pytriton = False - - -class NemoQueryLLMBase(ABC): - """ - Abstract base class for querying a Large Language Model (LLM). - - Args: - url (str): The URL of the inference server. - model_name (str): The name of the model to be queried. - """ - - def __init__(self, url, model_name): - self.url = url - self.model_name = model_name - - -class NemoQueryLLMPyTorch(NemoQueryLLMBase): - """ - Sends a query to Triton for LLM inference - - Example: - from nemo.deploy import NemoTritonQueryLLMPyTorch - - nq = NemoTritonQueryLLMPyTorch(url="localhost", model_name="GPT-2B") - - prompts = ["hello, testing GPT inference", "another GPT inference test?"] - output = nq.query_llm( - prompts=prompts, - max_length=100, - top_k=1, - top_p=0.0, - temperature=0.0, - ) - print("prompts: ", prompts) - """ - - def __init__(self, url, model_name): - super().__init__( - url=url, - model_name=model_name, - ) - - # these arguments are explicitly defined in order to make it clear to user what they can pass - # names and optionality should exactly match the get_triton_input() results for MegatronGPTDeployable - def query_llm( - self, - prompts: List[str], - use_greedy: Optional[bool] = None, - temperature: Optional[float] = None, - top_k: Optional[int] = None, - top_p: Optional[float] = None, - repetition_penalty: Optional[float] = None, - add_BOS: Optional[bool] = None, - all_probs: Optional[bool] = None, - compute_logprob: Optional[bool] = None, - end_strings: Optional[List[str]] = None, - min_length: Optional[int] = None, - max_length: Optional[int] = None, - apply_chat_template: bool = False, - n_top_logprobs: Optional[int] = None, - init_timeout: float = 60.0, - echo: Optional[bool] = None, - ): - """ - Query the Triton server synchronously and return a list of responses. - - Args: - prompts (List(str)): list of sentences. - use_greedy (bool): use greedy sampling, effectively the same as top_k=1 - temperature (float): A parameter of the softmax function, which is the last layer in the network. - top_k (int): limits us to a certain number (K) of the top tokens to consider. - top_p (float): limits us to the top tokens within a certain probability mass (p). - repetition_penalty (float): penalty applied to repeated sequences, 1.0 means no penalty. - add_BOS (bool): whether or not to add a BOS (beginning of sentence) token. - all_probs (bool): when using compute_logprob, returns probabilities for all tokens in vocabulary. - compute_logprob (bool): get back probabilities of all tokens in the sequence. - end_strings (List(str)): list of strings which will terminate generation when they appear in the output. - min_length (int): min generated tokens. - max_length (int): max generated tokens. - apply_chat_template (bool): applies chat template if its a chat model. Default: False - init_timeout (flat): timeout for the connection. - """ - prompts = str_list2numpy(prompts) - inputs = { - "prompts": prompts, - } - if use_greedy is not None: - inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_) - if temperature is not None: - inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single) - if top_k is not None: - inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_) - if top_p is not None: - inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single) - if repetition_penalty is not None: - inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single) - if add_BOS is not None: - inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_) - if all_probs is not None: - inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_) - if compute_logprob is not None: - inputs["compute_logprob"] = np.full(prompts.shape, compute_logprob, dtype=np.bool_) - if end_strings is not None: - inputs["end_strings"] = str_list2numpy(end_strings) - if min_length is not None: - inputs["min_length"] = np.full(prompts.shape, min_length, dtype=np.int_) - if max_length is not None: - inputs["max_length"] = np.full(prompts.shape, max_length, dtype=np.int_) - if apply_chat_template is not None: - inputs["apply_chat_template"] = np.full(prompts.shape, apply_chat_template, dtype=np.bool_) - if n_top_logprobs is not None: - inputs["n_top_logprobs"] = np.full(prompts.shape, n_top_logprobs, dtype=np.int_) - if echo is not None: - inputs["echo"] = np.full(prompts.shape, echo, dtype=np.bool_) - - with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout, inference_timeout_s=600) as client: - result_dict = client.infer_batch(**inputs) - output_type = client.model_config.outputs[0].dtype - - log_probs_output = None - if "log_probs" in result_dict.keys(): - log_probs_output = result_dict["log_probs"] - - top_log_probs_output = None - if "top_logprobs" in result_dict.keys(): - top_log_probs_output = result_dict["top_logprobs"] - - if output_type == np.bytes_: - if "sentences" in result_dict.keys(): - output = result_dict["sentences"] - else: - return "Unknown output keyword." - - sentences = np.char.decode(output.astype("bytes"), "utf-8") - openai_response = { - "id": f"cmpl-{int(time.time())}", - "object": "text_completion", - "created": int(time.time()), - "model": self.model_name, - "choices": [{"text": sentences}], - } - - if log_probs_output is not None: - # logprobs are stored under choices in openai format. - openai_response["choices"][0]["logprobs"] = {} - openai_response["choices"][0]["logprobs"]["token_logprobs"] = log_probs_output - # TODO athitten: get top_n_logprobs from mcore once available - if top_log_probs_output is not None: - # we take 1st element because cast_output adds an extra dimension - n_log_probs_output = [json.loads(top_log_prob[0]) for top_log_prob in top_log_probs_output] - openai_response["choices"][0]["logprobs"]["top_logprobs"] = n_log_probs_output - return openai_response - else: - return result_dict["sentences"] - - -class NemoQueryLLMHF(NemoQueryLLMBase): - """ - Sends a query to Triton for LLM inference - - Example: - from nemo.deploy import NemoQueryLLMHF - - nq = NemoQueryLLMHF(url="localhost", model_name="GPT-2B") - - prompts = ["hello, testing GPT inference", "another GPT inference test?"] - output = nq.query_llm( - prompts=prompts, - max_length=100, - top_k=1, - top_p=0.0, - temperature=0.0, - ) - print("prompts: ", prompts) - """ - - def __init__(self, url, model_name): - super().__init__( - url=url, - model_name=model_name, - ) - - # these arguments are explicitly defined in order to make it clear to user what they can pass - # names and optionality should exactly match the get_triton_input() results for HuggingFaceLLMDeploy - def query_llm( - self, - prompts: List[str], - use_greedy: Optional[bool] = None, - temperature: Optional[float] = None, - top_k: Optional[int] = None, - top_p: Optional[float] = None, - repetition_penalty: Optional[float] = None, - add_BOS: Optional[bool] = None, - all_probs: Optional[bool] = None, - output_logits: Optional[bool] = None, - output_scores: Optional[bool] = None, - end_strings: Optional[List[str]] = None, - min_length: Optional[int] = None, - max_length: Optional[int] = None, - init_timeout: float = 60.0, - ): - """ - Query the Triton server synchronously and return a list of responses. - - Args: - prompts (List[str]): list of sentences. - use_greedy (Optional[bool]): use greedy sampling, effectively the same as top_k=1 - temperature (Optional[float]): A parameter of the softmax function, which is the last layer in the network. - top_k (Optional[int]): limits us to a certain number (K) of the top tokens to consider. - top_p (Optional[float]): limits us to the top tokens within a certain probability mass (p). - repetition_penalty (Optional[float]): penalty applied to repeated sequences, 1.0 means no penalty. - add_BOS (Optional[bool]): whether or not to add a BOS (beginning of sentence) token. - all_probs (Optional[bool]): when using compute_logprob, returns probabilities for all tokens in vocabulary. - output_logits (Optional[bool]): whether to return logits for each token - output_scores (Optional[bool]): whether to return scores for each token - end_strings (Optional[List[str]]): list of strs which will stop generation when they appear in the output. - min_length (Optional[int]): min generated tokens. - max_length (Optional[int]): max generated tokens. - init_timeout (float): timeout for the connection. - """ - prompts = str_list2numpy(prompts) - inputs = { - "prompts": prompts, - } - if use_greedy is not None: - inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_) - if temperature is not None: - inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single) - if top_k is not None: - inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_) - if top_p is not None: - inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single) - if repetition_penalty is not None: - inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single) - if add_BOS is not None: - inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_) - if all_probs is not None: - inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_) - if output_logits is not None: - inputs["output_logits"] = np.full(prompts.shape, output_logits, dtype=np.bool_) - if output_scores is not None: - inputs["output_scores"] = np.full(prompts.shape, output_scores, dtype=np.bool_) - if end_strings is not None: - inputs["end_strings"] = str_list2numpy(end_strings) - if min_length is not None: - inputs["min_length"] = np.full(prompts.shape, min_length, dtype=np.int_) - if max_length is not None: - inputs["max_length"] = np.full(prompts.shape, max_length, dtype=np.int_) - - with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client: - result_dict = client.infer_batch(**inputs) - output_type = client.model_config.outputs[0].dtype - - if output_type == np.bytes_: - if "sentences" in result_dict.keys(): - output = result_dict["sentences"] - else: - return "Unknown output keyword." - - sentences = np.char.decode(output.astype("bytes"), "utf-8") - openai_response = { - "id": f"cmpl-{int(time.time())}", - "object": "text_completion", - "created": int(time.time()), - "model": self.model_name, - "choices": [{"text": sentences}], - } - if output_logits and "logits" in result_dict: - openai_response["logits"] = result_dict["logits"] - if output_scores and "scores" in result_dict: - openai_response["scores"] = result_dict["scores"] - return openai_response - else: - return result_dict["sentences"] - - -class NemoQueryLLM(NemoQueryLLMBase): - """ - Sends a query to Triton for LLM inference - - Example: - from nemo.deploy import NemoQueryLLM - - nq = NemoQueryLLM(url="localhost", model_name="GPT-2B") - - prompts = ["hello, testing GPT inference", "another GPT inference test?"] - output = nq.query_llm( - prompts=prompts, - max_output_len=100, - top_k=1, - top_p=0.0, - temperature=0.0, - ) - print("prompts: ", prompts) - """ - - def __init__(self, url, model_name): - super().__init__( - url=url, - model_name=model_name, - ) - - def query_llm( - self, - prompts, - stop_words_list=None, - bad_words_list=None, - no_repeat_ngram_size=None, - min_output_len=None, - max_output_len=None, - top_k=None, - top_p=None, - temperature=None, - random_seed=None, - task_id=None, - lora_uids=None, - use_greedy: bool = None, - repetition_penalty: float = None, - add_BOS: bool = None, - all_probs: bool = None, - compute_logprob: bool = None, - end_strings=None, - init_timeout=60.0, - openai_format_response: bool = False, - output_context_logits: bool = False, - output_generation_logits: bool = False, - ): - """ - Query the Triton server synchronously and return a list of responses. - - Args: - prompts (List(str)): list of sentences. - max_output_len (int): max generated tokens. - top_k (int): limits us to a certain number (K) of the top tokens to consider. - top_p (float): limits us to the top tokens within a certain probability mass (p). - temperature (float): A parameter of the softmax function, which is the last layer in the network. - random_seed (int): Seed to condition sampling. - stop_words_list (List(str)): list of stop words. - bad_words_list (List(str)): list of bad words. - no_repeat_ngram_size (int): no repeat ngram size. - task_id (str): downstream task id if virtual tokens are used. - init_timeout (flat): timeout for the connection. - openai_format_response: return response similar to OpenAI API format - output_generation_logits: return generation logits from model on PyTriton - """ - - prompts = str_list2numpy(prompts) - inputs = {"prompts": prompts} - - if min_output_len is not None: - inputs["min_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_) - - if max_output_len is not None: - inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_) - - if top_k is not None: - inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_) - - if top_p is not None: - inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single) - - if temperature is not None: - inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single) - - if random_seed is not None: - inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.int_) - - if stop_words_list is not None: - inputs["stop_words_list"] = str_list2numpy(stop_words_list) - - if bad_words_list is not None: - inputs["bad_words_list"] = str_list2numpy(bad_words_list) - - if no_repeat_ngram_size is not None: - inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single) - - if task_id is not None: - task_id = np.char.encode(task_id, "utf-8") - inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id) - - if lora_uids is not None: - lora_uids = np.char.encode(lora_uids, "utf-8") - inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids) - - if use_greedy is not None: - inputs["use_greedy"] = np.full(prompts.shape, use_greedy, dtype=np.bool_) - - if repetition_penalty is not None: - inputs["repetition_penalty"] = np.full(prompts.shape, repetition_penalty, dtype=np.single) - - if add_BOS is not None: - inputs["add_BOS"] = np.full(prompts.shape, add_BOS, dtype=np.bool_) - - if all_probs is not None: - inputs["all_probs"] = np.full(prompts.shape, all_probs, dtype=np.bool_) - - if compute_logprob is not None: - inputs["compute_logprob"] = np.full(prompts.shape, compute_logprob, dtype=np.bool_) - - if end_strings is not None: - inputs["end_strings"] = str_list2numpy(end_strings) - - if output_context_logits is not None: - inputs["output_context_logits"] = np.full(prompts.shape, output_context_logits, dtype=np.bool_) - - if output_generation_logits is not None: - inputs["output_generation_logits"] = np.full(prompts.shape, output_generation_logits, dtype=np.bool_) - - with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client: - result_dict = client.infer_batch(**inputs) - output_type = client.model_config.outputs[0].dtype - - if output_type == np.bytes_: - if "outputs" in result_dict.keys(): - output = result_dict["outputs"] - elif "sentences" in result_dict.keys(): - output = result_dict["sentences"] - else: - return "Unknown output keyword." - - sentences = np.char.decode(output.astype("bytes"), "utf-8") - if openai_format_response: - openai_response = { - "id": f"cmpl-{int(time.time())}", - "object": "text_completion", - "created": int(time.time()), - "model": self.model_name, - "choices": [{"text": sentences}], - } - if output_generation_logits: - openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"] - if output_context_logits: - openai_response["choices"][0]["context_logits"] = result_dict["context_logits"] - return openai_response - else: - return sentences - else: - return result_dict["outputs"] - - def query_llm_streaming( - self, - prompts, - stop_words_list=None, - bad_words_list=None, - no_repeat_ngram_size=None, - max_output_len=512, - top_k=1, - top_p=0.0, - temperature=1.0, - random_seed=None, - task_id=None, - lora_uids=None, - init_timeout=60.0, - ): - """ - Query the Triton server using streaming. - - Args: - prompts (List(str)): list of sentences. - max_output_len (int): max generated tokens. - top_k (int): limits us to a certain number (K) of the top tokens to consider. - top_p (float): limits us to the top tokens within a certain probability mass (p). - temperature (float): A parameter of the softmax function, which is the last layer in the network. - random_seed (int): Seed to condition sampling. - stop_words_list (List(str)): list of stop words. - bad_words_list (List(str)): list of bad words. - no_repeat_ngram_size (int): no repeat ngram size. - task_id (str): downstream task id if virtual tokens are used. - init_timeout (flat): timeout for the connection. - """ - - prompts = str_list2numpy(prompts) - inputs = {"prompts": prompts} - - if max_output_len is not None: - inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_) - - if top_k is not None: - inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_) - - if top_p is not None: - inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single) - - if temperature is not None: - inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single) - - if random_seed is not None: - inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.int_) - - if stop_words_list is not None: - stop_words_list = np.char.encode(stop_words_list, "utf-8") - inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list) - - if bad_words_list is not None: - bad_words_list = np.char.encode(bad_words_list, "utf-8") - inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list) - - if no_repeat_ngram_size is not None: - inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single) - - if task_id is not None: - task_id = np.char.encode(task_id, "utf-8") - inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id) - - if lora_uids is not None: - lora_uids = np.char.encode(lora_uids, "utf-8") - inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids) - - with DecoupledModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client: - for partial_result_dict in client.infer_batch(**inputs): - output_type = client.model_config.outputs[0].dtype - if output_type == np.bytes_: - sentences = np.char.decode(partial_result_dict["outputs"].astype("bytes"), "utf-8") - yield sentences - else: - yield partial_result_dict["outputs"] diff --git a/nemo/deploy/service/__init__.py b/nemo/deploy/service/__init__.py deleted file mode 100644 index 0349454da9e1..000000000000 --- a/nemo/deploy/service/__init__.py +++ /dev/null @@ -1,14 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from .rest_model_api import app diff --git a/nemo/deploy/service/fastapi_interface_to_pytriton.py b/nemo/deploy/service/fastapi_interface_to_pytriton.py deleted file mode 100644 index cdb043bce869..000000000000 --- a/nemo/deploy/service/fastapi_interface_to_pytriton.py +++ /dev/null @@ -1,326 +0,0 @@ -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - -import numpy as np -import requests -from fastapi import FastAPI, HTTPException -from pydantic import BaseModel, model_validator -from pydantic_settings import BaseSettings - -from nemo.deploy.nlp import NemoQueryLLMPyTorch -from nemo.utils import logging - - -class TritonSettings(BaseSettings): - """ - TritonSettings class that gets the values of TRITON_HTTP_ADDRESS and TRITON_PORT. - """ - - _triton_service_port: int - _triton_service_ip: str - - def __init__(self): - super(TritonSettings, self).__init__() - try: - self._triton_service_port = int(os.environ.get('TRITON_PORT', 8000)) - self._triton_service_ip = os.environ.get('TRITON_HTTP_ADDRESS', '0.0.0.0') - except Exception as error: - logging.error("An exception occurred trying to retrieve set args in TritonSettings class. Error:", error) - return - - @property - def triton_service_port(self): - """ - Returns the port number for the Triton service. - """ - return self._triton_service_port - - @property - def triton_service_ip(self): - """ - Returns the IP address for the Triton service. - """ - return self._triton_service_ip - - -app = FastAPI() -triton_settings = TritonSettings() - - -class BaseRequest(BaseModel): - """ - Common parameters for completions and chat requests for the server. - - Attributes: - model (str): The name of the model to use for completion. - max_tokens (int): The maximum number of tokens to generate in the response. - temperature (float): Sampling temperature for randomness in generation. - top_p (float): Cumulative probability for nucleus sampling. - top_k (int): Number of highest-probability tokens to consider for sampling. - """ - - model: str - max_tokens: int = 512 - temperature: float = 1.0 - top_p: float = 0.0 - top_k: int = 0 - - @model_validator(mode='after') - def set_greedy_params(self): - """Validate parameters for greedy decoding.""" - if self.temperature == 0 and self.top_p == 0: - logging.warning("Both temperature and top_p are 0. Setting top_k to 1 to ensure greedy sampling.") - self.top_k = 1 - return self - - -class CompletionRequest(BaseRequest): - """ - Represents a request for text completion. - - Attributes: - prompt (str): The input text to generate a response from. - logprobs (int): Number of log probabilities to include in the response, if applicable. - echo (bool): Whether to return the input text as part of the response. - """ - - prompt: str - logprobs: int = None - echo: bool = False - - -class ChatCompletionRequest(BaseRequest): - """ - Represents a request for chat completion. - - Attributes: - messages (list[dict]): A list of message dictionaries for chat completion. - logprobs (bool): Whether to return log probabilities for output tokens. - top_logprobs (int): Number of log probabilities to include in the response, if applicable. - logprobs must be set to true if this parameter is used. - """ - - messages: list[dict] - - -@app.get("/v1/health") -def health_check(): - """ - Health check endpoint to verify that the API is running. - - Returns: - dict: A dictionary indicating the status of the application. - """ - return {"status": "ok"} - - -@app.get("/v1/triton_health") -async def check_triton_health(): - """ - This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while - running the REST or FastAPI application. - Verify by running: curl http://service_http_address:service_port/v1/triton_health and the returned status should - inform if the server is accessible. - """ - triton_url = ( - f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready" - ) - logging.info(f"Attempting to connect to Triton server at: {triton_url}") - try: - response = requests.get(triton_url, timeout=5) - if response.status_code == 200: - return {"status": "Triton server is reachable and ready"} - else: - raise HTTPException(status_code=503, detail="Triton server is not ready") - except requests.RequestException as e: - raise HTTPException(status_code=503, detail=f"Cannot reach Triton server: {str(e)}") - - -def convert_numpy(obj): - """ - Convert NumPy arrays in output to lists - """ - if isinstance(obj, np.ndarray): - return obj.tolist() - elif isinstance(obj, dict): - return {k: convert_numpy(v) for k, v in obj.items()} - elif isinstance(obj, list): - return [convert_numpy(i) for i in obj] - else: - return obj - - -def _helper_fun( - url, - model, - prompts, - temperature, - top_k, - top_p, - compute_logprob, - max_length, - apply_chat_template, - n_top_logprobs, - echo, -): - """ - run_in_executor doesn't allow to pass kwargs, so we have this helper function to pass args as a list - """ - nq = NemoQueryLLMPyTorch(url=url, model_name=model) - output = nq.query_llm( - prompts=prompts, - temperature=temperature, - top_k=top_k, - top_p=top_p, - compute_logprob=compute_logprob, - max_length=max_length, - apply_chat_template=apply_chat_template, - n_top_logprobs=n_top_logprobs, - init_timeout=300, - echo=echo, - ) - return output - - -async def query_llm_async( - *, - url, - model, - prompts, - temperature, - top_k, - top_p, - compute_logprob, - max_length, - apply_chat_template, - n_top_logprobs, - echo, -): - """ - Sends requests to `NemoQueryLLMPyTorch.query_llm` in a non-blocking way, allowing the server to process - concurrent requests. This way enables batching of requests in the underlying Triton server. - """ - import asyncio - import concurrent - - loop = asyncio.get_event_loop() - with concurrent.futures.ThreadPoolExecutor() as pool: - result = await loop.run_in_executor( - pool, - _helper_fun, - url, - model, - prompts, - temperature, - top_k, - top_p, - compute_logprob, - max_length, - apply_chat_template, - n_top_logprobs, - echo, - ) - return result - - -@app.post("/v1/completions/") -async def completions_v1(request: CompletionRequest): - """ - Defines the completions endpoint and queries the model deployed on PyTriton server. - """ - url = f"http://{triton_settings.triton_service_ip}:{triton_settings.triton_service_port}" - logging.info(f"Request: {request}") - prompts = request.prompt - if not isinstance(request.prompt, list): - prompts = [request.prompt] - - output = await query_llm_async( - url=url, - model=request.model, - prompts=prompts, - temperature=request.temperature, - top_k=request.top_k, - top_p=request.top_p, - compute_logprob=(request.logprobs is not None and request.logprobs > 0), - max_length=request.max_tokens, - apply_chat_template=False, - n_top_logprobs=request.logprobs, - echo=request.echo, - ) - - output_serializable = convert_numpy(output) - output_serializable["choices"][0]["text"] = output_serializable["choices"][0]["text"][0][0] - if request.logprobs is not None and request.logprobs > 0: - output_serializable["choices"][0]["logprobs"]["token_logprobs"] = output_serializable["choices"][0][ - "logprobs" - ]["token_logprobs"][0] - output_serializable["choices"][0]["logprobs"]["top_logprobs"] = output_serializable["choices"][0]["logprobs"][ - "top_logprobs" - ][0] - if request.echo: - # output format requires empty logprobs for the 1st token - output_serializable["choices"][0]["logprobs"]["token_logprobs"].insert(0, None) - else: - output_serializable["choices"][0]["logprobs"] = None - logging.info(f"Output: {output_serializable}") - return output_serializable - - -def dict_to_str(messages): - """ - Serializes dict to str - """ - return json.dumps(messages) - - -@app.post("/v1/chat/completions/") -async def chat_completions_v1(request: ChatCompletionRequest): - """ - Defines the chat completions endpoint and queries the model deployed on PyTriton server. - """ - url = f"http://{triton_settings.triton_service_ip}:{triton_settings.triton_service_port}" - logging.info(f"Request: {request}") - prompts = request.messages - if not isinstance(request.messages, list): - prompts = [request.messages] - # Serialize the dictionary to a JSON string represnetation to be able to convert to numpy array - # (str_list2numpy) and back to list (str_ndarray2list) as required by PyTriton. Using the dictionaries directly - # with these methods is not possible as they expect string type. - json_prompts = [dict_to_str(prompts)] - output = await query_llm_async( - url=url, - model=request.model, - prompts=json_prompts, - temperature=request.temperature, - top_k=request.top_k, - top_p=request.top_p, - compute_logprob=False, # disable logprobs because we dont need them for any benchmark - max_length=request.max_tokens, - apply_chat_template=True, - n_top_logprobs=None, - echo=False, # chat request doesn't support echo - ) - # Add 'role' as 'assistant' key to the output dict - output["choices"][0]["message"] = {"role": "assistant", "content": output["choices"][0]["text"]} - output["object"] = "chat.completion" - output["choices"][0]["logprobs"] = None - - del output["choices"][0]["text"] - - output_serializable = convert_numpy(output) - output_serializable["choices"][0]["message"]["content"] = output_serializable["choices"][0]["message"]["content"][ - 0 - ][0] - - logging.info(f"Output: {output_serializable}") - return output_serializable diff --git a/nemo/deploy/service/rest_model_api.py b/nemo/deploy/service/rest_model_api.py deleted file mode 100644 index 64afea167295..000000000000 --- a/nemo/deploy/service/rest_model_api.py +++ /dev/null @@ -1,132 +0,0 @@ -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from pathlib import Path -import requests - -from fastapi import FastAPI, HTTPException -from pydantic import BaseModel -from pydantic_settings import BaseSettings - -from nemo.deploy.nlp import NemoQueryLLM -from nemo.utils import logging - - -class TritonSettings(BaseSettings): - _triton_service_port: int - _triton_service_ip: str - _triton_request_timeout: str - - def __init__(self): - super(TritonSettings, self).__init__() - try: - self._triton_service_port = int(os.environ.get('TRITON_PORT', 8080)) - self._triton_service_ip = os.environ.get('TRITON_HTTP_ADDRESS', '0.0.0.0') - self._triton_request_timeout = int(os.environ.get('TRITON_REQUEST_TIMEOUT', 60)) - self._openai_format_response = os.environ.get('OPENAI_FORMAT_RESPONSE', 'False').lower() == 'true' - self._output_generation_logits = os.environ.get('OUTPUT_GENERATION_LOGITS', 'False').lower() == 'true' - except Exception as error: - logging.error("An exception occurred trying to retrieve set args in TritonSettings class. Error:", error) - return - - @property - def triton_service_port(self): - return self._triton_service_port - - @property - def triton_service_ip(self): - return self._triton_service_ip - - @property - def triton_request_timeout(self): - return self._triton_request_timeout - - @property - def openai_format_response(self): - """ - Retuns the response from Triton server in OpenAI compatible format if set to True. - """ - return self._openai_format_response - - @property - def output_generation_logits(self): - """ - Retuns the generation logits along with text in Triton server output if set to True. - """ - return self._output_generation_logits - - -app = FastAPI() -triton_settings = TritonSettings() - - -class CompletionRequest(BaseModel): - model: str - prompt: str - max_tokens: int = 512 - temperature: float = 1.0 - top_p: float = 0.0 - top_k: int = 1 - stream: bool = False - stop: str | None = None - frequency_penalty: float = 1.0 - - -@app.get("/v1/health") -def health_check(): - return {"status": "ok"} - - -@app.get("/v1/triton_health") -async def check_triton_health(): - """ - This method exposes endpoint "/triton_health" which can be used to verify if Triton server is accessible while running the REST or FastAPI application. - Verify by running: curl http://service_http_address:service_port/v1/triton_health and the returned status should inform if the server is accessible. - """ - triton_url = ( - f"http://{triton_settings.triton_service_ip}:{str(triton_settings.triton_service_port)}/v2/health/ready" - ) - logging.info(f"Attempting to connect to Triton server at: {triton_url}") - try: - response = requests.get(triton_url, timeout=5) - if response.status_code == 200: - return {"status": "Triton server is reachable and ready"} - else: - raise HTTPException(status_code=503, detail="Triton server is not ready") - except requests.RequestException as e: - raise HTTPException(status_code=503, detail=f"Cannot reach Triton server: {str(e)}") - - -@app.post("/v1/completions/") -def completions_v1(request: CompletionRequest): - try: - url = triton_settings.triton_service_ip + ":" + str(triton_settings.triton_service_port) - nq = NemoQueryLLM(url=url, model_name=request.model) - output = nq.query_llm( - prompts=[request.prompt], - max_output_len=request.max_tokens, - # when these below params are passed as None - top_k=request.top_k, - top_p=request.top_p, - temperature=request.temperature, - init_timeout=triton_settings.triton_request_timeout, - openai_format_response=triton_settings.openai_format_response, - output_generation_logits=triton_settings.output_generation_logits, - ) - if triton_settings.openai_format_response: - return output - else: - return { - "output": output[0][0], - } - except Exception as error: - logging.error("An exception occurred with the post request to /v1/completions/ endpoint:", error) - return {"error": "An exception occurred"} diff --git a/nemo/deploy/triton_deployable.py b/nemo/deploy/triton_deployable.py deleted file mode 100644 index 7fa3a36e3a56..000000000000 --- a/nemo/deploy/triton_deployable.py +++ /dev/null @@ -1,31 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -from abc import ABC, abstractmethod -import numpy as np - - -class ITritonDeployable(ABC): - @abstractmethod - def get_triton_input(self): - pass - - @abstractmethod - def get_triton_output(self): - pass - - @abstractmethod - def triton_infer_fn(self, **inputs: np.ndarray): - pass diff --git a/nemo/deploy/utils.py b/nemo/deploy/utils.py deleted file mode 100644 index fbfb4a1bc7ad..000000000000 --- a/nemo/deploy/utils.py +++ /dev/null @@ -1,204 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import typing -from pathlib import Path - -import numpy as np -import torch -from PIL import Image -from pytriton.model_config import Tensor - -from nemo.export.tarutils import TarPath - -NEMO2 = "NEMO 2.0" -NEMO1 = "NEMO 1.0" - - -def typedict2tensor( - typedict_class, - overwrite_kwargs: typing.Optional[typing.Dict[str, typing.Any]] = None, - defaults: typing.Optional[typing.Dict[str, typing.Any]] = None, -): - """Converts a type dictionary class into a tuple of PyTriton Tensor objects. - - This function takes a class with type hints and converts each typed field into a PyTriton - Tensor specification, handling nested list types and mapping Python types to numpy dtypes. - - Args: - typedict_class: A class with type hints that will be converted to Tensor specs - overwrite_kwargs: Optional dictionary of kwargs to override default Tensor parameters - defaults: Optional dictionary of default values (unused) - - Returns: - tuple: A tuple of PyTriton Tensor objects, one for each typed field in the input class - - Raises: - Exception: If an unsupported type is encountered during type mapping - """ - - def _map_type(type_): - if type_ is int: - return np.int32 - elif type_ is float: - return np.float32 - elif type_ is bool: - return np.bool_ - elif type_ is str: - return bytes - else: - raise Exception(f"Unknown type {type_}") - - def _get_tensor_params(type_): - count = 0 - while typing.get_origin(type_) is list: - type_ = typing.get_args(type_)[0] - count += 1 - count -= 1 # we don't want to count the last dimension - shape = (-1,) * count if count > 1 else (1,) - return {"shape": shape, "dtype": _map_type(type_)} - - overwrite_kwargs = overwrite_kwargs or {} - return tuple( - Tensor(name=name, **_get_tensor_params(type_), **overwrite_kwargs) - for name, type_ in typing.get_type_hints(typedict_class).items() - ) - - -def nemo_checkpoint_version(path: str) -> str: - """Determines the version of a NeMo checkpoint from its file structure. - - Examines the provided checkpoint path to determine if it follows the NeMo 2.0 - or NeMo 1.0 format based on the presence of 'context' and 'weights' directories. - - Args: - path (str): Path to the NeMo checkpoint file or directory - - Returns: - str: Version string - either NEMO2 or NEMO1 constant indicating the checkpoint version - """ - - if os.path.isdir(path): - path = Path(path) - else: - path = TarPath(path) - - if (path / "context").exists() and (path / "weights").exists(): - return NEMO2 - else: - return NEMO1 - - -def str_list2numpy(str_list: typing.List[str]) -> np.ndarray: - """Converts a list of strings to a numpy array of UTF-8 encoded bytes. - - Takes a list of strings and converts it to a numpy array with an additional - dimension, then encodes the strings as UTF-8 bytes. - - Args: - str_list (List[str]): List of strings to convert - - Returns: - np.ndarray: Numpy array of UTF-8 encoded bytes with shape (N, 1) where N is - the length of the input list - """ - str_ndarray = np.array(str_list)[..., np.newaxis] - return np.char.encode(str_ndarray, "utf-8") - - -def str_ndarray2list(str_ndarray: np.ndarray) -> typing.List[str]: - """Converts a numpy array of UTF-8 encoded bytes back to a list of strings. - - Takes a numpy array of UTF-8 encoded bytes and decodes it back to strings, - removing any extra dimensions, and returns the result as a Python list. - - Args: - str_ndarray (np.ndarray): Numpy array of UTF-8 encoded bytes, typically - with shape (N, 1) where N is the length of the resulting list - - Returns: - List[str]: List of decoded strings - """ - str_ndarray = str_ndarray.astype("bytes") - str_ndarray = np.char.decode(str_ndarray, encoding="utf-8") - str_ndarray = str_ndarray.squeeze(axis=-1) - return str_ndarray.tolist() - - -def ndarray2img(img_ndarray: np.ndarray) -> typing.List[Image.Image]: - """Converts a numpy array of images to a list of PIL Image objects. - - Takes a numpy array containing one or more images and converts each image - to a PIL Image object using Image.fromarray(). - - Args: - img_ndarray (np.ndarray): Numpy array of images, where each image is a 2D or 3D array - representing pixel values - - Returns: - List[Image.Image]: List of PIL Image objects created from the input array - """ - - img_list = [Image.fromarray(i) for i in img_ndarray] - return img_list - - -def cast_output(data, required_dtype): - """Casts input data to a numpy array with the required dtype. - - Takes input data that may be a torch.Tensor, numpy array, or other sequence type - and converts it to a numpy array with the specified dtype. For string dtypes, - the data is encoded as UTF-8 bytes. The output array is ensured to have at least - 2 dimensions. - - Args: - data: Input data to cast. Can be a torch.Tensor, numpy array, or sequence type - that can be converted to a numpy array. - required_dtype: The desired numpy dtype for the output array. - - Returns: - np.ndarray: A numpy array containing the input data cast to the required dtype, - with at least 2 dimensions. - """ - - if isinstance(data, torch.Tensor): - data = data.cpu().numpy() - elif not isinstance(data, np.ndarray): - data = np.array(data) - - data_is_str = required_dtype in (object, np.object_, bytes, np.bytes_) - if data_is_str: - data = np.char.encode(data, "utf-8") - - if data.ndim < 2: - data = data[..., np.newaxis] - return data.astype(required_dtype) - - -def broadcast_list(data, src=0, group=None): - """Broadcasts a list of text data to all processes. - - Args: - data (list): List of strings to broadcast. - src (int, optional): Source rank. Defaults to 0. - group (ProcessGroup, optional): The process group to work on. If None, the default process group will be used. - """ - - if not torch.distributed.is_initialized(): - raise RuntimeError("Distributed environment is not initialized.") - - object_list = [data] if torch.distributed.get_rank() == src else [None] - torch.distributed.broadcast_object_list(object_list, src=src, group=group) - return object_list[0] diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py deleted file mode 100644 index 1a5e5f6afd5c..000000000000 --- a/nemo/export/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# WAR for trtllm and lightning conflict -try: - from nemo.lightning import io - - __all__ = ["io"] -except (ImportError, ModuleNotFoundError): - pass - -import warnings - -warnings.warn( - "The 'nemo.export' is deprecated and will be removed in NeMo FW 25.09 container release. " - "For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Export-Deploy", - DeprecationWarning, - stacklevel=2, -) diff --git a/nemo/export/multimodal/__init__.py b/nemo/export/multimodal/__init__.py deleted file mode 100644 index 341a77c5bc66..000000000000 --- a/nemo/export/multimodal/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py deleted file mode 100644 index f3a133cd65fe..000000000000 --- a/nemo/export/multimodal/build.py +++ /dev/null @@ -1,728 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import os -import shutil -import tarfile -import tempfile -from pathlib import Path -from time import time -from typing import List - -import tensorrt as trt -import torch -import yaml -from omegaconf import OmegaConf -from PIL import Image -from tensorrt_llm._common import check_max_num_tokens -from tensorrt_llm.builder import BuildConfig, Builder -from tensorrt_llm.commands.build import build as build_trtllm -from tensorrt_llm.mapping import Mapping -from tensorrt_llm.models import MLLaMAForCausalLM -from tensorrt_llm.plugin import PluginConfig -from transformers import AutoModel, AutoProcessor, MllamaForConditionalGeneration - -from nemo.collections.multimodal.speech_llm.modules.perception_modules import AudioPerceptionModule -from nemo.core.classes.common import typecheck -from nemo.export.tensorrt_llm import TensorRTLLM -from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_nemo_model - -from .converter import convert_mllama_nemo_to_hf - -logger = trt.Logger(trt.Logger.INFO) - - -def build_trtllm_engine( - model_dir: str, - visual_checkpoint_path: str, - llm_checkpoint_path: str = None, - model_type: str = "neva", - llm_model_type: str = "llama", - tensor_parallelism_size: int = 1, - max_input_len: int = 256, - max_output_len: int = 256, - max_batch_size: int = 1, - max_multimodal_len: int = 1024, - dtype: str = "bfloat16", - use_lora_plugin: str = None, - lora_target_modules: List[str] = None, - max_lora_rank: int = 64, - lora_ckpt_list: List[str] = None, -): - """Build TRTLLM engine by nemo export""" - trt_llm_exporter = TensorRTLLM(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False) - trt_llm_exporter.export( - nemo_checkpoint_path=visual_checkpoint_path if llm_checkpoint_path is None else llm_checkpoint_path, - model_type=llm_model_type, - tensor_parallelism_size=tensor_parallelism_size, - max_input_len=max_input_len, - max_output_len=max_output_len, - max_seq_len=max_input_len + max_output_len, - max_batch_size=max_batch_size, - max_prompt_embedding_table_size=max_multimodal_len, - dtype=dtype, - load_model=False, - use_lora_plugin=use_lora_plugin, - lora_target_modules=lora_target_modules, - max_lora_rank=max_lora_rank, - use_mcore_path=False, - ) - - -def build_mllama_trtllm_engine( - model_dir: str, - hf_model_path: str, - tensor_parallelism_size: int = 1, - max_input_len: int = 256, - max_output_len: int = 256, - max_batch_size: int = 1, - max_multimodal_len: int = 1024, - dtype: str = "bfloat16", - use_lora_plugin: str = None, - lora_target_modules: List[str] = None, - max_lora_rank: int = 64, - lora_ckpt_list: List[str] = None, -): - """Build mllama TRTLLM engine from HF""" - if max_batch_size < 4: - print( - "TensorRT LLM may hit a runtime issue with batch size is smaller than 4 on some models." " Force set to 4" - ) - max_batch_size = 4 - - plugin_config = PluginConfig() - plugin_config.gpt_attention_plugin = "auto" - plugin_config.gemm_plugin = "auto" - plugin_config.enable_paged_kv_cache(tokens_per_block=128) - plugin_config.remove_input_padding = True - plugin_config.use_paged_context_fmha = True - - max_seq_len = max_input_len + max_output_len - max_num_tokens, opt_num_tokens = check_max_num_tokens( - max_num_tokens=None, - opt_num_tokens=None, - max_seq_len=max_seq_len, - max_batch_size=max_batch_size, - max_input_len=max_input_len, - max_beam_width=1, - remove_input_padding=True, - enable_context_fmha=plugin_config.context_fmha, - tokens_per_block=128, - multiple_profiles=False, - ) - - build_dict = { - 'max_input_len': max_input_len, - 'max_output_len': max_output_len, - 'max_encoder_input_len': max_multimodal_len, - 'max_batch_size': max_batch_size, - 'max_beam_width': 1, - 'max_seq_len': max_seq_len, - 'max_num_tokens': max_num_tokens, - 'opt_num_tokens': opt_num_tokens, - 'strongly_typed': True, - 'builder_opt': None, - } - build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config) - - for rank in range(tensor_parallelism_size): - mapping = Mapping(world_size=tensor_parallelism_size, rank=rank, tp_size=tensor_parallelism_size) - model = MLLaMAForCausalLM.from_hugging_face( - hf_model_path, - dtype, - mapping=mapping, - ) - - engine = build_trtllm(model, build_config) - engine.save(model_dir) - - -def export_visual_wrapper_onnx( - visual_wrapper, input, output_dir, input_names=['input'], dynamic_axes={'input': {0: 'batch'}} -): - """Export visual wrapper to ONNX""" - logger.log(trt.Logger.INFO, "Exporting onnx") - os.makedirs(f'{output_dir}/onnx', exist_ok=True) - torch.onnx.export( - visual_wrapper, - input, - f'{output_dir}/onnx/visual_encoder.onnx', - opset_version=17, - input_names=input_names, - output_names=['output'], - dynamic_axes=dynamic_axes, - ) - - -def export_perception_wrapper_onnx( - perception_wrapper, - input, - output_dir, - input_names=['processed_signal', 'processed_signal_length'], - output_names=['encoded', 'encoded_length'], - dynamic_axes={ - 'processed_signal': {0: 'batch', 2: 'time'}, - 'processed_signal_length': {0: 'batch'}, - 'encoded': {0: 'batch', 1: 'time'}, - 'encoded_length': {0: 'batch'}, - }, -): - """Export perception wrapper to ONNX""" - logger.log(trt.Logger.INFO, "Exporting onnx") - os.makedirs(f'{output_dir}/onnx', exist_ok=True) - torch.onnx.export( - perception_wrapper, - input, - f'{output_dir}/onnx/perception_encoder.onnx', - opset_version=17, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - ) - - -def build_trt_engine( - model_type, - input_sizes, - output_dir, - vision_max_batch_size, - dtype=torch.bfloat16, - image_size=None, - num_frames=None, - nemo_config=None, - part_name='visual_encoder', -): - """Build TRT engine from onnx""" - onnx_file = '%s/onnx/%s.onnx' % (output_dir, part_name) - engine_file = '%s/%s.engine' % (output_dir, part_name) - config_file = '%s/%s' % (output_dir, "config.json") - nemo_config_file = '%s/%s' % (output_dir, "nemo_config.yaml") - - with open(nemo_config_file, 'w') as f: - yaml.dump(nemo_config, f) - - logger.log(trt.Logger.INFO, "Building TRT engine for %s" % part_name) - - builder = trt.Builder(logger) - network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)) - profile = builder.create_optimization_profile() - - config_args = {"precision": str(dtype).split('.')[-1], "model_type": model_type} - if image_size is not None: - config_args["image_size"] = image_size - if num_frames is not None: - config_args["num_frames"] = num_frames - - config_wrapper = Builder().create_builder_config(**config_args) - config = config_wrapper.trt_builder_config - - parser = trt.OnnxParser(network, logger) - - with open(onnx_file, 'rb') as model: - if not parser.parse(model.read(), os.path.abspath(onnx_file)): - logger.log(trt.Logger.ERROR, "Failed parsing %s" % onnx_file) - for error in range(parser.num_errors): - logger.log(trt.Logger.ERROR, parser.get_error(error)) - logger.log(trt.Logger.INFO, "Succeeded parsing %s" % onnx_file) - - # Delete onnx files since we don't need them now - shutil.rmtree(f'{output_dir}/onnx') - - nBS = -1 - nMinBS = 1 - nOptBS = max(nMinBS, int(vision_max_batch_size / 2)) - nMaxBS = vision_max_batch_size - - inputT = network.get_input(0) - - # input sizes can be a list of ints (e.g., [3, H, W]) when inputs are images, - # or a list of three int lists (e.g., [[1, 1, 2700], [1, 500, 2700], [1, 4096, 2700]]). - # or a list of three list of lists - # (e.g., [{input1: min_shape, input2: min_shape, }, \ - # {input1: opt_shape, input2: opt_shape}, \ - # {input1: max_shape, input2: max_shape}] ) - assert isinstance(input_sizes, list), "input_sizes must be a list" - if isinstance(input_sizes[0], int): - logger.log(trt.Logger.INFO, f"Processed input sizes {input_sizes}") - inputT.shape = [nBS, *input_sizes] - min_size = opt_size = max_size = input_sizes - elif len(input_sizes) == 3 and isinstance(input_sizes[0], list): - min_size, opt_size, max_size = input_sizes - logger.log(trt.Logger.INFO, f"Processed min/opt/max input sizes {min_size}/{opt_size}/{max_size}") - elif len(input_sizes) == 3 and isinstance(input_sizes[0], dict): - logger.log(trt.Logger.INFO, f"Processed min/opt/max input sizes {input_sizes}") - else: - raise ValueError(f"invalid input sizes: {input_sizes}") - - if isinstance(input_sizes[0], dict): - for i in range(network.num_inputs): - inputT = network.get_input(i) - input_name = inputT.name - min_size = input_sizes[0][input_name] - opt_size = input_sizes[1][input_name] - max_size = input_sizes[2][input_name] - logger.log(trt.Logger.INFO, f"{input_name} min/opt/max input sizes {min_size}/{opt_size}/{max_size}") - profile.set_shape(input_name, min_size, opt_size, max_size) - else: - profile.set_shape(inputT.name, [nMinBS, *min_size], [nOptBS, *opt_size], [nMaxBS, *max_size]) - - config.add_optimization_profile(profile) - - t0 = time() - engine_string = builder.build_serialized_network(network, config) - t1 = time() - if engine_string is None: - raise RuntimeError("Failed building %s" % (engine_file)) - else: - logger.log(trt.Logger.INFO, "Succeeded building %s in %d s" % (engine_file, t1 - t0)) - with open(engine_file, 'wb') as f: - f.write(engine_string) - - Builder.save_config(config_wrapper, config_file) - - -def build_neva_engine( - model_type: str, - model_dir: str, - visual_checkpoint_path: str, - vision_max_batch_size: int = 1, -): - """Build neva visual engine""" - device = torch.device("cuda") if torch.cuda.is_available() else "cpu" - - if os.path.isdir(visual_checkpoint_path): - # load untar checkpoint - config_path = os.path.join(visual_checkpoint_path, 'model_config.yaml') - with open(config_path, 'r') as f: - nemo_config = yaml.safe_load(f) - try: - weights_path = os.path.join(visual_checkpoint_path, 'model_weights.ckpt') - mp0_weights = torch.load(weights_path, map_location=device) - except FileNotFoundError: - weights_path = os.path.join(visual_checkpoint_path, 'mp_rank_00/model_weights.ckpt') - mp0_weights = torch.load(weights_path, map_location=device) - else: - # extract NeMo checkpoint - with tempfile.TemporaryDirectory() as temp: - temp_path = Path(temp) - mp0_weights, nemo_config, _ = load_nemo_model(visual_checkpoint_path, temp_path) - - vision_config = nemo_config["mm_cfg"]["vision_encoder"] - - class DownSampleBlock(torch.nn.Module): - # pylint: disable=C0115,C0116 - def forward(self, x): - vit_embeds = x - h = w = int(vit_embeds.shape[1] ** 0.5) - vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) - vit_embeds = self.flat_square(vit_embeds) - vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1]) - return vit_embeds - - def flat_square(self, x): - n, w, h, c = x.size() - if w % 2 == 1: - x = torch.cat([x, torch.zeros((n, 1, h, c), dtype=x.dtype).to(x.device)], dim=1).contiguous() - n, w, h, c = x.size() - if h % 2 == 1: - x = torch.cat([x, torch.zeros((n, w, 1, c), dtype=x.dtype).to(x.device)], dim=2).contiguous() - n, w, h, c = x.size() - x = x.view(n, w, int(h / 2), int(c * 2)) - x = x.permute(0, 2, 1, 3).contiguous() - x = x.view(n, int(h / 2), int(w / 2), int(c * 4)) - return x - - class VisionEncoderWrapper(torch.nn.Module): - # pylint: disable=C0115,C0116 - def __init__(self, encoder, connector): - super().__init__() - self.encoder = encoder - self.connector = connector - - def forward(self, images): - vision_x = self.encoder(pixel_values=images, output_hidden_states=True) - vision_x = vision_x.hidden_states[-2] - vision_x = self.connector(vision_x) - return vision_x - - encoder = AutoModel.from_pretrained( - vision_config["from_pretrained"], - torch_dtype=torch.bfloat16, - trust_remote_code=True, - attn_implementation='eager', - ) - vision_encoder = encoder.vision_model - hf_config = encoder.config - dtype = hf_config.torch_dtype - - # connector - if nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "mlp2x_gelu": - vision_connector = torch.nn.Sequential( - torch.nn.Linear(vision_config["hidden_size"], nemo_config["hidden_size"], bias=True), - torch.nn.GELU(), - torch.nn.Linear(nemo_config["hidden_size"], nemo_config["hidden_size"], bias=True), - ).to(dtype=dtype) - - key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector" - for layer in range(0, 3, 2): - vision_connector[layer].load_state_dict( - { - 'weight': mp0_weights[f"{key_prefix}.{layer}.weight"].to(dtype), - 'bias': mp0_weights[f"{key_prefix}.{layer}.bias"].to(dtype), - } - ) - elif nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "linear": - vision_connector = torch.nn.Linear(vision_config["hidden_size"], nemo_config["hidden_size"], bias=True) - key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector" - vision_connector.load_state_dict( - { - 'weight': mp0_weights[f"{key_prefix}.weight"].to(dtype), - 'bias': mp0_weights[f"{key_prefix}.bias"].to(dtype), - } - ) - elif nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "mlp_downsample": - vision_connector = torch.nn.Sequential( - DownSampleBlock(), - torch.nn.LayerNorm(vision_config["hidden_size"] * 4), - torch.nn.Linear(vision_config["hidden_size"] * 4, nemo_config["hidden_size"], bias=True), - torch.nn.GELU(), - torch.nn.Linear(nemo_config["hidden_size"], nemo_config["hidden_size"], bias=True), - ).to(dtype=dtype) - key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector" - for layer in [1, 2, 4]: - vision_connector[layer].load_state_dict( - { - 'weight': mp0_weights[f"{key_prefix}.{layer}.weight"].to(dtype), - 'bias': mp0_weights[f"{key_prefix}.{layer}.bias"].to(dtype), - } - ) - - else: - raise ValueError(f"Unknown projector type: {nemo_config['mm_cfg']['mm_mlp_adapter_type']}") - - # export the whole wrapper - lita_num_frames = None - wrapper = VisionEncoderWrapper(vision_encoder, vision_connector).to(device, dtype) - if model_type == "lita" or model_type == "vila": - image_size = hf_config.image_size - if model_type == "lita": - lita_num_frames = nemo_config['mm_cfg']['lita']['sample_frames'] - else: - image_size = hf_config.vision_config.image_size - if model_type == "vita": - lita_num_frames = nemo_config['mm_cfg']['lita']['sample_frames'] - dummy_image = torch.empty( - 1, 3, image_size, image_size, dtype=dtype, device=device - ) # dummy image shape [B, C, H, W] - - export_visual_wrapper_onnx(wrapper, dummy_image, model_dir) - build_trt_engine( - model_type, - [3, image_size, image_size], - model_dir, - vision_max_batch_size, - dtype, - image_size=image_size, - num_frames=lita_num_frames if model_type == "lita" or model_type == 'vita' else None, - nemo_config=nemo_config, - ) - - -def build_video_neva_engine( - model_dir: str, - visual_checkpoint_path: str, - vision_max_batch_size: int = 1, -): - """Build video neva visual engine""" - device = torch.device("cuda") if torch.cuda.is_available() else "cpu" - # extract NeMo checkpoint - with tarfile.open(visual_checkpoint_path) as tar: - nemo_config = yaml.safe_load(tar.extractfile("./model_config.yaml")) - try: - # trained without TP - mp0_weights = torch.load(tar.extractfile("./model_weights.ckpt"), map_location=device) - except KeyError: - # trained with TP - mp0_weights = torch.load(tar.extractfile("./mp_rank_00/model_weights.ckpt"), map_location=device) - - vision_config = nemo_config["mm_cfg"]["vision_encoder"] - - class VisionEncoderWrapper(torch.nn.Module): - # pylint: disable=C0115,C0116 - def __init__(self, encoder, connector): - super().__init__() - self.encoder = encoder - self.connector = connector - - def forward(self, images): - b, num_frames, c, h, w = images.shape - images = images.view(b * num_frames, c, h, w) - vision_x = self.encoder(pixel_values=images, output_hidden_states=True) # [(B num_frames), C, H, W] - vision_x = vision_x.hidden_states[-2] - vision_x = vision_x[:, 1:] - - # reshape back to [B, num_frames, img_size, hidden_size] - vision_x = vision_x.view(b, num_frames, -1, vision_x.shape[-1]) - - vision_x = self.connector(vision_x) - return vision_x - - encoder = AutoModel.from_pretrained( - vision_config["from_pretrained"], - torch_dtype=torch.bfloat16, - trust_remote_code=True, - attn_implementation='eager', - ) - vision_encoder = encoder.vision_model - hf_config = encoder.config - dtype = hf_config.torch_dtype - - # connector - assert nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "linear" - vision_connector = torch.nn.Linear(vision_config["hidden_size"], nemo_config["hidden_size"], bias=True) - - key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector" - vision_connector.load_state_dict( - { - 'weight': mp0_weights[f"{key_prefix}.weight"].to(dtype), - 'bias': mp0_weights[f"{key_prefix}.bias"].to(dtype), - } - ) - - # export the whole wrapper - wrapper = VisionEncoderWrapper(vision_encoder, vision_connector).to(device, dtype) - image_size = hf_config.vision_config.image_size - num_frames = nemo_config['data']['num_frames'] - dummy_video = torch.empty(1, num_frames, 3, image_size, image_size, dtype=dtype, device=device) # dummy image - export_visual_wrapper_onnx(wrapper, dummy_video, model_dir) - build_trt_engine( - "video-neva", - [num_frames, 3, image_size, image_size], # [num_frames, 3, H, W] - model_dir, - vision_max_batch_size, - dtype, - image_size=image_size, - num_frames=num_frames, - ) - - -def build_perception_engine( - model_dir: str, - perception_checkpoint_path: str, - model_type: str = "salm", - max_batch_size: int = 1, -): - """Build perception engine""" - assert model_type == "salm", f"Invalid model type {model_type}" - - def load_perception_model(perception_checkpoint_path): - weights = "model_weights.ckpt" - perception_state_dict = torch.load(os.path.join(perception_checkpoint_path, weights)) - config = "model_config.yaml" - config = OmegaConf.load(os.path.join(perception_checkpoint_path, config)) - perception = AudioPerceptionModule(cfg=config) - perception.load_state_dict(perception_state_dict) - perception.eval() - return perception - - if not os.path.exists(model_dir): - os.makedirs(model_dir) - # load perception model - perception_model = load_perception_model(perception_checkpoint_path) - feature_extractor = perception_model.preprocessor - input_signal = torch.randn(1, 1000, dtype=torch.float32) - input_signal_length = torch.tensor([1000], dtype=torch.int32) - - processed_signal, processed_signal_length = feature_extractor( - input_signal=input_signal, length=input_signal_length - ) - processed_signal_length = processed_signal_length.to(torch.int32) - dump_path = model_dir + "/feature_extractor.ts" # dump the feature extractor as torchscript - feature_extractor.export(dump_path, (input_signal, input_signal_length)) - - class PerceptionWrapper(torch.nn.Module): - # pylint: disable=C0115,C0116 - def __init__(self, encoder, modality_adapter, proj): - super().__init__() - self.encoder = encoder - self.modality_adapter = modality_adapter - self.proj = proj - - @typecheck.disable_checks() - def forward(self, processed_signal, processed_signal_length): - encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length) - encoded, encoded_len = self.modality_adapter(audio_signal=encoded, length=encoded_len) - # b, c, t -> b, t, c - encoded = self.proj(encoded.transpose(1, 2)) - encoded_len = encoded_len.to(torch.int32) - return encoded, encoded_len - - perception = PerceptionWrapper(perception_model.encoder, perception_model.modality_adapter, perception_model.proj) - export_perception_wrapper_onnx(perception, (processed_signal, processed_signal_length), model_dir) - # export the onnx perception model to tensorrt engine - # 512 -> 5.12 sec, 3072 -> 30.72 sec - opt_batch_size = max(1, max_batch_size // 2) - shapes = [ - {"processed_signal": [1, 80, 64], "processed_signal_length": [1]}, - {"processed_signal": [opt_batch_size, 80, 512], "processed_signal_length": [opt_batch_size]}, - {"processed_signal": [max_batch_size, 80, 3072], "processed_signal_length": [max_batch_size]}, - ] - build_trt_engine( - model_type, - shapes, - model_dir, - max_batch_size, - dtype=torch.float16, - nemo_config=None, - part_name='perception_encoder', - ) - - -def build_mllama_visual_engine( - model_dir: str, - hf_model_path: str, - processor_name: str = "meta-llama/Llama-3.2-11B-Vision-Instruct", - vision_max_batch_size: int = 1, -): - """Build mllama visual engine""" - hf_model = MllamaForConditionalGeneration.from_pretrained(hf_model_path, torch_dtype="auto", device_map="auto") - model_dtype = hf_model.dtype - - class MLLaMAVisionWrapper(torch.nn.Module): - # pylint: disable=C0115,C0116 - def __init__(self, vision_model, output_proj): - super().__init__() - self.vision_model = vision_model - self.output_proj = output_proj - - def forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask): - out = self.vision_model(pixel_values, aspect_ratio_ids, aspect_ratio_mask).last_hidden_state - out = self.output_proj(out) - return out - - wrapper = MLLaMAVisionWrapper(hf_model.vision_model, hf_model.multi_modal_projector) - - processor = AutoProcessor.from_pretrained(processor_name) - image = Image.new('RGB', [2048, 2688]) - inputs = processor(images=image, return_tensors="pt").to(model_dtype) - - export_visual_wrapper_onnx( - wrapper, - tuple([value for _, value in inputs.items()]), - model_dir, - input_names=[key for key in inputs], - dynamic_axes={key: {0: "batch"} for key in inputs}, - ) - shapes = [{k: list(v.shape) for k, v in inputs.items()}] * 3 - shapes[2] = shapes[0].copy() - for k, v in shapes[2].items(): - shapes[2][k] = [vision_max_batch_size] + v[1:] - build_trt_engine("mllama", shapes, model_dir, vision_max_batch_size, model_dtype) - - -def build_visual_engine( - model_dir: str, - visual_checkpoint_path: str, - model_type: str = "neva", - vision_max_batch_size: int = 1, -): - """Build visual engine""" - model_list = ['neva', 'lita', 'vila', 'vita'] - if model_type in model_list: - build_neva_engine(model_type, model_dir, visual_checkpoint_path, vision_max_batch_size) - elif model_type == "video-neva": - build_video_neva_engine(model_dir, visual_checkpoint_path, vision_max_batch_size) - else: - raise RuntimeError(f"Invalid model type {model_type}") - - -def extract_lora_ckpt( - lora_ckpt: str, - output_dir: str, -): - """Extrace lora from checkpoint""" - if os.path.exists(os.path.join(lora_ckpt, "model_weights.ckpt")): - model_weight = torch.load(os.path.join(lora_ckpt, "model_weights.ckpt")) - elif os.path.exists(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt")): - model_weight = torch.load(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt")) - else: - raise RuntimeError("Imcompatible lora checkpoint format") - - model_config = os.path.join(lora_ckpt, "model_config.yaml") - - if not os.path.exists(model_config): - raise RuntimeError("Imcompatible lora checkpoint format") - - llm_lora_weight = {} - - for k, v in model_weight.items(): - if "mm_projector" not in k: - llm_lora_weight[k] = v - - llm_lora_path = os.path.join(output_dir, "llm_lora.nemo") - with tempfile.TemporaryDirectory() as tmp_dir: - llm_weight_path = os.path.join(tmp_dir, "model_weights.ckpt") - torch.save(llm_lora_weight, llm_weight_path) - - with tarfile.open(llm_lora_path, "w") as tar: - tar.add(llm_weight_path, arcname="model_weights.ckpt") - tar.add(model_config, arcname="model_config.yaml") - - return llm_lora_path - - -def build_mllama_engine( - model_dir: str, - checkpoint_path: str, - processor_name: str = "meta-llama/Llama-3.2-11B-Vision-Instruct", - vision_max_batch_size: int = 1, - tensor_parallelism_size: int = 1, - max_input_len: int = 256, - max_output_len: int = 256, - max_batch_size: int = 1, - max_multimodal_len: int = 1024, - dtype: str = "bfloat16", - use_lora_plugin: str = None, - lora_target_modules: List[str] = None, - max_lora_rank: int = 64, - lora_ckpt_list: List[str] = None, -): - """Build mllama engine""" - new_state_dict, config = convert_mllama_nemo_to_hf(checkpoint_path, processor_name) - - hf_model = MllamaForConditionalGeneration(config) - hf_model = hf_model.to(torch.bfloat16) - hf_model.load_state_dict(new_state_dict) - - with tempfile.TemporaryDirectory() as tmp_dir: - hf_model_path = os.path.join(tmp_dir, "hf_checkpoint") - hf_model.save_pretrained(hf_model_path) - del hf_model, new_state_dict - - build_mllama_visual_engine( - os.path.join(model_dir, "visual_engine"), - hf_model_path, - vision_max_batch_size=vision_max_batch_size, - ) - build_mllama_trtllm_engine( - os.path.join(model_dir, "llm_engine"), - hf_model_path, - tensor_parallelism_size, - max_input_len, - max_output_len, - max_batch_size, - max_multimodal_len, - dtype, - ) diff --git a/nemo/export/multimodal/converter.py b/nemo/export/multimodal/converter.py deleted file mode 100644 index 747ddf80eaea..000000000000 --- a/nemo/export/multimodal/converter.py +++ /dev/null @@ -1,412 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import torch -from transformers import AutoProcessor, MllamaConfig -from transformers.models.mllama.configuration_mllama import MllamaTextConfig, MllamaVisionConfig - -from nemo import lightning as nl -from nemo.collections import vlm - - -def split_qkv_weight(qkv_weight, model_config): - """Split attention qkv from nemo to hf format""" - hidden_size = model_config.hidden_size - head_num = model_config.num_attention_heads - num_query_groups = model_config.num_query_groups or head_num - head_size = model_config.kv_channels or (hidden_size // head_num) - heads_per_group = head_num // num_query_groups - qkv_weight = qkv_weight.reshape(-1, head_size, hidden_size) - q_weight = torch.empty((head_num, head_size, hidden_size), device=qkv_weight.device) - k_weight = torch.empty((num_query_groups, head_size, hidden_size), device=qkv_weight.device) - v_weight = torch.empty((num_query_groups, head_size, hidden_size), device=qkv_weight.device) - - qkv_index = 0 - for i in range(num_query_groups): - q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :] = qkv_weight[ - qkv_index : qkv_index + heads_per_group, :, : - ] - qkv_index += heads_per_group - k_weight[i, :, :] = qkv_weight[qkv_index, :, :] - qkv_index += 1 - v_weight[i, :, :] = qkv_weight[qkv_index, :, :] - qkv_index += 1 - - return [('q_proj', q_weight), ('k_proj', k_weight), ('v_proj', v_weight)] - - -def split_kv_weight(kv_weight, model_config): - """Split cross attention qkv from nemo to hf format""" - hidden_size = model_config.hidden_size - head_num = model_config.num_attention_heads - num_query_groups = model_config.num_query_groups or head_num - head_size = model_config.kv_channels or (hidden_size // head_num) - kv_weight = kv_weight.reshape(-1, head_size, hidden_size) - k_weight = torch.empty((num_query_groups, head_size, hidden_size), device=kv_weight.device) - v_weight = torch.empty((num_query_groups, head_size, hidden_size), device=kv_weight.device) - - kv_index = 0 - for i in range(num_query_groups): - k_weight[i, :, :] = kv_weight[kv_index, :, :] - kv_index += 1 - v_weight[i, :, :] = kv_weight[kv_index, :, :] - kv_index += 1 - - return [('k_proj', k_weight), ('v_proj', v_weight)] - - -def split_gate_weight(gate_weight): - """Split linear fc to gate""" - gate_weight = torch.chunk(gate_weight, 2, axis=0) - - return [('gate_proj', gate_weight[0]), ('up_proj', gate_weight[1])] - - -def convert_mllama_config(source_vision, source_text): - """Convert nemo mllama config to hf config""" - vision_config = MllamaVisionConfig( - num_hidden_layers=source_vision.num_layers, - hidden_size=source_vision.hidden_size, - attention_heads=source_vision.num_attention_heads, - image_size=source_vision.vision_chunk_size, - max_num_tiles=source_vision.vision_max_num_chunks, - torch_dtype="bfloat16", - ) - - cross_attention_layers = [ - x + i for i, x in enumerate(source_text._init_fusion_schedule(source_text.num_cross_attention_layers)) - ] - text_config = MllamaTextConfig( - rope_theta=source_text.rotary_base, - num_hidden_layers=source_text.num_layers + source_text.num_cross_attention_layers, - cross_attention_layers=cross_attention_layers, - hidden_size=source_text.hidden_size, - intermediate_size=source_text.ffn_hidden_size, - num_attention_heads=source_text.num_attention_heads, - num_key_value_heads=source_text.num_query_groups, - vocab_size=source_text.vocab_size, - rope_scaling={ - "factor": 8.0, - "high_freq_factor": 4.0, - "low_freq_factor": 1.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3", - }, - eos_token_id=[128001, 128008, 128009], - torch_dtype="bfloat16", - ) - - return MllamaConfig(vision_config, text_config, torch_dtype="bfloat16") - - -def convert_mllama_nemo_to_hf(checkpoint_path, processor_name): - """Convert nemo mllama to hf state dict and config""" - processor = AutoProcessor.from_pretrained(processor_name) - - strategy = nl.MegatronStrategy( - tensor_model_parallel_size=1, - ckpt_load_optimizer=False, - ckpt_save_optimizer=False, - ) - trainer = nl.Trainer( - devices=1, - max_steps=1000, - accelerator="gpu", - strategy=strategy, - plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), - val_check_interval=1000, - limit_val_batches=50, - ) - - fabric = trainer.to_fabric() - - tokenizer = processor.tokenizer - model = vlm.MLlamaModel(vlm.MLlamaConfig11BInstruct(), tokenizer=tokenizer) - config = model.config - vision_model_config = config.vision_model_config - language_model_config = config.language_model_config - model = fabric.load_model(checkpoint_path, model) - model = model.module.module.module.module - - state_dict = model.state_dict() - del model - - v = "vision_model.vision_encoder" - key_map = [ - ("vision_model.class_embedding", f"{v}.class_embedding"), - ("vision_model.gated_positional_embedding.embedding", f"{v}.positional_embedding"), - ( - "vision_model.gated_positional_embedding.tile_embedding.weight", - f"{v}.gated_tile_positional_embedding.weight", - ), - ("vision_model.gated_positional_embedding.gate", f"{v}.gated_positional_embedding_gate"), - ("vision_model.layernorm_post.bias", f"{v}.ln_post.bias"), - ("vision_model.layernorm_post.weight", f"{v}.ln_post.weight"), - ("vision_model.layernorm_pre.bias", f"{v}.ln_pre.bias"), - ("vision_model.layernorm_pre.weight", f"{v}.ln_pre.weight"), - ("vision_model.post_tile_positional_embedding.embedding.weight", f"{v}.post_tile_pos_embed.embedding.weight"), - ("vision_model.post_tile_positional_embedding.gate", f"{v}.post_tile_pos_embed.gate"), - ("vision_model.pre_tile_positional_embedding.embedding.weight", f"{v}.pre_tile_pos_embed.embedding.weight"), - ("vision_model.pre_tile_positional_embedding.gate", f"{v}.pre_tile_pos_embed.gate"), - ("multi_modal_projector.bias", "vision_model.vision_projection.encoder.bias"), - ("multi_modal_projector.weight", "vision_model.vision_projection.encoder.weight"), - ("language_model.model.norm.weight", "language_model.decoder.final_layernorm.weight"), - ("language_model.lm_head.weight", "language_model.output_layer.weight"), - ] - - for i in range(vision_model_config.num_layers): - key_map.extend( - [ - ( - f"vision_model.transformer.layers.{i}.self_attn.o_proj.weight", - f"{v}.transformer.layers.{i}.self_attention.linear_proj.weight", - ), - ( - f"vision_model.transformer.layers.{i}.input_layernorm.bias", - f"{v}.transformer.layers.{i}.input_layernorm.bias", - ), - ( - f"vision_model.transformer.layers.{i}.input_layernorm.weight", - f"{v}.transformer.layers.{i}.input_layernorm.weight", - ), - ( - f"vision_model.transformer.layers.{i}.post_attention_layernorm.bias", - f"{v}.transformer.layers.{i}.pre_mlp_layernorm.bias", - ), - ( - f"vision_model.transformer.layers.{i}.post_attention_layernorm.weight", - f"{v}.transformer.layers.{i}.pre_mlp_layernorm.weight", - ), - ( - f"vision_model.transformer.layers.{i}.mlp.fc1.bias", - f"{v}.transformer.layers.{i}.mlp.linear_fc1.bias", - ), - ( - f"vision_model.transformer.layers.{i}.mlp.fc1.weight", - f"{v}.transformer.layers.{i}.mlp.linear_fc1.weight", - ), - ( - f"vision_model.transformer.layers.{i}.mlp.fc2.bias", - f"{v}.transformer.layers.{i}.mlp.linear_fc2.bias", - ), - ( - f"vision_model.transformer.layers.{i}.mlp.fc2.weight", - f"{v}.transformer.layers.{i}.mlp.linear_fc2.weight", - ), - ] - ) - - for i in range(vision_model_config.num_global_layers): - key_map.extend( - [ - ( - f"vision_model.global_transformer.layers.{i}.self_attn.o_proj.weight", - f"{v}.global_transformer.layers.{i}.self_attention.linear_proj.weight", - ), - ( - f"vision_model.global_transformer.layers.{i}.gate_attn", - f"{v}.global_transformer.layers.{i}.gate_attn", - ), - ( - f"vision_model.global_transformer.layers.{i}.gate_ffn", - f"{v}.global_transformer.layers.{i}.gate_ffn", - ), - ( - f"vision_model.global_transformer.layers.{i}.input_layernorm.bias", - f"{v}.global_transformer.layers.{i}.input_layernorm.bias", - ), - ( - f"vision_model.global_transformer.layers.{i}.input_layernorm.weight", - f"{v}.global_transformer.layers.{i}.input_layernorm.weight", - ), - ( - f"vision_model.global_transformer.layers.{i}.post_attention_layernorm.bias", - f"{v}.global_transformer.layers.{i}.pre_mlp_layernorm.bias", - ), - ( - f"vision_model.global_transformer.layers.{i}.post_attention_layernorm.weight", - f"{v}.global_transformer.layers.{i}.pre_mlp_layernorm.weight", - ), - ( - f"vision_model.global_transformer.layers.{i}.mlp.fc1.bias", - f"{v}.global_transformer.layers.{i}.mlp.linear_fc1.bias", - ), - ( - f"vision_model.global_transformer.layers.{i}.mlp.fc1.weight", - f"{v}.global_transformer.layers.{i}.mlp.linear_fc1.weight", - ), - ( - f"vision_model.global_transformer.layers.{i}.mlp.fc2.bias", - f"{v}.global_transformer.layers.{i}.mlp.linear_fc2.bias", - ), - ( - f"vision_model.global_transformer.layers.{i}.mlp.fc2.weight", - f"{v}.global_transformer.layers.{i}.mlp.linear_fc2.weight", - ), - ] - ) - - cross_attention_frequency = language_model_config.num_layers // language_model_config.num_cross_attention_layers - toal_num_layer = language_model_config.num_layers + language_model_config.num_cross_attention_layers - prefix = "language_model.decoder" - for i in range(toal_num_layer): - cross_num = (i - 3) // (cross_attention_frequency + 1) - if (i - 3) % (cross_attention_frequency + 1) == 0: - xattn_index = cross_num * cross_attention_frequency + 3 - key_map.extend( - [ - ( - f"language_model.model.layers.{i}.cross_attn.o_proj.weight", - f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_proj.weight", - ), - ( - f"language_model.model.layers.{i}.cross_attn.q_proj.weight", - f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_q.weight", - ), - ( - f"language_model.model.layers.{i}.cross_attn.k_norm.weight", - f"{prefix}.xattn_layers.{xattn_index}.cross_attention.k_layernorm.weight", - ), - ( - f"language_model.model.layers.{i}.input_layernorm.weight", - f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_q.layer_norm_weight", - ), - ( - f"language_model.model.layers.{i}.cross_attn.q_norm.weight", - f"{prefix}.xattn_layers.{xattn_index}.cross_attention.q_layernorm.weight", - ), - ( - f"language_model.model.layers.{i}.post_attention_layernorm.weight", - f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc1.layer_norm_weight", - ), - ( - f"language_model.model.layers.{i}.mlp.down_proj.weight", - f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc2.weight", - ), - ( - f"language_model.model.layers.{i}.cross_attn_attn_gate", - f"{prefix}.xattn_layers.{xattn_index}.gate_attn", - ), - ( - f"language_model.model.layers.{i}.cross_attn_mlp_gate", - f"{prefix}.xattn_layers.{xattn_index}.gate_ffn", - ), - ] - ) - else: - attn_index = i - cross_num - 1 - key_map.extend( - [ - ( - f"language_model.model.layers.{i}.self_attn.o_proj.weight", - f"{prefix}.layers.{attn_index}.self_attention.linear_proj.weight", - ), - ( - f"language_model.model.layers.{i}.post_attention_layernorm.weight", - f"{prefix}.layers.{attn_index}.mlp.linear_fc1.layer_norm_weight", - ), - ( - f"language_model.model.layers.{i}.mlp.down_proj.weight", - f"{prefix}.layers.{attn_index}.mlp.linear_fc2.weight", - ), - ( - f"language_model.model.layers.{i}.input_layernorm.weight", - f"{prefix}.layers.{attn_index}.self_attention.linear_qkv.layer_norm_weight", - ), - ] - ) - - new_state_dict = {} - for new_key, old_key in key_map: - new_state_dict[new_key] = state_dict[old_key] - - def convert_vision_qkv_weight(state_dict, vision_model_config): - hidden_size = vision_model_config.hidden_size - - new_state_dict = {} - for i in range(vision_model_config.num_layers): - qkv_weights = state_dict[ - f"vision_model.vision_encoder.transformer.layers.{i}.self_attention.linear_qkv.weight" - ] - - for name, weight in split_qkv_weight(qkv_weights, vision_model_config): - new_key = f'vision_model.transformer.layers.{i}.self_attn.{name}.weight' - new_state_dict[new_key] = weight.reshape(-1, hidden_size) - - for i in range(vision_model_config.num_global_layers): - qkv_weights = state_dict[ - f"vision_model.vision_encoder.global_transformer.layers.{i}.self_attention.linear_qkv.weight" - ] - - for name, weight in split_qkv_weight(qkv_weights, vision_model_config): - new_key = f'vision_model.global_transformer.layers.{i}.self_attn.{name}.weight' - new_state_dict[new_key] = weight.reshape(-1, hidden_size) - - return new_state_dict - - def convert_patch_embeding(state_dict): - conv1_weight = state_dict["vision_model.vision_encoder.conv1._linear.weight"] - return {"vision_model.patch_embedding.weight": conv1_weight.reshape(conv1_weight.shape[0], 3, 14, 14)} - - def convert_language_qkv_weight(state_dict, language_model_config): - hidden_size = language_model_config.hidden_size - new_state_dict = {} - for i in range(toal_num_layer): - cross_num = (i - 3) // (cross_attention_frequency + 1) - if (i - 3) % (cross_attention_frequency + 1) == 0: - xattn_index = cross_num * cross_attention_frequency + 3 - kv_weights = state_dict[f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_kv.weight"] - for name, weight in split_kv_weight(kv_weights, language_model_config): - new_key = f"language_model.model.layers.{i}.cross_attn.{name}.weight" - new_state_dict[new_key] = weight.reshape(-1, hidden_size) - else: - attn_index = i - cross_num - 1 - qkv_weights = state_dict[f"{prefix}.layers.{attn_index}.self_attention.linear_qkv.weight"] - for name, weight in split_qkv_weight(qkv_weights, language_model_config): - new_key = f"language_model.model.layers.{i}.self_attn.{name}.weight" - new_state_dict[new_key] = weight.reshape(-1, hidden_size) - - return new_state_dict - - def convert_gate(state_dict): - new_state_dict = {} - for i in range(toal_num_layer): - cross_num = (i - 3) // (cross_attention_frequency + 1) - if (i - 3) % (cross_attention_frequency + 1) == 0: - xattn_index = cross_num * cross_attention_frequency + 3 - gate_weight = state_dict[f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc1.weight"] - else: - attn_index = i - cross_num - 1 - gate_weight = state_dict[f"{prefix}.layers.{attn_index}.mlp.linear_fc1.weight"] - - for name, weight in split_gate_weight(gate_weight): - new_key = f"language_model.model.layers.{i}.mlp.{name}.weight" - new_state_dict[new_key] = weight - - return new_state_dict - - def convert_embedding(state_dict): - word_embeddings = state_dict["language_model.embedding.word_embeddings.weight"] - learnable_embedding = state_dict["language_model.learnable_embedding.weight"] - - return {"language_model.model.embed_tokens.weight": torch.cat((word_embeddings, learnable_embedding), dim=0)} - - new_state_dict.update(convert_vision_qkv_weight(state_dict, vision_model_config)) - new_state_dict.update(convert_patch_embeding(state_dict)) - new_state_dict.update(convert_language_qkv_weight(state_dict, language_model_config)) - new_state_dict.update(convert_gate(state_dict)) - new_state_dict.update(convert_embedding(state_dict)) - - return new_state_dict, convert_mllama_config(vision_model_config, language_model_config) diff --git a/nemo/export/multimodal/run.py b/nemo/export/multimodal/run.py deleted file mode 100644 index d113f877b3c3..000000000000 --- a/nemo/export/multimodal/run.py +++ /dev/null @@ -1,1168 +0,0 @@ -# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import json -import os - -try: - import decord -except Exception: - import logging - - logging.warning("The package `decord` was not installed in this environment.") - -import einops -import numpy as np -import soundfile as sf -import tensorrt as trt -import tensorrt_llm -import tensorrt_llm.profiler as profiler -import torch -import yaml -from PIL import Image -from tensorrt_llm import logger -from tensorrt_llm._utils import str_dtype_to_trt, torch_dtype_to_trt -from tensorrt_llm.runtime import ModelRunner, Session, TensorInfo -from torch.nn import functional as F -from torchvision import transforms -from transformers import AutoProcessor, CLIPImageProcessor - -from nemo.export.utils.constants import TRTLLM_ENGINE_DIR - - -def trt_dtype_to_torch(dtype): - if dtype == trt.float16: - return torch.float16 - elif dtype == trt.float32: - return torch.float32 - elif dtype == trt.int32: - return torch.int32 - elif dtype == trt.bfloat16: - return torch.bfloat16 - else: - raise TypeError("%s is not supported" % dtype) - - -class MultimodalModelRunner: - - def __init__(self, visual_engine_dir, llm_engine_dir, modality='vision'): - self.modality = modality - self.runtime_rank = tensorrt_llm.mpi_rank() - device_id = self.runtime_rank % torch.cuda.device_count() - torch.cuda.set_device(device_id) - self.device = "cuda:%d" % (device_id) - - self.stream = torch.cuda.Stream(torch.cuda.current_device()) - torch.cuda.set_stream(self.stream) - - # parse model type from visual engine config - with open(os.path.join(visual_engine_dir, "config.json"), "r") as f: - config = json.load(f) - self.model_type = config['builder_config']['model_type'] - self.vision_precision = config['builder_config']['precision'] - self.modality_precision = config['builder_config']['precision'] - - self.num_frames = config['builder_config'].get('num_frames', None) - self.image_size = config['builder_config'].get('image_size', None) - - self.profiling_iterations = 20 - - if modality == 'vision': - self.init_image_encoder(visual_engine_dir) - self.init_tokenizer(llm_engine_dir) - self.init_llm(os.path.join(llm_engine_dir, TRTLLM_ENGINE_DIR)) # Engine is stored in subdirectory - if self.model_type == 'lita' or self.model_type == 'vila' or self.model_type == 'vita': - self.init_vision_preprocessor(visual_engine_dir) - - def init_tokenizer(self, llm_engine_dir): - if os.path.exists(os.path.join(llm_engine_dir, "tokenizer_config.json")): - from transformers import AutoTokenizer - - self.tokenizer = AutoTokenizer.from_pretrained(llm_engine_dir) - self.tokenizer.pad_token = self.tokenizer.eos_token - if self.model_type == 'vita': - self.tokenizer.im_start_id = self.tokenizer.convert_tokens_to_ids("") - self.tokenizer.im_end_id = self.tokenizer.convert_tokens_to_ids("") - self.tokenizer.vid_start_id = self.tokenizer.convert_tokens_to_ids("") - self.tokenizer.vid_end_id = self.tokenizer.convert_tokens_to_ids("") - else: - from sentencepiece import SentencePieceProcessor - - sp = SentencePieceProcessor(os.path.join(llm_engine_dir, 'tokenizer.model')) - - class return_obj: - - def __init__(self, input_ids): - self.input_ids = input_ids - - def __getitem__(self, name): - if name in "input_ids": - return self.input_ids - else: - raise AttributeError(f"'return_obj' has no item '{name}'") - - # sentencepiece does not follow the same interface as HF - class HFTokenizerInterface: - - def encode(self, x, return_tensors=None, **kwargs): - out = sp.encode(x) - if return_tensors == "pt": - out = torch.tensor(out) - return return_obj(out) - - def __call__(self, x, return_tensors=None, **kwargs): - return self.encode(x, return_tensors, **kwargs) - - def decode(self, x, **kwargs): - return sp.decode(x.tolist()) - - def batch_decode(self, x, **kwargs): - return self.decode(x, **kwargs) - - self.tokenizer = HFTokenizerInterface() - self.tokenizer.eos_token_id = sp.eos_id() - self.tokenizer.bos_token_id = sp.bos_id() - self.tokenizer.pad_token_id = sp.pad_id() - - self.tokenizer.padding_side = "right" - - if self.model_type == 'lita': - self.tokenizer.im_start_id = sp.piece_to_id("") - self.tokenizer.im_end_id = sp.piece_to_id("") - self.tokenizer.vid_start_id = sp.piece_to_id("") - self.tokenizer.vid_end_id = sp.piece_to_id("") - - def init_image_encoder(self, visual_engine_dir): - vision_encoder_path = os.path.join(visual_engine_dir, 'visual_encoder.engine') - logger.info(f'Loading engine from {vision_encoder_path}') - with open(vision_encoder_path, 'rb') as f: - engine_buffer = f.read() - logger.info(f'Creating session from engine {vision_encoder_path}') - self.visual_encoder_session = Session.from_serialized_engine(engine_buffer) - - def init_vision_preprocessor(self, visual_encoder_dir): - with open(os.path.join(visual_encoder_dir, 'nemo_config.yaml'), 'r') as f: - self.nemo_config = yaml.safe_load(f) - - vision_config = self.nemo_config["mm_cfg"]["vision_encoder"] - - if self.model_type == 'lita': - self.image_processor = AutoProcessor.from_pretrained( - vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True - ) - elif self.model_type == 'vila' or self.model_type == 'vita': - from transformers import SiglipImageProcessor - - self.image_processor = SiglipImageProcessor.from_pretrained( - vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True - ) - else: - raise ValueError(f"Invalid model type: {self.model_type}") - - def init_llm(self, llm_engine_dir): - self.model = ModelRunner.from_dir( - llm_engine_dir, - rank=tensorrt_llm.mpi_rank(), - debug_mode=False, - stream=self.stream, - ) - self.model_config = self.model.session._model_config - self.runtime_mapping = self.model.session.mapping - - def video_preprocess(self, video_path): - from decord import VideoReader - - if isinstance(video_path, str): - vr = VideoReader(video_path) - num_frames = self.num_frames - if num_frames == -1: - frames = [Image.fromarray(frame.asnumpy()).convert('RGB') for frame in vr] - else: - # equally sliced frames into self.num_frames frames - # if self.num_frames is greater than the number of frames in the video, we will repeat the last frame - num_frames = min(num_frames, len(vr)) - indices = np.linspace(0, len(vr) - 1, num=num_frames, dtype=int) - frames = [Image.fromarray(vr[idx].asnumpy()).convert('RGB') for idx in indices] - if len(frames) < num_frames: - frames += [frames[-1]] * (num_frames - len(frames)) - elif isinstance(video_path, np.ndarray): - num_frames = self.num_frames - if num_frames == -1: - frames = [Image.fromarray(frame).convert('RGB') for frame in video_path] - else: - # equally sliced frames into self.num_frames frames - # if self.num_frames is greater than the number of frames in the video, we will repeat the last frame - num_frames = min(num_frames, video_path.shape[0]) - indices = np.linspace(0, video_path.shape[0] - 1, num=num_frames, dtype=int) - frames = [Image.fromarray(video_path[idx]).convert('RGB') for idx in indices] - if len(frames) < num_frames: - frames += [frames[-1]] * (num_frames - len(frames)) - else: - frames = self.video_path - - processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16) - frames = processor.preprocess(frames, return_tensors="pt")['pixel_values'] - # make dtype consistent with vision encoder - media_tensors = frames.to( - tensorrt_llm._utils.str_dtype_to_torch(self.vision_precision) - ) # [num_frames, 3, H, W] - return media_tensors.unsqueeze(0) # [1, num_frames, 3, H, W] - - def insert_tokens_by_index(self, input_ids, num_frames): - im_start_id = self.tokenizer.im_start_id - im_end_id = self.tokenizer.im_end_id - vid_start_id = self.tokenizer.vid_start_id - vid_end_id = self.tokenizer.vid_end_id - - image_token_indices = (input_ids == 0).nonzero(as_tuple=False).squeeze().tolist() - input_ids = input_ids.squeeze().tolist() - offset = 0 - - # Insert the image tokens and corresponding start/end tokens - for i in range(num_frames): - idx = image_token_indices[1] + offset - input_ids.insert(idx + 1, im_end_id) - input_ids.insert(idx + 1, 0) - input_ids.insert(idx + 1, im_start_id) - offset += 3 - - # Insert the video start and end tokens around the video token - vid_idx = image_token_indices[1] + offset - input_ids.insert(vid_idx + 1, vid_end_id) - input_ids.insert(vid_idx + 1, 0) - input_ids.insert(vid_idx + 1, vid_start_id) - - input_ids.pop(image_token_indices[1]) - input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0) - - return input_ids - - def preprocess(self, warmup, pre_prompt, post_prompt, image, attention_mask, batch_size): - if not warmup: - profiler.start(self.modality.capitalize()) - - if not warmup: - profiler.stop(self.modality.capitalize()) - - if self.model_type == 'vila': - visual_features, visual_atts = self.get_visual_features(image, attention_mask) - input_ids = self.tokenizer_image_token(batch_size, pre_prompt[0] + post_prompt[0], self.tokenizer) - batch_split_prompts = self.split_prompt_by_images(input_ids) - first_batch_split_prompts = batch_split_prompts[0] - # compute prompt length + visual length - length = sum([ids.shape[1] for ids in first_batch_split_prompts]) - if batch_size == 1 and len(image) > 1: - # mode 1: multiple image as a whole, flatten visual dims - length += visual_atts.shape[0] * visual_atts.shape[1] - else: - # mode 2: multiple images individually (replicate prompt for each image) - length += visual_atts.shape[1] - - input_lengths = torch.IntTensor([length] * batch_size).to(torch.int32) - input_ids, ptuning_args = self.setup_fake_prompts_vila( - batch_size, visual_features, first_batch_split_prompts, input_lengths - ) - return input_ids, input_lengths, ptuning_args, visual_features - - elif self.model_type == 'lita' or self.model_type == 'vita': - visual_input = [] - for i, img in enumerate(image): - visual_features, visual_atts = self.get_visual_features(img, attention_mask) - visual_features = visual_features.unsqueeze(0) - im_tokens, vid_tokens, num_sample_frames = self.preprocess_lita_visual(visual_features, self.nemo_config) - visual_input.extend([im_tokens, vid_tokens]) - - input_ids = self.tokenizer_image_token(batch_size, pre_prompt[0] + post_prompt[0], self.tokenizer) - input_ids = self.insert_tokens_by_index(input_ids, num_sample_frames) - batch_splits = self.split_prompt_by_images(input_ids) - first_batch_split_prompts = batch_splits[0] - length = sum([ids.shape[1] for ids in first_batch_split_prompts]) - - # Update visual atts shape to match im_tokens shape and vid_tokens shape - im_tokens = im_tokens.view(1, -1, im_tokens.shape[-1]) - visual_features = torch.cat([im_tokens, vid_tokens], dim=1) - visual_atts = torch.ones(visual_features.size()[:-1], dtype=torch.long).to(image.device) - - if batch_size == 1: - length += visual_atts.shape[0] * visual_atts.shape[1] - else: - raise ValueError("Batch size greater than 1 is not supported for LITA and VITA models") - - input_lengths = torch.IntTensor([length] * batch_size).to(torch.int32) - input_ids, ptuning_args = self.setup_fake_prompts_vila( - batch_size, visual_input, first_batch_split_prompts, input_lengths - ) - return input_ids, input_lengths, ptuning_args, visual_features - else: - visual_features, visual_atts = self.get_visual_features(image, attention_mask) - pre_input_ids = self.tokenizer(pre_prompt, return_tensors="pt", padding=True).input_ids - if post_prompt[0] is not None: - post_input_ids = self.tokenizer(post_prompt, return_tensors="pt", padding=True).input_ids - if self.model_type == 'video-neva': - length = ( - pre_input_ids.shape[1] + post_input_ids.shape[1] + visual_atts.shape[2] * visual_atts.shape[1] - ) - else: - length = pre_input_ids.shape[1] + post_input_ids.shape[1] + visual_atts.shape[1] - else: - post_input_ids = None - length = pre_input_ids.shape[1] + visual_atts.shape[1] - - input_lengths = torch.IntTensor([length] * batch_size).to(torch.int32) - - input_ids, ptuning_args = self.setup_fake_prompts( - visual_features, pre_input_ids, post_input_ids, input_lengths - ) - - return input_ids, input_lengths, ptuning_args, visual_features - - @staticmethod - def tokenizer_image_token(batch_size, prompt, tokenizer, image_token_index=-200): - prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")] - - def insert_separator(X, sep): - return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1] - - input_ids = [] - offset = 0 - if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id: - offset = 1 - input_ids.append(prompt_chunks[0][0]) - - for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)): - input_ids.extend(x[offset:]) - - input_ids = torch.tensor(input_ids, dtype=torch.long) - input_ids[input_ids == image_token_index] = 0 - input_ids = input_ids.unsqueeze(0).expand(batch_size, -1) - - return input_ids - - def split_prompt_by_images(self, tensor): - batch_splits = [] - for batch in tensor: - # Find indices where value is zero () - zero_indices = (batch == 0).nonzero(as_tuple=False).squeeze(0) - # Add starting point for slicing - start_idx = 0 - splits = [] - for idx in zero_indices: - if start_idx != idx: # Ensure not slicing zero-length tensors - splits.append(batch[start_idx:idx].unsqueeze(0)) - start_idx = idx + 1 # Move start index past the zero - if start_idx < len(batch): # Handle last segment if it's not zero-ending - splits.append(batch[start_idx:].unsqueeze(0)) - # Remove empty tensors resulting from consecutive zeros - splits = [split for split in splits if split.numel() > 0] - batch_splits.append(splits) - - return batch_splits - - def generate( - self, - pre_prompt, - post_prompt, - image, - decoder_input_ids, - max_new_tokens, - attention_mask, - warmup, - batch_size, - top_k, - top_p, - temperature, - repetition_penalty, - num_beams, - lora_uids=None, - ): - if not warmup: - profiler.start("Generate") - - input_ids, input_lengths, ptuning_args, visual_features = self.preprocess( - warmup, pre_prompt, post_prompt, image, attention_mask, batch_size - ) - - if warmup: - return None - - profiler.start("LLM") - end_id = self.tokenizer.eos_token_id - - ptuning_args[0] = torch.stack([ptuning_args[0]]) - output_ids = self.model.generate( - input_ids, - sampling_config=None, - prompt_table=ptuning_args[0], - max_new_tokens=max_new_tokens, - end_id=end_id, - pad_id=( - self.tokenizer.pad_token_id - if self.tokenizer.pad_token_id is not None - else self.tokenizer.all_special_ids[0] - ), - top_k=top_k, - top_p=top_p, - temperature=temperature, - repetition_penalty=repetition_penalty, - num_beams=num_beams, - output_sequence_lengths=False, - lora_uids=lora_uids, - return_dict=False, - ) - - profiler.stop("LLM") - - if tensorrt_llm.mpi_rank() == 0: - # Extract a list of tensors of shape beam_width x output_ids. - output_beams_list = [ - self.tokenizer.batch_decode( - output_ids[batch_idx, :, input_lengths[batch_idx] :], skip_special_tokens=True - ) - for batch_idx in range(batch_size) - ] - - stripped_text = [ - [output_beams_list[batch_idx][beam_idx].strip() for beam_idx in range(num_beams)] - for batch_idx in range(batch_size) - ] - profiler.stop("Generate") - return stripped_text - else: - profiler.stop("Generate") - return None - - def get_visual_features(self, image, attention_mask): - visual_features = {'input': image.to(tensorrt_llm._utils.str_dtype_to_torch(self.vision_precision))} - if attention_mask is not None: - visual_features['attention_mask'] = attention_mask - tensor_info = [TensorInfo('input', str_dtype_to_trt(self.vision_precision), image.shape)] - if attention_mask is not None: - tensor_info.append(TensorInfo('attention_mask', trt.DataType.INT32, attention_mask.shape)) - - visual_output_info = self.visual_encoder_session.infer_shapes(tensor_info) - - visual_outputs = { - t.name: torch.empty(tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device=image.device) - for t in visual_output_info - } - - ok = self.visual_encoder_session.run(visual_features, visual_outputs, self.stream.cuda_stream) - assert ok, "Runtime execution failed for vision encoder session" - self.stream.synchronize() - - image_embeds = visual_outputs['output'] - image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device) - - return image_embeds, image_atts - - def setup_fake_prompts(self, visual_features, pre_input_ids, post_input_ids, input_lengths): - # Assemble fake prompts which points to image embedding actually - if hasattr(self, 'num_frames') and (visual_features.shape[1] == self.num_frames): - visual_features = visual_features.view(visual_features.shape[0], -1, visual_features.shape[-1]) - - fake_prompt_id = torch.arange( - self.model_config.vocab_size, - self.model_config.vocab_size + visual_features.shape[0] * visual_features.shape[1], - ) - fake_prompt_id = fake_prompt_id.reshape(visual_features.shape[0], visual_features.shape[1]) - - if post_input_ids is not None: - input_ids = [pre_input_ids, fake_prompt_id, post_input_ids] - else: - input_ids = [fake_prompt_id, pre_input_ids] - input_ids = torch.cat(input_ids, dim=1).contiguous().to(torch.int32) - - ptuning_args = self.ptuning_setup(visual_features, input_ids, input_lengths) - - return input_ids, ptuning_args - - def setup_fake_prompts_vila(self, batch_size, visual_features, split_input_ids, input_lengths): - - if self.model_type == 'lita' or self.model_type == 'vita': - squeeze_img_tokens = visual_features[0].squeeze(0) - reshape_img_tokens = [t.unsqueeze(0) for t in squeeze_img_tokens] - visual_features = reshape_img_tokens + [visual_features[1]] - - fake_prompt_counter = self.model_config.vocab_size - if batch_size == 1: - # only check for multi-image inference (mode 1) - assert len(visual_features) <= len( - split_input_ids - ), "Unexpected number of visual features. Please check # in prompt and the #image files." - - input_ids = [] - if batch_size == 1: - input_ids = [split_input_ids[0]] - - if self.model_type == 'vila': - # mode 1: multiple image as a whole, concat all prompts together,
...
-                for idx, visual_feature in enumerate(visual_features):
-                    fake_prompt_id = torch.arange(fake_prompt_counter, fake_prompt_counter + visual_feature.shape[0])
-                    fake_prompt_counter += visual_feature.shape[0]
-                    fake_prompt_id = fake_prompt_id.unsqueeze(0)
-                    input_ids.append(fake_prompt_id)
-
-                    # in case no post prompt
-                    if len(split_input_ids) > idx + 1:
-                        input_ids.append(split_input_ids[idx + 1])
-            elif self.model_type == 'lita' or self.model_type == 'vita':
-                for idx, visual_f in enumerate(visual_features):
-                    fake_prompt_id = torch.arange(fake_prompt_counter, fake_prompt_counter + visual_f.shape[1])
-                    fake_prompt_id = fake_prompt_id.reshape(visual_f.shape[1])
-                    fake_prompt_counter += visual_f.shape[1]
-                    fake_prompt_id = fake_prompt_id.unsqueeze(0)
-                    input_ids.append(fake_prompt_id)
-
-                    # in case no post prompt
-                    if len(split_input_ids) > idx + 1:
-                        input_ids.append(split_input_ids[idx + 1])
-
-        elif batch_size > 1 and self.model_type == 'vila':
-            # mode 2: each image have individual prompt, 

-            for idx, visual_feature in enumerate(visual_features):
-                input_ids.append(split_input_ids[0])
-                fake_prompt_id = torch.arange(fake_prompt_counter, fake_prompt_counter + visual_feature.shape[0])
-                fake_prompt_counter += visual_feature.shape[0]
-                fake_prompt_id = fake_prompt_id.unsqueeze(0)
-                input_ids.append(fake_prompt_id)
-                if len(split_input_ids) > 1:
-                    input_ids.append(split_input_ids[1])
-
-        input_ids = torch.cat(input_ids, dim=1).contiguous().to(torch.int32)
-        input_ids = input_ids.reshape(batch_size, -1)
-        ptuning_args = self.ptuning_setup(visual_features, input_ids, input_lengths)
-        return input_ids, ptuning_args
-
-    def preprocess_lita_visual(self, visual_features, config):
-
-        b, t, s, d = visual_features.shape
-
-        num_frames = t
-        if (
-            'visual_token_format' in config['mm_cfg']['lita']
-            and config['mm_cfg']['lita']['visual_token_format'] == 'im_vid_start_end'
-        ):
-            num_image_frames = min(num_frames, config['mm_cfg']['lita']['sample_frames'])
-            idx = np.round(np.linspace(0, num_frames - 1, num_image_frames)).astype(int)
-
-            # Image and video features
-            im_features = visual_features[:, idx, ...]
-
-            vid_features = einops.reduce(visual_features, 'b t s d -> b t d', 'mean')
-            return im_features, vid_features, num_image_frames
-
-        elif (
-            'lita_video_arch' in config['mm_cfg']['lita']
-            and config['mm_cfg']['lita']['lita_video_arch'] == 'temporal_spatial_pool'
-        ):
-            pool_size = 2
-            selected_frames = np.round(np.linspace(0, visual_features.shape[1] - 1, pool_size * pool_size)).astype(int)
-            s_tokens = visual_features[:, selected_frames, ...]
-            s_tokens = einops.rearrange(s_tokens, 'b t (h w) d -> (b t) d h w', h=16, w=16)
-            s_tokens = F.avg_pool2d(s_tokens, kernel_size=pool_size)
-            s_tokens = einops.rearrange(s_tokens, '(b t) d h w -> b (t h w) d', b=b)
-
-            t_tokens = einops.reduce(visual_features, 'b t s d -> b t d', 'mean')
-
-            return t_tokens, s_tokens, pool_size**2
-
-        else:
-            raise ValueError(f'Invalid visual token format: {config["mm_cfg"]["lita"]["visual_token_format"]}')
-
-    def ptuning_setup(self, prompt_table, input_ids, input_lengths):
-        hidden_size = self.model_config.hidden_size * self.runtime_mapping.tp_size
-
-        if self.model_type == 'lita' or self.model_type == 'vita':
-            prompt_table = torch.cat(prompt_table, dim=1)
-        if prompt_table is not None:
-            task_vocab_size = torch.tensor(
-                [prompt_table.shape[1]],
-                dtype=torch.int32,
-            ).cuda()
-            prompt_table = prompt_table.view((prompt_table.shape[0] * prompt_table.shape[1], prompt_table.shape[2]))
-
-            assert prompt_table.shape[1] == hidden_size, "Prompt table dimensions do not match hidden size"
-
-            prompt_table = prompt_table.cuda().to(
-                dtype=tensorrt_llm._utils.str_dtype_to_torch(self.model_config.dtype)
-            )
-        else:
-            prompt_table = torch.empty([1, hidden_size]).cuda()
-            task_vocab_size = torch.zeros([1]).cuda()
-
-        if self.model_config.remove_input_padding:
-            tasks = torch.zeros([torch.sum(input_lengths)], dtype=torch.int32).cuda()
-        else:
-            tasks = torch.zeros(input_ids.shape, dtype=torch.int32).cuda()
-
-        return [prompt_table, tasks, task_vocab_size]
-
-    def expand2square_pt(self, images, background_color):
-        height, width = images.shape[-2:]
-        b = len(images)
-        background_color = torch.Tensor(background_color)
-        if width == height:
-            return images
-        elif width > height:
-            result = einops.repeat(background_color, 'c -> b c h w', b=b, h=width, w=width).clone()
-            paste_start = (width - height) // 2
-            paste_end = paste_start + height
-            result[:, :, paste_start:paste_end, :] = images
-            return result
-        else:
-            result = einops.repeat(background_color, 'c -> b c h w', b=b, h=height, w=height).clone()
-            paste_start = (height - width) // 2
-            paste_end = paste_start + width
-            result[:, :, :, paste_start:paste_end] = images
-            return result
-
-    def load_video(self, config, video_path, processor, num_frames=None):
-        frames = None
-        if isinstance(video_path, str):
-            decord.bridge.set_bridge('torch')
-            video_reader = decord.VideoReader(uri=video_path)
-            if num_frames is not None:
-                idx = np.round(np.linspace(0, len(video_reader) - 1, num_frames)).astype(int)
-                frames = video_reader.get_batch(idx)
-            else:
-                frames = torch.cat([torch.tensor(f.asnumpy()) for f in video_reader])
-        elif isinstance(video_path, np.ndarray):
-            frames = torch.tensor(video_path, dtype=torch.float32)
-
-        return self.preprocess_frames(frames, config, processor)
-
-    def preprocess_frames(self, frames, config, processor):
-        frames = einops.rearrange(frames, 't h w c -> t c h w')
-        if config['data']['image_aspect_ratio'] == 'pad':
-            frames = self.expand2square_pt(frames, tuple(int(x * 255) for x in processor.image_mean))
-        processed_frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
-        return processed_frames
-
-    def get_num_sample_frames(self, config, vid_len):
-        if (
-            'visual_token_format' in config['mm_cfg']['lita']
-            and config['mm_cfg']['lita']['visual_token_format'] == 'im_vid_start_end'
-        ):
-            max_frames = config['data']['num_frames']
-            if vid_len <= max_frames:
-                return vid_len
-            else:
-                subsample = int(np.ceil(float(vid_len) / max_frames))
-                return int(np.round(float(vid_len) / subsample))
-        else:
-            return config['mm_cfg']['lita']['sample_frames']
-
-    def process_lita_video(self, nemo_config, video_path, image_processor):
-        image = None
-        if isinstance(video_path, str):
-            vid_len = len(decord.VideoReader(video_path))
-            num_sample_frames = self.get_num_sample_frames(nemo_config, vid_len)
-            image = (
-                self.load_video(nemo_config, video_path, image_processor, num_sample_frames)
-                .unsqueeze(0)
-                .to(self.device, dtype=torch.bfloat16)
-            )
-        elif isinstance(video_path, np.ndarray):
-            image = (
-                self.load_video(nemo_config, video_path, image_processor)
-                .unsqueeze(0)
-                .to(self.device, dtype=torch.bfloat16)
-            )
-        return image
-
-    def process_image(self, image_file, image_processor, nemo_config, image_folder):
-        if isinstance(image_file, str):
-            if image_folder is not None:
-                image = Image.open(os.path.join(image_folder, image_file)).convert("RGB")
-            else:
-                image = Image.open(image_file).convert("RGB")
-        else:
-            # image is stored in bytearray
-            image = image_file
-
-        crop_size = nemo_config['mm_cfg']['vision_encoder']['crop_size']
-        crop_size = tuple(crop_size)
-        image = image.resize(crop_size)
-        if nemo_config['data']['image_aspect_ratio'] == 'pad':
-            image = self.expand2square_pt(image, tuple(int(x * 255) for x in image_processor.image_mean))
-            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
-        else:
-            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
-        return image
-
-    def process_vila_img(self, images):
-        new_images = [self.process_image(image, self.image_processor, self.nemo_config, None) for image in images]
-
-        if all(x.shape == new_images[0].shape for x in new_images):
-            new_images = torch.stack(new_images, dim=0)
-        return new_images
-
-    def setup_inputs(self, input_text, raw_image, batch_size):
-        attention_mask = None
-        image = None
-
-        if self.model_type == "neva":
-            image_size = self.image_size
-            dtype = torch.float32
-            transform = transforms.Compose(
-                [
-                    transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC),
-                    transforms.ToTensor(),
-                    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
-                ]
-            )
-            image = transform(raw_image).to(dtype).unsqueeze(0)
-
-            if input_text is None:
-                input_text = "Hi! What is in this image?"
-
-            pre_prompt = "System\n\nUser\n"
-            post_prompt = f"\n{input_text}\nAssistant\n"
-        elif self.model_type == "video-neva":
-            image = self.video_preprocess(raw_image)  # shape (1, num_frames, 3, H, W)
-
-            if input_text is None:
-                input_text = "Hi! What is in this video?"
-
-            # SteerLM prompt template
-            pre_prompt = (
-                "System\nA chat between a curious user and an artificial intelligence assistant. "
-                "The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n"
-                "User"
-            )
-            post_prompt = (
-                f"\n{input_text}\nAssistant\n"
-                "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,"
-                "correctness:4,coherence:4,complexity:4,verbosity:4\n"
-            )
-        elif self.model_type in ['vila', 'lita', 'vita']:
-            if self.model_type == "vila" or self.model_type == "lita":
-                pre_prompt = (
-                    "A chat between a curious user and an artificial intelligence assistant. "
-                    "The assistant gives helpful, detailed, and polite answers to the user's questions. USER: "
-                )
-                if input_text is None:
-                    input_text = "\n Please elaborate what you see in the images?"
-                post_prompt = input_text + " ASSISTANT:"
-
-            elif self.model_type == "vita":
-                # llama3 prompt template
-                pre_prompt = (
-                    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
-                    "You are a helpful language and vision assistant. "
-                    "You are able to understand the visual content that the user provides, "
-                    "and assist the user with a variety of tasks using natural language. "
-                    "<|start_header_id|>user<|end_header_id|>\n\n"
-                )
-                if input_text is None:
-                    input_text = "\n Please elaborate what you see in the images?"
-                post_prompt = input_text + "<|start_header_id|>assistant<|end_header_id|>\n\n"
-
-        else:
-            raise RuntimeError(f"Invalid model type {self.model_type}")
-
-        if self.model_type == 'lita' or self.model_type == 'vita':
-            image = self.process_lita_video(self.nemo_config, raw_image, self.image_processor)
-
-        if self.model_type == 'vila':
-            raw_image = [raw_image] * batch_size
-            image = self.process_vila_img(raw_image)
-
-        # Repeat inputs to match batch size
-        pre_prompt = [pre_prompt] * batch_size
-        post_prompt = [post_prompt] * batch_size
-        if self.model_type not in ['vila', 'lita', 'vita']:
-            if image.dim() == 5:
-                image = image.expand(batch_size, -1, -1, -1, -1).contiguous()
-            else:
-                image = image.expand(batch_size, -1, -1, -1).contiguous()
-        image = image.to(self.device)
-
-        decoder_input_ids = None
-
-        return input_text, pre_prompt, post_prompt, image, decoder_input_ids, attention_mask
-
-    def run(
-        self,
-        input_text,
-        input_image,
-        max_new_tokens,
-        batch_size,
-        top_k,
-        top_p,
-        temperature,
-        repetition_penalty,
-        num_beams,
-        lora_uids=None,
-        run_profiling=False,
-        check_accuracy=False,
-    ):
-        input_text, pre_prompt, post_prompt, processed_image, decoder_input_ids, attention_mask = self.setup_inputs(
-            input_text, input_image, batch_size
-        )
-
-        self.generate(
-            pre_prompt,
-            post_prompt,
-            processed_image,
-            decoder_input_ids,
-            max_new_tokens,
-            attention_mask=attention_mask,
-            warmup=True,
-            batch_size=batch_size,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            num_beams=num_beams,
-            lora_uids=lora_uids,
-        )
-        num_iters = self.profiling_iterations if run_profiling else 1
-        for _ in range(num_iters):
-            output_text = self.generate(
-                pre_prompt,
-                post_prompt,
-                processed_image,
-                decoder_input_ids,
-                max_new_tokens,
-                attention_mask=attention_mask,
-                warmup=False,
-                batch_size=batch_size,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
-                num_beams=num_beams,
-                lora_uids=lora_uids,
-            )
-        if self.runtime_rank == 0:
-            self.print_result(input_text, output_text, batch_size, num_beams, run_profiling, check_accuracy)
-        return output_text
-
-    def print_result(self, input_text, output_text, batch_size, num_beams, run_profiling, check_accuracy):
-        if not run_profiling and not check_accuracy:
-            return
-        logger.info("---------------------------------------------------------")
-        if self.model_type != 'nougat':
-            logger.info(f"\n[Q] {input_text}")
-        logger.info(f"\n[A] {output_text[0]}")
-
-        if num_beams == 1:
-            output_ids = self.tokenizer(output_text[0][0], add_special_tokens=False)['input_ids']
-            logger.info(f"Generated {len(output_ids)} tokens")
-
-        if check_accuracy:
-            for i in range(batch_size - 1):
-                if not (output_text[i] == output_text[i + 1]):
-                    logger.info(f"Output {i} and {i + 1} do not match")
-                    assert False
-
-                assert 'robot' in output_text[0][0].lower()
-
-        if run_profiling:
-            msec_per_batch = lambda name: 1000 * profiler.elapsed_time_in_sec(name) / self.profiling_iterations
-            logger.info('Latencies per batch (msec)')
-            logger.info(f'TRT {self.modality} encoder: %.1f' % (msec_per_batch(self.modality.capitalize())))
-            logger.info('TRTLLM LLM generate: %.1f' % (msec_per_batch('LLM')))
-            logger.info('Multimodal generate: %.1f' % (msec_per_batch('Generate')))
-
-        logger.info("---------------------------------------------------------")
-
-    def load_test_media(self, input_media):
-        media_model = ["video-neva", "lita", "vita"]
-        if self.model_type in media_model:
-            media = input_media
-        elif self.model_type == "neva" or self.model_type == "vila":
-            media = Image.open(input_media).convert('RGB')
-        else:
-            raise RuntimeError(f"Invalid model type {self.model_type}")
-
-        return media
-
-
-class SpeechllmModelRunner(MultimodalModelRunner):
-    def __init__(self, perception_engine_dir, llm_engine_dir, modality):
-        """
-        perception_engine_dir: path to the perception engine directory
-                               it should contain:
-                               config.json nemo_config.yaml
-                               perception_encoder.engine : tensorrt engine
-                               feature_extractor.ts  : torchscript model
-        llm_engine_dir: path to the LLM engine directory
-        """
-        super().__init__(perception_engine_dir, llm_engine_dir, modality)
-        assert self.model_type == 'salm'
-        # init preprocessor
-        feature_extractor_path = os.path.join(perception_engine_dir, 'feature_extractor.ts')
-        self.feature_extractor = self.init_speech_preprocessor(feature_extractor_path)
-        self.init_modality_encoder(perception_engine_dir)
-
-    def init_modality_encoder(self, engine_dir):
-        """
-        Initialize the modality encoder session from the prebuilt engine directory
-        Args:
-            engine_dir: str, path to the engine directory
-        """
-        # find file with .engine extension
-        engine_file = None
-        for file in os.listdir(engine_dir):
-            if file.endswith('.engine'):
-                engine_file = file
-                break
-        assert engine_file is not None, f"Engine file not found in {engine_dir}"
-        encoder_path = os.path.join(engine_dir, engine_file)
-        logger.info(f'Loading engine from {encoder_path}')
-        with open(encoder_path, 'rb') as f:
-            engine_buffer = f.read()
-        logger.info(f'Creating session from engine {encoder_path}')
-        self.modality_encoder_session = Session.from_serialized_engine(engine_buffer)
-
-    def init_speech_preprocessor(self, feature_extractor_path):
-        feature_extractor = torch.jit.load(feature_extractor_path)
-        feature_extractor.eval()
-        return feature_extractor
-
-    def process_audio(self, input_signal, input_signal_length):
-        """
-        Args:
-            input_signal: audio signal in numpy array
-            input_signal_length: length of the audio signal in numpy array
-
-        Returns:
-            processed_signal: torch.tensor [B, 80, T]
-            processed_signal_length [B]
-        """
-        input_signal = torch.tensor(input_signal, dtype=torch.float32)
-        input_signal_length = torch.tensor(input_signal_length, dtype=torch.int32)
-        processed_signal, processed_signal_length = self.feature_extractor(input_signal, input_signal_length)
-        return processed_signal, processed_signal_length
-
-    def setup_inputs(self, input_text, input_media, batch_size):
-        """
-        Args:
-            input_text: str or List[str] or None
-            input_media: Tuple[np.array, np.array]
-                input_signal: audio signal in numpy array [b, -1]
-                input_signal_length: length of the audio signal in numpy array [b]
-            batch_size: int
-
-        """
-        input_signal, input_signal_length = input_media
-        processed_signal, processed_signal_length = self.process_audio(input_signal, input_signal_length)
-        processed_signal = processed_signal.to(self.device)
-        processed_signal_length = processed_signal_length.to(self.device)
-        if input_text is None:
-            input_text = "Q: what's the transcription of the audio? A:"
-
-        if isinstance(input_text, str):
-            input_text = [input_text] * batch_size
-
-        assert len(input_text) == batch_size
-        pre_prompt = [''] * batch_size
-        post_prompt = input_text
-        decoder_input_ids = None
-        attention_mask = None
-        return (
-            input_text,
-            pre_prompt,
-            post_prompt,
-            processed_signal,
-            processed_signal_length,
-            decoder_input_ids,
-            attention_mask,
-        )
-
-    def load_test_media(self, input_media_path):
-        """
-        Args:
-            input_media_path: str, path to the audio file
-        Returns:
-            input_signal: np.array [1, -1]
-            input_signal_length: np.array [1]
-        """
-        waveform, sample_rate = sf.read(input_media_path, dtype=np.float32)
-        input_signal = np.array([waveform], dtype=np.float32)
-        input_signal_length = np.array([len(waveform)], dtype=np.int32)
-        return input_signal, input_signal_length
-
-    def get_modality_encoder_features(self, modality_features, attention_mask):
-        """
-        Do inference on the modality encoder engine
-        Args:
-            modality_features: dict {'input1': torch.tensor, 'input2': torch.tensor, ..}
-            attention_mask: None
-        Returns:
-        """
-
-        if attention_mask is not None:
-            modality_features['attention_mask'] = attention_mask
-
-        tensor_info = []
-        for key, tensor in modality_features.items():
-            tensor_info.append(TensorInfo(key, torch_dtype_to_trt(tensor.dtype), tensor.shape))
-
-        output_info = self.modality_encoder_session.infer_shapes(tensor_info)
-
-        outputs = {
-            t.name: torch.empty(tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device=self.device)
-            for t in output_info
-        }
-
-        ok = self.modality_encoder_session.run(modality_features, outputs, self.stream.cuda_stream)
-        assert ok, "Runtime execution failed for vision encoder session"
-        self.stream.synchronize()
-
-        return outputs
-
-    def preprocess(self, warmup, pre_prompt, post_prompt, processed_features, attention_mask, batch_size):
-        """
-        Args:
-            warmup: bool
-            pre_prompt: List[str]
-            post_prompt: List[str]
-            processed_features: Tuple[torch.tensor, torch.tensor]
-                processed_signal: torch.tensor [B, 80, T]
-                processed_signal_length: torch.tensor [B]
-            attention_mask: None
-            batch_size: int
-        Returns:
-            input_ids: torch.tensor [B, L]
-            input_lengths: torch.tensor [B]
-            ptuning_args: List[torch.tensor]
-            encoded_features: torch.tensor [B, L, D]
-        """
-        if not warmup:
-            profiler.start(self.modality.capitalize())
-
-        if not warmup:
-            profiler.stop(self.modality.capitalize())
-
-        assert self.model_type == 'salm', f"Invalid model type {self.model_type}"
-
-        processed_features = {
-            "processed_signal": processed_features[0],
-            "processed_signal_length": processed_features[1].to(torch.int32),
-        }
-        encoded_outputs = self.get_modality_encoder_features(processed_features, attention_mask)
-        encoded_features, encoded_length = encoded_outputs['encoded'], encoded_outputs['encoded_length']
-        pre_input_ids = self.tokenizer(pre_prompt).input_ids
-        post_input_ids = self.tokenizer(post_prompt).input_ids
-        input_lengths = []
-        input_ids = []
-        encoded_length = encoded_length.cpu().numpy()
-        fake_id_start = self.model.vocab_size
-        for i in range(batch_size):
-            feat_len = encoded_length[i]
-            feat_fake_ids = np.arange(fake_id_start, fake_id_start + feat_len)
-            cur_input_ids = np.concatenate([pre_input_ids[i], feat_fake_ids, post_input_ids[i]])
-            fake_id_start += feat_len
-            input_lengths.append(len(cur_input_ids))
-            input_ids.append(cur_input_ids)
-
-        max_length = max(input_lengths)
-        # convert input_ids to torch tensor with padding
-        input_ids = [
-            np.pad(ids, (0, max_length - len(ids)), 'constant', constant_values=self.tokenizer.pad_token_id)
-            for ids in input_ids
-        ]
-        input_ids = torch.tensor(input_ids, dtype=torch.int32)
-        input_lengths = torch.tensor(input_lengths, dtype=torch.int32)
-        ptuning_args = self.ptuning_setup(encoded_features, input_ids, input_lengths)
-
-        return input_ids, input_lengths, ptuning_args, encoded_features
-
-    def run(
-        self,
-        input_text,
-        input_media=None,
-        max_new_tokens: int = 30,
-        batch_size: int = 1,
-        top_k: int = 1,
-        top_p: float = 0.0,
-        temperature: float = 1.0,
-        repetition_penalty: float = 1.0,
-        num_beams: int = 1,
-        run_profiling=False,
-        check_accuracy=False,
-        input_signal=None,
-        input_signal_length=None,
-        lora_uids=None,
-    ):
-        """
-        Args:
-            input_text: str or List[str] or None
-            input_media: Tuple[np.array, np.array] or None
-                input_signal: audio signal in numpy array [b, -1]
-                input_signal_length: length of the audio signal in numpy array [b]
-            max_new_tokens: int
-            batch_size: int
-            top_k: int
-            top_p: float
-            temperature: float
-            repetition_penalty: float
-            num_beams: int
-            run_profiling: bool
-            check_accuracy: bool
-        """
-        if input_media is None:
-            assert input_signal is not None and input_signal_length is not None
-            input_media = (input_signal, input_signal_length)
-
-        (
-            input_text,
-            pre_prompt,
-            post_prompt,
-            processed_signal,
-            processed_signal_length,
-            decoder_input_ids,
-            attention_mask,
-        ) = self.setup_inputs(input_text, input_media, batch_size)
-        processed_media = (processed_signal, processed_signal_length)
-
-        self.generate(
-            pre_prompt,
-            post_prompt,
-            processed_media,
-            decoder_input_ids,
-            max_new_tokens,
-            attention_mask=attention_mask,
-            warmup=True,
-            batch_size=batch_size,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            repetition_penalty=repetition_penalty,
-            num_beams=num_beams,
-        )
-        num_iters = self.profiling_iterations if run_profiling else 1
-        for _ in range(num_iters):
-            output_text = self.generate(
-                pre_prompt,
-                post_prompt,
-                processed_media,
-                decoder_input_ids,
-                max_new_tokens,
-                attention_mask=attention_mask,
-                warmup=False,
-                batch_size=batch_size,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
-                repetition_penalty=repetition_penalty,
-                num_beams=num_beams,
-            )
-        if self.runtime_rank == 0:
-            self.print_result(input_text, output_text, batch_size, num_beams, run_profiling, check_accuracy)
-        return output_text
diff --git a/nemo/export/onnx_llm_exporter.py b/nemo/export/onnx_llm_exporter.py
deleted file mode 100755
index 3204a3c75eeb..000000000000
--- a/nemo/export/onnx_llm_exporter.py
+++ /dev/null
@@ -1,475 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import warnings
-from pathlib import Path
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-import wrapt
-from transformers import AutoModel, AutoTokenizer
-
-from nemo.deploy import ITritonDeployable
-from nemo.export.utils import get_example_inputs, get_model_device_type, is_nemo2_checkpoint, validate_fp8_network
-from nemo.utils import logging
-
-if TYPE_CHECKING:
-    import tensorrt as trt
-
-
-@wrapt.decorator
-def noop_decorator(func):
-    """No op decorator"""
-
-    def wrapper(*args, **kwargs):
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-use_pytriton = True
-batch = noop_decorator
-try:
-    from pytriton.decorators import batch
-except Exception:
-    logging.warning("PyTriton is not available.")
-    use_pytriton = False
-
-
-use_onnxruntime = True
-try:
-    import onnxruntime
-except Exception:
-    logging.warning("onnxruntime is not available.")
-    use_onnxruntime = False
-
-
-use_trt = True
-try:
-    import tensorrt as trt
-except ImportError:
-    logging.warning("tensorrt is not available")
-    use_trt = False
-
-
-# pylint: disable=line-too-long
-class OnnxLLMExporter(ITritonDeployable):
-    """
-    Exports models to ONNX and run fast inference.
-
-    Example:
-        from nemo.export.onnx_llm_exporter import OnnxLLMExporter
-
-        onnx_llm_exporter = OnnxLLMExporter(
-            onnx_model_dir="/path/for/onnx_model/files",
-            model_name_or_path="/path/for/model/files",
-        )
-
-        onnx_llm_exporter.export(
-            input_names=["input_ids", "attention_mask", "dimensions"],
-            output_names=["embeddings"],
-        )
-
-        output = onnx_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
-        print("output: ", output)
-    """
-
-    def __init__(
-        self,
-        onnx_model_dir: str,
-        model: Optional[torch.nn.Module] = None,
-        tokenizer=None,
-        model_name_or_path: str = None,
-        load_runtime: bool = True,
-    ):
-        """
-        Initializes the ONNX Exporter.
-
-        Args:
-            onnx_model_dir (str): path for storing the ONNX model files.
-            model (Optional[torch.nn.Module]): torch model.
-            tokenizer (HF or NeMo tokenizer): tokenizer class.
-            model_name_or_path (str): a path for ckpt or HF model ID
-            load_runtime (bool): load ONNX runtime if there is any exported model available in
-                                 the onnx_model_dir folder.
-        """
-        self.onnx_model_dir = onnx_model_dir
-        self.model_name_or_path = model_name_or_path
-        self.onnx_model_path = str(Path(onnx_model_dir) / "model.onnx")
-        self.model = model
-        self.tokenizer = tokenizer
-        self.model_input_names = None
-        self.model_output_names = None
-        self.onnx_runtime_session = None
-        self.calibration_data = None
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.quant_max_batch_size = None
-
-        if self.model_name_or_path is not None:
-            if model is not None:
-                raise ValueError("A model was also passed but it will be overridden.")
-
-            if Path(self.model_name_or_path).is_dir():
-                if is_nemo2_checkpoint(self.model_name_or_path):
-                    raise NotImplementedError("NeMo 2.0 checkpoint will be supported later.")
-                else:
-                    self._load_hf_model()
-
-        if load_runtime:
-            self._load_runtime()
-
-    def _load_runtime(self):
-        if use_onnxruntime:
-            if Path(self.onnx_model_path).exists():
-                self.onnx_runtime_session = onnxruntime.InferenceSession(self.onnx_model_path)
-                self.model_input_names = [input.name for input in self.onnx_runtime_session.get_inputs()]
-                self.model_output_names = [output.name for output in self.onnx_runtime_session.get_outputs()]
-                self.tokenizer = AutoTokenizer.from_pretrained(
-                    Path(self.onnx_model_dir) / "tokenizer", trust_remote_code=True
-                )
-
-    def _load_hf_model(self):
-        self.model = AutoModel.from_pretrained(
-            self.model_name_or_path,
-            trust_remote_code=True,
-        ).eval()
-
-        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, trust_remote_code=True)
-
-    def export(
-        self,
-        input_names: list,
-        output_names: list,
-        example_inputs: dict = None,
-        opset: int = 20,
-        dynamic_axes_input: Optional[dict] = None,
-        dynamic_axes_output: Optional[dict] = None,
-        export_dtype: str = "fp32",
-        verbose: bool = False,
-    ):
-        """
-        Performs ONNX conversion from a PyTorch model.
-
-        Args:
-            input_names (list): input parameter names of the model that ONNX will export will use.
-            output_names (list): output parameter names of the model that ONNX will export will use.
-            example_inputs (dict): example input for the model to build the engine.
-            opset (int): ONNX opset version. Default is 20.
-            dynamic_axes_input (dict): Variable length axes for the input.
-            dynamic_axes_output (dict): Variable length axes for the output.
-            export_dtype (str): Export dtype, fp16 or fp32.
-            verbose (bool): Enable verbose or not.
-        """
-
-        self._export_to_onnx(
-            input_names=input_names,
-            example_inputs=example_inputs,
-            output_names=output_names,
-            opset=opset,
-            dynamic_axes_input=dynamic_axes_input,
-            dynamic_axes_output=dynamic_axes_output,
-            export_dtype=export_dtype,
-            verbose=verbose,
-        )
-        self._load_runtime()
-
-    def _export_to_onnx(
-        self,
-        input_names: list,
-        output_names: list,
-        example_inputs: dict = None,
-        opset: int = 20,
-        dynamic_axes_input: Optional[dict] = None,
-        dynamic_axes_output: Optional[dict] = None,
-        export_dtype: Union[torch.dtype, str] = "fp32",
-        verbose: bool = False,
-    ):
-
-        if example_inputs is None:
-            example_inputs = get_example_inputs(self.tokenizer)
-
-        if "dimensions" in input_names:
-            example_inputs["dimensions"] = torch.tensor([1] * example_inputs["input_ids"].shape[0])
-
-        if isinstance(export_dtype, str):
-            export_dtype = {"fp16": torch.float16, "fp32": torch.float32}[export_dtype]
-
-        self.model.to(export_dtype)
-
-        Path(self.onnx_model_dir).mkdir(parents=True, exist_ok=True)
-
-        with torch.autocast(device_type=get_model_device_type(self.model), dtype=export_dtype):
-            torch.onnx.export(
-                model=self.model,
-                args=(example_inputs,),
-                f=self.onnx_model_path,
-                input_names=input_names,
-                output_names=output_names,
-                dynamic_axes={**dynamic_axes_input, **dynamic_axes_output},
-                verbose=verbose,
-                opset_version=opset,
-            )
-        logging.info(f"Successfully exported PyTorch model to ONNX model {self.onnx_model_path}")
-
-        existing_directory_path = Path(self.onnx_model_dir) / "tokenizer"
-        existing_directory_path.mkdir(exist_ok=True)
-        self.tokenizer.save_pretrained(existing_directory_path)
-
-    def export_onnx_to_trt(
-        self,
-        trt_model_dir: str,
-        profiles=None,
-        override_layernorm_precision_to_fp32: bool = False,
-        override_layers_to_fp32: List = None,
-        trt_dtype: str = "fp16",
-        profiling_verbosity: str = "layer_names_only",
-        trt_builder_flags: List["trt.BuilderFlag"] = None,
-    ) -> None:
-        """Performs TensorRT conversion from an ONNX model.
-
-        Args:
-            trt_model_dir: path to store the TensorRT model.
-            profiles: TensorRT profiles.
-            override_layernorm_precision_to_fp32 (bool): whether to convert layers to fp32 or not.
-            override_layers_to_fp32 (List): Layer names to be converted to fp32.
-            trt_dtype (str): "fp16" or "fp32".
-            profiling_verbosity (str): Profiling verbosity. Default is "layer_names_only".
-            trt_builder_flags (List[trt.BuilderFlag]): TRT specific flags.
-        """
-        logging.info(f"Building TRT engine from ONNX model ({self.onnx_model_path})")
-        trt_logger = trt.Logger(trt.Logger.WARNING)
-        builder = trt.Builder(trt_logger)
-        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
-        config = builder.create_builder_config()
-        parser = trt.OnnxParser(network, trt_logger)
-
-        # we use parse_from_file() instead of parse() because it can be used for both single
-        # file models as well as externally stored models (required when model >2GiB)
-        if not parser.parse_from_file(self.onnx_model_path):
-            logging.warning("ONNX model could not be parsed")
-            for error in range(parser.num_errors):
-                logging.error(parser.get_error(error))
-            return
-
-        if profiles:
-            for profile in profiles:
-                optimization_profile = builder.create_optimization_profile()
-
-                for i in range(network.num_inputs):
-                    in_tensor = network.get_input(i)
-                    optimization_profile.set_shape(
-                        in_tensor.name,
-                        min=profile[in_tensor.name][0],
-                        opt=profile[in_tensor.name][1],
-                        max=profile[in_tensor.name][2],
-                    )
-
-                config.add_optimization_profile(optimization_profile)
-
-        if trt_dtype == "fp16":
-            logging.info("Setting Build Flag FP16")
-            config.set_flag(trt.BuilderFlag.FP16)
-        elif trt_dtype == "fp8":
-            # With FP8 export we want to also enable FP16 layers as a fallback instead of FP32
-            logging.info("Setting Build Flag FP8 and FP16")
-            config.set_flag(trt.BuilderFlag.FP8)
-            config.set_flag(trt.BuilderFlag.FP16)
-            validate_fp8_network(network)
-
-        # patch network
-        if override_layernorm_precision_to_fp32:
-            logging.info("Overriding TensorRT network LayerNorm precision to float32.")
-            self._override_layernorm_precision_to_fp32(network)
-
-        if override_layers_to_fp32:
-            logging.info("Overriding some layers to float32.")
-            self._override_layers_to_fp32(network, override_layers_to_fp32)
-
-        try:
-            config.profiling_verbosity = {
-                "detailed": trt.ProfilingVerbosity.DETAILED,
-                "layer_names_only": trt.ProfilingVerbosity.LAYER_NAMES_ONLY,
-                "none": trt.ProfilingVerbosity.NONE,
-            }[profiling_verbosity]
-        except KeyError:
-            error_msg = "Unknown profiling verbosity value."
-            raise ValueError(error_msg)
-        logging.info(f"Setting Profiling Verbosity to {config.profiling_verbosity}")
-
-        if trt_builder_flags is not None:
-            for flag in trt_builder_flags:
-                config.set_flag(flag)
-
-        engine_string = builder.build_serialized_network(network, config)
-        if engine_string is None:
-            raise Exception("Failed to serialize the TensorRT Engine. Please check the " "TensorRT logs for details")
-
-        trt_model_path = Path(trt_model_dir)
-        trt_model_path.mkdir(parents=True, exist_ok=True)
-        trt_model_path = trt_model_path / "model.plan"
-        trt_model_path.write_bytes(engine_string)
-        logging.info(f"Successfully exported ONNX model ({self.onnx_model_path}) " f"to TRT engine ({trt_model_path})")
-
-    def _override_layer_precision_to_fp32(self, layer: "trt.ILayer") -> None:
-        layer.precision = trt.float32
-        layer.set_output_type(0, trt.float32)
-
-    def _override_layers_to_fp32(self, network: "trt.INetworkDefinition", fp32_layer_patterns: list[str]) -> None:
-        for i in range(network.num_layers):
-            layer = network.get_layer(i)
-            layer_name = layer.name
-            if any(layer_name.startswith(pattern) for pattern in fp32_layer_patterns) and layer.precision in {
-                trt.float32,
-                trt.float16,
-            }:
-                if layer.type in {trt.LayerType.CAST}:
-                    logging.info(f"Skipping overriding {layer.type} layer {i} {layer_name} dtype")
-                    continue
-                if any(
-                    layer.get_input(input_idx).dtype in {trt.float32, trt.float16}
-                    for input_idx in range(layer.num_inputs)
-                ):
-                    # Note: Assigning to layer.precision (even the same value) sets precision_is_set=True,
-                    # which prevents TensorRT from changing this layer's precision
-                    layer.precision = trt.float32
-                    logging.info(f"Setting layer {i} {layer_name} (type: {layer.type}) precision to FP32")
-                for j in range(layer.num_outputs):
-                    if layer.get_output_type(j) in {trt.float32, trt.float16}:
-                        layer.set_output_type(j, trt.float32)
-                        logging.info(f"Setting layer {i} {layer_name} (type: {layer.type}) output type {j} to FP32")
-
-    def _override_layernorm_precision_to_fp32(self, network: "trt.INetworkDefinition") -> None:
-        """Set the precision of LayerNorm subgraphs to FP32 to preserve accuracy.
-
-        - https://nvbugs/4478448 (Mistral)
-        - https://nvbugs/3802112 (T5)
-
-        Args:
-            network: tensorrt.INetworkDefinition
-        """
-        # Logic originally from OSS T5 HF export script:
-        # https://gitlab-master.nvidia.com/TensorRT/Public/oss/-/blob/77495ec/demo/HuggingFace/T5/export.py
-        pow_ops = {}
-        for layer_index, layer in enumerate(network):
-            if layer.type == trt.LayerType.IDENTITY:
-                all_fp32 = all(
-                    [
-                        layer.output_type_is_set(o) and layer.get_output_type(o) == trt.float32
-                        for o in range(layer.num_outputs)
-                    ]
-                )
-                if all_fp32:
-                    if layer.get_input(0).dtype == trt.float32:
-                        layer.precision = trt.float32
-
-            if layer.type == trt.LayerType.ELEMENTWISE:
-                layer.__class__ = getattr(trt, "IElementWiseLayer")
-                if layer.op == trt.ElementWiseOperation.POW:
-                    pow_ops[layer] = layer_index
-                    self._override_layer_precision_to_fp32(layer)
-
-        for _, index in pow_ops.items():
-            # Iterate from few layers before pow to include residual add and cast op.
-            # Iterate till 10 layers after pow op to include all
-            # operations included in layer norm.
-            START_OFFSET = 4
-            END_OFFSET = 12
-            for i in range(index - START_OFFSET, index + END_OFFSET):
-                layer = network.get_layer(i)
-                if layer.type == trt.LayerType.REDUCE:
-                    self._override_layer_precision_to_fp32(layer)
-
-                if layer.type == trt.LayerType.ELEMENTWISE:
-                    layer.__class__ = getattr(trt, "IElementWiseLayer")
-                    if layer.op == trt.ElementWiseOperation.SUM:
-                        self._override_layer_precision_to_fp32(layer)
-
-                if layer.type == trt.LayerType.UNARY:
-                    layer.__class__ = getattr(trt, "IUnaryLayer")
-                    if layer.op == trt.UnaryOperation.SQRT:
-                        self._override_layer_precision_to_fp32(layer)
-
-                if layer.type == trt.LayerType.ELEMENTWISE:
-                    layer.__class__ = getattr(trt, "IElementWiseLayer")
-                    if layer.op == trt.ElementWiseOperation.DIV:
-                        self._override_layer_precision_to_fp32(layer)
-
-                if layer.type == trt.LayerType.ELEMENTWISE:
-                    layer.__class__ = getattr(trt, "IElementWiseLayer")
-                    if layer.op == trt.ElementWiseOperation.PROD:
-                        self._override_layer_precision_to_fp32(layer)
-
-    def forward(self, inputs: Union[List, Dict], dimensions: Optional[List] = None):
-        """Run inference for a given input.
-
-        Args:
-            inputs (Union[List, Dict]): Input for the model. If list, it should be a list of strings.
-                If dict, it should be a dictionary with keys as the model input names.
-            dimensions (Optional[List]): The dimensions parameter of the model. Required if the model
-                was exported to accept dimensions parameter and inputs is given as a list of strings.
-
-        Returns:
-            np.ndarray: Model output.
-        """
-
-        if self.onnx_runtime_session is None:
-            warnings.warn("ONNX Runtime is not available. Please install the onnxruntime-gpu and try again.")
-            return None
-
-        if isinstance(inputs, List):
-            if "dimensions" in self.model_input_names and dimensions is None:
-                raise ValueError("Dimensions should be provided for list input.")
-            inputs = dict(self.tokenizer(inputs))
-            inputs["dimensions"] = dimensions
-
-        output = self.onnx_runtime_session.run(self.model_output_names, inputs)
-        return output[0]
-
-    @property
-    def get_model(self):
-        """Returns the model"""
-
-        return self.model
-
-    @property
-    def get_tokenizer(self):
-        """Returns the tokenizer"""
-
-        return self.tokenizer
-
-    @property
-    def get_model_input_names(self):
-        """Returns the model input names"""
-
-        return self.model_input_names
-
-    @property
-    def get_triton_input(self):
-        """Get triton input"""
-
-        raise NotImplementedError("This function will be implemented later.")
-
-    @property
-    def get_triton_output(self):
-        """Get triton output"""
-
-        raise NotImplementedError("This function will be implemented later.")
-
-    @batch
-    def triton_infer_fn(self, **inputs: np.ndarray):
-        """PyTriton inference function"""
-
-        raise NotImplementedError("This function will be implemented later.")
diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py
deleted file mode 100644
index e32b12643387..000000000000
--- a/nemo/export/quantize/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .quantizer import Quantizer
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
deleted file mode 100644
index 98f24cd4e4a9..000000000000
--- a/nemo/export/quantize/quantizer.py
+++ /dev/null
@@ -1,277 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tarfile
-from contextlib import nullcontext
-from typing import Callable, Optional, Union
-
-import torch
-import torch.distributed as dist
-from megatron.core import parallel_state
-from megatron.core.transformer.module import Float16Module
-from omegaconf.omegaconf import DictConfig, open_dict
-
-try:
-    from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-except (ImportError, ModuleNotFoundError):
-    from abc import ABC
-
-    MegatronGPTModel = ABC
-
-from nemo.utils import logging
-from nemo.utils.distributed import temporary_directory
-from nemo.utils.model_utils import save_artifacts, unwrap_model
-
-try:
-    import modelopt.torch.quantization as mtq
-    from modelopt.torch.export import export_tensorrt_llm_checkpoint
-
-    QUANT_CFG_CHOICES = {
-        "int8": mtq.INT8_DEFAULT_CFG,
-        "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
-        "fp8": mtq.FP8_DEFAULT_CFG,
-        "int4_awq": mtq.INT4_AWQ_CFG,
-        "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
-        "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
-        "nvfp4": mtq.NVFP4_DEFAULT_CFG,
-    }
-
-    HAVE_MODELOPT = True
-
-except (ImportError, ModuleNotFoundError) as e:
-    HAVE_MODELOPT = False
-    HAVE_MODELOPT_ERROR = e
-
-
-SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
-
-
-def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: Optional[bool] = None) -> torch.dtype:
-    """Mapping from PTL precision types to corresponding PyTorch parameter datatype."""
-    if megatron_amp_O2 is not None and megatron_amp_O2 is False:
-        return torch.float32
-
-    if precision in ['bf16', 'bf16-mixed']:
-        return torch.bfloat16
-    elif precision in [16, '16', '16-mixed']:
-        return torch.float16
-    elif precision in [32, '32', '32-true']:
-        return torch.float32
-    else:
-        raise ValueError(f"Could not parse the precision of `{precision}` to a valid torch.dtype")
-
-
-class Quantizer:
-    """Post-training quantization (PTQ) and TRT-LLM export of Nemo checkpoints.
-
-    PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving.
-    The process consist of several steps:
-
-        1. Loading a Nemo model from disk using appropriate parallelism strategy
-        2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
-        3. Producing output directory or .qnemo tarball with model config (json),
-           quantized weights (safetensors) and tokenizer config (yaml).
-
-    The output directory (or .qnemo file) produced is intended to be consumed by TensorRT-LLM toolbox
-    for efficient inference. This can be achieved using Nemo inference containers.
-
-    Currently supported and tested model family is Llama2. Model type needs to be specified in
-    the quantization command with decoder_type parameter on exporting (see below). Quantizing other
-    model families is experimental and might not be fully supported.
-
-    Available quantization methods are listed in `QUANT_CFG_CHOICES` dictionary above.
-    Please consult Model Optimizer documentation https://nvidia.github.io/TensorRT-Model-Optimizer/ for details.
-    You can also inspect different choices in examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
-    for quantization algorithms and calibration data as well as recommended settings.
-
-    Quantization algorithm can also be conveniently set to 'null' to perform only weights export step
-    for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
-    """
-
-    def __init__(self, quantization_config: Optional[DictConfig], export_config: Optional[DictConfig]):
-        """Initialize Quantizer with quantization and export configurations.
-
-        Expected keys in `quantization_config`:
-            - algorithm: str
-            - decoder_type: str
-            - awq_block_size: int (only for awq algorithms)
-            - sq_alpha: float (only for smooth quant algorithms)
-            - enable_kv_cache: bool (default: None i.e. auto-detect based on algorithm and decoder_type)
-
-        Expected keys in `export_config`:
-            - dtype: str/int
-            - decoder_type: str
-            - inference_tensor_parallel: int
-            - inference_pipeline_parallel: int
-            - save_path: str
-        """
-        if not HAVE_MODELOPT:
-            raise RuntimeError("nvidia-modelopt is needed to use Quantizer") from HAVE_MODELOPT_ERROR
-
-        self.quantization_config = quantization_config
-        self.export_config = export_config
-
-        # Quantization sanity checks
-        assert (
-            quantization_config.algorithm is None or quantization_config.algorithm in QUANT_CFG_CHOICES
-        ), f"Unsupported quantization algorithm: {quantization_config.algorithm}"
-        if quantization_config.algorithm is not None:
-            quant_cfg = QUANT_CFG_CHOICES[quantization_config.algorithm]
-
-            if "awq" in quantization_config.algorithm:
-                weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"]
-                if isinstance(weight_quantizer, list):
-                    weight_quantizer = weight_quantizer[0]
-                weight_quantizer["block_sizes"][-1] = quantization_config.awq_block_size
-
-            # Always turn on FP8 kv cache to save memory footprint.
-            # For int8_sq, we use int8 kv cache.
-            # TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron.
-            enable_quant_kv_cache = quantization_config.get("enable_kv_cache", None)
-            if enable_quant_kv_cache is None:
-                enable_quant_kv_cache = (
-                    "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gpt"
-                )
-            logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
-            quant_cfg["quant_cfg"]["*output_quantizer"] = {
-                "num_bits": 8 if quantization_config.algorithm == "int8_sq" else (4, 3),
-                "axis": None,
-                "enable": enable_quant_kv_cache,
-            }
-            if quantization_config.algorithm == "int8_sq":
-                logging.info(f"Using int8_sq alpha = {quantization_config.sq_alpha}")
-                quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": quantization_config.sq_alpha}
-
-            self.quant_cfg = quant_cfg
-        else:
-            self.quant_cfg = None
-
-        # Export sanity checks
-        if export_config is not None:
-            assert export_config.dtype in SUPPORTED_DTYPE, f"Unsupported export dtype: {export_config.dtype}"
-
-    @staticmethod
-    def _setup(model: MegatronGPTModel):
-        """Setup model for quantization."""
-        try:
-            model.model.module.language_model.encoder.activations_checkpoint_method = None
-        except AttributeError:
-            pass
-
-        if not parallel_state.is_initialized():
-
-            def dummy():
-                return
-
-            if model.trainer.strategy.launcher is not None:
-                model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
-            model.trainer.strategy.setup_environment()
-
-    @staticmethod
-    def modify_model_config(model_cfg: DictConfig) -> DictConfig:
-        """Modify model config for quantization."""
-        with open_dict(model_cfg):
-            if model_cfg.get("sequence_parallel", False):
-                logging.warning("Disabling sequence parallelism for quantization...")
-                model_cfg.sequence_parallel = False
-            model_cfg.name = "modelopt"
-            model_cfg.apply_rope_fusion = False
-
-        return model_cfg
-
-    @staticmethod
-    def _sample_output(model: MegatronGPTModel):
-        """Generate sample output for a model instance."""
-        logging.info("Generating sample output for the model...")
-
-        response = model.generate(
-            inputs=[
-                "Born in north-east France, Soyer trained as a",
-                "Born in California, Soyer trained as a",
-            ],
-            length_params={
-                "max_length": 100,
-                "min_length": 100,
-            },
-        )
-
-        logging.info(f'Example NeMo output before export: {response["sentences"]}"')
-
-    def quantize(self, model: MegatronGPTModel, forward_loop: Callable[[MegatronGPTModel], None]):
-        """Quantize the model and calibrate using given forward loop."""
-        assert self.quant_cfg is not None, "Quantization algorithm is not set"
-
-        logging.info(f"Quantizing model to {self.quantization_config.algorithm}...")
-        self._setup(model)
-
-        model = mtq.quantize(model, self.quant_cfg, forward_loop)
-
-        if self.quantization_config.decoder_type == "gpt":
-            # We found squared_relu may have an under-calibration problem.
-            # Clamp the scaling_factor with a min threshold to avoid under-calibration.
-            maxbound = 0
-            if self.quantization_config.algorithm == "fp8":
-                maxbound = 448
-            elif self.quantization_config.algorithm == "int8_sq":
-                maxbound = 127
-            model = mtq.postprocess_amax(
-                model, "*input_quantizer", lambda amax: torch.clamp(amax, min=0.01 * maxbound)
-            )
-
-        if dist.get_rank() == 0:
-            mtq.print_quant_summary(model)
-
-        return model
-
-    def export(self, model: MegatronGPTModel):
-        """Export model to '.qnemo' format for TensorRT-LLM engine build."""
-        assert self.export_config is not None, "Export config is not set"
-        torch_dtype = torch_dtype_from_precision(self.export_config.dtype)
-
-        if self.export_config.get("sample_output", True):
-            self._sample_output(model)
-
-        if model.cfg.megatron_amp_O2:
-            model.model = unwrap_model(model.model, Float16Module)
-
-        # Setup model export handling: temporary directory for
-        # '.qnemo' tarball or directly write to export_config.save_path
-        compress = self.export_config.get("compress", False)
-        if compress:
-            export_handler = temporary_directory()
-        else:
-            export_handler = nullcontext(enter_result=self.export_config.save_path)
-
-        with export_handler as export_dir:
-            export_tensorrt_llm_checkpoint(
-                model=model,
-                decoder_type=self.export_config.decoder_type,
-                dtype=torch_dtype,
-                export_dir=export_dir,
-                inference_tensor_parallel=self.export_config.inference_tensor_parallel,
-                inference_pipeline_parallel=self.export_config.inference_pipeline_parallel,
-                use_nfs_workspace=model.trainer.num_nodes > 1,
-            )
-            dist.barrier()  # Wait until all ranks complete export_model_config step
-            logging.info(
-                "Exporting quantized weights, model artifacts,"
-                f" and tokenizer config to {self.export_config.save_path}..."
-            )
-            if dist.get_rank() == 0:
-                save_artifacts(model, export_dir)
-                if compress:
-                    os.makedirs(os.path.dirname(self.export_config.save_path), exist_ok=True)
-                    with tarfile.open(self.export_config.save_path, "w") as tar:
-                        tar.add(export_dir, arcname="./")
diff --git a/nemo/export/sentencepiece_tokenizer.py b/nemo/export/sentencepiece_tokenizer.py
deleted file mode 100644
index e6e09aa8b6d7..000000000000
--- a/nemo/export/sentencepiece_tokenizer.py
+++ /dev/null
@@ -1,280 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-import sentencepiece
-import torch
-
-
-class SentencePieceTokenizer:
-    """
-    SentencePieceTokenizer https://github.com/google/sentencepiece
-
-        Args:
-        model_path: path to sentence piece tokenizer model.
-        special_tokens: either list of special tokens or dictionary of token name to token value
-        legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored,
-            including the possibility to add special tokens inside wrapper.
-        tokenizer: wraps an existing tokenizer
-    """
-
-    def __init__(
-        self,
-        model_path: Optional[str] = None,
-        special_tokens: Optional[Union[Dict[str, str], List[str]]] = None,
-        legacy: bool = False,
-        tokenizer: Optional[sentencepiece.SentencePieceProcessor] = None,
-    ):
-        model_path_provided = model_path is not None
-        tokenizer_provided = tokenizer is not None
-        if not (model_path_provided ^ tokenizer_provided):
-            raise ValueError("Exactly only one of the arguments 'model_path', 'tokenizer' should be provided")
-
-        if tokenizer_provided:
-            self.tokenizer = tokenizer
-        else:
-            if not model_path or not os.path.exists(model_path):
-                raise ValueError(f"model_path: {model_path} is invalid")
-            self.tokenizer = sentencepiece.SentencePieceProcessor()
-            self.tokenizer.Load(model_path)
-
-        self.original_vocab_size = self.tokenizer.get_piece_size()
-        self.vocab_size = self.tokenizer.get_piece_size()
-        self.legacy = legacy
-        self.special_token_to_id = {}
-        self.id_to_special_token = {}
-        if special_tokens:
-            if not self.legacy:
-                raise ValueError(
-                    "Special tokens must be None when legacy is set to False. Provide special tokens at train time."
-                )
-            self.add_special_tokens(special_tokens)
-        self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y')
-
-    def text_to_tokens(self, text):
-        if self.legacy:
-            tokens = []
-            idx = 0
-
-            while 1:
-                indices = {}
-
-                for token in self.special_token_to_id:
-                    try:
-                        indices[token] = text[idx:].index(token)
-                    except ValueError:
-                        continue
-
-                if len(indices) == 0:
-                    break
-
-                next_token = min(indices, key=indices.get)
-                next_idx = idx + indices[next_token]
-
-                tokens.extend(self.tokenizer.encode_as_pieces(text[idx:next_idx]))
-                tokens.append(next_token)
-                idx = next_idx + len(next_token)
-
-            tokens.extend(self.tokenizer.encode_as_pieces(text[idx:]))
-            return tokens
-
-        return self.tokenizer.encode_as_pieces(text)
-
-    def encode(self, text):
-        if self.legacy:
-            ids = []
-            idx = 0
-
-            while 1:
-                indices = {}
-
-                for token in self.special_token_to_id:
-                    try:
-                        indices[token] = text[idx:].index(token)
-                    except ValueError:
-                        continue
-
-                if len(indices) == 0:
-                    break
-
-                next_token = min(indices, key=indices.get)
-                next_idx = idx + indices[next_token]
-
-                ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
-                ids.append(self.special_token_to_id[next_token])
-                idx = next_idx + len(next_token)
-
-            ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
-            return ids
-
-        return self.tokenizer.encode_as_ids(text)
-
-    def tokens_to_text(self, tokens):
-        if isinstance(tokens, np.ndarray):
-            tokens = tokens.tolist()
-
-        return self.tokenizer.decode_pieces(tokens)
-
-    def batch_decode(self, ids):
-        if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
-            ids = ids.tolist()
-
-        if self.legacy:
-            text = ""
-            last_i = 0
-
-            for i, id in enumerate(ids):
-                if id in self.id_to_special_token:
-                    text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
-                    text += self.id_to_special_token[id] + " "
-                    last_i = i + 1
-
-            text += self.tokenizer.decode_ids(ids[last_i:])
-            return text.strip()
-
-        return self.tokenizer.decode(ids)
-
-    def token_to_id(self, token):
-        if self.legacy and token in self.special_token_to_id:
-            return self.special_token_to_id[token]
-
-        return self.tokenizer.piece_to_id(token)
-
-    def ids_to_tokens(self, ids):
-        tokens = []
-        for id in ids:
-            if id >= self.original_vocab_size:
-                tokens.append(self.id_to_special_token[id])
-            else:
-                tokens.append(self.tokenizer.id_to_piece(id))
-        return tokens
-
-    def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
-        if isinstance(tokens, str):
-            tokens = [tokens]
-        ids = []
-        for token in tokens:
-            ids.append(self.token_to_id(token))
-        return ids
-
-    def add_special_tokens(self, special_tokens):
-        if not self.legacy:
-            raise AttributeError("Special Token addition does not work when legacy is set to False.")
-
-        if isinstance(special_tokens, list):
-            for token in special_tokens:
-                if (
-                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
-                    and token not in self.special_token_to_id
-                ):
-                    self.special_token_to_id[token] = self.vocab_size
-                    self.id_to_special_token[self.vocab_size] = token
-                    self.vocab_size += 1
-        elif isinstance(special_tokens, dict):
-            for token_name, token in special_tokens.items():
-                setattr(self, token_name, token)
-                if (
-                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
-                    and token not in self.special_token_to_id
-                ):
-                    self.special_token_to_id[token] = self.vocab_size
-                    self.id_to_special_token[self.vocab_size] = token
-                    self.vocab_size += 1
-
-    @property
-    def pad_id(self):
-        if self.legacy:
-            pad_id = self.tokens_to_ids([self.pad_token])[0]
-        else:
-            pad_id = self.tokenizer.pad_id()
-        return pad_id
-
-    @property
-    def bos_token_id(self):
-        if self.legacy:
-            bos_id = self.tokens_to_ids([self.bos_token])[0]
-        else:
-            bos_id = self.tokenizer.bos_id()
-        return bos_id
-
-    @property
-    def eos_token_id(self):
-        if self.legacy:
-            eos_id = self.tokens_to_ids([self.eos_token])[0]
-        else:
-            eos_id = self.tokenizer.eos_id()
-        return eos_id
-
-    @property
-    def sep_id(self):
-        if self.legacy:
-            return self.tokens_to_ids([self.sep_token])[0]
-        else:
-            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
-
-    @property
-    def cls_id(self):
-        if self.legacy:
-            return self.tokens_to_ids([self.cls_token])[0]
-        else:
-            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
-
-    @property
-    def mask_id(self):
-        if self.legacy:
-            return self.tokens_to_ids([self.mask_token])[0]
-        else:
-            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
-
-    @property
-    def unk_id(self):
-        return self.tokenizer.unk_id()
-
-    @property
-    def additional_special_tokens_ids(self):
-        """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5."""
-        special_tokens = set(
-            [self.bos_token, self.eos_token, self.pad_token, self.mask_token, self.cls_token, self.sep_token]
-        )
-        return [v for k, v in self.special_token_to_id.items() if k not in special_tokens]
-
-    @property
-    def vocab(self):
-        main_vocab = [self.tokenizer.id_to_piece(id) for id in range(self.tokenizer.get_piece_size())]
-        special_tokens = [
-            self.id_to_special_token[self.original_vocab_size + i]
-            for i in range(self.vocab_size - self.original_vocab_size)
-        ]
-        return main_vocab + special_tokens
-
-    # Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM
-
-    def convert_ids_to_tokens(self, ids, skip_special_tokens: bool = False):
-        return self.ids_to_tokens(ids)  # TODO: support skip_special_tokens
-
-    def convert_tokens_to_string(self, tokens: List[str]):
-        return self.tokens_to_text(tokens)
-
-    def __len__(self):
-        return self.vocab_size
-
-    @property
-    def is_fast(self):
-        return True
-
-    def get_added_vocab(self):
-        return None
diff --git a/nemo/export/tarutils.py b/nemo/export/tarutils.py
deleted file mode 100644
index 40add3162db6..000000000000
--- a/nemo/export/tarutils.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import fnmatch
-import logging
-import os
-import tarfile
-
-from typing import IO, Union
-
-LOGGER = logging.getLogger("NeMo")
-
-try:
-    from zarr.storage import BaseStore
-
-    HAVE_ZARR = True
-except Exception as e:
-    LOGGER.warning(f"Cannot import zarr, support for zarr-based checkpoints is not available. {type(e).__name__}: {e}")
-    BaseStore = object
-    HAVE_ZARR = False
-
-
-class TarPath:
-    """
-    A class that represents a path inside a TAR archive and behaves like pathlib.Path.
-
-    Expected use is to create a TarPath for the root of the archive first, and then derive
-    paths to other files or directories inside the archive like so:
-
-    with TarPath('/path/to/archive.tar') as archive:
-        myfile = archive / 'filename.txt'
-        if myfile.exists():
-            data = myfile.read()
-            ...
-
-    Only read and enumeration operations are supported.
-    """
-
-    def __init__(self, tar: Union[str, tarfile.TarFile, 'TarPath'], *parts):
-        self._needs_to_close = False
-        self._relpath = ''
-        if isinstance(tar, TarPath):
-            self._tar = tar._tar
-            self._relpath = os.path.join(tar._relpath, *parts)
-        elif isinstance(tar, tarfile.TarFile):
-            self._tar = tar
-            if parts:
-                self._relpath = os.path.join(*parts)
-        elif isinstance(tar, str):
-            self._needs_to_close = True
-            self._tar = tarfile.open(tar, 'r')
-            if parts:
-                self._relpath = os.path.join(*parts)
-        else:
-            raise ValueError(f"Unexpected argument type for TarPath: {type(tar).__name__}")
-
-    def __del__(self):
-        if self._needs_to_close:
-            self._tar.close()
-
-    def __truediv__(self, key) -> 'TarPath':
-        return TarPath(self._tar, os.path.join(self._relpath, key))
-
-    def __str__(self) -> str:
-        return os.path.join(self._tar.name, self._relpath)
-
-    @property
-    def tarobject(self):
-        """
-        Returns the wrapped tar object.
-        """
-        return self._tar
-
-    @property
-    def relpath(self):
-        """
-        Returns the relative path of the path.
-        """
-        return self._relpath
-
-    @property
-    def name(self):
-        """
-        Returns the name of the path.
-        """
-        return os.path.split(self._relpath)[1]
-
-    @property
-    def suffix(self):
-        """
-        Returns the suffix of the path.
-        """
-        name = self.name
-        i = name.rfind('.')
-        if 0 < i < len(name) - 1:
-            return name[i:]
-        else:
-            return ''
-
-    def __enter__(self):
-        self._tar.__enter__()
-        return self
-
-    def __exit__(self, *args):
-        return self._tar.__exit__(*args)
-
-    def exists(self):
-        """
-        Checks if the path exists.
-        """
-        try:
-            self._tar.getmember(self._relpath)
-            return True
-        except KeyError:
-            try:
-                self._tar.getmember(os.path.join('.', self._relpath))
-                return True
-            except KeyError:
-                return False
-
-    def is_file(self):
-        """
-        Checks if the path is a file.
-        """
-        try:
-            self._tar.getmember(self._relpath).isreg()
-            return True
-        except KeyError:
-            try:
-                self._tar.getmember(os.path.join('.', self._relpath)).isreg()
-                return True
-            except KeyError:
-                return False
-
-    def is_dir(self):
-        """
-        Checks if the path is a directory.
-        """
-        try:
-            self._tar.getmember(self._relpath).isdir()
-            return True
-        except KeyError:
-            try:
-                self._tar.getmember(os.path.join('.', self._relpath)).isdir()
-                return True
-            except KeyError:
-                return False
-
-    def open(self, mode: str) -> IO[bytes]:
-        """
-        Opens a file in the archive.
-        """
-        if mode != 'r' and mode != 'rb':
-            raise NotImplementedError()
-
-        file = None
-        try:
-            # Try the relative path as-is first
-            file = self._tar.extractfile(self._relpath)
-        except KeyError:
-            try:
-                # Try the relative path with "./" prefix
-                file = self._tar.extractfile(os.path.join('.', self._relpath))
-            except KeyError:
-                raise FileNotFoundError()
-
-        if file is None:
-            raise FileNotFoundError()
-
-        return file
-
-    def glob(self, pattern):
-        """
-        Returns an iterator over the files in the directory, matching the pattern.
-        """
-        for member in self._tar.getmembers():
-            # Remove the "./" prefix, if any
-            name = member.name[2:] if member.name.startswith('./') else member.name
-
-            # If we're in a subdirectory, make sure the file is too, and remove that subdir component
-            if self._relpath:
-                if not name.startswith(self._relpath + '/'):
-                    continue
-                name = name[len(self._relpath) + 1 :]
-
-            # See if the name matches the pattern
-            if fnmatch.fnmatch(name, pattern):
-                yield TarPath(self._tar, os.path.join(self._relpath, name))
-
-    def rglob(self, pattern):
-        """
-        Returns an iterator over the files in the directory, including subdirectories.
-        """
-        for member in self._tar.getmembers():
-            # Remove the "./" prefix, if any
-            name = member.name[2:] if member.name.startswith('./') else member.name
-
-            # If we're in a subdirectory, make sure the file is too, and remove that subdir component
-            if self._relpath:
-                if not name.startswith(self._relpath + '/'):
-                    continue
-                name = name[len(self._relpath) + 1 :]
-
-            # See if any tail of the path matches the pattern, return full path if that's true
-            parts = name.split('/')
-            for i in range(len(parts)):
-                subname = '/'.join(parts[i:])
-                if fnmatch.fnmatch(subname, pattern):
-                    yield TarPath(self._tar, os.path.join(self._relpath, name))
-                    break
-
-    def iterdir(self):
-        """
-        Returns an iterator over the files in the directory.
-        """
-        return self.glob('*')
-
-
-class ZarrPathStore(BaseStore):
-    """
-    An implementation of read-only Store for zarr library
-    that works with pathlib.Path or TarPath objects.
-    """
-
-    def __init__(self, tarpath: TarPath):
-        assert HAVE_ZARR, "Package zarr>=2.18.2,<3.0.0 is required to use ZarrPathStore"
-        self._path = tarpath
-        self._writable = False
-        self._erasable = False
-
-    def __getitem__(self, key):
-        with (self._path / key).open('rb') as file:
-            return file.read()
-
-    def __contains__(self, key):
-        return (self._path / key).is_file()
-
-    def __iter__(self):
-        return self.keys()
-
-    def __len__(self):
-        return sum(1 for _ in self.keys())
-
-    def __setitem__(self, key, value):
-        raise NotImplementedError()
-
-    def __delitem__(self, key):
-        raise NotImplementedError()
-
-    def keys(self):
-        """
-        Returns an iterator over the keys in the store.
-        """
-        return self._path.iterdir()
diff --git a/nemo/export/tensorrt_lazy_compiler.py b/nemo/export/tensorrt_lazy_compiler.py
deleted file mode 100644
index 50b609087250..000000000000
--- a/nemo/export/tensorrt_lazy_compiler.py
+++ /dev/null
@@ -1,714 +0,0 @@
-# Copyright (c) MONAI Consortium
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#     http://www.apache.org/licenses/LICENSE-2.0
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import inspect
-import os
-import tempfile
-import threading
-from collections import OrderedDict
-from logging import getLogger
-from pathlib import Path
-from types import MethodType
-from typing import Any, Dict, List, Sequence, Tuple, Union
-
-import torch
-
-from nemo.utils.export_utils import add_casts_around_norms, replace_for_export
-from nemo.utils.import_utils import safe_import
-
-polygraphy, polygraphy_imported = safe_import("polygraphy")
-if polygraphy_imported:
-    from polygraphy.backend.common import bytes_from_path
-    from polygraphy.backend.trt import (
-        CreateConfig,
-        Profile,
-        engine_bytes_from_network,
-        engine_from_bytes,
-        network_from_onnx_path,
-    )
-
-trt, trt_imported = safe_import("tensorrt")
-torch_tensorrt, _ = safe_import("torch_tensorrt")
-cudart, _ = safe_import("cuda.cudart")
-
-lock_sm = threading.Lock()
-
-
-def trt_to_torch_dtype_dict():
-    """
-    Map of TRT dtype -> Torch dtype
-    """
-    return {
-        trt.int32: torch.int32,
-        trt.float32: torch.float32,
-        trt.float16: torch.float16,
-        trt.bfloat16: torch.float16,
-        trt.int64: torch.int64,
-        trt.int8: torch.int8,
-        trt.bool: torch.bool,
-    }
-
-
-def get_profile_shapes(input_shape: Sequence[int], dynamic_batchsize: Sequence[int] | None):
-    """
-    Given a sample input shape, calculate min/opt/max shapes according to dynamic_batchsize.
-    """
-
-    def scale_batch_size(input_shape: Sequence[int], scale_num: int):
-        scale_shape = [*input_shape]
-        scale_shape[0] = scale_num
-        return scale_shape
-
-    # Use the dynamic batchsize range to generate the min, opt and max model input shape
-    if dynamic_batchsize:
-        min_input_shape = scale_batch_size(input_shape, dynamic_batchsize[0])
-        opt_input_shape = scale_batch_size(input_shape, dynamic_batchsize[1])
-        max_input_shape = scale_batch_size(input_shape, dynamic_batchsize[2])
-    else:
-        min_input_shape = opt_input_shape = max_input_shape = input_shape
-    return min_input_shape, opt_input_shape, max_input_shape
-
-
-def get_dynamic_axes(profiles):
-    """
-    This method calculates dynamic_axes to use in onnx.export().
-    Args:
-       profiles: [[min,opt,max],...] list of profile dimensions
-    """
-    dynamic_axes: dict[str, list[int]] = {}
-    if not profiles:
-        return dynamic_axes
-    for profile in profiles:
-        for key in profile:
-            axes = []
-            vals = profile[key]
-            for i in range(len(vals[0])):
-                if vals[0][i] != vals[2][i]:
-                    axes.append(i)
-            if len(axes) > 0:
-                dynamic_axes[key] = axes
-    return dynamic_axes
-
-
-def cuassert(cuda_ret):
-    """
-    Error reporting method for CUDA calls.
-    Args:
-     cuda_ret: CUDA return code.
-    """
-    err = cuda_ret[0]
-    if err != 0:
-        raise RuntimeError(f"CUDA ERROR: {err}")
-    if len(cuda_ret) > 1:
-        return cuda_ret[1]
-    return None
-
-
-class ShapeError(Exception):
-    """
-    Exception class to report errors from setting TRT plan input shapes
-    """
-
-    pass
-
-
-class TRTEngine:
-    """
-    An auxiliary class to implement running of TRT optimized engines
-
-    """
-
-    def __init__(self, plan_path, logger=None):
-        """
-        Loads serialized engine, creates execution context and activates it
-        Args:
-          plan_path: path to serialized TRT engine.
-          logger: optional logger object
-        """
-        self.plan_path = plan_path
-        self.logger = logger or getLogger("trt_compile")
-        self.logger.info(f"Loading TensorRT engine: {self.plan_path}")
-        self.engine = engine_from_bytes(bytes_from_path(self.plan_path))
-        self.tensors = OrderedDict()
-        self.cuda_graph_instance = None  # cuda graph
-        self.context = self.engine.create_execution_context()
-        self.input_names = []
-        self.output_names = []
-        self.dtypes = []
-        self.cur_profile = 0
-        self.input_table = {}
-        dtype_dict = trt_to_torch_dtype_dict()
-        for idx in range(self.engine.num_io_tensors):
-            binding = self.engine[idx]
-            if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
-                self.input_names.append(binding)
-            elif self.engine.get_tensor_mode(binding) == trt.TensorIOMode.OUTPUT:
-                self.output_names.append(binding)
-                dtype = dtype_dict[self.engine.get_tensor_dtype(binding)]
-                self.dtypes.append(dtype)
-        self.logger.info(
-            f"Loaded TensorRT engine: {self.plan_path}.\nInputs: {self.input_names}\nOutputs: {self.output_names}"
-        )
-
-    def allocate_buffers(self, device):
-        """
-        Allocates outputs to run TRT engine
-        Args:
-            device: GPU device to allocate memory on
-        """
-        ctx = self.context
-
-        for i, binding in enumerate(self.output_names):
-            shape = list(ctx.get_tensor_shape(binding))
-            if binding not in self.tensors or list(self.tensors[binding].shape) != shape:
-                t = torch.empty(shape, dtype=self.dtypes[i], device=device).contiguous()
-                self.tensors[binding] = t
-                ctx.set_tensor_address(binding, t.data_ptr())
-
-    def set_inputs(self, feed_dict, stream):
-        """
-        Sets input bindings for TRT engine according to feed_dict
-        Args:
-           feed_dict: a dictionary [str->Tensor]
-           stream: CUDA stream to use
-        """
-        e = self.engine
-        ctx = self.context
-
-        last_profile = self.cur_profile
-
-        def try_set_inputs():
-            for binding in self.input_names:
-                t = feed_dict.get(self.input_table[binding], None)
-                if t is not None:
-                    t = t.contiguous()
-                    shape = t.shape
-                    ctx.set_input_shape(binding, shape)
-                    ctx.set_tensor_address(binding, t.data_ptr())
-
-        while True:
-            try:
-                try_set_inputs()
-                break
-            except ShapeError:
-                next_profile = (self.cur_profile + 1) % e.num_optimization_profiles
-                if next_profile == last_profile:
-                    raise
-                self.cur_profile = next_profile
-                ctx.set_optimization_profile_async(self.cur_profile, stream)
-            except Exception:
-                raise
-        left = ctx.infer_shapes()
-        assert len(left) == 0
-
-    def infer(self, stream, use_cuda_graph=False):
-        """
-        Runs TRT engine.
-        Args:
-            stream: CUDA stream to run on
-            use_cuda_graph: use CUDA graph. Note: requires all inputs to be the same GPU memory between calls.
-        """
-        if use_cuda_graph:
-            if self.cuda_graph_instance is not None:
-                cuassert(cudart.cudaGraphLaunch(self.cuda_graph_instance, stream))
-                cuassert(cudart.cudaStreamSynchronize(stream))
-            else:
-                # do inference before CUDA graph capture
-                noerror = self.context.execute_async_v3(stream)
-                if not noerror:
-                    raise ValueError("ERROR: inference failed.")
-                # capture cuda graph
-                cuassert(
-                    cudart.cudaStreamBeginCapture(
-                        stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
-                    )
-                )
-                self.context.execute_async_v3(stream)
-                graph = cuassert(cudart.cudaStreamEndCapture(stream))
-                self.cuda_graph_instance = cuassert(cudart.cudaGraphInstantiate(graph, 0))
-                self.logger.info("CUDA Graph captured!")
-        else:
-            noerror = self.context.execute_async_v3(stream)
-            cuassert(cudart.cudaStreamSynchronize(stream))
-            if not noerror:
-                raise ValueError("ERROR: inference failed.")
-
-        return self.tensors
-
-
-def make_tensor(d):
-    """
-    Creates a new tensor from d, returns d if d is already a tensor
-    """
-    return d if isinstance(d, torch.Tensor) else torch.tensor(d).cuda()
-
-
-def unroll_input(input_names, input_example):
-    """
-    Simulates list/tuple unrolling during ONNX export
-    """
-    unrolled_input = {}
-    for name in input_names:
-        val = input_example[name]
-        if val is not None:
-            if isinstance(val, list) or isinstance(val, tuple):
-                for i in range(len(val)):
-                    unrolled_input[f"{name}_{i}"] = make_tensor(val[i])
-            else:
-                unrolled_input[name] = make_tensor(val)
-    return unrolled_input
-
-
-def parse_groups(
-    ret: List[torch.Tensor], output_lists: List[List[int]]
-) -> Tuple[Union[torch.Tensor, List[torch.Tensor]], ...]:
-    """
-    Implements parsing of 'output_lists' arg of trt_compile().
-
-    Args:
-      ret: plain list of Tensors
-
-      output_lists: list of output group sizes: to form some Lists/Tuples out of 'ret' List, this will be a list
-                    of group dimensions, like [[], [5], [-1]] for returning Tensor, list of 5 items and dynamic list.
-        Format: [[group_n] | [], ...]
-          [] or group_n == 0 : next output from ret is a scalar
-          group_n > 0  :       next output from ret is a list of group_n length
-          group_n == -1:       next output is a dynamic list. This entry can be at any
-                               position in output_lists, but can appear only once.
-    Returns:
-       Tuple of Union[torch.Tensor, List[torch.Tensor]], according to the grouping in output_lists
-
-    """
-    groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple()
-    cur = 0
-    for i in range(len(output_lists)):
-        gl = output_lists[i]
-        assert len(gl) == 0 or len(gl) == 1
-        if len(gl) == 0 or gl[0] == 0:
-            groups = (*groups, ret[cur])
-            cur = cur + 1
-        elif gl[0] > 0:
-            groups = (*groups, ret[cur : cur + gl[0]])
-            cur = cur + gl[0]
-        elif gl[0] == -1:
-            rev_groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple()
-            rcur = len(ret)
-            for rl in range(len(output_lists) - 1, i, -1):
-                rgl = output_lists[rl]
-                assert len(rgl) == 0 or len(rgl) == 1
-                if len(rgl) == 0 or rgl[0] == 0:
-                    rcur = rcur - 1
-                    rev_groups = (*rev_groups, ret[rcur])
-                elif rgl[0] > 0:
-                    rcur = rcur - rgl[0]
-                    rev_groups = (*rev_groups, ret[rcur : rcur + rgl[0]])
-                else:
-                    raise ValueError("Two -1 lists in output")
-            groups = (*groups, ret[cur:rcur], *rev_groups[::-1])
-            break
-    return groups
-
-
-class TrtCompiler:
-    """
-    This class implements:
-      - TRT lazy persistent export
-      - Running TRT with optional fallback to Torch
-        (for TRT engines with limited profiles)
-    """
-
-    def __init__(
-        self,
-        model,
-        plan_path,
-        precision="fp16",
-        method="onnx",
-        input_names=None,
-        output_names=None,
-        output_lists=None,
-        export_args=None,
-        build_args=None,
-        input_profiles=None,
-        dynamic_batchsize=None,
-        use_cuda_graph=False,
-        timestamp=None,
-        fallback=False,
-        forward_override=None,
-        logger=None,
-    ):
-        """
-        Initialization method:
-         Tries to load persistent serialized TRT engine
-         Saves its arguments for lazy TRT build on first forward() call
-        Args:
-            model: Model to "wrap".
-            plan_path : Path where to save persistent serialized TRT engine.
-            precision: TRT builder precision o engine model. Should be 'fp32'|'tf32'|'fp16'|'bf16'.
-            method: One of 'onnx'|'torch_trt'.
-                    Default is 'onnx' (torch.onnx.export()->TRT). This is the most stable and efficient option.
-                    'torch_trt' may not work for some nets. Also AMP must be turned off for it to work.
-            input_names: Optional list of input names. If None, will be read from the function signature.
-            output_names: Optional list of output names. Note: If not None, patched forward() will return a dictionary.
-            output_lists: Optional list of output group sizes: when forward() returns Lists/Tuples, this will be a list
-                          of their dimensions, like [[], [5], [-1]] for Tensor, list of 5 items and dynamic list.
-            export_args: Optional args to pass to export method. See onnx.export() and Torch-TensorRT docs for details.
-            build_args: Optional args to pass to TRT builder. See polygraphy.Config for details.
-            input_profiles: Optional list of profiles for TRT builder and ONNX export.
-                            Each profile is a map of the form : {"input id" : [min_shape, opt_shape, max_shape], ...}.
-            dynamic_batchsize: A sequence with three elements to define the input batch size range for the model to be
-                               converted. Should be a sequence like [MIN_BATCH, OPT_BATCH, MAX_BATCH].
-            [note]: If neither input_profiles nor dynamic_batchsize specified, static shapes will be used.
-            use_cuda_graph: Use CUDA Graph for inference. Note: inputs have to be the same GPU memory between calls!
-            timestamp: Optional timestamp to rebuild TRT engine (e.g. if config file changes).
-            fallback: Allow to fall back to Pytorch when TRT inference fails (e.g, shapes exceed max profile).
-        """
-
-        method_vals = ["onnx", "torch_trt"]
-        if method not in method_vals:
-            raise ValueError(f"trt_compile(): 'method' should be one of {method_vals}, got: {method}.")
-        precision_vals = ["fp32", "tf32", "fp16", "bf16"]
-        if precision not in precision_vals:
-            raise ValueError(f"trt_compile(): 'precision' should be one of {precision_vals}, got: {precision}.")
-
-        self.plan_path = plan_path
-        self.precision = precision
-        self.method = method
-        self.return_dict = output_names is not None
-        self.output_names = output_names or []
-        self.output_lists = output_lists or []
-        self.profiles = input_profiles or []
-        self.dynamic_batchsize = dynamic_batchsize
-        self.export_args = export_args or {}
-        self.build_args = build_args or {}
-        self.engine: TRTEngine | None = None
-        self.use_cuda_graph = use_cuda_graph
-        self.fallback = fallback
-        self.disabled = False
-
-        self.logger = logger or getLogger("trt_compile")
-        self.argspec = inspect.getfullargspec(model.forward)
-        # Normally we read input_names from forward() but can be overridden
-        if input_names is None:
-            input_names = self.argspec.args[1:]
-        self.defaults = {}
-        if self.argspec.defaults is not None:
-            for i in range(len(self.argspec.defaults)):
-                d = self.argspec.defaults[-i - 1]
-                if d is not None:
-                    d = make_tensor(d)
-                    self.defaults[self.argspec.args[-i - 1]] = d
-
-        self.input_names = input_names
-        self.old_forward = model.forward
-
-        # Force engine rebuild if older than the timestamp
-        if timestamp is not None and os.path.exists(self.plan_path) and os.path.getmtime(self.plan_path) < timestamp:
-            os.remove(self.plan_path)
-
-    def _inputs_to_dict(self, input_example):
-        trt_inputs = {}
-        for i, inp in enumerate(input_example):
-            input_name = self.input_names[i]
-            trt_inputs[input_name] = inp
-        return trt_inputs
-
-    def _load_engine(self):
-        """
-        Loads TRT plan from disk and activates its execution context.
-        """
-        try:
-            self.engine = TRTEngine(self.plan_path, self.logger)
-            # Make sure we have names correct
-            input_table = {}
-            for name in self.engine.input_names:
-                if name.startswith("__") and name not in self.input_names:
-                    orig_name = name[2:]
-                else:
-                    orig_name = name
-                input_table[name] = orig_name
-            self.engine.input_table = input_table
-            self.logger.info(f"Engine loaded, inputs:{self.engine.input_table}")
-        except Exception as e:
-            self.logger.info(f"Exception while loading the engine:\n{e}")
-
-    def forward(self, model, argv, kwargs):
-        """
-        Main forward method:
-         Builds TRT engine if not available yet.
-         Tries to run TRT engine
-         If exception thrown and self.callback==True: falls back to original Pytorch
-
-        Args: Passing through whatever args wrapped module's forward() has
-        Returns: Passing through wrapped module's forward() return value(s)
-
-        """
-        args = self.defaults
-        args.update(kwargs)
-        if len(argv) > 0:
-            args.update(self._inputs_to_dict(argv))
-
-        if self.engine is None and not self.disabled:
-            # Restore original forward for export
-            new_forward = model.forward
-            model.forward = self.old_forward
-            try:
-                self._load_engine()
-                if self.engine is None:
-                    build_args = args.copy()
-                    with torch.no_grad():
-                        self._build_and_save(model, build_args)
-                        # This will reassign input_names from the engine
-                    self._load_engine()
-                    assert self.engine is not None
-            except Exception as e:
-                if self.fallback:
-                    self.logger.info(f"Failed to build engine: {e}")
-                    self.disabled = True
-                else:
-                    raise e
-            if not self.disabled and not self.fallback:
-                # Delete all parameters
-                for param in model.parameters():
-                    del param
-                # Call empty_cache to release GPU memory
-                torch.cuda.empty_cache()
-            # restore TRT hook
-            model.forward = new_forward
-        # Run the engine
-        try:
-            if self.engine is not None:
-                # forward_trt is not thread safe as we do not use per-thread execution contexts
-                with lock_sm:
-                    device = torch.cuda.current_device()
-                    stream = torch.cuda.Stream(device=device)
-                    self.engine.set_inputs(unroll_input(self.input_names, args), stream.cuda_stream)
-                    self.engine.allocate_buffers(device=device)
-                    # Need this to synchronize with Torch stream
-                    stream.wait_stream(torch.cuda.current_stream())
-                    ret = self.engine.infer(stream.cuda_stream, use_cuda_graph=self.use_cuda_graph)
-                    # if output_names is not None, return dictionary
-                    if not self.return_dict:
-                        ret = list(ret.values())
-                        if self.output_lists:
-                            ret = parse_groups(ret, self.output_lists)
-                        elif len(ret) == 1:
-                            ret = ret[0]
-                    return ret
-        except Exception as e:
-            if self.fallback:
-                self.logger.info(f"Exception: {e}\nFalling back to Pytorch ...")
-            else:
-                raise e
-        return self.old_forward(*argv, **kwargs)
-
-    def _onnx_to_trt(self, onnx_path):
-        """
-        Builds TRT engine from ONNX file at onnx_path and saves to self.plan_path
-        """
-
-        profiles = []
-        for profile in self.profiles:
-            p = Profile()
-            for id, val in profile.items():
-                p.add(id, min=val[0], opt=val[1], max=val[2])
-            profiles.append(p)
-
-        build_args = self.build_args.copy()
-        build_args["tf32"] = self.precision != "fp32"
-        if self.precision == "fp16":
-            build_args["fp16"] = True
-        elif self.precision == "bf16":
-            build_args["bf16"] = True
-
-        self.logger.info(f"Building TensorRT engine for {onnx_path}: {self.plan_path}")
-        network = network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM])
-        return engine_bytes_from_network(network, config=CreateConfig(profiles=profiles, **build_args))
-
-    def _build_and_save(self, model, input_example):
-        """
-        If TRT engine is not ready, exports model to ONNX,
-        builds TRT engine and saves serialized TRT engine to the disk.
-        Args:
-             input_example: passed to onnx.export()
-        """
-
-        if self.engine is not None:
-            return
-
-        export_args = self.export_args
-        engine_bytes = None
-
-        add_casts_around_norms(model)
-        replace_for_export(model)
-
-        if self.method == "torch_trt":
-            enabled_precisions = [torch.float32]
-            if self.precision == "fp16":
-                enabled_precisions.append(torch.float16)
-            elif self.precision == "bf16":
-                enabled_precisions.append(torch.bfloat16)
-            inputs = list(input_example.values())
-
-            def get_torch_trt_input(input_shape, dynamic_batchsize):
-                min_input_shape, opt_input_shape, max_input_shape = get_profile_shapes(input_shape, dynamic_batchsize)
-                return torch_tensorrt.Input(
-                    min_shape=min_input_shape, opt_shape=opt_input_shape, max_shape=max_input_shape
-                )
-
-            tt_inputs = [get_torch_trt_input(i.shape, self.dynamic_batchsize) for i in inputs]
-            engine_bytes = torch_tensorrt.convert_method_to_trt_engine(
-                model,
-                "forward",
-                arg_inputs=tt_inputs,
-                enabled_precisions=enabled_precisions,
-                **export_args,
-            )
-        else:
-            dbs = self.dynamic_batchsize
-            if dbs:
-                if len(self.profiles) > 0:
-                    raise ValueError("ERROR: Both dynamic_batchsize and input_profiles set for TrtCompiler!")
-                if len(dbs) != 3:
-                    raise ValueError("dynamic_batchsize has to have len ==3 ")
-                profile = {}
-                for id, val in input_example.items():
-
-                    def add_profile(id, val):
-                        sh = val.shape
-                        if len(sh) > 0:
-                            sh = sh[1:]
-                            profile[id] = [[dbs[0], *sh], [dbs[1], *sh], [dbs[2], *sh]]
-
-                    if isinstance(val, list) or isinstance(val, tuple):
-                        for i in range(len(val)):
-                            add_profile(f"{id}_{i}", val[i])
-                    elif isinstance(val, torch.Tensor):
-                        add_profile(id, val)
-                self.profiles = [profile]
-
-            self.dynamic_axes = get_dynamic_axes(self.profiles)
-
-            if len(self.dynamic_axes) > 0:
-                export_args.update({"dynamic_axes": self.dynamic_axes})
-
-            # Use temporary directory for easy cleanup in case of external weights
-            with tempfile.TemporaryDirectory() as tmpdir:
-                if export_args.get("dynamo", False):
-                    input_names = None
-                else:
-                    input_names = list(unroll_input(self.input_names, input_example).keys())
-                onnx_path = str(Path(tmpdir) / "model.onnx")
-                self.logger.info(
-                    f"Exporting to {onnx_path}:\n"
-                    + f"output_names={self.output_names}\ninput_names={self.input_names}\nexport args: {export_args}"
-                )
-                torch.onnx.export(
-                    model,
-                    (input_example,),
-                    onnx_path,
-                    input_names=input_names,
-                    output_names=self.output_names,
-                    **export_args,
-                )
-                if polygraphy_imported:
-                    from polygraphy.backend.onnx.loader import fold_constants, onnx_from_path, save_onnx
-
-                    onnx_model = fold_constants(onnx_from_path(onnx_path), size_threshold=16 * 1000 * 1000)
-                    save_onnx(onnx_model, onnx_path)
-                self.logger.info("Export to ONNX successful.")
-                engine_bytes = self._onnx_to_trt(onnx_path)
-        if engine_bytes:
-            open(self.plan_path, "wb").write(engine_bytes)
-
-
-def trt_forward(self, *argv, **kwargs):
-    """
-    Patch function to replace original model's forward() with.
-    Redirects to TrtCompiler.forward()
-    """
-    return self._trt_compiler.forward(self, argv, kwargs)
-
-
-def trt_compile(
-    model: torch.nn.Module,
-    base_path: str,
-    args: Dict[str, Any] | None = None,
-    submodule: Union[str, List[str]] | None = None,
-    logger: Any | None = None,
-) -> torch.nn.Module:
-    """
-    Instruments model or submodule(s) with TrtCompiler and replaces its forward() with TRT hook.
-    Note: TRT 10.3 is recommended for best performance. Some nets may even fail to work with TRT 8.x
-    Args:
-      model: module to patch with TrtCompiler object.
-      base_path: TRT plan(s) saved to f"{base_path}[.{submodule}].plan" path.
-                 dirname(base_path) must exist, base_path does not have to.
-                 If base_path does point to existing file (e.g. associated checkpoint),
-                 that file becomes a dependency - its mtime is added to args["timestamp"].
-      args: Optional dict : unpacked and passed to TrtCompiler() - see TrtCompiler above for details.
-      submodule: Optional hierarchical id(s) of submodule to patch, e.g. ['image_decoder.decoder']
-                  If None, TrtCompiler patch is applied to the whole model.
-                  Otherwise, submodule (or list of) is being patched.
-      logger: Optional logger for diagnostics.
-    Returns:
-      Always returns same model passed in as argument. This is for ease of use in configs.
-    """
-
-    default_args: Dict[str, Any] = {
-        "method": "onnx",
-        "precision": "fp16",
-        "build_args": {"builder_optimization_level": 5, "precision_constraints": "obey"},
-    }
-
-    default_args.update(args or {})
-    args = default_args
-
-    if trt_imported and polygraphy_imported and torch.cuda.is_available():
-        # if "path" filename point to existing file (e.g. checkpoint)
-        # it's also treated as dependency
-        if os.path.exists(base_path):
-            timestamp = int(os.path.getmtime(base_path))
-            if "timestamp" in args:
-                timestamp = max(int(args["timestamp"]), timestamp)
-            args["timestamp"] = timestamp
-
-        def wrap(model, path):
-            if not hasattr(model, "_trt_compiler"):
-                model.orig_forward = model.forward
-                wrapper = TrtCompiler(model, path + ".plan", logger=logger, **args)
-                model._trt_compiler = wrapper
-                model.forward = MethodType(trt_forward, model)
-
-        def find_sub(parent, submodule):
-            idx = submodule.find(".")
-            # if there is "." in name, call recursively
-            if idx != -1:
-                parent_name = submodule[:idx]
-                parent = getattr(parent, parent_name)
-                submodule = submodule[idx + 1 :]
-                return find_sub(parent, submodule)
-            return parent, submodule
-
-        if submodule is not None:
-            if isinstance(submodule, str):
-                submodule = [submodule]
-            for s in submodule:
-                parent, sub = find_sub(model, s)
-                wrap(getattr(parent, sub), base_path + "." + s)
-        else:
-            wrap(model, base_path)
-    else:
-        logger = logger or getLogger("trt_compile")
-        logger.warning("TensorRT and/or polygraphy packages are not available! trt_compile() has no effect.")
-
-    return model
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
deleted file mode 100644
index a19d342713b7..000000000000
--- a/nemo/export/tensorrt_llm.py
+++ /dev/null
@@ -1,1804 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import json
-import logging
-import os
-import pickle
-import shutil
-import tempfile
-import warnings
-from glob import glob
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-import numpy as np
-import safetensors
-import tensorrt_llm
-import torch
-import torch.nn.functional as F
-import wrapt
-from tensorrt_llm._common import check_max_num_tokens
-from tensorrt_llm._utils import numpy_to_torch
-from tensorrt_llm.builder import BuildConfig
-from tensorrt_llm.commands.build import build as build_trtllm
-from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import (
-    BaichuanForCausalLM,
-    BertForQuestionAnswering,
-    BertForSequenceClassification,
-    BertModel,
-    BloomForCausalLM,
-    ChatGLMForCausalLM,
-    CogVLMForCausalLM,
-    CohereForCausalLM,
-    DbrxForCausalLM,
-    DeciLMForCausalLM,
-    DecoderModel,
-    DeepseekForCausalLM,
-    DeepseekV2ForCausalLM,
-    DiT,
-    EagleForCausalLM,
-    EncoderModel,
-    FalconForCausalLM,
-    GemmaForCausalLM,
-    GPTForCausalLM,
-    GPTJForCausalLM,
-    GPTNeoXForCausalLM,
-    GrokForCausalLM,
-    LLaMAForCausalLM,
-    MambaForCausalLM,
-    MedusaForCausalLm,
-    MLLaMAForCausalLM,
-    MPTForCausalLM,
-    OPTForCausalLM,
-    Phi3ForCausalLM,
-    PhiForCausalLM,
-    QWenForCausalLM,
-    RecurrentGemmaForCausalLM,
-    ReDrafterForCausalLM,
-    RobertaForQuestionAnswering,
-    RobertaForSequenceClassification,
-    RobertaModel,
-    WhisperEncoder,
-)
-from tensorrt_llm.plugin import PluginConfig
-from transformers import PreTrainedTokenizerBase
-
-from nemo.deploy import ITritonDeployable
-from nemo.export.tarutils import TarPath
-from nemo.export.trt_llm.converter.model_converter import determine_quantization_settings, model_to_trtllm_ckpt
-from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import dist_model_to_trt_llm_ckpt, get_layer_prefix
-from nemo.export.trt_llm.converter.utils import init_model_parallel_from_nemo
-from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import (
-    build_tokenizer,
-    get_model_type,
-    get_tokenizer,
-    get_weights_dtype,
-    load_nemo_model,
-)
-from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
-from nemo.export.trt_llm.qnemo.tokenizer_utils import TOKENIZER_CONFIG_FILE, get_nmt_tokenizer
-from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint
-from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
-from nemo.export.trt_llm.tensorrt_llm_run import (
-    generate,
-    generate_streaming,
-    load,
-    load_distributed,
-    refit,
-    unload_engine,
-)
-from nemo.export.trt_llm.utils import is_rank
-from nemo.export.utils import is_nemo_tarfile, prepare_directory_for_export, torch_dtype_from_precision
-from nemo.export.utils.constants import TRTLLM_ENGINE_DIR
-
-use_deploy = True
-try:
-    from nemo.deploy.utils import cast_output, str_ndarray2list
-except Exception:
-    use_deploy = False
-
-LOGGER = logging.getLogger("NeMo")
-
-
-@wrapt.decorator
-def noop_decorator(func):
-    """No op decorator"""
-
-    def wrapper(*args, **kwargs):
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-use_pytriton = True
-batch = noop_decorator
-try:
-    from pytriton.decorators import batch, first_value
-    from pytriton.model_config import Tensor
-except Exception:
-    use_pytriton = False
-
-
-# pylint: disable=line-too-long
-class TensorRTLLM(ITritonDeployable):
-    """
-    Exports nemo and huggingface checkpoints to TensorRT-LLM and run fast inference.
-
-    Example:
-        from nemo.export.tensorrt_llm import TensorRTLLM
-
-        trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files")
-        trt_llm_exporter.export(
-            nemo_checkpoint_path="/path/for/nemo/checkpoint",
-            model_type="llama",
-            tensor_parallelism_size=1,
-        )
-
-        output = trt_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
-        print("output: ", output)
-
-    """
-
-    def __init__(
-        self,
-        model_dir: str,
-        lora_ckpt_list: List[str] = None,
-        load_model: bool = True,
-        use_python_runtime: bool = True,
-        enable_chunked_context: bool = None,
-        max_tokens_in_paged_kv_cache: int = None,
-        multi_block_mode: bool = False,
-    ):
-        """
-        Args:
-            model_dir (str): path for storing the TensorRT-LLM model files.
-            lora_ckpt_list (List[str]): lora checkpoint paths.
-            load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir.
-            use_python_runtime (bool): whether to use python or c++ runtime.
-            multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context. Only available when using c++ runtime
-        """
-
-        if use_python_runtime:
-            if enable_chunked_context is not None or max_tokens_in_paged_kv_cache is not None:
-                raise Exception(
-                    "enable_chunked_context and max_tokens_in_paged_kv_cache options "
-                    "work only with the TensorRT-LLM C++ runtime. Please set "
-                    "use_python_runtime=False to use these options."
-                )
-
-        self.model_dir = model_dir
-        self.engine_dir = os.path.join(model_dir, TRTLLM_ENGINE_DIR)
-        self.lora_ckpt_list = lora_ckpt_list
-        self.use_python_runtime = use_python_runtime
-        self.enable_chunked_context = enable_chunked_context if enable_chunked_context is not None else False
-        self.max_tokens_in_paged_kv_cache = max_tokens_in_paged_kv_cache
-        self.multi_block_mode = multi_block_mode
-        self.model = None
-        self.tokenizer = None
-        self.config = None
-        self.ptuning_tables = []
-        self.p_table = None
-        self.task_vocab_size = 0
-        self.task_vtoken_counts = []
-        self.task_ids = {}
-
-        if load_model:
-            self._load()
-
-    def export(
-        self,
-        nemo_checkpoint_path: str,
-        model_type: Optional[str] = None,
-        delete_existing_files: bool = True,
-        tensor_parallelism_size: int = 1,
-        pipeline_parallelism_size: int = 1,
-        gpus_per_node: Optional[int] = None,
-        max_input_len: int = 256,
-        max_output_len: Optional[int] = None,
-        max_batch_size: int = 8,
-        max_prompt_embedding_table_size: Optional[int] = None,
-        use_parallel_embedding: bool = False,
-        use_embedding_sharing: bool = False,
-        paged_kv_cache: bool = True,
-        remove_input_padding: bool = True,
-        paged_context_fmha: bool = False,
-        dtype: Optional[str] = None,
-        load_model: bool = True,
-        use_lora_plugin: str = None,
-        lora_target_modules: List[str] = None,
-        max_lora_rank: int = 64,
-        max_num_tokens: Optional[int] = None,
-        opt_num_tokens: Optional[int] = None,
-        max_seq_len: Optional[int] = 512,
-        multiple_profiles: bool = False,
-        gpt_attention_plugin: str = "auto",
-        gemm_plugin: str = "auto",
-        use_mcore_path: bool = True,
-        reduce_fusion: bool = True,
-        fp8_quantized: Optional[bool] = None,
-        fp8_kvcache: Optional[bool] = None,
-        gather_context_logits: Optional[bool] = False,
-        gather_generation_logits: Optional[bool] = False,
-        build_rank: Optional[int] = 0,
-    ):
-        """
-        Exports nemo checkpoints to TensorRT-LLM.
-
-        Args:
-            nemo_checkpoint_path (str): path for the nemo checkpoint.
-            model_type (Optional[str]): type of the model (optional for NeMo 2.0 and quantized checkpoints).
-            delete_existing_files (bool): if True, deletes all the files in model_dir.
-            tensor_parallelism_size (int): tensor parallelism.
-            pipeline_parallelism_size (int): pipeline parallelism.
-            gpus_per_node (int): number of gpus per node.
-            max_input_len (int): max input length.
-            max_output_len (int): max output length.
-            max_batch_size (int): max batch size.
-            max_prompt_embedding_table_size (int): max prompt embedding size.
-            use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
-            use_embedding_sharing (bool):
-            paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
-            paged_context_fmha (bool): whether to use paged context fmha feature of TRT-LLM or not
-            remove_input_padding (bool): enables removing input padding or not.
-            dtype (Optional[str]): Floating point type for model weights (supports 'bfloat16', 'float16' or 'float32').
-                If None, try to autodetect the type from model config.
-            load_model (bool): load TensorRT-LLM model after the export.
-            use_lora_plugin (str): use dynamic lora or not.
-            lora_target_modules (List[str]): list of the target lora modules.
-            max_lora_rank (int): maximum lora rank.
-            max_num_tokens (int):
-            opt_num_tokens (int):
-            max_seq_len (int): the maximum sequence length of a single request.
-            multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False
-            gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
-            gemm_plugin (str): enable the gpt plugin. Default = "auto"
-            use_mcore_path (bool) : Use the more recent mcore path for export
-            reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce
-            fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type.
-            fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type.
-            gather_context_logits (Optional[bool]): if True, enables gather_context_logits while building trtllm engine. Default: False
-            gather_generation_logits (Optional[bool]): if True, enables gather_generation_logits while building trtllm engine. Default: False
-            build_rank (Optional[int]): rank to export the model on. If None, builds on all ranks.
-        """
-        if not use_mcore_path:
-            warnings.warn(
-                "Exporting models using the local codebase with use_mcore_path=False is deprecated."
-                " Please install megatron-core and set use_mcore_path to True.",
-                stacklevel=2,
-            )
-
-        gpus_per_node = tensor_parallelism_size if gpus_per_node is None else gpus_per_node
-        prepare_directory_for_export(
-            self.model_dir, delete_existing_files=delete_existing_files, subdir=TRTLLM_ENGINE_DIR
-        )
-
-        if max_prompt_embedding_table_size is None:
-            max_prompt_embedding_table_size = 0
-
-        self.model = None
-
-        if max_output_len is not None:
-            warnings.warn(
-                "Parameter max_output_len is deprecated and will be removed.", DeprecationWarning, stacklevel=2
-            )
-            max_output_len = max_output_len if max_output_len is not None else 256
-
-            if max_seq_len is None:
-                max_seq_len = max_input_len + max_output_len
-            else:
-                warnings.warn(
-                    f"Parameter max_output_len will be overwritten by max_seq_len={max_seq_len}.",
-                    DeprecationWarning,
-                    stacklevel=2,
-                )
-
-        max_seq_len = max_seq_len if max_seq_len is not None else 512
-
-        if max_batch_size < 4:
-            warnings.warn(
-                "TensorRT LLM may hit a runtime issue with batch size is smaller than 4 on some models."
-                " Force set to 4",
-                stacklevel=2,
-            )
-            max_batch_size = 4
-
-        is_export_rank = is_rank(build_rank)
-
-        if is_export_rank:
-            tmp_dir = tempfile.TemporaryDirectory()
-            nemo_export_dir = Path(tmp_dir.name)
-
-            if is_qnemo_checkpoint(nemo_checkpoint_path):
-                if os.path.isdir(nemo_checkpoint_path):
-                    nemo_export_dir = nemo_checkpoint_path
-                else:
-                    raise ValueError("Checkpoint path must be a directory")
-
-                if os.path.exists(os.path.join(nemo_checkpoint_path, TOKENIZER_CONFIG_FILE)):
-                    # Instantiate tokenizer for a legacy "Nemo 1" quantized checkpoint from a tokenizer config.
-                    # Note that using the config is deprecated and it will be removed in future releases.
-                    LOGGER.warning("Detected legacy tokenizer_config.yaml, using it to build tokenizer.")
-                    self.tokenizer = get_nmt_tokenizer(nemo_checkpoint_path)
-                else:
-                    self.tokenizer = get_tokenizer(nemo_checkpoint_path)
-
-                model_config = None
-
-                qnemo_to_tensorrt_llm(
-                    nemo_checkpoint_path=nemo_checkpoint_path,
-                    engine_dir=self.engine_dir,
-                    max_input_len=max_input_len,
-                    max_seq_len=max_seq_len,
-                    max_batch_size=max_batch_size,
-                    max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                    tensor_parallel_size=tensor_parallelism_size,
-                    pipeline_parallel_size=pipeline_parallelism_size,
-                    use_parallel_embedding=use_parallel_embedding,
-                    paged_kv_cache=paged_kv_cache,
-                    paged_context_fmha=paged_context_fmha,
-                    remove_input_padding=remove_input_padding,
-                    use_lora_plugin=use_lora_plugin,
-                    lora_target_modules=lora_target_modules,
-                    max_lora_rank=max_lora_rank,
-                    max_num_tokens=max_num_tokens,
-                    opt_num_tokens=opt_num_tokens,
-                    multiple_profiles=multiple_profiles,
-                    reduce_fusion=reduce_fusion,
-                )
-            else:
-                if model_type is None:
-                    # For NeMo 2.0 models we can get model_type from the model class name
-                    model_type = get_model_type(nemo_checkpoint_path)
-
-                if model_type is None:
-                    raise ValueError(
-                        "Parameter model_type needs to be provided and cannot be inferred from the checkpoint. "
-                        "Please specify it explicitely."
-                    )
-
-                if model_type not in self.get_supported_models_list:
-                    raise ValueError(
-                        f"Model {model_type} is not currently a supported model type. "
-                        f"Supported model types are: {self.get_supported_models_list}."
-                    )
-
-                if dtype is None:
-                    dtype = get_weights_dtype(nemo_checkpoint_path)
-
-                if dtype is None:
-                    raise ValueError(
-                        "Parameter dtype needs to be provided and cannot be inferred from the checkpoint. "
-                        "Please specify it explicitely."
-                    )
-
-                model, model_config, self.tokenizer = load_nemo_model(
-                    nemo_checkpoint_path, nemo_export_dir, use_mcore_path
-                )
-                if use_mcore_path:
-                    from megatron.core.export.data_type import DataType
-                    from megatron.core.export.export_config import ExportConfig
-                    from megatron.core.export.model_type import ModelType
-                    from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
-                        DEFAULT_CONVERSION_DICT,
-                    )
-                    from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
-                    from tensorrt_llm.layers import MoeConfig
-
-                    share_embeddings_and_output_weights = model_config.get(
-                        "share_embeddings_and_output_weights", False
-                    )
-                    fp8_quantized, fp8_kvcache = determine_quantization_settings(
-                        model_config, fp8_quantized, fp8_kvcache
-                    )
-
-                    # We build the transformer config using the nemo model config.
-                    transformer_config = self.get_transformer_config(model_config)
-                    input_model_type = getattr(ModelType, model_type)
-
-                    # MCore export supports some default conversion dictionaries
-                    mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT
-
-                    # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys
-                    nemo_model_conversion_dict = {
-                        f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
-                    } | {  # Mapping for NeMo 2.0
-                        f'module.{key}': value for key, value in mcore_model_conversion_dict.items()
-                    }
-
-                    # TODO: Workaround: Gemma uses gated activation, while mcore does not handle openai-gelu
-                    # as a gated function. Remove once !11614 is merged.
-                    activation = model_config.get('activation', "gelu")
-                    if activation == "openai-gelu" and input_model_type.name == 'gemma':
-                        activation = "geglu"
-
-                    trtllm_helper = TRTLLMHelper(
-                        transformer_config=transformer_config,
-                        model_type=input_model_type,
-                        trtllm_conversion_dict=nemo_model_conversion_dict,
-                        position_embedding_type=model_config.get('position_embedding_type'),
-                        max_position_embeddings=model_config.get('max_position_embeddings'),
-                        rotary_percentage=model_config.get('rotary_percentage', 1.0),
-                        rotary_base=model_config.get('rotary_base', 10000),
-                        moe_tp_mode=model_config.get('moe_tp_mode', 2),
-                        multi_query_mode=model_config.get("multi_query_mode", False),
-                        activation=activation,
-                        seq_len_interpolation_factor=model_config.get("seq_len_interpolation_factor"),
-                        moe_renorm_mode=model_config.get(
-                            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
-                        ),
-                        share_embeddings_and_output_weights=share_embeddings_and_output_weights,
-                    )
-
-                    input_dtype = getattr(DataType, dtype)
-                    export_config = ExportConfig(
-                        tensor_parallelism_size,
-                        pipeline_parallelism_size,
-                        use_parallel_embedding,
-                        share_embeddings_and_output_weights,
-                    )
-
-                    trtllm_model_weights_list, trtllm_model_config_list = (
-                        trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
-                            model_state_dict=model,
-                            export_config=export_config,
-                            dtype=input_dtype,
-                            state_dict_split_by_layer_numbers=False,
-                            fp8_quantized=fp8_quantized,
-                            fp8_kvcache=fp8_kvcache,
-                        )
-                    )
-
-                    for trtllm_model_weights, trtllm_model_config in zip(
-                        trtllm_model_weights_list, trtllm_model_config_list
-                    ):
-                        trtllm_helper.build_and_save_engine(
-                            max_input_len=max_input_len,
-                            max_output_len=max_output_len,
-                            max_batch_size=max_batch_size,
-                            engine_dir=self.engine_dir,
-                            trtllm_model_weights=trtllm_model_weights,
-                            trtllm_model_config=trtllm_model_config,
-                            lora_ckpt_list=self.lora_ckpt_list,
-                            use_lora_plugin=use_lora_plugin,
-                            max_lora_rank=max_lora_rank,
-                            lora_target_modules=lora_target_modules,
-                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                            paged_kv_cache=paged_kv_cache,
-                            remove_input_padding=remove_input_padding,
-                            paged_context_fmha=paged_context_fmha,
-                            use_refit=False,
-                            max_num_tokens=max_num_tokens,
-                            max_seq_len=max_seq_len,
-                            opt_num_tokens=opt_num_tokens,
-                            max_beam_width=1,
-                            tokens_per_block=128,
-                            multiple_profiles=multiple_profiles,
-                            gpt_attention_plugin=gpt_attention_plugin,
-                            gemm_plugin=gemm_plugin,
-                        )
-                else:
-                    if model_type == "gpt" or model_type == "starcoder":
-                        model_type = "gptnext"
-
-                    if model_type == "mixtral":
-                        model_type = "llama"
-
-                    trtllm_model_weights_list, trtllm_model_config_list = model_to_trtllm_ckpt(
-                        model=model,
-                        nemo_model_config=model_config,
-                        nemo_export_dir=nemo_export_dir,
-                        decoder_type=model_type,
-                        dtype=dtype,
-                        tensor_parallel_size=tensor_parallelism_size,
-                        pipeline_parallel_size=pipeline_parallelism_size,
-                        gpus_per_node=gpus_per_node,
-                        use_parallel_embedding=use_parallel_embedding,
-                        use_embedding_sharing=use_embedding_sharing,
-                        fp8_quantized=fp8_quantized,
-                        fp8_kvcache=fp8_kvcache,
-                    )
-
-                    for trtllm_model_weights, trtllm_model_config in zip(
-                        trtllm_model_weights_list, trtllm_model_config_list
-                    ):
-                        build_and_save_engine(
-                            max_input_len=max_input_len,
-                            max_output_len=max_output_len,
-                            max_batch_size=max_batch_size,
-                            model_config=trtllm_model_config,
-                            model_weights=trtllm_model_weights,
-                            model_dir=self.engine_dir,
-                            model_type=model_type,
-                            lora_ckpt_list=self.lora_ckpt_list,
-                            use_lora_plugin=use_lora_plugin,
-                            max_lora_rank=max_lora_rank,
-                            lora_target_modules=lora_target_modules,
-                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                            paged_kv_cache=paged_kv_cache,
-                            remove_input_padding=remove_input_padding,
-                            paged_context_fmha=paged_context_fmha,
-                            max_num_tokens=max_num_tokens,
-                            opt_num_tokens=opt_num_tokens,
-                            max_seq_len=max_seq_len,
-                            multiple_profiles=multiple_profiles,
-                            gpt_attention_plugin=gpt_attention_plugin,
-                            gemm_plugin=gemm_plugin,
-                            gather_context_logits=gather_context_logits,
-                            gather_generation_logits=gather_generation_logits,
-                        )
-
-            tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
-            tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context")
-            vocab_path = os.path.join(nemo_export_dir, "vocab.json")
-            if isinstance(self.tokenizer, PreTrainedTokenizerBase):
-                self.tokenizer.save_pretrained(self.model_dir)
-            elif os.path.exists(tokenizer_path):
-                shutil.copy(tokenizer_path, self.model_dir)
-            elif os.path.exists(tokenizer_path_nemo2):
-                # Copy HF tokenizer files to root model directory
-                for path in glob(os.path.join(tokenizer_path_nemo2, "nemo_tokenizer", "*.json")):
-                    shutil.copy(path, self.model_dir)
-                # Copy SentencePiece tokenizer.model
-                for path in glob(os.path.join(tokenizer_path_nemo2, "*.model")):
-                    shutil.copy(path, os.path.join(self.model_dir, "tokenizer.model"))
-            elif os.path.exists(vocab_path):
-                shutil.copy(vocab_path, os.path.join(self.model_dir, "vocab.json"))
-
-            nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
-            if os.path.exists(nemo_model_config):
-                shutil.copy(nemo_model_config, self.model_dir)
-
-            tmp_dir.cleanup()
-
-        if is_export_rank and model_config is not None:
-            self._export_to_nim_format(model_config, model_type)
-
-        if tensorrt_llm.mpi_world_size() > 1:
-            tensorrt_llm.mpi_barrier()
-
-        if is_export_rank and load_model:
-            self._load()
-
-    def export_hf_model(
-        self,
-        hf_model_path: str,
-        max_batch_size: int = 8,
-        tensor_parallelism_size: int = 1,
-        max_input_len: int = 256,
-        max_output_len: int = 256,
-        max_num_tokens: Optional[int] = None,
-        opt_num_tokens: Optional[int] = None,
-        dtype: Optional[str] = None,
-        max_seq_len: Optional[int] = 512,
-        gemm_plugin: str = "auto",
-        remove_input_padding: bool = True,
-        paged_context_fmha: bool = False,
-        paged_kv_cache: bool = True,
-        tokens_per_block: int = 128,
-        multiple_profiles: bool = False,
-        reduce_fusion: bool = False,
-        max_beam_width: int = 1,
-        use_refit: bool = False,
-        model_type: Optional[str] = None,
-        delete_existing_files: bool = True,
-    ):
-        """
-        Export a Hugging Face model checkpoint to TensorRT-LLM format.
-
-        Args:
-            hf_model_path (str): Path to the Hugging Face model directory
-            max_batch_size (int, optional): Maximum batch size for inference. Defaults to 8.
-            tensor_parallelism_size (int, optional): Size of tensor parallelism. Defaults to 1.
-            max_input_len (int, optional): Maximum input sequence length. Defaults to 256.
-            max_output_len (int, optional): Maximum output sequence length. Defaults to 256.
-            max_num_tokens (int, optional): Maximum number of tokens. Defaults to None.
-            opt_num_tokens (int, optional): Optimal number of tokens. Defaults to None.
-            dtype (str, optional): Data type for model weights. If None, inferred from model config.
-            max_seq_len (int, optional): Maximum total sequence length. Defaults to 512.
-            gemm_plugin (str, optional): GEMM plugin type. Defaults to "auto".
-            remove_input_padding (bool, optional): Whether to remove input padding. Defaults to True.
-            paged_context_fmha (bool, optional): Whether to use paged context FMHA. Defaults to False.
-            paged_kv_cache (bool, optional): Whether to use paged KV cache. Defaults to True.
-            tokens_per_block (int, optional): Number of tokens per block for paged KV cache. Defaults to 128.
-            multiple_profiles (bool, optional): Whether to use multiple TensorRT profiles. Defaults to False.
-            reduce_fusion (bool, optional): Whether to reduce operator fusion. Defaults to False.
-            max_beam_width (int, optional): Maximum beam width for beam search. Defaults to 1.
-            use_refit (bool, optional): Whether to use TensorRT refitting. Defaults to False.
-            model_type (str, optional): Type of the model architecture. Defaults to None.
-            delete_existing_files (bool, optional): Whether to delete existing files in export dir. Defaults to True.
-
-        Raises:
-            ValueError: If model_type is not supported or dtype cannot be determined
-        """
-        LOGGER.info("Starting HF export to TRT-LLM")
-        if model_type not in self.get_supported_hf_model_mapping:
-            raise ValueError(
-                f"Model {model_type} is not currently a supported model type. "
-                f"Supported model types are: {self.get_supported_hf_model_mapping.keys()}."
-            )
-
-        if dtype is None:
-            dtype = self.get_hf_model_dtype(hf_model_path)
-            if dtype is None:
-                raise ValueError("No dtype found in hf model config. Please specify a dtype.")
-
-        prepare_directory_for_export(
-            self.model_dir, delete_existing_files=delete_existing_files, subdir=TRTLLM_ENGINE_DIR
-        )
-
-        if max_batch_size < 4:
-            print("TensorRT-LLM may hit runtime issue with batch size is smaller than 4. Force set to 4")
-            max_batch_size = 4
-
-        plugin_config = PluginConfig()
-        plugin_config.gemm_plugin = gemm_plugin
-        if paged_kv_cache:
-            plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
-        else:
-            plugin_config.paged_kv_cache = False
-        plugin_config.remove_input_padding = remove_input_padding
-        plugin_config.use_paged_context_fmha = paged_context_fmha
-        plugin_config.multiple_profiles = multiple_profiles
-        plugin_config.reduce_fusion = reduce_fusion
-        max_seq_len = max_input_len + max_output_len
-        max_num_tokens, opt_num_tokens = check_max_num_tokens(
-            max_num_tokens=max_num_tokens,
-            opt_num_tokens=opt_num_tokens,
-            max_seq_len=max_seq_len,
-            max_batch_size=max_batch_size,
-            max_input_len=max_input_len,
-            max_beam_width=max_beam_width,
-            remove_input_padding=remove_input_padding,
-            enable_context_fmha=plugin_config.context_fmha,
-            tokens_per_block=tokens_per_block,
-            multiple_profiles=multiple_profiles,
-        )
-        build_dict = {
-            'max_input_len': max_input_len,
-            'max_output_len': max_output_len,
-            'max_batch_size': max_batch_size,
-            'max_beam_width': max_beam_width,
-            'max_seq_len': max_seq_len,
-            'max_num_tokens': max_num_tokens,
-            'opt_num_tokens': opt_num_tokens,
-            'strongly_typed': False,
-            'builder_opt': None,
-            'multiple_profiles': multiple_profiles,
-            'use_refit': use_refit,
-        }
-        build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
-        for rank in range(tensor_parallelism_size):
-            LOGGER.info(f"Iterating over rank:{rank}")
-            mapping = Mapping(world_size=tensor_parallelism_size, rank=rank, tp_size=tensor_parallelism_size)
-            trtllm_model_class = self.get_supported_hf_model_mapping[model_type]
-            model = trtllm_model_class.from_hugging_face(
-                hf_model_path,
-                dtype,
-                mapping=mapping,
-            )
-            engine = build_trtllm(model, build_config)
-            engine.save(self.engine_dir)
-        # Copy HF tokenizer files to root model directory
-        for path in glob(os.path.join(hf_model_path, "*.json")):
-            shutil.copy(path, self.model_dir)
-        # Copy sentencepiece model to model directory
-        for path in glob(os.path.join(hf_model_path, "*.model")):
-            shutil.copy(path, self.model_dir)
-        LOGGER.info(f"Generarated TRT-LLM checkpoint at dir:{self.model_dir}")
-        LOGGER.info(f"Loading the TRT-LLM checkpoint:{self.model_dir}")
-        self._load()
-
-    def get_hf_model_dtype(self, model_dir: str) -> Optional[str]:
-        """
-        Read the config file from a Hugging Face model directory and identify the model's data type.
-
-        Args:
-            model_dir (str): Path to the Hugging Face model directory
-
-        Returns:
-            Optional[str]: The model's data type if found in config, None otherwise
-        """
-        config_path = Path(model_dir) / 'config.json'
-
-        if not config_path.exists():
-            raise FileNotFoundError(f"Config file not found at {config_path}")
-
-        try:
-            with open(config_path, 'r') as f:
-                config = json.load(f)
-                # Check for dtype in different possible locations in the config
-                if 'torch_dtype' in config:
-                    return config['torch_dtype']
-                elif 'dtype' in config:
-                    return config['dtype']
-                elif 'pretrained_config' in config and 'dtype' in config['pretrained_config']:
-                    return config['pretrained_config']['dtype']
-
-                # If no explicit dtype found, check for other indicators
-                if 'fp16' in config and config['fp16']:
-                    return 'float16'
-                elif 'bf16' in config and config['bf16']:
-                    return 'bfloat16'
-
-            return None
-        except json.JSONDecodeError:
-            raise ValueError(f"Invalid JSON in config file at {config_path}")
-        except Exception as e:
-            raise RuntimeError(f"Error reading config file: {str(e)}")
-
-    def _export_to_nim_format(self, model_config: Dict[str, Any], model_type: str):
-        """
-        Exports the model configuration to a specific format required by NIM.
-        This method performs the following steps:
-
-        1. Copies the generation_config.json (if present) from the nemo_context directory to the root model directory.
-        2. Creates a dummy Hugging Face configuration file based on the provided model configuration and type.
-
-        Args:
-            model_config (dict): A dictionary containing the model configuration parameters.
-            model_type (str): The type of the model (e.g., "llama").
-        """
-
-        generation_config_path = os.path.join(self.model_dir, "nemo_context", "artifacts", "generation_config.json")
-        if os.path.isfile(generation_config_path):
-            shutil.copy(generation_config_path, self.model_dir)
-
-        # Fields "architectures" and "model_type" are required by HF but not relevant for NIM
-        seq_len_interpolation_factor = model_config.get("seq_len_interpolation_factor")
-        hf_config = {
-            "max_position_embeddings": model_config.get("encoder_seq_length"),
-            "architectures": ["LLaMAForCausalLM"],
-            "rope_scaling": (
-                None
-                if seq_len_interpolation_factor is None
-                else {
-                    "factor": seq_len_interpolation_factor,
-                    "rope_type": "default",
-                }
-            ),
-            "model_type": model_type,
-        }
-        with open(os.path.join(self.model_dir, "config.json"), "w") as f:
-            json.dump(hf_config, f, indent=2)
-            f.write("\n")
-
-    def get_transformer_config(self, nemo_model_config):
-        """Given nemo model config get transformer config"""
-        from megatron.core.transformer.transformer_config import TransformerConfig
-
-        normalization = nemo_model_config.get('normalization', 'layernorm')
-        transformer_config_normalization = 'LayerNorm'
-        layernorm_zero_centered_gamma = nemo_model_config.get('layernorm_zero_centered_gamma', False)
-        if normalization == 'layernorm1p':
-            layernorm_zero_centered_gamma = True
-        elif normalization == 'rmsnorm':
-            transformer_config_normalization = 'RMSNorm'
-
-        num_moe_experts = nemo_model_config.get('num_moe_experts', 0)
-        conf = TransformerConfig(
-            num_layers=nemo_model_config.get('num_layers'),
-            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
-            num_attention_heads=nemo_model_config.get('num_attention_heads'),
-            num_query_groups=nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
-            kv_channels=nemo_model_config.get("kv_channels", None),
-            hidden_size=nemo_model_config.get('hidden_size'),
-            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
-            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
-            add_bias_linear=nemo_model_config.get('bias'),
-            num_moe_experts=num_moe_experts if num_moe_experts > 0 else None,
-            normalization=transformer_config_normalization,
-            layernorm_zero_centered_gamma=layernorm_zero_centered_gamma,
-            gated_linear_unit=nemo_model_config.get('gated_linear_unit', False),
-        )
-        return conf
-
-    def convert_to_safe_tensors(
-        self,
-        nemo_checkpoint_path: str,
-        model_type: Optional[str] = None,
-        delete_existing_files: bool = True,
-        tensor_parallelism_size: int = 1,
-        pipeline_parallelism_size: int = 1,
-        gpus_per_node: int = None,
-        use_parallel_embedding: bool = False,
-        use_embedding_sharing: bool = False,
-        dtype: str = "bfloat16",
-    ):
-        """Convert to safe tensor"""
-        gpus_per_node = tensor_parallelism_size if gpus_per_node is None else gpus_per_node
-
-        if Path(self.model_dir).exists():
-            if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
-                for files in os.listdir(self.model_dir):
-                    path = os.path.join(self.model_dir, files)
-                    try:
-                        shutil.rmtree(path)
-                    except OSError:
-                        os.remove(path)
-
-                if len(os.listdir(self.model_dir)) > 0:
-                    raise Exception("Couldn't delete all files.")
-            elif len(os.listdir(self.model_dir)) > 0:
-                raise Exception("There are files in this folder. Try setting delete_existing_files=True.")
-        else:
-            Path(self.model_dir).mkdir(parents=True, exist_ok=True)
-
-        if model_type == "gpt" or model_type == "starcoder":
-            model_type = "gptnext"
-
-        if model_type == "mixtral":
-            model_type = "llama"
-
-        if tensorrt_llm.mpi_rank() == 0:
-            tmp_dir = tempfile.TemporaryDirectory()
-            nemo_export_dir = Path(tmp_dir.name)
-
-            model, model_config, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
-            weights_dicts, model_configs = model_to_trtllm_ckpt(
-                model=model,
-                nemo_model_config=model_config,
-                nemo_export_dir=nemo_export_dir,
-                decoder_type=model_type,
-                dtype=dtype,
-                tensor_parallel_size=tensor_parallelism_size,
-                pipeline_parallel_size=pipeline_parallelism_size,
-                gpus_per_node=gpus_per_node,
-                use_parallel_embedding=use_parallel_embedding,
-                use_embedding_sharing=use_embedding_sharing,
-            )
-
-            for weight_dict, model_config in zip(weights_dicts, model_configs):
-                rank = model_config.mapping.tp_rank
-                for k, v in weight_dict.items():
-                    if isinstance(v, np.ndarray):
-                        weight_dict[k] = numpy_to_torch(v)
-                    else:
-                        weight_dict[k] = v
-
-                safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))
-            model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))
-
-            tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
-            if os.path.exists(tokenizer_path):
-                shutil.copy(tokenizer_path, self.model_dir)
-            else:
-                if self.tokenizer is not None:
-                    self.tokenizer.save_pretrained(self.model_dir)
-
-            nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
-            if os.path.exists(nemo_model_config):
-                shutil.copy(nemo_model_config, self.model_dir)
-
-            tmp_dir.cleanup()
-
-        if tensorrt_llm.mpi_world_size() > 1:
-            tensorrt_llm.mpi_barrier()
-
-    def gather_and_reshard_model(self, model_config, model, storage_dtype):
-        """
-        Accumulate all vp model chunks together, and reshard model (i.e) gather all pp ranks
-        if required and return the final model state dict
-        """
-
-        def _get_layer_index(split_key):
-            for index, key in enumerate(split_key):
-                if key == "layers":
-                    return index + 1
-            raise ValueError(f"Unknown layer name format: {split_key}")
-
-        def rename_layer_num(param_name, layer_num):
-            split_key = param_name.split(".")
-            layer_index = int(_get_layer_index(split_key))
-            split_key[layer_index] = str(layer_num)
-            return ".".join(split_key)
-
-        def get_layer_num(param_name):
-            split_key = param_name.split(".")
-            layer_index = int(_get_layer_index(split_key))
-            return int(split_key[layer_index])
-
-        from megatron.core import parallel_state
-
-        tp_size = parallel_state.get_tensor_model_parallel_world_size()
-        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
-        pp_first_rank = parallel_state.get_pipeline_model_parallel_first_rank()
-        pp_last_rank = parallel_state.get_pipeline_model_parallel_last_rank()
-        pp_size = parallel_state.get_pipeline_model_parallel_world_size()
-        pp_group = parallel_state.get_pipeline_model_parallel_group()
-        vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
-        if not vp_size:
-            vp_size = 1
-
-        inference_tp_size = self.tp_size
-        inference_pp_size = self.pp_size
-        reshard_model = False
-        if inference_tp_size != tp_size or inference_pp_size != pp_size:
-            LOGGER.info("Training/Generation model parallelism resharding enabled")
-            if inference_pp_size == 1 and pp_size > 1 and inference_tp_size == tp_size:
-                reshard_model = True
-            else:
-                raise NotImplementedError(
-                    "NeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases."
-                )
-
-        num_layers = model_config["num_layers"]
-        layers_per_pp = num_layers // pp_size
-        layers_per_chunk = layers_per_pp // vp_size
-
-        tl_params = {}
-        model_level_params = {}
-        if vp_size > 1:  # consolidate params across model chunks
-            for idx, model_chunk in enumerate(model):
-                for key, val in model_chunk.state_dict().items():
-                    # TODO: currently fp8 is not supported
-                    if torch.is_tensor(val) and '_extra_state' not in key:
-                        if 'layers' in key:
-                            key2 = rename_layer_num(key, get_layer_num(key) + idx * pp_size * layers_per_chunk)
-                            tl_params[key2] = val
-                        else:
-                            model_level_params[key] = val
-        else:
-            for key, val in model.state_dict().items():
-                # TODO: currently fp8 is not supported
-                if torch.is_tensor(val) and '_extra_state' not in key:
-                    if 'decoder.layers' in key:
-                        tl_params[key] = val
-                    else:
-                        model_level_params[key] = val
-
-        if vp_size > 1 or reshard_model:
-            # gather layers across pp ranks
-            gathered_params = {}
-            for key, val in tl_params.items():
-                weight_list = [torch.zeros_like(val) for _ in range(pp_size)]
-                torch.distributed.all_gather(weight_list, val, group=pp_group)
-                for idx in range(pp_size):
-                    layer_num = get_layer_num(key) + idx * layers_per_chunk
-                    key2 = rename_layer_num(key, layer_num)
-                    if not reshard_model:  # Save only layers of 1 single PP stage
-                        layers_start = layers_per_pp * pp_rank
-                        layers_end = layers_per_pp * (pp_rank + 1) - 1
-                        if layer_num >= layers_start and layer_num <= layers_end:
-                            key2 = rename_layer_num(key, layer_num % layers_per_pp)
-                            gathered_params[key2] = weight_list[idx]
-                    else:
-                        gathered_params[key2] = weight_list[idx]
-            tl_params = gathered_params
-
-        model_state_dict = model_level_params
-        model_state_dict.update(tl_params)
-
-        def get_tensor_if_available(key, pp_src_idx, group):
-            tensor = model_state_dict.get(key)
-            if tensor is not None:
-                tensor_shape = [tensor.shape]
-            else:
-                tensor_shape = [None]
-
-            torch.distributed.broadcast_object_list(tensor_shape, pp_src_idx, group=group)
-
-            if tensor_shape[0] is None:
-                return None
-            if torch.distributed.get_rank() != pp_src_idx:
-                tensor = torch.empty(tensor_shape[0], dtype=storage_dtype).cuda()
-
-            torch.distributed.broadcast(tensor.contiguous(), pp_src_idx, group=pp_group)
-            return tensor
-
-        if reshard_model:
-            key = 'decoder.final_layernorm.weight'
-            tensor = get_tensor_if_available(key, pp_last_rank, pp_group)
-            if tensor is not None:
-                model_state_dict[key] = tensor
-
-            key = 'decoder.final_layernorm.bias'
-            tensor = get_tensor_if_available(key, pp_last_rank, pp_group)
-            if tensor is not None:
-                model_state_dict[key] = tensor
-
-            key = 'embedding.word_embeddings.weight'
-            tensor = get_tensor_if_available(key, pp_first_rank, pp_group)
-            if tensor is not None:
-                model_state_dict[key] = tensor
-
-            key = 'output_layer.weight'
-            tensor = get_tensor_if_available(key, pp_last_rank, pp_group)
-            if tensor is not None:
-                model_state_dict[key] = tensor
-
-        return model_state_dict
-
-    def get_input_dtype(self, storage_dtype):
-        """
-        Return mcore export dtype given torch dtype
-        """
-        from megatron.core.export.data_type import DataType
-
-        if storage_dtype == torch.bfloat16:
-            return DataType.bfloat16
-        elif storage_dtype == torch.float32:
-            return DataType.float32
-        elif storage_dtype == torch.float16:
-            return DataType.float16
-
-    @staticmethod
-    def get_nemo_to_trtllm_conversion_dict(model_state_dict):
-        """MCore export supports some default conversion dictionaries
-        All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models sometimes start with "model.decoder.layers.4.blahblah". so we append model prefix. to the keys
-        """
-        from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT
-
-        model_prefix, _ = get_layer_prefix(layer_names=model_state_dict.keys(), is_mcore=True)
-
-        nemo_model_conversion_dict = {}
-        for key, value in DEFAULT_CONVERSION_DICT.items():
-            if model_prefix:
-                nemo_model_conversion_dict[f'{model_prefix}{key}'] = value
-            else:
-                nemo_model_conversion_dict[key] = value
-        return nemo_model_conversion_dict
-
-    def build(
-        self,
-        model,
-        model_config,
-        model_type,
-        gpus_per_node,
-        tokenizer,
-        max_input_len: int = 1024,
-        max_output_len: int = 1024,
-        max_batch_size: int = 4,
-        use_refit: bool = True,
-        reshard_model: bool = False,
-        use_mcore_path: bool = True,
-    ):
-        """
-        Convert a model parallel nemo model to TensorRT-LLM.
-        """
-        assert tensorrt_llm.mpi_rank() == torch.distributed.get_rank()
-        self.use_refit, self.model_type, self.gpus_per_node = use_refit, model_type, gpus_per_node
-        self.mp_rank, self.dp_rank, self.tp_size, self.pp_size, self.dp_size = init_model_parallel_from_nemo(
-            reshard_model
-        )
-        self.tokenizer = build_tokenizer(tokenizer)
-
-        if self.dp_size > 1:
-            self.model_dir = os.path.join(self.model_dir, f"dp_rank{self.dp_rank}")
-
-        if use_mcore_path:
-            from megatron.core.export.model_type import ModelType
-            from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
-            from tensorrt_llm.layers import MoeConfig
-
-            storage_dtype = torch_dtype_from_precision(model_config.precision)
-            model_state_dict = self.gather_and_reshard_model(model_config, model, storage_dtype)
-            # We build the transformer config using the nemo model config.
-            transformer_config = self.get_transformer_config(model_config)
-            input_model_type = getattr(ModelType, model_type)
-
-            nemo_model_conversion_dict = self.get_nemo_to_trtllm_conversion_dict(model_state_dict)
-            self.trtllm_helper = TRTLLMHelper(
-                transformer_config=transformer_config,
-                model_type=input_model_type,
-                trtllm_conversion_dict=nemo_model_conversion_dict,
-                position_embedding_type=model_config.get('position_embedding_type'),
-                max_position_embeddings=model_config.get('max_position_embeddings'),
-                rotary_percentage=model_config.get('rotary_percentage', 1.0),
-                rotary_base=model_config.get('rotary_base', 10000),
-                moe_tp_mode=model_config.get('moe_tp_mode', 2),
-                multi_query_mode=model_config.get("multi_query_mode", False),
-                activation=model_config.get('activation', "gelu"),
-                seq_len_interpolation_factor=model_config.get("seq_len_interpolation_factor"),
-                moe_renorm_mode=model_config.get(
-                    'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
-                ),
-                share_embeddings_and_output_weights=model_config.get("share_embeddings_and_output_weights", False),
-            )
-
-            input_dtype = self.get_input_dtype(storage_dtype)
-
-            trtllm_model_weights_list, trtllm_model_config_list = (
-                self.trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
-                    model_state_dict=model_state_dict,
-                    dtype=input_dtype,
-                    state_dict_split_by_layer_numbers=True,
-                    on_device_distributed_conversion=True,
-                    vocab_size=self.tokenizer.vocab_size,
-                    gpus_per_node=gpus_per_node,
-                )
-            )
-            trtllm_model_config = trtllm_model_config_list[0]
-            trtllm_model_weights = trtllm_model_weights_list[0]
-
-            if reshard_model:
-                assert self.pp_size == 1, 'Reshard is true, but pp size is not one'
-                # MCORE Export will use parallel_state to determine pp .
-                # Since we reshard to pp = 1, we need to modify the config and mapping
-                world_size = self.tp_size * self.pp_size
-                trtllm_model_config.pp_size = self.pp_size
-                trtllm_model_config.world_size = world_size
-                trtllm_model_config.mapping = tensorrt_llm.Mapping(
-                    world_size=world_size,
-                    rank=self.mp_rank,
-                    tp_size=self.tp_size,
-                    pp_size=self.pp_size,
-                )
-
-            engine = self.trtllm_helper.build_and_save_engine(
-                max_input_len=max_input_len,
-                max_output_len=max_output_len,
-                max_seq_len=max_input_len + max_output_len,
-                max_batch_size=max_batch_size,
-                trtllm_model_config=trtllm_model_config,
-                trtllm_model_weights=trtllm_model_weights,
-                engine_dir=self.model_dir,
-                use_refit=use_refit,
-            )
-        else:
-            weights, model_config = model_to_trtllm_ckpt(
-                model=model,
-                nemo_model_config=model_config,
-                nemo_export_dir=self.model_dir,
-                decoder_type=model_type,
-                tensor_parallel_size=self.tp_size,
-                pipeline_parallel_size=self.pp_size,
-                gpus_per_node=gpus_per_node,
-                use_parallel_embedding=True,
-                use_distributed_convert=True,
-                model_parallel_rank=self.mp_rank,
-                vocab_size=self.tokenizer.vocab_size,
-            )
-
-            engine = build_and_save_engine(
-                max_input_len=max_input_len,
-                max_output_len=max_output_len,
-                max_seq_len=max_input_len + max_output_len,
-                max_batch_size=max_batch_size,
-                model_config=model_config[0],
-                model_weights=weights[0],
-                model_dir=self.model_dir,
-                model_type=model_type,
-                use_refit=use_refit,
-            )
-
-        torch.distributed.barrier()
-
-        cfg_path = Path(os.path.join(self.model_dir, f'config_{torch.distributed.get_rank()}.json'))
-        with open(cfg_path, "w", encoding="utf-8") as f:
-            json.dump(engine.config.to_dict(), f, indent=4)
-
-        load_distributed(self.model_dir, self.mp_rank, gpus_per_node)
-
-    def refit(self, model, model_config, use_mcore_path=True):
-        """
-        Refits an TensorRT engine using an instantiated nemo model.
-        This function should only be used after calling build()
-        """
-        weights_dict = None
-        if use_mcore_path:
-            storage_dtype = torch_dtype_from_precision(model_config.precision)
-
-            model_state_dict = self.gather_and_reshard_model(model_config, model, storage_dtype)
-
-            nemo_model_conversion_dict = self.get_nemo_to_trtllm_conversion_dict(model_state_dict)
-            self.trtllm_helper.weights_converter.convert(
-                model_state_dict=model_state_dict,
-                tokenizer_vocab_size=self.tokenizer.vocab_size,
-                trtllm_conversion_dict=nemo_model_conversion_dict,
-            )
-            weights_dict = self.trtllm_helper.weights_converter.trtllm_model_weights
-
-        else:
-            weights_dict = dist_model_to_trt_llm_ckpt(
-                model=model,
-                nemo_model_config=model_config,
-                inference_tp_size=self.tp_size,
-                inference_pp_size=self.pp_size,
-                tokenizer_vocab_size=self.tokenizer.vocab_size,
-            )
-        load_distributed(self.model_dir, self.mp_rank, self.gpus_per_node)
-        gc.collect()
-        torch.cuda.empty_cache()
-        refit(weights_dict)
-
-    def forward(
-        self,
-        input_texts: List[str],
-        max_output_len: int = 64,
-        top_k: int = 1,
-        top_p: float = 0.0,
-        temperature: float = 1.0,
-        stop_words_list: List[str] = None,
-        bad_words_list: List[str] = None,
-        no_repeat_ngram_size: int = None,
-        task_ids: List[str] = None,
-        lora_uids: List[str] = None,
-        prompt_embeddings_table=None,
-        prompt_embeddings_checkpoint_path: str = None,
-        streaming: bool = False,
-        output_log_probs: bool = False,
-        output_context_logits: bool = False,
-        output_generation_logits: bool = False,
-        **sampling_kwargs,
-    ):
-        """
-        Exports nemo checkpoints to TensorRT-LLM.
-
-        Args:
-            input_texts (List(str)): list of sentences.
-            max_output_len (int): max generated tokens.
-            top_k (int): limits us to a certain number (K) of the top tokens to consider.
-            top_p (float): limits us to the top tokens within a certain probability mass (p).
-            temperature (float): A parameter of the softmax function, which is the last layer in the network.
-            stop_words_list (List(str)): list of stop words.
-            bad_words_list (List(str)): list of bad words.
-            no_repeat_ngram_size (int): no repeat ngram size.
-            task_ids (List(str)): list of the task ids for the prompt tables.
-            prompt_embeddings_table (List(float)): prompt embeddings table.
-            prompt_embeddings_checkpoint_path (str): path for the nemo checkpoint for the prompt embedding table.
-            output_generation_logits (bool): if True returns generation_logits in the outout of generate method.
-            sampling_kwargs: Additional kwargs to set in the SamplingConfig.
-        """
-
-        if self.model is None:
-            raise Exception(
-                "A nemo checkpoint should be exported to TensorRT-LLM and "
-                "then it should be loaded first to run inference."
-            )
-        else:
-            if prompt_embeddings_table is not None or prompt_embeddings_checkpoint_path is not None:
-                prompt_table = self._get_prompt_embedding_table(
-                    prompt_embeddings_table, prompt_embeddings_checkpoint_path
-                )
-                tv_size = prompt_table.size(dim=0)
-                task_vtoken_counts = [tv_size]
-            elif len(self.ptuning_tables) > 0:
-                prompt_table = self.p_table
-                tv_size = self.task_vocab_size
-                task_vtoken_counts = self.task_vtoken_counts
-            else:
-                prompt_table = None
-                tv_size = None
-                task_vtoken_counts = None
-
-            if task_ids is None:
-                assert prompt_table is None, "There is a prompt embedding table and task_ids cannot be None"
-                input_task_ids = None
-            else:
-                if prompt_table is None:
-                    input_task_ids = None
-                else:
-                    if len(task_ids) > 1:
-                        assert len(task_ids) == len(input_texts), (
-                            "Either len of the task_ids has to be 1 or" "it needs to match with len of input_texts."
-                        )
-
-                    if len(task_ids) == 1:
-                        assert task_ids[0] in self.task_ids.keys(), "Task: {0} doesn't exist in the task list.".format(
-                            task_ids[0]
-                        )
-                        input_task_ids = [self.task_ids[task_ids[0]] for i in range(len(input_texts))]
-                    else:
-                        input_task_ids = []
-                        for i in range(len(input_texts)):
-                            assert (
-                                task_ids[i] in self.task_ids.keys()
-                            ), "Task: {0} doesn't exist in the task list.".format(task_ids[i])
-                            input_task_ids.append(self.task_ids[task_ids[i]])
-            if not streaming:
-                if torch.distributed.is_initialized() or tensorrt_llm.mpi_world_size() > 1:
-                    multiprocessed_env = True
-                else:
-                    multiprocessed_env = False
-
-                return generate(
-                    input_texts=input_texts,
-                    max_output_len=max_output_len,
-                    host_context=self.model,
-                    top_k=top_k,
-                    top_p=top_p,
-                    temperature=temperature,
-                    prompt_table=prompt_table,
-                    task_vocab_size=tv_size,
-                    task_vtoken_counts=task_vtoken_counts,
-                    task_ids=input_task_ids,
-                    lora_uids=lora_uids,
-                    stop_words_list=stop_words_list,
-                    bad_words_list=bad_words_list,
-                    no_repeat_ngram_size=no_repeat_ngram_size,
-                    output_log_probs=output_log_probs,
-                    multiprocessed_env=multiprocessed_env,
-                    output_context_logits=output_context_logits,
-                    output_generation_logits=output_generation_logits,
-                    **sampling_kwargs,
-                )
-            else:
-                return generate_streaming(
-                    input_texts=input_texts,
-                    max_output_len=max_output_len,
-                    host_context=self.model,
-                    top_k=top_k,
-                    top_p=top_p,
-                    temperature=temperature,
-                    prompt_table=prompt_table,
-                    task_vocab_size=tv_size,
-                    task_vtoken_counts=task_vtoken_counts,
-                    task_ids=input_task_ids,
-                    lora_uids=lora_uids,
-                    stop_words_list=stop_words_list,
-                    bad_words_list=bad_words_list,
-                    no_repeat_ngram_size=no_repeat_ngram_size,
-                    **sampling_kwargs,
-                )
-
-    def add_prompt_table(self, task_name: str, prompt_embeddings_checkpoint_path: str):
-        """Add prompt table"""
-        if self.model is None:
-            raise Exception(
-                "A nemo checkpoint should be exported to TensorRT-LLM and "
-                "then it should be loaded first to run inference."
-            )
-
-        for pt in self.ptuning_tables:
-            if pt["task_name"] == task_name:
-                raise Exception("Task name: {0} has already added. Please pass a unique task name.".format(task_name))
-
-        prompt_table = self._get_prompt_embedding_table(
-            prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path
-        )
-
-        self.ptuning_tables.append({"table": prompt_table, "task_name": task_name})
-        with open(os.path.join(self.model_dir, 'prompt_tables.pkl'), 'wb') as f:
-            pickle.dump(self.ptuning_tables, f)
-
-        self._prep_ptuning_table()
-
-    def remove_prompt_table(self, task_name: str):
-        """Remove prompt table"""
-        if self.ptuning_tables is not None:
-            for i in range(len(self.ptuning_tables)):
-                if self.ptuning_tables[i]["task_name"] == task_name:
-                    self.ptuning_tables.pop(i)
-                    with open(os.path.join(self.model_dir, 'prompt_tables.pkl'), 'wb') as f:
-                        pickle.dump(self.ptuning_tables, f)
-                    return
-            self._prep_ptuning_table()
-
-    def _pad_logits(self, logits_tensor):
-        """
-        Pads the logits tensor with 0's on the right
-        """
-        padding_len = max([logit_tensor.shape[0] for logit_tensor in logits_tensor])
-        for i, tensor in enumerate(logits_tensor):
-            tensor_len = tensor.shape[0]
-            if tensor_len < padding_len:
-                padding_diff = padding_len - tensor_len
-                # padding_diff num of rows of zeros are added at the bottom
-                logits_tensor[i] = F.pad(tensor, (0, 0, 0, padding_diff), mode='constant', value=0)
-        return logits_tensor
-
-    @property
-    def get_supported_models_list(self):
-        """Supported model list"""
-        # gpt and gptnext are the same. Keeping the gptnext due to backward compatibility.
-        return ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"]
-
-    @property
-    def get_supported_hf_model_mapping(self):
-        """Supported HF Model Mapping"""
-        HF_MODEL_CLASS_MAP = {
-            'GPT2LMHeadModel': GPTForCausalLM,
-            'GPT2LMHeadCustomModel': GPTForCausalLM,
-            'GPTBigCodeForCausalLM': GPTForCausalLM,
-            'Starcoder2ForCausalLM': GPTForCausalLM,
-            'JAISLMHeadModel': GPTForCausalLM,
-            'GPTForCausalLM': GPTForCausalLM,
-            'NemotronForCausalLM': GPTForCausalLM,
-            'OPTForCausalLM': OPTForCausalLM,
-            'BloomForCausalLM': BloomForCausalLM,
-            'RWForCausalLM': FalconForCausalLM,
-            'FalconForCausalLM': FalconForCausalLM,
-            'PhiForCausalLM': PhiForCausalLM,
-            'Phi3ForCausalLM': Phi3ForCausalLM,
-            'Phi3VForCausalLM': Phi3ForCausalLM,
-            'Phi3SmallForCausalLM': Phi3ForCausalLM,
-            'PhiMoEForCausalLM': Phi3ForCausalLM,
-            'MambaForCausalLM': MambaForCausalLM,
-            'GPTNeoXForCausalLM': GPTNeoXForCausalLM,
-            'GPTJForCausalLM': GPTJForCausalLM,
-            'MptForCausalLM': MPTForCausalLM,
-            'MPTForCausalLM': MPTForCausalLM,
-            'GLMModel': ChatGLMForCausalLM,
-            'ChatGLMModel': ChatGLMForCausalLM,
-            'ChatGLMForCausalLM': ChatGLMForCausalLM,
-            'ChatGLMForConditionalGeneration': ChatGLMForCausalLM,
-            'LlamaForCausalLM': LLaMAForCausalLM,
-            'LlavaLlamaModel': LLaMAForCausalLM,
-            'ExaoneForCausalLM': LLaMAForCausalLM,
-            'MistralForCausalLM': LLaMAForCausalLM,
-            'MixtralForCausalLM': LLaMAForCausalLM,
-            'ArcticForCausalLM': LLaMAForCausalLM,
-            'Grok1ModelForCausalLM': GrokForCausalLM,
-            'InternLMForCausalLM': LLaMAForCausalLM,
-            'InternLM2ForCausalLM': LLaMAForCausalLM,
-            'InternLMXComposer2ForCausalLM': LLaMAForCausalLM,
-            'GraniteForCausalLM': LLaMAForCausalLM,
-            'GraniteMoeForCausalLM': LLaMAForCausalLM,
-            'MedusaForCausalLM': MedusaForCausalLm,
-            'MedusaLlamaForCausalLM': MedusaForCausalLm,
-            'ReDrafterForCausalLM': ReDrafterForCausalLM,
-            'BaichuanForCausalLM': BaichuanForCausalLM,
-            'BaiChuanForCausalLM': BaichuanForCausalLM,
-            'SkyworkForCausalLM': LLaMAForCausalLM,
-            'GEMMA': GemmaForCausalLM,
-            'GEMMA2': GemmaForCausalLM,
-            'QWenLMHeadModel': QWenForCausalLM,
-            'QWenForCausalLM': QWenForCausalLM,
-            'Qwen2ForCausalLM': QWenForCausalLM,
-            'Qwen2MoeForCausalLM': QWenForCausalLM,
-            'Qwen2ForSequenceClassification': QWenForCausalLM,
-            'Qwen2VLForConditionalGeneration': QWenForCausalLM,
-            'Qwen2VLModel': QWenForCausalLM,
-            'WhisperEncoder': WhisperEncoder,
-            'EncoderModel': EncoderModel,
-            'DecoderModel': DecoderModel,
-            'DbrxForCausalLM': DbrxForCausalLM,
-            'RecurrentGemmaForCausalLM': RecurrentGemmaForCausalLM,
-            'CogVLMForCausalLM': CogVLMForCausalLM,
-            'DiT': DiT,
-            'DeepseekForCausalLM': DeepseekForCausalLM,
-            'DeciLMForCausalLM': DeciLMForCausalLM,
-            'DeepseekV2ForCausalLM': DeepseekV2ForCausalLM,
-            'EagleForCausalLM': EagleForCausalLM,
-            'CohereForCausalLM': CohereForCausalLM,
-            'MLLaMAModel': MLLaMAForCausalLM,
-            'MllamaForConditionalGeneration': MLLaMAForCausalLM,
-            'BertForQuestionAnswering': BertForQuestionAnswering,
-            'BertForSequenceClassification': BertForSequenceClassification,
-            'BertModel': BertModel,
-            'RobertaModel': RobertaModel,
-            'RobertaForQuestionAnswering': RobertaForQuestionAnswering,
-            'RobertaForSequenceClassification': RobertaForSequenceClassification,
-        }
-        return HF_MODEL_CLASS_MAP
-
-    @property
-    def get_hidden_size(self):
-        """Get hidden size"""
-        if self.config is None:
-            return None
-        else:
-            return self.config["pretrained_config"]["hidden_size"]
-
-    @property
-    def get_triton_input(self):
-        """Get triton input"""
-        inputs = (
-            Tensor(name="prompts", shape=(-1,), dtype=bytes),
-            Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
-            Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
-            Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
-            Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
-            Tensor(name="random_seed", shape=(-1,), dtype=np.int_, optional=True),
-            Tensor(name="stop_words_list", shape=(-1,), dtype=bytes, optional=True),
-            Tensor(name="bad_words_list", shape=(-1,), dtype=bytes, optional=True),
-            Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True),
-            Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True),
-            Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
-            Tensor(name="output_context_logits", shape=(-1,), dtype=np.bool_, optional=False),
-            Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False),
-        )
-        return inputs
-
-    @property
-    def get_triton_output(self):
-        outputs = (
-            Tensor(name="outputs", shape=(-1,), dtype=bytes),
-            Tensor(name="generation_logits", shape=(-1,), dtype=np.single),
-            Tensor(name="context_logits", shape=(-1,), dtype=np.single),
-        )
-        return outputs
-
-    @batch
-    @first_value(
-        "max_output_len",
-        "top_k",
-        "top_p",
-        "temperature",
-        "random_seed",
-        "no_repeat_ngram_size",
-        "output_generation_logits",
-        "output_context_logits",
-    )
-    def triton_infer_fn(self, **inputs: np.ndarray):
-        """Triton infer function for streaming"""
-        output_dict = {}
-        context_logits_available = False
-        generation_logits_available = False
-        prompts = str_ndarray2list(inputs.pop("prompts"))
-        infer_input = {"input_texts": prompts}
-        try:
-            if "max_output_len" in inputs:
-                infer_input["max_output_len"] = inputs.pop("max_output_len")
-            if "top_k" in inputs:
-                infer_input["top_k"] = inputs.pop("top_k")
-            if "top_p" in inputs:
-                infer_input["top_p"] = inputs.pop("top_p")
-            if "temperature" in inputs:
-                infer_input["temperature"] = inputs.pop("temperature")
-            if "random_seed" in inputs:
-                infer_input["random_seed"] = inputs.pop("random_seed")
-            if "stop_words_list" in inputs:
-                stop_words_list = str_ndarray2list(inputs.pop("stop_words_list"))
-                infer_input["stop_words_list"] = [[stop_word] for stop_word in stop_words_list]
-            if "bad_words_list" in inputs:
-                bad_words_list = str_ndarray2list(inputs.pop("bad_words_list"))
-                infer_input["bad_words_list"] = [[bad_word] for bad_word in bad_words_list]
-            if "no_repeat_ngram_size" in inputs:
-                infer_input["no_repeat_ngram_size"] = inputs.pop("no_repeat_ngram_size")
-            if "task_id" in inputs:
-                task_id = np.char.decode(inputs.pop("task_id").astype("bytes"), encoding="utf-8")
-                infer_input["task_ids"] = task_id[0]
-            if "lora_uids" in inputs:
-                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
-                infer_input["lora_uids"] = lora_uids[0].tolist()
-            if "output_generation_logits" in inputs:
-                generation_logits_available = inputs["output_generation_logits"]
-                infer_input["output_generation_logits"] = inputs.pop("output_generation_logits")
-            if "output_context_logits" in inputs:
-                context_logits_available = inputs["output_context_logits"]
-                infer_input["output_context_logits"] = inputs.pop("output_context_logits")
-
-            if generation_logits_available:
-                # generation_logits is a 4d torch tensor of dim [BS,1,#generated_tokens,vocab_size]
-                output_texts, generation_logits = self.forward(**infer_input)
-                # convert generation_logits to numpy array. Note: from my understanding since generation_logits is
-                # returned as a torch tensor it won't have varying number of tokens across multiple sequences,
-                # likely due to TRTLLM taking care of padding hence no addtnl padding is needed.
-                output_dict["generation_logits"] = np.array(
-                    [generation_logit.cpu().numpy() for generation_logit in generation_logits]
-                )
-
-            elif context_logits_available:
-                output_texts, context_logits = self.forward(**infer_input)
-                # context_logits is a list of tensors shaped [#tokens, vocab_size] and the len of the list  is BS
-                # In case of batched inputs (i.e multiple prompts sent as a list) context_logits returned can have
-                # different seq_len. Following code pads them as it can otherwise error while converting to numpy array
-                context_logits = self._pad_logits(context_logits)
-                # Convert context_Logits to numpy array of shape [bS, 1, padding_len, vocab_size],.
-                context_logits = np.array([logit_tensor.unsqueeze(0).cpu().numpy() for logit_tensor in context_logits])
-                output_dict["context_logits"] = context_logits
-            else:
-                output_texts = self.forward(**infer_input)
-            output_dict["outputs"] = cast_output(output_texts, np.bytes_)
-        except Exception as error:
-            err_msg = "An error occurred: {0}".format(str(error))
-            output_dict["outputs"] = cast_output([err_msg] * len(prompts), np.bytes_)
-
-        return output_dict
-
-    @batch
-    @first_value("max_output_len", "top_k", "top_p", "temperature", "random_seed", "no_repeat_ngram_size")
-    def triton_infer_fn_streaming(self, **inputs: np.ndarray):
-        """Triton infer function for streaming"""
-        try:
-            infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
-            if "max_output_len" in inputs:
-                infer_input["max_output_len"] = inputs.pop("max_output_len")
-            if "top_k" in inputs:
-                infer_input["top_k"] = inputs.pop("top_k")
-            if "top_p" in inputs:
-                infer_input["top_p"] = inputs.pop("top_p")
-            if "temperature" in inputs:
-                infer_input["temperature"] = inputs.pop("temperature")
-            if "random_seed" in inputs:
-                infer_input["random_seed"] = inputs.pop("random_seed")
-            if "stop_words_list" in inputs:
-                stop_words_list = str_ndarray2list(inputs.pop("stop_words_list"))
-                infer_input["stop_words_list"] = [[stop_word] for stop_word in stop_words_list]
-            if "bad_words_list" in inputs:
-                bad_words_list = str_ndarray2list(inputs.pop("bad_words_list"))
-                infer_input["bad_words_list"] = [[bad_word] for bad_word in bad_words_list]
-            if "no_repeat_ngram_size" in inputs:
-                infer_input["no_repeat_ngram_size"] = inputs.pop("no_repeat_ngram_size")
-            if "task_id" in inputs:
-                task_id = np.char.decode(inputs.pop("task_id").astype("bytes"), encoding="utf-8")
-                infer_input["task_ids"] = task_id[0]
-            if "lora_uids" in inputs:
-                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
-                infer_input["lora_uids"] = lora_uids[0].tolist()
-
-            partial_outputs = self.forward(**infer_input, streaming=True)
-            # On each request to this generator, run the model for one step and return a dict
-            # with full outputs generated until this step.
-            for output_texts in partial_outputs:
-                yield {"outputs": cast_output(output_texts, np.bytes_)}
-        except Exception as error:
-            err_msg = "An error occurred: {0}".format(str(error))
-            output = cast_output([err_msg], np.bytes_)
-            return {"outputs": output}
-
-    def _prep_ptuning_table(self):
-        self.task_vocab_size = 0
-        for pt in self.ptuning_tables:
-            if self.task_vocab_size < pt["table"].size(dim=0):
-                self.task_vocab_size = pt["table"].size(dim=0)
-
-        # pad tasks to longest task embedding table, remember the original task vtoken counts
-        vtokens_embeddings = []
-        self.task_vtoken_counts = []
-        self.task_ids = {}
-        tid = 0
-        for i, ptuning_table in enumerate(self.ptuning_tables):
-            original_table = ptuning_table["table"]
-            vtoken_count = original_table.size(dim=0)
-            padded_table = torch.zeros((self.task_vocab_size, self.get_hidden_size), dtype=original_table.dtype)
-            padded_table[:vtoken_count, :] = original_table
-            vtokens_embeddings.append(padded_table)
-            self.task_ids[ptuning_table["task_name"]] = tid
-            self.task_vtoken_counts.append(vtoken_count)
-            tid = tid + 1
-
-        if len(vtokens_embeddings) > 0:
-            self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size)
-
-            max_prompt_embedding_table_size = self.config['build_config']['max_prompt_embedding_table_size']
-            actual_prompt_table_size = self.p_table.shape[0]
-
-            if actual_prompt_table_size > max_prompt_embedding_table_size:
-                raise Exception(
-                    f"The size of the combined prompt embedding table ({actual_prompt_table_size}) is greater than max_prompt_embedding_table_size ({max_prompt_embedding_table_size})."
-                )
-        else:
-            self.p_table = None
-
-    def _load_prompt_tables(self):
-        if self.model_dir is not None:
-            pt_path = Path(os.path.join(self.model_dir, 'prompt_tables.pkl'))
-            if pt_path.exists():
-                with open(pt_path, 'rb') as f:
-                    self.ptuning_tables = pickle.load(f)
-                self._prep_ptuning_table()
-            else:
-                self.ptuning_tables = []
-
-    def _get_prompt_embedding_table_ckpt(self, prompt_embeddings_checkpoint_path):
-        with TarPath(prompt_embeddings_checkpoint_path) as checkpoint_archive:
-            mw_path = checkpoint_archive / "model_weights.ckpt"
-            if not mw_path.exists():
-                mw_path = checkpoint_archive / "mp_rank_00/model_weights.ckpt"
-                if not mw_path.exists():
-                    raise FileNotFoundError(
-                        "File: {0} could not be found in the nemo checkpoint. "
-                        "Please check the nemo checkpoint format for the prompt "
-                        "embedding table.".format(mw_path)
-                    )
-
-            with mw_path.open('rb') as mw_file:
-                weights = torch.load(mw_file)
-
-            weights_found = True
-            if "model.embedding.adapter_layer.ptuning_adapter.inference_table" in weights:
-                weights = weights["model.embedding.adapter_layer.ptuning_adapter.inference_table"]
-            elif (
-                "model.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight"
-                in weights
-            ):
-                weights = weights[
-                    "model.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight"
-                ]
-            elif 'prompt_table' in weights:
-                if "prompt_table.taskname.prompt_embeddings.weight" in weights['prompt_table']:
-                    weights = weights['prompt_table']["prompt_table.taskname.prompt_embeddings.weight"]
-                else:
-                    weights_found = False
-            else:
-                weights_found = False
-
-            if not weights_found:
-                raise Exception(
-                    "Could not find the embedding table in the {0}. Please check the nemo file format".format(
-                        prompt_embeddings_checkpoint_path
-                    )
-                )
-
-            return weights.cpu().detach()
-
-    def _get_prompt_embedding_table(
-        self,
-        prompt_embeddings_table=None,
-        prompt_embeddings_checkpoint_path=None,
-    ):
-        if prompt_embeddings_table is not None and prompt_embeddings_checkpoint_path is not None:
-            LOGGER.warning(
-                "prompt_embeddings_table will be used and "
-                "prompt_embeddings_checkpoint_path will be "
-                "ignored for ptuning."
-            )
-            p_tuning = "use_table"
-        elif prompt_embeddings_table is not None:
-            p_tuning = "use_table"
-        elif prompt_embeddings_checkpoint_path is not None:
-            p_tuning = "use_checkpoint"
-        else:
-            return None, None
-
-        if p_tuning == "use_table":
-            if not isinstance(prompt_embeddings_table, np.ndarray):
-                raise TypeError("Only numpy array is allowed for the prompt embeddings table.")
-
-            if len(prompt_embeddings_table.shape) != 2:
-                raise Exception("A two dimensional prompt embeddings table for a single task is only supported.")
-
-            prompt_embeddings_table = torch.from_numpy(prompt_embeddings_table)
-        elif p_tuning == "use_checkpoint":
-            if not is_nemo_tarfile(prompt_embeddings_checkpoint_path):
-                raise TypeError(prompt_embeddings_checkpoint_path + " is not a nemo file.")
-            prompt_embeddings_table = self._get_prompt_embedding_table_ckpt(prompt_embeddings_checkpoint_path)
-
-        dtype = self.config['pretrained_config']['dtype']
-        prompt_embeddings_table = prompt_embeddings_table.to(
-            dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype)
-        ).cuda()
-
-        if prompt_embeddings_table.size(dim=1) != self.config["pretrained_config"]["hidden_size"]:
-            raise Exception(
-                "Hidden dimension of the model is {0} and does not match with the dimension of the prompt table.".format(
-                    self.config["pretrained_config"]["hidden_size"]
-                )
-            )
-
-        return prompt_embeddings_table
-
-    def _load_config_file(self):
-        config_path = Path(self.engine_dir) / 'config.json'
-        if config_path.exists():
-            with open(config_path, 'r') as f:
-                self.config = json.load(f)
-        else:
-            raise FileNotFoundError(f"File: {config_path} could not be found.")
-
-    def _load(self):
-        self.model = None
-        self.tokenizer = None
-        self.config = None
-        self.ptuning_tables = []
-
-        if Path(self.model_dir).exists():
-            folders = os.listdir(self.model_dir)
-            if len(folders) > 0:
-                try:
-                    self._load_config_file()
-                    self.tokenizer = get_tokenizer(self.model_dir)
-                    self.model = load(
-                        tokenizer=self.tokenizer,
-                        engine_dir=self.engine_dir,
-                        lora_ckpt_list=self.lora_ckpt_list,
-                        use_python_runtime=self.use_python_runtime,
-                        enable_chunked_context=self.enable_chunked_context,
-                        max_tokens_in_paged_kv_cache=self.max_tokens_in_paged_kv_cache,
-                        multi_block_mode=self.multi_block_mode,
-                    )
-                    self._load_prompt_tables()
-                except Exception as error:
-                    raise RuntimeError(
-                        "Files in the TensorRT-LLM folder are corrupted and the model needs to be exported again."
-                    ) from error
-
-    def unload_engine(self):
-        """Unload engine"""
-        unload_engine()
diff --git a/nemo/export/tensorrt_mm_exporter.py b/nemo/export/tensorrt_mm_exporter.py
deleted file mode 100644
index 54914846fa79..000000000000
--- a/nemo/export/tensorrt_mm_exporter.py
+++ /dev/null
@@ -1,365 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import shutil
-import tempfile
-from pathlib import Path
-from typing import List
-
-import numpy as np
-import wrapt
-from tensorrt_llm.runtime import MultimodalModelRunner as TRTLLMRunner
-
-from nemo.deploy import ITritonDeployable
-from nemo.export.multimodal.build import (
-    build_mllama_engine,
-    build_perception_engine,
-    build_trtllm_engine,
-    build_visual_engine,
-    extract_lora_ckpt,
-)
-from nemo.export.multimodal.run import MultimodalModelRunner, SpeechllmModelRunner
-
-use_deploy = True
-try:
-    from nemo.deploy.utils import cast_output, ndarray2img, str_ndarray2list
-except Exception:
-    use_deploy = False
-
-
-@wrapt.decorator
-def noop_decorator(func):
-    """No op decorator"""
-
-    def wrapper(*args, **kwargs):
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-use_pytriton = True
-batch = noop_decorator
-try:
-    from pytriton.decorators import batch, first_value
-    from pytriton.model_config import Tensor
-except Exception:
-    use_pytriton = False
-
-
-LOGGER = logging.getLogger("NeMo")
-
-
-class TensorRTMMExporter(ITritonDeployable):
-    """
-    Exports nemo checkpoints to TensorRT and run fast inference.
-
-    Example:
-        from nemo.export import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir="/path/for/model/files")
-        exporter.export(
-            visual_checkpoint_path="/path/for/nemo/checkpoint",
-            model_type="neva",
-            tensor_parallel_size=1,
-        )
-
-        output = exporter.forward("Hi! What is in this image?", "/path/for/input_media")
-        print("output: ", output)
-
-    """
-
-    def __init__(
-        self,
-        model_dir: str,
-        load_model: bool = True,
-        modality: str = "vision",
-    ):
-        self.model_dir = model_dir
-        self.runner = None
-        # vision modality is for image and video
-        assert modality in ["vision", "audio"]
-        self.modality = modality
-
-        if load_model:
-            self._load()
-
-    def export(
-        self,
-        visual_checkpoint_path: str,
-        llm_checkpoint_path: str = None,
-        model_type: str = "neva",
-        llm_model_type: str = "llama",
-        tensor_parallel_size: int = 1,
-        max_input_len: int = 4096,
-        max_output_len: int = 256,
-        max_batch_size: int = 1,
-        vision_max_batch_size: int = 1,
-        max_multimodal_len: int = 3072,
-        dtype: str = "bfloat16",
-        delete_existing_files: bool = True,
-        load_model: bool = True,
-        use_lora_plugin: str = None,
-        lora_target_modules: List[str] = None,
-        lora_checkpoint_path: str = None,
-        max_lora_rank: int = 64,
-    ):
-        """Export multimodal models to TRTLLM"""
-        if Path(self.model_dir).exists():
-            if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
-                for files in os.listdir(self.model_dir):
-                    path = os.path.join(self.model_dir, files)
-                    try:
-                        shutil.rmtree(path)
-                    except OSError:
-                        os.remove(path)
-
-                if len(os.listdir(self.model_dir)) > 0:
-                    raise Exception("Couldn't delete all files.")
-            elif len(os.listdir(self.model_dir)) > 0:
-                raise Exception("There are files in this folder. Try setting delete_existing_files=True.")
-        else:
-            Path(self.model_dir).mkdir(parents=True, exist_ok=True)
-
-        if model_type == "mllama":
-            build_mllama_engine(
-                model_dir=self.model_dir,
-                checkpoint_path=visual_checkpoint_path,
-                tensor_parallelism_size=tensor_parallel_size,
-                max_input_len=max_input_len,
-                max_output_len=max_output_len,
-                max_batch_size=max_batch_size,
-                vision_max_batch_size=vision_max_batch_size,
-                max_multimodal_len=max_multimodal_len,
-                dtype=dtype,
-            )
-        else:
-            if lora_checkpoint_path is not None:
-                tmp_dir = tempfile.TemporaryDirectory()
-                if os.path.isdir(lora_checkpoint_path):
-                    lora_dir = lora_checkpoint_path
-                else:
-                    raise ValueError("lora_checkpoint_path in nemo1 is not supported. It must be a directory")
-
-                llm_lora_path = [extract_lora_ckpt(lora_dir, tmp_dir.name)]
-            else:
-                tmp_dir = None
-                llm_lora_path = None
-                lora_dir = None
-
-            llm_dir = os.path.join(self.model_dir, "llm_engine")
-            build_trtllm_engine(
-                model_dir=llm_dir,
-                visual_checkpoint_path=visual_checkpoint_path,
-                llm_checkpoint_path=llm_checkpoint_path,
-                model_type=model_type,
-                llm_model_type=llm_model_type,
-                tensor_parallelism_size=tensor_parallel_size,
-                max_input_len=max_input_len,
-                max_output_len=max_output_len,
-                max_batch_size=max_batch_size,
-                max_multimodal_len=max_multimodal_len,
-                dtype=dtype,
-                use_lora_plugin=use_lora_plugin,
-                lora_target_modules=lora_target_modules,
-                max_lora_rank=max_lora_rank,
-                lora_ckpt_list=llm_lora_path,
-            )
-
-            if model_type == "salm":
-                perception_dir = os.path.join(self.model_dir, "perception_engine")
-                build_perception_engine(perception_dir, visual_checkpoint_path, model_type, vision_max_batch_size)
-            else:
-                visual_dir = os.path.join(self.model_dir, "visual_engine")
-                build_visual_engine(
-                    visual_dir,
-                    visual_checkpoint_path if lora_dir is None else lora_dir,
-                    model_type,
-                    vision_max_batch_size,
-                )
-
-            if tmp_dir is not None:
-                tmp_dir.cleanup()
-
-        if load_model:
-            self._load()
-
-    def forward(
-        self,
-        input_text: str,
-        input_media: str,
-        batch_size: int = 1,
-        max_output_len: int = 30,
-        top_k: int = 1,
-        top_p: float = 0.0,
-        temperature: float = 1.0,
-        repetition_penalty: float = 1.0,
-        num_beams: int = 1,
-        lora_uids: List[str] = None,
-    ):
-        """Run forward with loaded TRTLLM engine"""
-        if self.runner is None:
-            raise Exception(
-                "A nemo checkpoint should be exported and " "then it should be loaded first to run inference."
-            )
-
-        if isinstance(self.runner, TRTLLMRunner):
-            self.runner.args.image_path = input_media
-            self.runner.args.batch_size = batch_size
-            self.runner.args.top_k = top_k
-            self.runner.args.top_p = top_p
-            self.runner.args.temperature = temperature
-            self.runner.args.repetition_penalty = repetition_penalty
-            self.runner.args.num_beams = num_beams
-            raw_image = self.runner.load_test_data(input_media)
-            return self.runner.run(
-                input_text,
-                raw_image,
-                max_output_len,
-            )[1]
-        else:
-            input_media = self.runner.load_test_media(input_media)
-            return self.runner.run(
-                input_text,
-                input_media,
-                max_output_len,
-                batch_size,
-                top_k,
-                top_p,
-                temperature,
-                repetition_penalty,
-                num_beams,
-                lora_uids,
-            )
-
-    def get_input_media_tensors(self):
-        """Get input media tensors"""
-        if self.modality == "vision":
-            return [Tensor(name="input_media", shape=(-1, -1, -1, 3), dtype=np.uint8)]
-        elif self.modality == "audio":
-            return [
-                Tensor(name="input_signal", shape=(-1,), dtype=np.single),
-                Tensor(name="input_signal_length", shape=(1,), dtype=np.intc),
-            ]
-        return []
-
-    @property
-    def get_triton_input(self):
-        inputs = (
-            [Tensor(name="input_text", shape=(-1,), dtype=bytes)]
-            + self.get_input_media_tensors()
-            + [
-                Tensor(name="batch_size", shape=(-1,), dtype=np.int_, optional=True),
-                Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
-                Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
-                Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
-                Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
-                Tensor(name="repetition_penalty", shape=(-1,), dtype=np.single, optional=True),
-                Tensor(name="num_beams", shape=(-1,), dtype=np.int_, optional=True),
-                Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
-            ]
-        )
-        inputs = tuple(inputs)
-        return inputs
-
-    @property
-    def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
-        return outputs
-
-    @batch
-    @first_value("batch_size", "max_output_len", "top_k", "top_p", "temperature", "repetition_penalty", "num_beams")
-    def triton_infer_fn(self, **inputs: np.ndarray):
-        try:
-            if self.runner is None:
-                raise Exception(
-                    "A nemo checkpoint should be exported and then it should be loaded first to run inference."
-                )
-
-            infer_input = {"input_text": str_ndarray2list(inputs.pop("input_text")[0])}
-            video_model_list = ["video-neva", "lita", "vita"]
-            if self.runner.model_type in ["neva", "vila", "mllama"]:
-                infer_input["input_image"] = ndarray2img(inputs.pop("input_media")[0])[0]
-            elif self.runner.model_type in video_model_list:
-                infer_input["input_image"] = inputs.pop("input_media")[0]
-            elif self.runner.model_type == "salm":
-                infer_input["input_signal"] = inputs.pop("input_signal")
-                infer_input["input_signal_length"] = inputs.pop("input_signal_length")[:, 0]
-            if "batch_size" in inputs:
-                infer_input["batch_size"] = inputs.pop("batch_size")
-            if "max_output_len" in inputs:
-                infer_input["max_new_tokens"] = inputs.pop("max_output_len")
-            if "top_k" in inputs:
-                infer_input["top_k"] = inputs.pop("top_k")
-            if "top_p" in inputs:
-                infer_input["top_p"] = inputs.pop("top_p")
-            if "temperature" in inputs:
-                infer_input["temperature"] = inputs.pop("temperature")
-            if "repetition_penalty" in inputs:
-                infer_input["repetition_penalty"] = inputs.pop("repetition_penalty")
-            if "num_beams" in inputs:
-                infer_input["num_beams"] = inputs.pop("num_beams")
-            if "lora_uids" in inputs:
-                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
-                infer_input["lora_uids"] = lora_uids[0].tolist()
-
-            if isinstance(self.runner, TRTLLMRunner):
-                self.runner.args.batch_size = infer_input.pop("batch_size")
-                self.runner.args.top_k = infer_input.pop("top_k")
-                self.runner.args.top_p = infer_input.pop("top_p")
-                self.runner.args.temperature = infer_input.pop("temperature")
-                self.runner.args.repetition_penalty = infer_input.pop("repetition_penalty")
-                self.runner.args.num_beams = infer_input.pop("num_beams")
-                output_texts = self.runner.run(**infer_input)[1]
-            else:
-                output_texts = self.runner.run(**infer_input)
-            output = cast_output(output_texts, np.bytes_)
-        except Exception as error:
-            err_msg = "An error occurred: {0}".format(str(error))
-            output = cast_output([err_msg], np.bytes_)
-
-        return {"outputs": output}
-
-    def _load(self):
-        llm_dir = os.path.join(self.model_dir, "llm_engine")
-        if not os.path.exists(llm_dir):
-            return
-        if self.modality == "vision":
-            import json
-
-            visual_dir = os.path.join(self.model_dir, "visual_engine")
-            with open(os.path.join(visual_dir, "config.json"), "r") as f:
-                config = json.load(f)
-            if config["builder_config"]["model_type"] == "mllama":
-                from types import SimpleNamespace
-
-                args = SimpleNamespace(
-                    visual_engine_dir=visual_dir,
-                    visual_engine_name="visual_encoder.engine",
-                    llm_engine_dir=llm_dir,
-                    hf_model_dir='meta-llama/Llama-3.2-11B-Vision-Instruct',
-                    use_py_session=True,
-                    cross_kv_cache_fraction=0.5,
-                    enable_context_fmha_fp32_acc=None,
-                    enable_chunked_context=False,
-                    kv_cache_free_gpu_memory_fraction=0.9,
-                    multi_block_mode=True,
-                )
-                self.runner = TRTLLMRunner(args)
-            else:
-                self.runner = MultimodalModelRunner(visual_dir, llm_dir, self.modality)
-        elif self.modality == "audio":
-            perception_dir = os.path.join(self.model_dir, "perception_engine")
-            self.runner = SpeechllmModelRunner(perception_dir, llm_dir, self.modality)
diff --git a/nemo/export/tiktoken_tokenizer.py b/nemo/export/tiktoken_tokenizer.py
deleted file mode 100644
index 2dbfd736f450..000000000000
--- a/nemo/export/tiktoken_tokenizer.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import base64
-import json
-from pathlib import Path
-from typing import Dict, Optional
-
-import numpy as np
-import tiktoken
-import torch
-
-PATTERN_TIKTOKEN = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
-DEFAULT_TIKTOKEN_MAX_VOCAB = 2**17  # 131072
-SPECIAL_TOKENS = ["", "", ""]
-SPECIAL_TOKEN_TEMPLATE = ""
-
-
-def reload_mergeable_ranks(
-    path: str,
-    max_vocab: Optional[int] = None,
-) -> Dict[bytes, int]:
-    """
-    Reload the tokenizer JSON file and convert it to Tiktoken format.
-    """
-    assert path.endswith(".json")
-
-    # reload vocab
-    with open(path, "r", encoding='utf-8') as f:
-        vocab = json.load(f)
-    assert isinstance(vocab, list)
-    print(f"Vocab size: {len(vocab)}")
-    if max_vocab is not None:
-        vocab = vocab[:max_vocab]
-        print(f"Cutting vocab to first {len(vocab)} tokens.")
-
-    # build ranks
-    ranks: Dict[bytes, int] = {}
-    for i, x in enumerate(vocab):
-        assert x.keys() == {"rank", "token_bytes", "token_str"}
-        assert x["rank"] == i
-        merge = base64.b64decode(x["token_bytes"])
-        assert i >= 256 or merge == bytes([i])
-        ranks[merge] = x["rank"]
-
-    # sanity check
-    assert len(ranks) == len(vocab)
-    assert set(ranks.values()) == set(range(len(ranks)))
-
-    return ranks
-
-
-class TiktokenTokenizer:
-    def __init__(self, vocab_file: str):
-
-        self.num_special_tokens = 1000
-        vocab_size = DEFAULT_TIKTOKEN_MAX_VOCAB
-        pattern = PATTERN_TIKTOKEN
-        special_tokens = SPECIAL_TOKENS.copy()
-        inner_vocab_size = vocab_size - self.num_special_tokens
-
-        token2id = reload_mergeable_ranks(vocab_file, max_vocab=inner_vocab_size)
-        self.tokenizer = tiktoken.Encoding(
-            name=Path(vocab_file).parent.name,
-            pat_str=pattern,
-            mergeable_ranks=token2id,
-            special_tokens={},  # special tokens are handled manually
-        )
-
-        # BOS / EOS / Pad token IDs
-        self._bos_id = special_tokens.index("")
-        self._eos_id = special_tokens.index("")
-
-    def encode(self, text):
-        tokens = self.tokenizer.encode(text)
-        tokens = [t + self.num_special_tokens for t in tokens]
-        return tokens
-
-    def decode(self, tokens):
-        # Filter out special tokens and adjust the remaining tokens
-        adjusted_tokens = [
-            t - self.num_special_tokens
-            for t in tokens
-            if t not in {self._bos_id, self._eos_id} and t >= self.num_special_tokens
-        ]
-
-        # Decode only if there are tokens left after filtering
-        if adjusted_tokens:
-            return self.tokenizer.decode(adjusted_tokens)
-        else:
-            return ""  # Return an empty string if all tokens were filtered out
-
-    def batch_decode(self, ids):
-        if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
-            ids = ids.tolist()
-
-        if isinstance(ids[0], list):
-            ids = ids[0]
-
-        return self.decode(ids)
-
-    @property
-    def pad_id(self):
-        return self._eos_id
-
-    @property
-    def bos_token_id(self):
-        return self._bos_id
-
-    @property
-    def eos_token_id(self):
-        return self._eos_id
diff --git a/nemo/export/trt_llm/__init__.py b/nemo/export/trt_llm/__init__.py
deleted file mode 100644
index 4fc50543f1d2..000000000000
--- a/nemo/export/trt_llm/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo/export/trt_llm/converter/__init__.py b/nemo/export/trt_llm/converter/__init__.py
deleted file mode 100644
index 4fc50543f1d2..000000000000
--- a/nemo/export/trt_llm/converter/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
deleted file mode 100755
index e31ab9aed4b4..000000000000
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from typing import Any, Dict, List, Optional, Tuple
-
-import numpy as np
-import tensorrt_llm
-import torch
-from tensorrt_llm._utils import pad_vocab_size
-from tensorrt_llm.functional import non_gated_version
-from tensorrt_llm.layers import MoeConfig
-from tensorrt_llm.models.modeling_utils import PretrainedConfig
-
-from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import (
-    convert_model_to_trt_llm_ckpt,
-    dist_model_to_trt_llm_ckpt,
-)
-from nemo.export.trt_llm.converter.utils import DECODER_MODEL_TYPE, split
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def get_config(decoder_type, config):
-    DECODER_CONFIG = {
-        "llama": tensorrt_llm.models.llama.config.LLaMAConfig,
-        "gpt": tensorrt_llm.models.gpt.config.GPTConfig,
-        "gptnext": tensorrt_llm.models.gpt.config.GPTConfig,
-        "falcon": tensorrt_llm.models.falcon.config.FalconConfig,
-        "gemma": tensorrt_llm.models.GemmaConfig,
-    }
-    config_cls = DECODER_CONFIG[decoder_type] if decoder_type in DECODER_CONFIG else PretrainedConfig
-
-    return config_cls(**config)
-
-
-def prompt_convert(prompt_config, prompt_weights):
-    if "task_templates" in prompt_config:
-        prompt_templates = prompt_config["task_templates"]
-        actual_task_id = 0
-        vtokens_embeddings = []
-        vtokens_len = []
-        for task_name_id, prompt_task in enumerate(prompt_templates):
-            prompt_task_name = prompt_task["taskname"]
-            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
-            prompt_task_weights = prompt_weights["prompt_table"].get(
-                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight"
-            )
-            if prompt_task_weights is None:
-                continue
-            vtokens_embeddings.append(prompt_task_weights)
-            vtokens_len.append(prompt_task_weights.shape[0])
-            actual_task_id += 1
-
-        max_vtoken_len = max(vtokens_len)
-        embedding_dim = vtokens_embeddings[0].shape[1]
-
-        # pad tasks to longest task embedding table
-        for i, vtoken_emb_table in enumerate(vtokens_embeddings):
-            padded_table = torch.zeros((max_vtoken_len, embedding_dim))
-            padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table
-            vtokens_embeddings[i] = padded_table
-
-        vtokens_embeddings = torch.stack(vtokens_embeddings)
-    else:
-        vtokens_embeddings = prompt_weights["prompt_embeddings_weights"]
-
-    return vtokens_embeddings
-
-
-def determine_quantization_settings(
-    nemo_model_config: Dict[str, Any], fp8_quantized: Optional[bool] = None, fp8_kvcache: Optional[bool] = None
-) -> Tuple[bool, bool]:
-    """
-    Determines the exported models quantization settings.
-    Reads from NeMo config, with optional override.
-
-    Args:
-        nemo_model_config (dict): NeMo model configuration
-        fp8_quantized (optional, bool): User-specified quantization flag
-        fp8_kvcache (optional, bool): User-specified cache quantization flag
-    Returns:
-        Tuple[bool, bool]:
-            - Model quantization flag
-            - Model kv-cache quantization flag
-    """
-
-    is_nemo_quantized: bool = nemo_model_config.get('fp8', False)
-    if fp8_quantized is None:
-        fp8_quantized = is_nemo_quantized
-    if fp8_kvcache is None:
-        fp8_kvcache = is_nemo_quantized
-
-    return fp8_quantized, fp8_kvcache
-
-
-def model_to_trtllm_ckpt(
-    model,
-    nemo_model_config,
-    nemo_export_dir,
-    decoder_type: str,
-    dtype: str = "bfloat16",
-    tensor_parallel_size: int = 1,
-    pipeline_parallel_size: int = 1,
-    gpus_per_node: int = None,
-    use_parallel_embedding: bool = False,
-    use_embedding_sharing: bool = False,
-    use_distributed_convert: bool = False,
-    model_parallel_rank: int = None,
-    vocab_size: Optional[int] = None,
-    fp8_quantized: Optional[bool] = None,
-    fp8_kvcache: Optional[bool] = None,
-) -> Tuple[List[Dict], List[PretrainedConfig]]:
-    if nemo_model_config.get("share_embeddings_and_output_weights", False) and not use_embedding_sharing:
-        LOGGER.info(
-            "Found share_embeddings_and_output_weights is True in NeMo config, set use_embedding_sharing = True"
-        )
-        use_embedding_sharing = True
-
-    fp8_quantized, fp8_kvcache = determine_quantization_settings(nemo_model_config, fp8_quantized, fp8_kvcache)
-    # If the model has been sharded with model parallelism, convert the model in a gpu-distributed manner
-    if use_distributed_convert:
-        weights_dict = dist_model_to_trt_llm_ckpt(
-            model=model,
-            nemo_model_config=nemo_model_config,
-            inference_tp_size=tensor_parallel_size,
-            inference_pp_size=pipeline_parallel_size,
-            tokenizer_vocab_size=vocab_size,
-            fp8_quantized=fp8_quantized,
-            fp8_kvcache=fp8_kvcache,
-        )
-        vocab_size_padded = vocab_size
-    else:
-        weights_dict = convert_model_to_trt_llm_ckpt(
-            model=model,
-            nemo_model_config=nemo_model_config,
-            nemo_export_dir=nemo_export_dir,
-            inference_tp_size=tensor_parallel_size,
-            processes=1,
-            storage_type=dtype,
-            use_parallel_embedding=use_parallel_embedding,
-            decoder_type=decoder_type,
-            fp8_quantized=fp8_quantized,
-            fp8_kvcache=fp8_kvcache,
-        )
-
-        has_lm_head = "lm_head.weight" in weights_dict
-        if has_lm_head:
-            lm_head_weight = weights_dict["lm_head.weight"]
-        if vocab_size is None:
-            vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0]
-        vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) if has_lm_head else vocab_size
-
-        if has_lm_head and vocab_size_padded != vocab_size:
-            pad_width = vocab_size_padded - vocab_size
-            lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
-
-    world_size = tensor_parallel_size * pipeline_parallel_size
-    hidden_act = nemo_model_config.get('activation')
-    hidden_act = (
-        hidden_act.split("-")[-1] if nemo_model_config.get('num_moe_experts', 0) else non_gated_version(hidden_act)
-    )
-
-    config = {
-        'architecture': DECODER_MODEL_TYPE[decoder_type],
-        'dtype': dtype,
-        'num_hidden_layers': nemo_model_config.get('num_layers'),
-        'num_attention_heads': nemo_model_config.get('num_attention_heads'),
-        'num_key_value_heads': nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
-        'head_size': nemo_model_config.get('kv_channels'),
-        'hidden_size': nemo_model_config.get('hidden_size'),
-        'intermediate_size': nemo_model_config.get('ffn_hidden_size'),
-        'norm_epsilon': nemo_model_config.get('layernorm_epsilon'),
-        'vocab_size': vocab_size_padded,
-        'position_embedding_type': (
-            "rope_gpt_neox" if nemo_model_config.get('position_embedding_type') == "rope" else "learned_absolute"
-        ),
-        'max_position_embeddings': nemo_model_config.get('max_position_embeddings'),
-        'hidden_act': hidden_act,
-        'use_parallel_embedding': use_parallel_embedding,
-        'embedding_sharding_dim': 0,
-        'share_embedding_table': use_embedding_sharing,
-        'quantization': {
-            'quant_algo': "FP8" if fp8_quantized else None,
-            'kv_cache_quant_algo': "FP8" if fp8_kvcache else None,
-        },
-        'bias': nemo_model_config.get('bias'),
-        'apply_query_key_layer_scaling': False,
-        'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0),
-        'rotary_base': nemo_model_config.get('rotary_base', 10000),
-        'moe_num_experts': nemo_model_config.get('num_moe_experts', 0),
-        'moe_top_k': nemo_model_config.get('moe_router_topk', 0),
-        'moe_normalization_mode': nemo_model_config.get(
-            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
-        ),
-        'moe_tp_mode': nemo_model_config.get(
-            'moe_tp_mode', 2
-        ),  # change MoeConfig.ParallelismMode.TENSOR_PARALLEL to 2
-        'logits_dtype': 'float32',
-        'world_size': world_size,
-        'tp_size': tensor_parallel_size,
-        'pp_size': pipeline_parallel_size,
-    }
-    model_configs = []
-    weights_dicts = []
-    num_layers = nemo_model_config.get('num_layers')
-    rotary_scaling = nemo_model_config.get("seq_len_interpolation_factor")
-
-    if decoder_type == "falcon":
-        config["new_decoder_architecture"] = False if num_layers == 32 else True
-        config["parallel_attention"] = True
-    if rotary_scaling is not None:
-        config["rotary_scaling"] = {"type": "linear", "factor": float(rotary_scaling)}
-
-    if use_distributed_convert:
-        config["gpus_per_node"] = gpus_per_node
-        model_configs.append(get_config(decoder_type, config))
-        model_configs[0].mapping = tensorrt_llm.Mapping(
-            world_size=world_size,
-            rank=model_parallel_rank,
-            tp_size=tensor_parallel_size,
-            pp_size=pipeline_parallel_size,
-        )
-        weights_dicts.append(weights_dict)
-        return weights_dicts, model_configs
-
-    pp_key = {
-        "transformer.vocab_embedding.weight",
-        "transformer.position_embedding.weight",
-        "lm_head.weight",
-        "transformer.ln_f.weight",
-        "transformer.ln_f.bias",
-    }
-
-    for i in range(world_size):
-        mapping = tensorrt_llm.Mapping(
-            world_size=world_size,
-            rank=i,
-            tp_size=tensor_parallel_size,
-            pp_size=pipeline_parallel_size,
-        )
-        layers_range = mapping.pp_layers(num_layers)
-
-        weights_dict_local = {}
-        for k, v in weights_dict.items():
-            if k in pp_key:
-                continue
-            new_key = k
-            if new_key.endswith(".bin"):  # TP split
-                if new_key.endswith(f"{mapping.tp_rank}.bin"):
-                    new_key = new_key.replace(f".{mapping.tp_rank}.bin", "")
-                else:
-                    continue
-            if "layers" in new_key:  # PP
-                layer_num = int(new_key.split(".")[2])
-                if layer_num in layers_range:
-                    new_key = new_key.replace(f"layers.{layer_num}", f"layers.{layer_num-layers_range[0]}")
-                else:
-                    continue
-            if config.get("new_decoder_architecture", False) and "post_layernorm" in new_key:
-                new_key = new_key.replace("post_layernorm", "mlp_layernorm")
-            weights_dict_local[new_key] = v
-
-        if mapping.is_first_pp_rank():
-            embedding_weight = (
-                split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank)
-                if use_parallel_embedding
-                else weights_dict["transformer.vocab_embedding.weight"]
-            )
-
-            weights_dict_local["transformer.vocab_embedding.weight"] = embedding_weight
-
-            pos_embedding_weight = weights_dict.get("transformer.position_embedding.weight")
-            if pos_embedding_weight is not None:
-                if use_parallel_embedding:
-                    pos_embedding_weight = split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank)
-                weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight
-
-        if mapping.is_last_pp_rank():
-            if has_lm_head:
-                weights_dict_local["lm_head.weight"] = split(
-                    lm_head_weight, mapping.tp_size, mapping.tp_rank
-                ).contiguous()
-            weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"]
-
-            ln_f_bias = weights_dict.get("transformer.ln_f.bias")
-            if ln_f_bias is not None:
-                weights_dict_local["transformer.ln_f.bias"] = ln_f_bias
-
-        config["gpus_per_node"] = gpus_per_node
-        model_config = get_config(decoder_type, config)
-        model_config.mapping = mapping
-        model_configs.append(model_config)
-        weights_dicts.append(weights_dict_local)
-
-    return weights_dicts, model_configs
diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
deleted file mode 100644
index 043c8bc48dd9..000000000000
--- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
+++ /dev/null
@@ -1,496 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-# pylint: disable=missing-function-docstring
-
-
-import logging
-import multiprocessing
-from collections import defaultdict
-from pathlib import Path
-
-import torch
-from tensorrt_llm._utils import pad_vocab_size, str_dtype_to_torch
-from tqdm import tqdm
-
-from nemo.export.trt_llm.converter.utils import save_scaling_factor, save_val, split_and_save_weight, weights_dict
-from nemo.export.utils import torch_dtype_from_precision
-
-LOGGER = logging.getLogger("NeMo")
-
-layer_names = {
-    "position_embedding": "embedding.position_embeddings.weight",
-    "word_embedding": "embedding.word_embeddings.weight",
-    "output_layer": "output_layer.weight",
-    "final_layernorm.weight": "final_layernorm.weight",
-    "final_layernorm.bias": "final_layernorm.bias",
-}
-
-
-def extract_layers_with_prefix(model_, prefix):
-    length_to_trim = len(prefix)
-    model_state = model_.get("state_dict", model_)
-    return {key[length_to_trim:]: model_state[key] for key in model_state.keys() if key.startswith(prefix)}
-
-
-def get_layer_name(layer_type: str, prefix: str):
-    layer_dict = layer_names
-    if layer_type in layer_dict:
-        return prefix + layer_dict[layer_type]
-    else:
-        raise ValueError(f"Unknown layer type {layer_type}")
-
-
-def get_layer_prefix(layer_names, is_mcore):
-    transformer_layer_prefix = None
-
-    for layer_name in layer_names:
-        if not layer_name.startswith('optimizer') and 'self_attention' in layer_name:
-            transformer_layer_prefix = layer_name.split('layers')[0]
-            break
-    assert transformer_layer_prefix is not None, f"Cannot extract transformer layer prefix from {layer_name}"
-    if is_mcore:
-        model_prefix = transformer_layer_prefix.split('decoder')[0]
-    else:
-        model_prefix = transformer_layer_prefix.split('encoder')[0]
-    assert model_prefix is not None, "Cannot extract model prefix from {layer_name}"
-
-    return model_prefix, transformer_layer_prefix
-
-
-def rename_key(new_key: str):
-    if "self_attention" in new_key:
-        new_key = new_key.replace("self_attention", "attention")
-    if "attention.linear_qkv.layer_norm_weight" in new_key:
-        new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
-    if "attention.linear_qkv.layer_norm_bias" in new_key:
-        new_key = new_key.replace("attention.linear_qkv.layer_norm_bias", "input_layernorm.bias")
-    if "mlp.linear_fc1.layer_norm_weight" in new_key:
-        new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
-    if "mlp.linear_fc1.layer_norm_bias" in new_key:
-        new_key = new_key.replace("mlp.linear_fc1.layer_norm_bias", "post_attention_layernorm.bias")
-
-    return new_key
-
-
-def rename_key_dist_ckpt(old_key: str, layer: int):
-    new_key = old_key
-    if "layers." in old_key:
-        split_key = old_key.split(".")
-        split_key.insert(1, str(layer))
-        new_key = ".".join(split_key)
-
-    return rename_key(new_key)
-
-
-def is_scaling_factor(key: str) -> bool:
-    return "extra_state" in key
-
-
-def load_scaling_factors(model: dict, num_layers: int, export_config: dict) -> dict:
-    if not export_config.get('fp8_quantized', False):
-        return {}
-
-    scaling_factors = {}
-    for key, val in model.items():
-        if is_scaling_factor(key):
-            for layer in range(num_layers):
-                renamed_key = rename_key_dist_ckpt(key, layer)
-                scaling_factors = save_scaling_factor(scaling_factors, renamed_key, val[layer], export_config)
-
-    return scaling_factors
-
-
-@torch.no_grad()
-def convert_model_to_trt_llm_ckpt(
-    nemo_model_config,
-    model,
-    nemo_export_dir,
-    storage_type,
-    inference_tp_size,
-    decoder_type,
-    use_parallel_embedding,
-    processes,
-    fp8_quantized=False,
-    fp8_kvcache=False,
-):
-
-    # if checkpoints files could be found - start preparing output dir
-    out_dir = create_export_dir(nemo_export_dir)
-    storage_type = str_dtype_to_torch(storage_type)
-    is_mcore = nemo_model_config.get("mcore_gpt", False)
-
-    # load position_embedding from rank 0
-    model_state_dict = model.get("state_dict", model)
-
-    prefix, transformer_layer_prefix = get_layer_prefix(model_state_dict.keys(), is_mcore)
-
-    has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict
-    has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict
-
-    num_layers = nemo_model_config["num_layers"]
-    training_tp_size = 1
-    training_pp_size = 1
-    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
-    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
-    num_attention_heads = nemo_model_config["num_attention_heads"]
-    kv_channels = nemo_model_config.get("kv_channels", None)
-
-    if num_kv_heads == 0:
-        if multi_query_mode:
-            num_kv_heads = 1
-        else:
-            num_kv_heads = num_attention_heads
-
-    export_config = {
-        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p"
-        or nemo_model_config.get("layernorm_zero_centered_gamma", False),
-        "tp_size": training_tp_size,
-        "split_gated_activation": nemo_model_config.get("activation", "gelu")
-        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu", "openai-gelu"]
-        and (decoder_type == "gptnext" or is_mcore),
-        "num_attention_heads": num_attention_heads,
-        "num_kv_heads": num_kv_heads,
-        "kv_channels": kv_channels,
-        "use_attention_nemo_shape": True,
-        "transpose_weights": True,
-        "use_parallel_embedding": use_parallel_embedding,
-        "fp8_quantized": fp8_quantized,
-        "fp8_kvcache": fp8_kvcache,
-    }
-
-    # split_factor: in how many parts a TP training node is split
-    split_factor = inference_tp_size
-    model_level_weights = defaultdict(list)
-
-    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
-        if tp_idx == 0 and pp_idx == 0:
-            if has_position_embedding:
-                val = model[get_layer_name("position_embedding", prefix)]
-                val = val.to(storage_type).cpu()
-                model_level_weights["transformer.position_embedding.weight"].append(val)
-        if pp_idx == 0:
-            val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
-
-            vocab_size = val.shape[0]
-            if use_parallel_embedding:
-                # Pad vocab_size first
-                if vocab_size % inference_tp_size != 0:
-                    vocab_size_padded = pad_vocab_size(vocab_size, inference_tp_size)
-                    pad_width = vocab_size_padded - vocab_size
-                    val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0)
-
-            val = val.to(storage_type).cpu()
-            model_level_weights["transformer.vocab_embedding.weight"].append(val)
-        if has_lm_head and pp_idx == training_pp_size - 1 and decoder_type != "gemma":
-            val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
-            val = val.to(storage_type).cpu()
-            model_level_weights["lm_head.weight"].append(val)
-
-    weights_dict = {}
-    tp_rank = 0
-
-    handle_model_level_weights(model, 0, 0)
-    model = extract_layers_with_prefix(model, transformer_layer_prefix)
-    scaling_factors = load_scaling_factors(model, num_layers, export_config)
-
-    starmap_args = []
-    for key, val in model.items():
-        if "_extra_state" not in key:
-            if len(val.size()) == 1:
-                starmap_args.append(
-                    (
-                        tp_rank,
-                        out_dir,
-                        split_factor,
-                        # Let's rename/map the key to the old layer name previously. You can try printing out
-                        # the rename_key output of the old llama checkpoint and compare.
-                        rename_key_dist_ckpt(key, 0),
-                        # Since the state dict value has the full layers,
-                        # let's select the ith layer weights/biases here.
-                        [val],
-                        storage_type,
-                        None,
-                        export_config,
-                        scaling_factors,
-                    )
-                )
-            else:
-                for i in range(num_layers):
-                    starmap_args.append(
-                        (
-                            tp_rank,
-                            out_dir,
-                            split_factor,
-                            # Let's rename/map the key to the old layer name previously. You can try printing out
-                            # the rename_key output of the old llama checkpoint and compare.
-                            rename_key_dist_ckpt(key, i),
-                            # Since the state dict value has the full layers,
-                            # let's select the ith layer weights/biases here.
-                            [val[i]],
-                            storage_type,
-                            None,
-                            export_config,
-                            scaling_factors,
-                        )
-                    )
-
-    starmap_args = tqdm(starmap_args, desc="saving weights")
-
-    if processes > 1:
-        with multiprocessing.Pool(processes) as pool:
-            weights_dicts = pool.starmap(split_and_save_weight, starmap_args)
-            weights_dict_local = {k: v for d in weights_dicts for k, v in d.items()}
-    else:
-        # simpler for debug situations
-        for starmap_arg in starmap_args:
-            weights_dict_local = split_and_save_weight(*starmap_arg)
-
-    weights_dict.update(weights_dict_local)
-
-    for key, values in model_level_weights.items():
-        model_level_weights[key] = torch.concatenate(values, axis=0)
-        weights_dict[key] = model_level_weights[key]
-
-    weights_dict.update(scaling_factors)
-    return weights_dict
-
-
-def _get_layer_index(split_key):
-    for index, key in enumerate(split_key):
-        if key == "layers":
-            return index + 1
-    raise ValueError(f"Unknown layer name format: {split_key}")
-
-
-def rename_layer_num(param_name, layer_num):
-    split_key = param_name.split(".")
-    layer_index = int(_get_layer_index(split_key))
-    split_key[layer_index] = str(layer_num)
-    return ".".join(split_key)
-
-
-def get_layer_num(param_name):
-    split_key = param_name.split(".")
-    layer_index = int(_get_layer_index(split_key))
-    return int(split_key[layer_index])
-
-
-@torch.no_grad()
-def dist_model_to_trt_llm_ckpt(
-    model,
-    nemo_model_config,
-    inference_tp_size,
-    inference_pp_size,
-    tokenizer_vocab_size,
-    fp8_quantized=False,
-    fp8_kvcache=False,
-):
-    from megatron.core import parallel_state
-    from megatron.core.tensor_parallel.utils import VocabUtility
-
-    tp_rank = parallel_state.get_tensor_model_parallel_rank()
-    tp_size = parallel_state.get_tensor_model_parallel_world_size()
-    tp_group = parallel_state.get_tensor_model_parallel_group()
-    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
-    pp_first_rank = parallel_state.get_pipeline_model_parallel_first_rank()
-    pp_last_rank = parallel_state.get_pipeline_model_parallel_last_rank()
-    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
-    pp_group = parallel_state.get_pipeline_model_parallel_group()
-    pp_is_last = parallel_state.is_pipeline_last_stage()
-    pp_is_first = parallel_state.is_pipeline_first_stage()
-    vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
-    if not vp_size:
-        vp_size = 1
-
-    reshard_model = False
-    if inference_tp_size != tp_size or inference_pp_size != pp_size:
-        LOGGER.info("Training/Generation model parallelism resharding enabled")
-        if inference_pp_size == 1 and pp_size > 1 and inference_tp_size == tp_size:
-            reshard_model = True
-        else:
-            raise NotImplementedError(
-                "NeMo currently only supports PP>1 -> PP=1 resharding,"
-                " other types of resharding will come in future releases."
-            )
-
-    num_layers = nemo_model_config["num_layers"]
-    is_mcore = nemo_model_config.get("mcore_gpt", False)
-    storage_type = torch_dtype_from_precision(nemo_model_config.precision)
-    sample_state_dict = model[0].state_dict() if vp_size > 1 else model.state_dict()
-    prefix, transformer_layer_prefix = get_layer_prefix(sample_state_dict, is_mcore)
-    assert is_mcore, "Only megatron-core inflight model conversion is supported"
-
-    export_config = {
-        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
-        "tp_size": tp_size,
-        "split_gated_activation": nemo_model_config.get("activation", "gelu")
-        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu", "openai-gelu"],
-        "num_attention_heads": nemo_model_config["num_attention_heads"],
-        "num_kv_heads": nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
-        "convert_on_device": True,
-        "use_attention_nemo_shape": True,
-        "transpose_weights": True,
-        "fp8_quantized": fp8_quantized,
-        "fp8_kvcache": fp8_kvcache,
-    }
-
-    starmap_config = {
-        "tp_rank": None,
-        "saved_dir": None,  # unused
-        "split_factor": 0,
-        "storage_type": storage_type,
-        "act_range": None,
-        "config": export_config,
-    }
-
-    tl_params = {}
-    model_level_params = {}
-    starmap_args = []
-    layers_per_pp = num_layers // pp_size
-    layers_per_chunk = layers_per_pp // vp_size
-
-    if vp_size > 1:  # consolidate params across model chunks
-        for idx, model_chunk in enumerate(model):
-            for key, val in model_chunk.state_dict().items():
-                if torch.is_tensor(val):
-                    if 'layers' in key:
-                        key2 = rename_layer_num(key, get_layer_num(key) + idx * pp_size * layers_per_chunk)
-                        tl_params[key2] = val
-                    else:
-                        model_level_params[key] = val
-    else:
-        for key, val in model.state_dict().items():
-            if torch.is_tensor(val):
-                if 'decoder.layers' in key:
-                    tl_params[key] = val
-                else:
-                    model_level_params[key] = val
-
-    if vp_size > 1 or reshard_model:
-        # gather layers across pp ranks
-        gathered_params = {}
-        for key, val in tl_params.items():
-            weight_list = [torch.zeros_like(val) for _ in range(pp_size)]
-            torch.distributed.all_gather(weight_list, val, group=pp_group)
-            for idx in range(pp_size):
-                layer_num = get_layer_num(key) + idx * layers_per_chunk
-                key2 = rename_layer_num(key, layer_num)
-                if not reshard_model:  # Save only layers of 1 single PP stage
-                    layers_start = layers_per_pp * pp_rank
-                    layers_end = layers_per_pp * (pp_rank + 1) - 1
-                    if layer_num >= layers_start and layer_num <= layers_end:
-                        key2 = rename_layer_num(key, layer_num % layers_per_pp)
-                        gathered_params[key2] = weight_list[idx]
-                else:
-                    gathered_params[key2] = weight_list[idx]
-        tl_params = gathered_params
-
-    # ----------------Convert layer level weights----------------
-    layer_params = extract_layers_with_prefix(tl_params, transformer_layer_prefix)
-    layer_params = {k: v for k, v in layer_params.items() if k.startswith("layers.")}
-    for key, val in layer_params.items():
-        starmap_args.append(starmap_config | {'key': rename_key(key), 'vals': val})
-
-    def broadcast_item(item, group, src_rank):
-        item = [item]
-        torch.distributed.broadcast_object_list(item, src_rank, group=group)
-        return item[0]
-
-    def try_get_model_level_weight(src_key_or_tensor, pp_src_idx):
-        have_tensor = False
-        if torch.distributed.get_rank() == pp_src_idx:
-            if isinstance(src_key_or_tensor, str):
-                tensor = model_level_params.get(src_key_or_tensor, None)
-                have_tensor = torch.is_tensor(tensor)
-            else:
-                assert torch.is_tensor(src_key_or_tensor)
-                tensor = src_key_or_tensor
-                have_tensor = True
-        if reshard_model:
-            have_tensor = broadcast_item(have_tensor, pp_group, pp_src_idx)
-        if not have_tensor:
-            return None
-
-        if reshard_model:  # Broadcast tensor to all PP groups
-            if torch.distributed.get_rank() == pp_src_idx:
-                shape = tensor.shape
-            else:
-                shape = [None]
-            shape = broadcast_item(shape, pp_group, pp_src_idx)
-            if torch.distributed.get_rank() != pp_src_idx:
-                tensor = torch.zeros(shape, dtype=storage_type).cuda()
-            torch.distributed.broadcast(tensor.contiguous(), pp_src_idx, group=pp_group)
-        return tensor
-
-    # ----------------Convert Final Layernorm----------------
-    if pp_is_last or reshard_model:
-        ln_f = try_get_model_level_weight(
-            get_layer_name("final_layernorm.weight", transformer_layer_prefix), pp_last_rank
-        )
-        if ln_f is not None:
-            starmap_args.append(starmap_config | {'key': "final_layernorm.weight", 'vals': ln_f})
-
-        ln_f_bias = try_get_model_level_weight(
-            get_layer_name("final_layernorm.bias", transformer_layer_prefix), pp_last_rank
-        )
-        if ln_f_bias is not None:
-            starmap_args.append(starmap_config | {'key': "final_layernorm.bias", 'vals': ln_f_bias})
-
-    # ----------------Convert Embeddings----------------
-    def get_remove_vocab_padding(tensor_name):
-        tensor = model_level_params.get(tensor_name, None)
-        if tensor is None:
-            return None
-
-        if tp_size > 1:  # Gather padded tensor chunks
-            vocab_size_padded = tensor.shape[0] * tp_size
-            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
-                vocab_size_padded, tp_rank, tp_size
-            )
-            dim_size = list(tensor.size())
-            dim_size[0] = vocab_size_padded
-            gathered_tensor = torch.zeros(dim_size, dtype=tensor.dtype, device=torch.cuda.current_device())
-            gathered_tensor[vocab_start_index:vocab_end_index] = tensor
-            torch.distributed.all_reduce(gathered_tensor, group=tp_group)
-            tensor = gathered_tensor
-        unpadded = tensor[:tokenizer_vocab_size]
-        if tp_size > 1:  # Split gathered tensor for tensor parallel embedding
-            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
-                tokenizer_vocab_size, tp_rank, tp_size
-            )
-            unpadded = unpadded[vocab_start_index:vocab_end_index]
-        return unpadded.T  # TRTLLM expects (vocab_size, hidden_size) so need extra transpose
-
-    if pp_is_first or reshard_model:
-        vocab_embed = get_remove_vocab_padding(get_layer_name("word_embedding", prefix))
-        vocab_embed = try_get_model_level_weight(vocab_embed, pp_first_rank)
-        save_val(vocab_embed, dir=None, key='transformer.vocab_embedding.weight', tp_num=None)
-
-    if pp_is_last or reshard_model:
-        lm_head = get_remove_vocab_padding(get_layer_name("output_layer", prefix))
-        lm_head = try_get_model_level_weight(lm_head, pp_last_rank)
-        save_val(lm_head, dir=None, key='lm_head.weight', tp_num=None)
-
-    for starmap_arg in tqdm(starmap_args, desc="saving weights"):
-        split_and_save_weight(**starmap_arg)
-
-    return weights_dict
-
-
-def create_export_dir(nemo_export_dir):
-    out_dir = Path(nemo_export_dir)
-    if not out_dir.exists():
-        out_dir.mkdir(parents=True)
-    return out_dir
diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py
deleted file mode 100755
index a3a2e21dab02..000000000000
--- a/nemo/export/trt_llm/converter/utils.py
+++ /dev/null
@@ -1,598 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import List, Optional, Tuple, Union
-import numpy as np
-import tensorrt_llm
-import torch
-from tensorrt_llm._utils import mpi_comm, torch_to_numpy
-
-# A global dicts to store exported weights.
-# This is set to be a global variable to avoid extra code modification from tensorrt_llm.
-weights_dict = {}
-
-
-DECODER_MODEL_TYPE = {
-    "gptj": 'GPTForCausalLM',
-    "gptnext": 'GPTForCausalLM',
-    "llama": 'LlamaForCausalLM',
-    "gemma": 'GemmaForCausalLM',
-    "falcon": 'FalconForCausalLM',
-}
-
-post_layernorm_keys = [
-    "post_attention_layernorm.weight",
-    "post_attention_layernorm.bias",
-    "post_self_attn_layernorm.weight",
-]
-mlp_proj_bias_keys = ["mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias"]
-attention_dense_bias_keys = ["attention.linear_proj.bias", "attention.dense.bias"]
-input_layernorm_keys = ["input_layernorm.weight", "input_layernorm.bias"]
-pre_layernorm_keys = ["pre_mlp_layernorm.weight", "pre_mlp_layernorm.bias"]
-attention_dense_weight_keys = ["attention.linear_proj.weight", "attention.dense.weight"]
-mlp_proj_weight_keys = ["mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight"]
-mlp_fc_keys = ["mlp.dense_h_to_4h.weight", "mlp.dense_h_to_4h.bias", "mlp.linear_fc1.weight", "mlp.linear_fc1.bias"]
-attention_qkv_bias_keys = ["attention.query_key_value.bias", "attention.linear_qkv.bias"]
-attention_qkv_weight_keys = ["attention.query_key_value.weight", "attention.linear_qkv.weight"]
-mlp_router_keys = ["mlp.router.weight"]
-mlp_fc_expert_keys = ["experts.linear_fc1.weight"]
-mlp_proj_experts_keys = ["experts.linear_fc2.weight"]
-final_layernorm_keys = ["final_layernorm.weight", "final_layernorm.bias"]
-mlp_dense_2_keys = ["mlp.dense_h_to_4h_2.weight", "mlp.dense_h_to_4h_2.bias"]
-attention_not_mapped_keys = [
-    "attention.query.weight",
-    "attention.query.bias",
-    "attention.key_value.weight",
-    "attention.key_value.bias",
-]
-
-weight_scaling_suffix = '.weights_scaling_factor'
-activation_scaling_suffix = '.activation_scaling_factor'
-
-
-def save_val(val, dir, key, tp_num=None):
-    suffix = "" if tp_num is None else f".{tp_num}.bin"
-    global weights_dict
-
-    # Transpose linear layer weights to the correct shape.
-    if torch.is_tensor(val):
-        val = val.detach().contiguous()
-        if len(val.shape) >= 2:
-            val = val.reshape(val.shape[0], -1)
-            val = torch.transpose(val, 0, 1)
-        if key not in weights_dict:
-            weights_dict[f"{key}{suffix}"] = torch.empty(
-                val.size(), dtype=val.dtype, layout=val.layout, device="cpu", pin_memory=True
-            )
-        weights_dict[f"{key}{suffix}"].copy_(val, non_blocking=True)
-    else:
-        if len(val.shape) >= 2:
-            val = np.ascontiguousarray(np.transpose(val.reshape(val.shape[0], -1), [1, 0]))
-        weights_dict[f"{key}{suffix}"] = val
-
-
-def save_split(split_vals, dir, key, i, split_factor):
-    for j, val in enumerate(split_vals):
-        save_val(val, dir, key, i * split_factor + j)
-
-
-def save_expert_split(split_vals, dir, key, i, split_factor):
-    for j, val in enumerate(split_vals):
-        tp_num = i * split_factor + j
-        suffix = "" if tp_num is None else f".{tp_num}.bin"
-
-        global weights_dict
-        weights_dict[f"{key}{suffix}"] = val
-
-
-def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
-    """This function has two purposes:
-    - compute quantized weights, scaled either per-tensor or per-column
-    - compute scaling factors.
-
-    Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
-    CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
-    CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.
-
-    Here is the list of what we need (T means per-tensor, C per-column):
-    - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8).
-    Used before the GEMM. (T)
-    - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
-    - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
-    - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
-    to quant range (int8) (used for CUBLAS) (T, C)
-
-    Note that we don't do anything special about row-parallel GEMM.
-    Theoretically, we could have per-GPU scaling factors too,
-    but then the model would change depending on the number of GPUs used.
-
-    For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection,
-    we consider it
-    as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
-    """
-    # compute weight scaling factors for fp->int8 and int8->fp
-    if is_qkv and not multi_query_mode:
-        scale_w_orig_quant_t = 127.0 / act_range["w"].reshape(3, -1).max(dim=-1, keepdims=True)[0].cpu().numpy()
-        scale_w_orig_quant_c = 127.0 / act_range["w"].reshape(3, -1).cpu().numpy()
-    elif is_qkv and multi_query_mode:
-        raise ValueError("Multi-query w/ int8 quant has not been supported yet")
-    else:
-        scale_w_orig_quant_t = 127.0 / act_range["w"].max().cpu().numpy()
-        scale_w_orig_quant_c = 127.0 / act_range["w"].cpu().numpy()
-    scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t
-    scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c
-
-    # compute the rest of needed scaling factors
-    scale_x_orig_quant_t = np.array(127.0 / act_range["x"].max().item())
-    scale_y_orig_quant_t = np.array(127.0 / act_range["y"].max().item())
-    scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.0)
-    scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t * scale_w_orig_quant_t)
-    scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t * scale_w_orig_quant_c)
-    if is_qkv:
-        scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t, scale_w_orig_quant_c.shape)
-        scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t, scale_w_orig_quant_c.shape)
-
-    def to_i8(x):
-        return x.round().clip(-127, 127).astype(np.int8)
-
-    return {
-        "weight.int8": to_i8(weights * scale_w_orig_quant_t),
-        "weight.int8.col": to_i8(weights * scale_w_orig_quant_c),
-        "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32),
-        "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32),
-        "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32),
-        "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32),
-        "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32),
-        "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32),
-    }
-
-
-def write_int8(vals, dir, base_key, split_dim, tp_rank, split_factor, kv_cache_only=False):
-    if not kv_cache_only:
-        save_split(
-            np.split(vals["weight.int8"], split_factor, axis=split_dim),
-            dir,
-            f"{base_key}.weight.int8",
-            tp_rank,
-            split_factor,
-        )
-        save_split(
-            np.split(vals["weight.int8.col"], split_factor, axis=split_dim),
-            dir,
-            f"{base_key}.weight.int8.col",
-            tp_rank,
-            split_factor,
-        )
-
-    saved_keys_once = ["scale_y_quant_orig"]
-    if not kv_cache_only:
-        saved_keys_once += ["scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant"]
-    # per-column scaling factors are loaded per-gpu for ColumnParallel GEMMs (QKV, FC1)
-    if not kv_cache_only:
-        if split_dim == -1:
-            save_split(
-                np.split(vals["scale_w_quant_orig.col"], split_factor, axis=split_dim),
-                dir,
-                f"{base_key}.scale_w_quant_orig.col",
-                tp_rank,
-                split_factor,
-            )
-            save_split(
-                np.split(vals["scale_y_accum_quant.col"], split_factor, axis=split_dim),
-                dir,
-                f"{base_key}.scale_y_accum_quant.col",
-                tp_rank,
-                split_factor,
-            )
-        else:
-            saved_keys_once += ["scale_w_quant_orig.col", "scale_y_accum_quant.col"]
-
-    if tp_rank == 0:
-        for save_key in saved_keys_once:
-            save_val(vals[save_key], dir, f"{base_key}.{save_key}")
-
-
-def get_suffix(key: str) -> str:
-    return '.' + key.split('.')[-1]
-
-
-def get_trt_llm_prefix(key: str) -> str:
-    layer_num = key.split(".")[1]
-    return f'transformer.layers.{layer_num}'
-
-
-def any_word_in_key(key: str, words: List[str]) -> bool:
-    return any([word in key for word in words])
-
-
-def sequential_key_map(key: str, mapping: List[Tuple[List[str], str]]) -> Optional[str]:
-    for keywords, mapped in mapping:
-        if any_word_in_key(key, keywords):
-            return mapped
-
-    return None
-
-
-def get_trt_llm_infix(key: str) -> Optional[str]:
-    mapping = [
-        (post_layernorm_keys, '.post_layernorm'),
-        (mlp_proj_bias_keys, '.mlp.proj'),
-        (attention_dense_bias_keys, '.attention.dense'),
-        (input_layernorm_keys, '.input_layernorm'),
-        (pre_layernorm_keys, '.post_layernorm'),
-        (attention_dense_weight_keys, '.attention.dense'),
-        (mlp_proj_weight_keys, '.mlp.proj'),
-        (mlp_fc_keys, '.mlp.fc'),
-        (attention_qkv_bias_keys + attention_qkv_weight_keys, '.attention.qkv'),
-        (mlp_router_keys, '.mlp.router'),
-        (mlp_fc_expert_keys, '.mlp.fc'),
-        (mlp_proj_experts_keys, '.mlp.proj'),
-    ]
-    return sequential_key_map(key, mapping)
-
-
-def get_trt_llm_keyname(key: str) -> str:
-    if any_word_in_key(key, final_layernorm_keys):
-        return key.replace("final_layernorm", "transformer.ln_f")
-
-    if infix := get_trt_llm_infix(key):
-        return get_trt_llm_prefix(key) + infix + get_suffix(key)
-
-    return key
-
-
-def is_scaling_factor(key: str) -> bool:
-    return "scale_fwd" in key
-
-
-def get_scaling_factor_keys(key: str) -> Tuple[Tuple[str, str], Tuple[str, str]]:
-    # Reuses existing mapping of NeMo -> TRT LLM weights key via swapping suffixes
-    corresponding_weight_key = '.'.join(key.split('.')[:-2]) + '.weight'
-    corresponding_trt_llm_weight_key = get_trt_llm_keyname(corresponding_weight_key)
-    base_key = '.'.join(corresponding_trt_llm_weight_key.split('.')[:-1])
-
-    weight_scale = base_key + weight_scaling_suffix
-    activation_scale = base_key + activation_scaling_suffix
-    keys = (weight_scale, activation_scale)
-
-    layer_prefix = get_trt_llm_prefix(key)
-    mapped_key = layer_prefix + '.mlp.gate'
-    gate_activation = mapped_key + activation_scaling_suffix
-    gate_weight = mapped_key + weight_scaling_suffix
-    gate_keys = (gate_activation, gate_weight)
-
-    return keys, gate_keys
-
-
-def save_scaling_factor(scaling_factors: dict, key: str, val: torch.Tensor, config: dict):
-    if not is_scaling_factor(key):
-        return scaling_factors
-
-    activation_factor = 1 / val[0].view(1)
-    weights_factor = 1 / val[1].view(1)
-
-    (weights_key, activation_key), gate_keys = get_scaling_factor_keys(key)
-    scaling_factors[activation_key] = activation_factor
-    scaling_factors[weights_key] = weights_factor
-
-    split_gated_activation = config.get("split_gated_activation", False)
-    if split_gated_activation and any_word_in_key(key, ["mlp.dense_h_to_4h", "mlp.linear_fc1"]):
-        (gate_activation_key, gate_weight_key) = gate_keys
-        scaling_factors[gate_activation_key] = activation_factor
-        scaling_factors[gate_weight_key] = weights_factor
-
-    return scaling_factors
-
-
-def cast_val_datatype(vals, trt_llm_key, storage_type, is_fp8_model, scaling_factors):
-    if not is_fp8_model:
-        return [val.to(storage_type) for val in vals]
-
-    fp8_storage_type = torch.float8_e4m3fn
-    quantized_keys = [
-        k.split(weight_scaling_suffix)[0] for k in scaling_factors.keys() if k.endswith(weight_scaling_suffix)
-    ]
-    for k in quantized_keys:
-        if k in trt_llm_key:
-            storage_type = fp8_storage_type
-            scale = scaling_factors[k + weight_scaling_suffix]
-            vals = [val.to(torch.float32) / scale for val in vals]
-            break
-
-    return [val.to(storage_type) for val in vals]
-
-
-def split_val_gate(vals: List[np.ndarray], convert_on_device: bool):
-    if convert_on_device:
-        return [[n] for n in torch.chunk(vals[0], 2, axis=-1)]
-
-    splits = [np.split(val, 2, axis=-1) for val in vals]
-    return list(zip(*splits))
-
-
-# Note: in multi_query_mode, only query heads are split between multiple GPUs, while key/value head
-# are not split as there is only one head per key/value.
-@torch.no_grad()
-def split_and_save_weight(
-    tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config, scaling_factors={}
-):
-    use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
-    split_gated_activation = config.get("split_gated_activation", False)
-    num_attention_heads = config.get("num_attention_heads", 0)
-    tp_size = config.get("tp_size", 1)
-    int8_outputs = config.get("int8_outputs", None)
-    multi_query_mode = config.get("multi_query_mode", False)
-    num_kv_heads = config.get("num_kv_heads", num_attention_heads)
-    size_per_head = config.get("kv_channels", None)
-    convert_on_device = config.get("convert_on_device", False)
-    is_fp8_model = config.get("fp8_quantized", False)
-    use_fp8_kv_cache = config.get("fp8_kvcache", False)
-    save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
-
-    trt_llm_key = get_trt_llm_keyname(key)
-    if not isinstance(vals, list):
-        vals = [vals]
-
-    if config.get("transpose_weights", False) and vals[0].ndim == 2:
-        vals = [val.T for val in vals]
-    if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
-        vals = [val.float() + 1.0 for val in vals]
-
-    vals = cast_val_datatype(vals, trt_llm_key, storage_type, is_fp8_model, scaling_factors)
-    if convert_on_device:
-        assert len(vals) == 1  # Should only convert a single device param per call
-        assert torch.is_tensor(vals[0])
-    elif torch.is_tensor(vals[0]):
-        vals = [torch_to_numpy(val.cpu()) for val in vals]
-
-    if any_word_in_key(
-        key,
-        input_layernorm_keys
-        + pre_layernorm_keys
-        + attention_dense_bias_keys
-        + post_layernorm_keys
-        + mlp_proj_bias_keys
-        + final_layernorm_keys,
-    ) and (tp_rank == 0 or convert_on_device):
-        # shared weights, only need to convert the weights of rank 0
-        save_val(vals[0], saved_dir, trt_llm_key)
-
-    elif any_word_in_key(key, attention_dense_weight_keys + mlp_proj_weight_keys):
-        if convert_on_device:
-            save_val(vals[0], saved_dir, trt_llm_key)
-        else:
-            cat_dim = 0
-            val = np.concatenate(vals, axis=cat_dim)
-            split_vals = np.split(val, split_factor, axis=cat_dim)
-            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
-
-        if act_range is not None and int8_outputs == "all":
-            base_key = trt_llm_key.replace(".weight", "")
-            vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode)
-            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
-
-    elif any_word_in_key(key, mlp_fc_keys):
-        if split_gated_activation:
-            vals, gates = split_val_gate(vals, convert_on_device)
-
-        if convert_on_device:
-            save_val(vals[0], saved_dir, trt_llm_key)
-        else:
-            cat_dim = -1
-            val = np.concatenate(vals, axis=cat_dim)
-            split_vals = np.split(val, split_factor, axis=cat_dim)
-            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
-
-        if act_range is not None and int8_outputs == "all":
-            base_key = trt_llm_key.replace(".weight", "")
-            vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode)
-            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
-
-        if split_gated_activation:
-            assert not save_int8
-            layer_prefix = get_trt_llm_prefix(key)
-            gate_key = layer_prefix + '.mlp.gate' + get_suffix(trt_llm_key)
-            if convert_on_device:
-                save_val(gates[0], saved_dir, gate_key)
-            else:
-                gate = np.concatenate(gates, axis=cat_dim)
-                split_vals = np.split(gate, split_factor, axis=cat_dim)
-                save_split(split_vals, saved_dir, gate_key, tp_rank, split_factor)
-
-    elif any_word_in_key(key, mlp_dense_2_keys):
-        if convert_on_device:
-            save_val(vals[0], saved_dir, trt_llm_key)
-        else:
-            cat_dim = -1
-            val = np.concatenate(vals, axis=cat_dim)
-            split_vals = np.split(val, split_factor, axis=cat_dim)
-            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
-
-        if act_range is not None and int8_outputs == "all":
-            base_key = trt_llm_key.replace(".weight", "")
-            vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode)
-            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
-
-    elif any_word_in_key(key, attention_qkv_bias_keys):
-        qkv_hidden_dim = vals[0].shape[0]
-        size_per_head = qkv_hidden_dim // (num_attention_heads + 2 * num_kv_heads)
-        q_num = num_attention_heads // num_kv_heads
-
-        # We first concat all sub weights per tp rank together.
-        len_vals = len(vals)
-        if convert_on_device:
-            val = vals[0]
-        else:
-            val = np.concatenate(vals, axis=0)
-        val = val.reshape(num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
-
-        # Split the QKV to separate variables.
-        if convert_on_device:
-            qkv = torch.split(val, [q_num, 1, 1], dim=1)
-            split_vals = torch.concatenate([qkv[0].reshape(-1), qkv[1].reshape(-1), qkv[2].reshape(-1)], dim=1)
-            save_val(split_vals, saved_dir, trt_llm_key)
-        else:
-            qkv = np.split(val, [q_num, q_num + 1], axis=1)
-            q_split = np.split(qkv[0], split_factor, axis=0)
-            k_split = np.split(qkv[1], split_factor, axis=0)
-            v_split = np.split(qkv[2], split_factor, axis=0)
-
-            # Concatenate Q, K, and V together
-            split_vals = [
-                np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0)
-                for i in range(split_factor)
-            ]
-            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
-
-    elif any_word_in_key(key, attention_qkv_weight_keys):
-        assert use_attention_nemo_shape, "Only support NEMO shape for QKV weights"
-        hidden_dim = vals[0].shape[0]
-        if size_per_head is None:
-            size_per_head = hidden_dim // num_attention_heads
-        q_num = num_attention_heads // num_kv_heads
-
-        # When the merge factor exceeds 1, the 'vals' list will have multiple entries.
-        # Depending on the format, 'vals' can look like either [QQQQ..KV, QQQQ..KV, ...](for GQA) or [QKV, QKV, ...](for MHA).
-        # We first concat all sub weights per tp rank together.
-        if convert_on_device:
-            val = vals[0].reshape(hidden_dim, num_kv_heads // tp_size, q_num + 2, size_per_head)
-            qkv = torch.split(val, [q_num, 1, 1], dim=2)
-            split_vals = torch.concatenate(
-                [qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1
-            )
-            save_val(split_vals, saved_dir, trt_llm_key)
-        else:
-            len_vals = len(vals)
-            val = np.concatenate(vals, axis=1)
-            val = val.reshape(hidden_dim, num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
-
-            # Split the QKV to separate variables.
-            qkv = np.split(val, [q_num, q_num + 1], axis=2)
-
-            query_groups_shape = qkv[0].shape
-            if len(query_groups_shape) > 1:
-                if (query_groups_shape[1] % split_factor) != 0:
-                    raise Exception(
-                        "Number of query groups of the models is {0}. Please select tensor parallelism size "
-                        "that can split the number of query groups to equal number of query matrices in the "
-                        "each GPU.".format(query_groups_shape[1])
-                    )
-
-            q_split = np.split(qkv[0], split_factor, axis=1)
-            k_split = np.split(qkv[1], split_factor, axis=1)
-            v_split = np.split(qkv[2], split_factor, axis=1)
-
-            # Concatenate Q, K, and V together
-            split_vals = [
-                np.concatenate(
-                    [
-                        q_split[i].reshape(hidden_dim, -1),
-                        k_split[i].reshape(hidden_dim, -1),
-                        v_split[i].reshape(hidden_dim, -1),
-                    ],
-                    axis=1,
-                )
-                for i in range(split_factor)
-            ]
-            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
-
-        if save_int8:
-            base_key = trt_llm_key.replace(".weight", "")
-            vals_i8 = generate_int8(val, act_range, is_qkv=True, multi_query_mode=multi_query_mode)
-            write_int8(
-                vals_i8,
-                saved_dir,
-                base_key,
-                cat_dim,
-                tp_rank,
-                split_factor,
-                kv_cache_only=int8_outputs == "kv_cache_only",
-            )
-
-        if use_fp8_kv_cache:
-            base_key = trt_llm_key.replace('.qkv.weight', '')
-            scaling_factor = torch.FloatTensor([1.0])
-            save_val(scaling_factor, dir, base_key + '.kv_cache_scaling_factor')
-
-    elif any_word_in_key(key, attention_not_mapped_keys):
-        pass
-
-    elif any_word_in_key(key, mlp_router_keys):
-        val = np.concatenate(vals, axis=1)
-        save_val(val, saved_dir, trt_llm_key)
-
-    elif any_word_in_key(key, mlp_fc_expert_keys):
-        cat_dim = -1
-        val = np.concatenate(vals, axis=cat_dim)
-        w1, w3 = np.split(val, 2, axis=1)
-        # w1 splits
-        split_w1s = np.split(w1, split_factor, axis=1)
-        # w3 splits
-        split_w3s = np.split(w3, split_factor, axis=1)
-
-        split_vals = [np.concatenate(item, axis=1) for item in zip(split_w3s, split_w1s)]
-        save_expert_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
-
-    elif any_word_in_key(key, mlp_proj_experts_keys):
-        cat_dim = -1
-        val = np.concatenate(vals, axis=cat_dim)
-        split_vals = np.split(val, split_factor, axis=cat_dim)
-        save_expert_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
-    else:
-        print(f"[WARNING] {key} not handled by converter")
-
-    global weights_dict
-    return weights_dict
-
-
-def split(v: Union[np.ndarray, torch.Tensor], tp_size: int, idx: int, dim: int = 0):
-    """Splits the np tensor v on dim and return the idx's slice."""
-    if tp_size == 1:
-        return v
-
-    dim = dim if len(v.shape) != 1 else 0
-    if torch.is_tensor(v):
-        return torch.split(v, v.size(dim) // tp_size, dim=dim)[idx].contiguous()
-
-    return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
-
-
-def init_model_parallel_from_nemo(reshard_model):
-    from megatron.core import parallel_state
-
-    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
-    tp_size = parallel_state.get_tensor_model_parallel_world_size()
-    dp_size = parallel_state.get_data_parallel_world_size()
-    tp_rank = parallel_state.get_tensor_model_parallel_rank()
-    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
-    dp_rank = parallel_state.get_data_parallel_rank()
-
-    if reshard_model and pp_size > 1:
-        dp_size = dp_size * pp_size
-        dp_rank = torch.distributed.get_rank() // tp_size
-        pp_rank = 0
-        pp_size = 1
-
-    mp_rank = tp_size * pp_rank + tp_rank
-    # Need to split cpp MPI World Comm because TensorRT-LLM NCCL plugins refer to the locally split comm.
-    # High level call structure is: MpiComm::split -> MpiComm::setSession -> LOCAL_COMM_SESSION (used in allReducePlugin.cpp)
-    tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)
-    # Also split the python mpi communicator and set the global world one to the local split one
-    new_comm = mpi_comm().Split(color=dp_rank, key=mp_rank)
-    from mpi4py import MPI
-
-    MPI.COMM_WORLD = new_comm
-
-    return mp_rank, dp_rank, tp_size, pp_size, dp_size
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
deleted file mode 100644
index 341a77c5bc66..000000000000
--- a/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
deleted file mode 100644
index cd547db25664..000000000000
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ /dev/null
@@ -1,706 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import functools
-import json
-import logging
-import os
-import pickle
-import shutil
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-import torch
-import yaml
-from transformers import AutoTokenizer, GPT2Tokenizer, PreTrainedTokenizer
-
-from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
-from nemo.export.tarutils import TarPath
-from nemo.export.tiktoken_tokenizer import TiktokenTokenizer
-from nemo.export.utils import load_model_weights, nemo_to_path, torch_dtype_from_precision
-
-try:
-    from nemo.lightning import io
-
-    HAVE_NEMO2 = True
-except (ImportError, ModuleNotFoundError):
-    HAVE_NEMO2 = False
-
-LOGGER = logging.getLogger("NeMo")
-EXTRA_STATE = "extra_state"
-
-
-def load_extra_state_from_bytes(val: Optional[Union[torch.Tensor, BytesIO]]) -> Optional[dict]:
-    """Loads single extra_state from bytes storage.
-
-    Args:
-        val (torch.Tensor | BytesIO): Bytes storage of extra_state
-    Returns:
-        Optional[dict]: Deserialized extra_state, or None if the bytes storage is empty.
-    """
-    if val is None:
-        return None
-
-    # TransformerEngine shifted from storing extra_states bytes storage from _io.BytesIO to torch.Tensor
-    if isinstance(val, torch.Tensor):
-        if val.numel() == 0:
-            return None
-
-        val = val.detach().numpy(force=True).tobytes()
-        return pickle.loads(val)
-
-    val.seek(0)
-    return torch.load(val, weights_only=True)
-
-
-def preprocess_scaling_factors_for_local_export(state_dict: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Scaling factors are kept in BufferIO objects.
-    This function reads the exact scales, preparing them for export.
-    Used only for local (non-mcore) path.
-
-    Args:
-        state_dict (dict): Model state dictionary
-    Returns:
-        dict: The same dictionary, with explicitly loaded extra states from bytes.
-    """
-    scales_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE in k and 'core_attention' not in k}
-    state_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE not in k}
-    scales = {}
-
-    for key, value in scales_dict.items():
-        extra_state = load_extra_state_from_bytes(value)
-
-        if extra_state is not None and 'scale_fwd' in extra_state:
-            scales[key + '.scale_fwd'] = extra_state['scale_fwd'].cpu()
-
-    combined_scales = {}
-    for key in scales:
-        if '.decoder.layers.0' not in key:
-            continue
-
-        # Key has a structure "model.decoder.layers.."
-        decomposed = key.split('.')
-        layer_num_idx = 3
-
-        # Merges scales from "model.decoder.layers.." to
-        # larger dimensional tensor with "model.decoder.layers." key
-        combined = []
-        layer_num = 0
-        decomposed[layer_num_idx] = str(layer_num)
-        while (scale := scales.get('.'.join(decomposed))) is not None:
-            combined.append(scale)
-            layer_num += 1
-            decomposed[layer_num_idx] = str(layer_num)
-
-        del decomposed[layer_num_idx]
-        combined_scales['.'.join(decomposed)] = torch.stack(combined)
-
-    return state_dict | combined_scales
-
-
-def rename_extra_states(state_dict: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    This function preprocesses extra states for Megatron export.
-
-    Args:
-        state_dict (dict): Model state dictionary
-    Returns:
-        dict: Model state dictionary, with extra states consumable by mcore export
-    """
-    mcore_extra_states = {}
-
-    for key, value in state_dict.items():
-        if EXTRA_STATE not in key:
-            continue
-
-        # Keys with the extra states have the following format:
-        # .layers.._extra_state/shard__
-        key_base, shard_key = key.split('/')
-        if '_' not in shard_key:
-            continue
-
-        shard_layer = shard_key.split('_')[1]
-        if not shard_layer.isnumeric():
-            continue
-
-        # Renames keys to:
-        # .layers..._extra_state
-        mcore_key = key_base.replace("layers", f"layers.{shard_layer}")
-        if isinstance(value, list):
-            value = value[0]
-        mcore_extra_states[mcore_key] = value
-
-    state_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE not in k}
-    return state_dict | mcore_extra_states
-
-
-def torch_to_numpy_state_dict(state_dict: Dict[str, Any]) -> Dict[str, Any]:
-    """
-    Transforms model state dictionary with torch tensors to numpy arrays.
-
-    Args:
-        state_dict (dict): Model state dictionary.
-    Returns:
-        dict: State dictionary using numpy arrays.
-    """
-    for k, v in state_dict.items():
-        if v.dtype == torch.bfloat16:
-            from tensorrt_llm._utils import np_bfloat16
-
-            state_dict[k] = v.view(torch.int16).numpy().view(np_bfloat16)
-        else:
-            state_dict[k] = v.numpy()
-
-    return state_dict
-
-
-def update_tokenizer_paths(tokenizer_config: Dict, unpacked_checkpoints_dir):
-    """Updates tokenizer paths in the tokenizer config."""
-
-    def _update_config_entry(key, file_pattern):
-        old_path = tokenizer_config.get(key, None)
-        if old_path is None:
-            return
-        old_path = Path(old_path)
-        new_path = unpacked_checkpoints_dir.get_tokenizer_file_path("tokenizer", key, file_pattern)
-        if new_path:
-            LOGGER.debug(f"Update tokenizer {key} {old_path} -> {new_path}")
-            tokenizer_config[key] = new_path
-        elif not old_path.exists():
-            LOGGER.warning(f"Tokenizer {key}'s path {old_path} does not exists: set it to None")
-            tokenizer_config[key] = None
-
-    _update_config_entry("model", "*.model")
-    _update_config_entry("vocab_file", "*vocab*")
-    _update_config_entry("merge_file", "*merge*.txt")
-
-    return tokenizer_config
-
-
-def copy_tokenizer_files(config, out_dir):
-    """Copies tokenizer files to the output directory."""
-    basenames = {
-        "model": "tokenizer",
-        "vocab_file": "vocab",
-        "merge_file": "merges",
-    }
-
-    for key in basenames.keys():
-        if config.get(key, None) is None:
-            continue
-
-        path = config[key]
-
-        if isinstance(path, str):
-            path = Path(path)
-
-        if not path.exists():
-            LOGGER.debug(f"Tokenizer {key}: {path} file not found")
-            continue
-
-        dst_path = out_dir / f"{basenames[key]}{path.suffix}"
-        config[key] = str(dst_path)
-        LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}")
-
-        # Copy 'path' to 'dst_path' without shutil.copy(...) because 'path' may be a TarPath
-        with path.open('rb') as infile:
-            with open(dst_path, 'wb') as outfile:
-                outfile.write(infile.read())
-
-    return config
-
-
-def get_tokenizer_from_nemo2_context(model_context_dir: Path):
-    """
-    Retrieve tokenizer configuration from NeMo 2.0 context and instantiate the tokenizer.
-
-    Args:
-        model_context_dir (Path): Path to the model context directory.
-
-    Returns:
-        The instantiated tokenizer (various classes possible).
-    """
-
-    if HAVE_NEMO2:
-        # Use NeMo tokenizer loaded from the NeMo 2.0 model context
-        tokenizer_spec = io.load_context(model_context_dir, subpath="model.tokenizer")
-        return build_tokenizer(tokenizer_spec)
-    else:
-        # Use local nemo.export SentencePieceTokenizer implementation
-        # or directly a HuggingFace tokenizer based on the model config
-        with (model_context_dir / "model.yaml").open("r") as stream:
-            model_config = yaml.safe_load(stream)
-
-        tokenizer_config = model_config["tokenizer"]
-        target_class = tokenizer_config["_target_"]
-        tokenizer_module = "nemo.collections.common.tokenizers."
-        assert target_class.startswith(tokenizer_module)
-        target_class = target_class.removeprefix(tokenizer_module)
-
-        if target_class == "sentencepiece_tokenizer.SentencePieceTokenizer":
-            tokenizer = SentencePieceTokenizer(
-                model_path=str(model_context_dir / tokenizer_config["model_path"]),
-                special_tokens=tokenizer_config.get("special_tokens", None),
-                legacy=tokenizer_config.get("legacy", False),
-            )
-        elif target_class == "huggingface.auto_tokenizer.AutoTokenizer":
-            tokenizer = AutoTokenizer.from_pretrained(
-                str(model_context_dir / tokenizer_config["pretrained_model_name"])
-            )
-        else:
-            raise ValueError(f"Unsupported tokenizer type: {tokenizer_module}{target_class}.")
-
-    return tokenizer
-
-
-def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenizer:
-    """Loads the tokenizer from the decoded NeMo weights dir."""
-    tokenizer_dir_or_path = Path(tokenizer_dir_or_path)
-    if (tokenizer_dir_or_path / "nemo_context").exists():
-        return get_tokenizer_from_nemo2_context(tokenizer_dir_or_path / "nemo_context")
-    elif (tokenizer_dir_or_path / "tokenizer_config.json").exists():
-        return AutoTokenizer.from_pretrained(tokenizer_dir_or_path)
-    elif os.path.exists(os.path.join(tokenizer_dir_or_path, "vocab.json")):
-        vocab_path = tokenizer_dir_or_path / "vocab.json" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
-        tokenizer_config = {"library": "tiktoken", "vocab_file": str(vocab_path)}
-        return build_tokenizer(tokenizer_config)
-    else:
-        model_path = (
-            tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
-        )
-        tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
-        return build_tokenizer(tokenizer_config)
-
-
-def build_tokenizer(tokenizer):
-    """Builds tokenizer for trt-llm export."""
-    if isinstance(tokenizer, dict):
-        tokenizer_config = tokenizer
-        if tokenizer_config["library"] == "sentencepiece":
-            return SentencePieceTokenizer(model_path=tokenizer_config["model"])
-        elif tokenizer_config["library"] == "tiktoken":
-            return TiktokenTokenizer(vocab_file=tokenizer_config["vocab_file"])
-        elif "GPT2" in tokenizer_config["type"]:
-            tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
-        else:
-            raise ValueError(f'Tokenizer type {tokenizer_config["library"]} not handled')
-
-        if tokenizer.bos_token_id is None:
-            tokenizer.add_special_tokens({"bos_token": ""})
-        if tokenizer.eos_token_id is None:
-            tokenizer.add_special_tokens({"eos_token": ""})
-    else:
-        # For NeMo tokenizers, monkey patch encode & batch_decode methods for unified interface
-        import nemo.collections.common.tokenizers as nemo_tokenizers
-
-        if isinstance(tokenizer, nemo_tokenizers.TokenizerSpec):
-            if isinstance(tokenizer, nemo_tokenizers.AutoTokenizer):
-                # Unwrap the original methods of HF tokenizer
-                batch_decode = tokenizer.tokenizer.batch_decode
-                encode = tokenizer.tokenizer.encode
-            elif isinstance(tokenizer, nemo_tokenizers.SentencePieceTokenizer):
-                # Define HF equivalents based on available SP methods
-                def batch_decode(self, ids):
-                    if torch.is_tensor(ids):
-                        ids = ids.cpu().numpy()
-                    if isinstance(ids, np.ndarray):
-                        ids = ids.tolist()
-                    return self.tokenizer.decode(ids)
-
-                encode = tokenizer.tokenizer.encode_as_ids
-            else:
-                raise NotImplementedError(f"Patching tokenizer methods for {type(tokenizer)} is not available")
-
-            tokenizer.bos_token_id = tokenizer.bos_id
-            tokenizer.eos_token_id = tokenizer.eos_id
-            nemo_tokenizers.TokenizerSpec.encode = encode
-            nemo_tokenizers.TokenizerSpec.batch_decode = batch_decode
-
-    return tokenizer
-
-
-def load_nemo_config(nemo_ckpt: Union[str, Path]) -> Dict[Any, Any]:
-    """
-    Load the model configuration from a NeMo checkpoint.
-
-    This function handles both NeMo 1.0 and NeMo 2.0 checkpoint structures.
-    For NeMo 2.0, it reads the configuration from the 'context/model.yaml' file.
-    For NeMo 1.0, it uses the UnpackedNemoCheckpointDir to load the model configuration.
-
-    Args:
-        nemo_ckpt (Union[str, Path]): Path to the NeMo checkpoint file or directory.
-    Returns:
-        Dict[Any, Any]: The configuration dictionary.
-    """
-    if Path(nemo_ckpt).is_dir():
-        nemo_ckpt = Path(nemo_ckpt)
-    else:
-        nemo_ckpt = TarPath(nemo_ckpt)
-
-    if (nemo_ckpt / "weights").exists() and (nemo_ckpt / "context").exists():  # Stucture of NeMo 2.0 checkpoints
-        with (nemo_ckpt / "context" / "model.yaml").open("r") as stream:
-            config = yaml.safe_load(stream)
-    else:  # Assume NeMo 1.0 case
-        unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(nemo_ckpt, load_checkpoints_to_cpu=True)
-        config = unpacked_checkpoint_dir.model_config
-
-    return config
-
-
-def get_model_type(nemo_ckpt: Union[str, Path]) -> Optional[str]:
-    """
-    Determine the model type from a NeMo checkpoint for TensorRT-LLM engine build.
-
-    Args:
-        nemo_ckpt (Union[str, Path]): Path to the NeMo checkpoint file.
-    Returns:
-        Optional[str]: The model type if it can be determined, otherwise None.
-    """
-    model_config = load_nemo_config(nemo_ckpt)
-    model_type = None
-
-    if model_class := model_config.get("_target_"):
-        # NeMo 2.0 case
-        NEMO2_TO_MODEL_TYPE = {
-            "nemo.collections.llm.gpt.model.base.GPTModel": "gpt",
-            "nemo.collections.llm.gpt.model.llama.LlamaModel": "llama",
-            "nemo.collections.llm.gpt.model.mistral.MistralModel": "llama",
-            "nemo.collections.llm.gpt.model.mixtral.MixtralModel": "llama",
-            "nemo.collections.llm.gpt.model.starcoder.StarcoderModel": "gpt",
-            "nemo.collections.llm.gpt.model.starcoder2.Starcoder2Model": "gpt",
-            "nemo.collections.llm.gpt.model.nemotron.NemotronModel": "gpt",
-            "nemo.collections.llm.gpt.model.gemma.GemmaModel": "gemma",
-            "nemo.collections.llm.gpt.model.phi3mini.Phi3Model": "phi3",
-            "nemo.collections.llm.gpt.model.baichuan.Baichuan2Model": "baichuan",
-            "nemo.collections.llm.gpt.model.chatglm.ChatGLMModel": "chatglm",
-            "nemo.collections.llm.gpt.model.qwen2.Qwen2Model": "qwen",
-        }
-        try:
-            model_type = NEMO2_TO_MODEL_TYPE[model_class]
-            LOGGER.info(f"Determined model_type='{model_type}' for {nemo_ckpt} checkpoint.")
-
-        except KeyError:
-            LOGGER.error(
-                f"Model {model_class} not found in the NEMO2_TO_MODEL_TYPE mapping, "
-                "try providing the model_type explicitely for exporting:\n"
-                f"{json.dumps(NEMO2_TO_MODEL_TYPE, indent=2)}"
-            )
-            raise
-    else:
-        LOGGER.warning(f"Parameter model_type cannot be determined for {nemo_ckpt} checkpoint.")
-    return model_type
-
-
-def get_weights_dtype(nemo_ckpt: Union[str, Path]) -> Optional[str]:
-    """Determine the weights data type from a NeMo checkpoint for TensorRT-LLM engine build.
-
-    Args:
-        nemo_ckpt (Union[str, Path]): Path to the NeMo checkpoint file.
-    Returns:
-        Optional[str]: The dtype if it can be determined, otherwise None.
-    """
-    model_config = load_nemo_config(nemo_ckpt)
-    torch_dtype = None
-    dtype = None
-
-    is_nemo2 = "_target_" in model_config
-    if is_nemo2:
-        torch_dtype = model_config["config"]["params_dtype"]["_target_"]
-    elif precision := model_config.get("precision", None):
-        torch_dtype = str(torch_dtype_from_precision(precision))
-
-    if torch_dtype is not None:
-        dtype = torch_dtype.removeprefix("torch.")
-        LOGGER.info(f"Determined weights dtype='{dtype}' for {nemo_ckpt} checkpoint.")
-    else:
-        LOGGER.warning(
-            f"Parameter dtype for model weights cannot be determined for {nemo_ckpt} checkpoint. "
-            "There is no 'precision' field specified in the model_config.yaml file."
-        )
-
-    return dtype
-
-
-def load_distributed_model_weights(
-    nemo_checkpoint: Union[str, Path], mcore_scales_format: bool, torch_tensor: bool = True
-) -> Dict[str, Any]:
-    """
-    Loads model weights in `torch_dist` format from the model path.
-    Preprocesses the scaling factors for local export if mcore_scales_format is set to False.
-
-    Args:
-        nemo_checkpoint (str | Path): Path to the nemo checkpoint.
-        mcore_scales_format (bool): Flag for local vs megatron.core export.
-        torch_tensor (bool): If set to False, converts returns weights in numpy format.
-    Returns:
-        dict: Model state dictionary.
-    """
-    state_dict = load_model_weights(nemo_checkpoint, load_extra_states=True)
-    if not torch_tensor:
-        state_dict = torch_to_numpy_state_dict(state_dict)
-
-    state_dict = rename_extra_states(state_dict)
-    if not mcore_scales_format:
-        state_dict.update({k: v[0] for k, v in state_dict.items() if EXTRA_STATE in k and isinstance(v, list)})
-        state_dict = preprocess_scaling_factors_for_local_export(state_dict)
-
-    return state_dict
-
-
-def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Path], mcore_scales_format: bool = True):
-    """Unified model loading for trt-llm export."""
-    if not os.path.exists(nemo_ckpt):
-        raise TypeError("%s does not exist", nemo_ckpt)
-
-    nemo_dir = nemo_to_path(nemo_ckpt)
-
-    tokenizer = None
-    try:
-        unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(nemo_dir, load_checkpoints_to_cpu=True)
-
-        if (nemo_dir / "model_weights").exists():
-            model = load_distributed_model_weights(nemo_ckpt, mcore_scales_format)
-
-            nemo_model_config = unpacked_checkpoint_dir.model_config
-
-            if nemo_model_config["tokenizer"].get("library", None) == "huggingface":
-                tokenizer = AutoTokenizer.from_pretrained(
-                    nemo_model_config["tokenizer"]["type"],
-                    use_fast=nemo_model_config["tokenizer"].get("use_fast", False),
-                )
-            else:
-                tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoint_dir)
-                tokenizer_config = copy_tokenizer_files(tokenizer_config, nemo_export_dir)
-
-                tokenizer = build_tokenizer(tokenizer_config)
-        elif (nemo_dir / "weights").exists():
-            model = load_distributed_model_weights(nemo_ckpt, mcore_scales_format)
-            io_folder = nemo_dir / "context"
-
-            if (io_folder / "model.yaml").exists():
-                with open(io_folder / "model.yaml", 'r') as stream:
-                    config = yaml.safe_load(stream)
-
-                nemo_model_config = {}
-                for k, v in config["config"].items():
-                    if isinstance(v, (float, int, str, bool)):
-                        nemo_model_config[k] = v
-                    elif k == "activation_func":
-                        nemo_model_config["activation"] = v["_target_"].rsplit('.', 1)[-1]
-            else:
-                assert HAVE_NEMO2, "nemo_toolkit>=2.0.0 is required to load the model context."
-
-                config = io.load_context(io_folder, subpath="model.config")
-
-                nemo_model_config = {}
-                for k, v in config.__dict__.items():
-                    if isinstance(v, (float, int, str, bool)):
-                        nemo_model_config[k] = v
-                    elif k == "activation_func":
-                        if isinstance(v, torch.jit.ScriptFunction):
-                            nemo_model_config["activation"] = v.name
-                        else:
-                            nemo_model_config["activation"] = v.__name__
-
-            if nemo_model_config.get("num_moe_experts") is None:
-                nemo_model_config["num_moe_experts"] = 0
-                nemo_model_config["moe_router_topk"] = 0
-            if nemo_model_config["activation"] == "silu":
-                nemo_model_config["activation"] = "fast-swiglu"
-            elif nemo_model_config["activation"] == "openai_gelu":
-                nemo_model_config["activation"] = "openai-gelu"
-            elif nemo_model_config["activation"] == "squared_relu":
-                nemo_model_config["activation"] = "squared-relu"
-
-            if nemo_model_config.get("add_bias_linear"):
-                nemo_model_config["bias"] = True
-
-            nemo_model_config["mcore_gpt"] = True
-            nemo_model_config["max_position_embeddings"] = nemo_model_config.get("seq_length", 4096)
-            nemo_model_config["rotary_percentage"] = nemo_model_config.get("rotary_percent", 1.0)
-
-            shutil.copytree(io_folder, nemo_export_dir / "nemo_context")
-        else:
-            raise Exception("Not a supported NeMo file format: only distributed MCore NeMo checkpoints are supported.")
-    finally:
-        if isinstance(nemo_dir, TarPath):
-            nemo_dir.tarobject.close()
-
-    return model, nemo_model_config, tokenizer
-
-
-def cpu_map_location(storage, loc):
-    """Maps storage to CPU."""
-    return storage.cpu()
-
-
-def gpu_map_location(storage, loc):
-    """Maps storage to GPU."""
-    if loc.startswith("cuda"):
-        training_gpu_idx = int(loc.split(":")[1])
-        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
-        return storage.cuda(inference_gpu_idx)
-    elif loc.startswith("cpu"):
-        return storage.cpu()
-    else:
-        raise ValueError(f"Not handled {loc}")
-
-
-class UnpackedNemoCheckpointDir:
-    """
-    Caches model config and tokenizer file path when loading from a packed NeMo checkpoint directory.
-    """
-
-    def __init__(
-        self,
-        checkpoints_dir: Union[Path, TarPath],
-        load_checkpoints_to_cpu: bool = False,
-    ):
-        assert isinstance(checkpoints_dir, (Path, TarPath))
-        self._checkpoints_dir = checkpoints_dir
-        self._load_checkpoints_to_cpu = load_checkpoints_to_cpu
-
-    @property
-    @functools.lru_cache
-    def model_config(self):
-        """Returns model config dictionary."""
-        model_config = None
-
-        model_config_filename = "model_config.yaml"
-        model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
-        if model_configs_paths:
-            if len(model_configs_paths) > 1:
-                LOGGER.debug(f"There are more than single {model_config_filename} in" f" {self._checkpoints_dir}")
-            model_config_path = model_configs_paths[0]
-            LOGGER.debug("Loading model config from %s", model_config_path)
-            with model_config_path.open("r") as model_config_file:
-                model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
-        else:
-            LOGGER.debug("Searching model config in checkpoints")
-            # try to obtain from checkpoint
-            checkpoint_name = self.checkpoint_name
-            checkpoints_paths = sorted(self._checkpoints_dir.rglob(checkpoint_name))
-            if checkpoints_paths:
-                # assume that parallel ranks 0 checkpoint should have model config embedded
-                checkpoint_path = checkpoints_paths[0]
-
-                map_location_fn = cpu_map_location if self._load_checkpoints_to_cpu else gpu_map_location
-
-                model_00 = torch.load(checkpoint_path, map_location=map_location_fn)
-                if "hyper_parameters" in model_00 and "cfg" in model_00["hyper_parameters"]:
-                    model_config = model_00["hyper_parameters"]["cfg"]
-                    LOGGER.debug("Loaded model config from checkpoint %s", checkpoint_path)
-                else:
-                    LOGGER.debug("Could not find model config in checkpoint %s", checkpoint_path)
-
-                del model_00
-
-        if model_config is None:
-            LOGGER.warning("Could not find checkpoint with NeMo model config in %s", self._checkpoints_dir)
-
-        LOGGER.debug("Loaded model config %s", model_config)
-
-        return model_config
-
-    @property
-    def checkpoints_dir(self):
-        """Returns path to checkpoints directory."""
-        return self._checkpoints_dir
-
-    def get_checkpoints_paths(self, tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
-        """Injects tensor/pipeline model parallel ranks into the filepath.
-        Does nothing if not using model parallelism.
-        """
-        checkpoint_path_without_rank = self.checkpoints_dir / self.checkpoint_name
-
-        def _inject_parallel_ranks(tp_rank, pp_rank):
-            if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1:
-                if pipeline_model_parallel_size is None or pipeline_model_parallel_size == 1:
-                    checkpoint_path = (
-                        checkpoint_path_without_rank.parent
-                        / f"mp_rank_{tp_rank:02d}"
-                        / checkpoint_path_without_rank.name
-                    )
-                else:
-                    checkpoint_path = (
-                        checkpoint_path_without_rank.parent
-                        / f"tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}"
-                        / checkpoint_path_without_rank.name
-                    )
-                return checkpoint_path
-            else:
-                return checkpoint_path_without_rank
-
-        return [
-            [
-                _inject_parallel_ranks(tp_rank=tp_rank, pp_rank=pp_rank)
-                for pp_rank in range(pipeline_model_parallel_size)
-            ]
-            for tp_rank in range(tensor_model_parallel_size)
-        ]
-
-    @property
-    @functools.lru_cache
-    def checkpoint_name(self):
-        """Returns the name of the checkpoint file."""
-        patterns = [
-            "model_weights.ckpt",  # older megatron checkpoints
-            "*last.ckpt",  # newer format of checkpoints
-        ]
-        for pattern in patterns:
-            model_files = sorted(list(self._checkpoints_dir.rglob(pattern)))
-            if model_files:
-                return model_files[0].name
-
-        raise ValueError(f"Could not find checkpoint files in {self._checkpoints_dir}")
-
-    @functools.lru_cache
-    def get_tokenizer_file_path(self, tokenizer_key, file_key, default_filename_pattern):
-        """Returns path to tokenizer file."""
-        model_config = self.model_config
-        file_property = None
-        if tokenizer_key in model_config and file_key in model_config[tokenizer_key]:
-            file_property = model_config[tokenizer_key][file_key]
-        elif file_key in model_config:
-            file_property = model_config[file_key]
-
-        LOGGER.debug("model_config[%s][%s]=%s", tokenizer_key, file_key, file_property)
-
-        if file_property and file_property.startswith("nemo:"):
-            filename = file_property.split("nemo:")[1]
-            filename_pattern = f"*{filename}"
-        elif file_property and file_property.startswith("/artifacts/"):
-            filename = Path(file_property).name
-            filename_pattern = f"*{filename}"
-        elif file_property is None or file_property == "None":
-            filename_pattern = None
-        else:
-            filename_pattern = default_filename_pattern
-            LOGGER.warning(
-                f"Tokenizer file from config: {tokenizer_key}.{file_key}={file_property} "
-                f"looks like unsupported path. Pattern {filename_pattern} will be used."
-            )
-
-        file_path = None
-        if filename_pattern is not None:
-            files_paths = list(self._checkpoints_dir.glob(filename_pattern))
-            if files_paths:
-                assert len(files_paths) == 1
-                file_path = files_paths[0]
-
-        return file_path
diff --git a/nemo/export/trt_llm/qnemo/__init__.py b/nemo/export/trt_llm/qnemo/__init__.py
deleted file mode 100644
index c8d1fa8f690a..000000000000
--- a/nemo/export/trt_llm/qnemo/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
deleted file mode 100644
index 003d1aba2a2c..000000000000
--- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import glob
-import itertools
-import os
-import subprocess
-import warnings
-from typing import List, Optional
-
-from tensorrt_llm.models import PretrainedConfig
-
-from nemo.export.trt_llm.qnemo.utils import CONFIG_NAME, WEIGHTS_NAME
-
-
-def qnemo_to_tensorrt_llm(
-    nemo_checkpoint_path: str,
-    engine_dir: str,
-    max_input_len: int,
-    max_seq_len: Optional[int],
-    max_batch_size: int,
-    max_prompt_embedding_table_size: int,
-    tensor_parallel_size: Optional[int] = None,
-    pipeline_parallel_size: Optional[int] = None,
-    use_parallel_embedding: bool = False,
-    paged_kv_cache: bool = True,
-    paged_context_fmha: bool = False,
-    remove_input_padding: bool = True,
-    use_lora_plugin: Optional[str] = None,
-    lora_target_modules: Optional[List[str]] = None,
-    max_lora_rank: int = 64,
-    max_num_tokens: Optional[int] = None,
-    opt_num_tokens: Optional[int] = None,
-    max_beam_width: int = 1,
-    multiple_profiles: bool = False,
-    reduce_fusion: bool = True,
-):
-    """Build TensorRT-LLM engine with trtllm-build command in a subprocess."""
-    assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}"
-
-    warnings.warn(
-        "Note that setting tensor_parallel_size, pipeline_parallel_size and use_parallel_embedding "
-        " parameters for quantized models is done on the calibration step (in PTQ workflow)."
-        " These parameters are ignored when building and running TensorRT-LLM engine below.",
-        UserWarning,
-        stacklevel=3,
-    )
-
-    num_build_workers = len(glob.glob(os.path.join(nemo_checkpoint_path, WEIGHTS_NAME.format("*"))))
-    assert num_build_workers, f"No TensorRT-LLM weight files found in {nemo_checkpoint_path}"
-
-    config = PretrainedConfig.from_json_file(os.path.join(nemo_checkpoint_path, CONFIG_NAME))
-
-    log_level = "warning"
-
-    quant_algo = config.quantization.quant_algo
-
-    use_fused_mlp = True
-    if config.quantization.exclude_modules:
-        for module_name in config.quantization.exclude_modules:
-            # For AutoQuant, fc and gate might not be quantized at the same time
-            # TODO: relax this limitation on the TRT-LLM side
-            if "gate" in module_name or "fc" in module_name:
-                use_fused_mlp = False
-    use_fused_mlp = use_fused_mlp and 'RecurrentGemma' not in config.architecture
-
-    use_qdq = quant_algo in ["FP8", "W8A8_SQ_PER_CHANNEL"]
-
-    speculative_decoding_mode = "medusa" if "Medusa" in config.architecture else None
-
-    build_cmd = ["trtllm-build"]
-    build_cmd.extend(["--checkpoint_dir", nemo_checkpoint_path])
-    build_cmd.extend(["--log_level", log_level])
-    build_cmd.extend(["--output_dir", engine_dir])
-    build_cmd.extend(["--workers", str(num_build_workers)])
-    build_cmd.extend(["--max_batch_size", str(max_batch_size)])
-    build_cmd.extend(["--max_input_len", str(max_input_len)])
-    build_cmd.extend(["--max_beam_width", str(max_beam_width)])
-    build_cmd.extend(["--max_prompt_embedding_table_size", str(max_prompt_embedding_table_size)])
-    build_cmd.extend(["--paged_kv_cache", "enable" if paged_kv_cache else "disable"])
-    build_cmd.extend(["--use_paged_context_fmha", "enable" if paged_context_fmha else "disable"])
-    build_cmd.extend(["--remove_input_padding", "enable" if remove_input_padding else "disable"])
-    build_cmd.extend(["--multiple_profiles", "enable" if multiple_profiles else "disable"])
-    build_cmd.extend(["--reduce_fusion", "enable" if reduce_fusion else "disable"])
-    build_cmd.extend(["--use_fused_mlp", "enable" if use_fused_mlp else "disable"])
-
-    if not use_qdq:
-        build_cmd.extend(["--gemm_plugin", "auto"])
-
-    if max_seq_len is not None:
-        build_cmd.extend(["--max_seq_len", str(max_seq_len)])
-
-    if max_num_tokens is not None:
-        build_cmd.extend(["--max_num_tokens", str(max_num_tokens)])
-    else:
-        build_cmd.extend(["--max_num_tokens", str(max_batch_size * max_input_len)])
-
-    if opt_num_tokens is not None:
-        build_cmd.extend(["--opt_num_tokens", str(opt_num_tokens)])
-
-    if speculative_decoding_mode:
-        build_cmd.extend(["--speculative_decoding_mode", speculative_decoding_mode])
-
-    print("trtllm-build command:")
-    print("".join(itertools.chain.from_iterable(zip(build_cmd, itertools.cycle(["\n ", " "])))).strip())
-
-    subprocess.run(build_cmd, shell=False, check=True)
diff --git a/nemo/export/trt_llm/qnemo/tokenizer_utils.py b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
deleted file mode 100644
index b3cc88de7caf..000000000000
--- a/nemo/export/trt_llm/qnemo/tokenizer_utils.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-
-from omegaconf import OmegaConf
-from transformers import AutoTokenizer
-
-from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
-from nemo.export.tiktoken_tokenizer import TiktokenTokenizer
-
-# TODO: use get_nmt_tokenizer helper below to instantiate tokenizer once environment / dependencies get stable
-# from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
-
-TOKENIZER_CONFIG_FILE = "tokenizer_config.yaml"
-TOKENIZER_DIR = "tokenizer"
-LOGGER = logging.getLogger("NeMo")
-
-
-def get_nmt_tokenizer(nemo_checkpoint_path: str):
-    """Build tokenizer from Nemo tokenizer config."""
-
-    LOGGER.info(f"Initializing tokenizer from {TOKENIZER_CONFIG_FILE}")
-    tokenizer_cfg = OmegaConf.load(os.path.join(nemo_checkpoint_path, TOKENIZER_CONFIG_FILE))
-
-    library = tokenizer_cfg.library
-    legacy = tokenizer_cfg.get("sentencepiece_legacy", library == "sentencepiece")
-
-    if library == "huggingface":
-        LOGGER.info(f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_cfg.type}")
-        tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg["type"], use_fast=tokenizer_cfg.get("use_fast", False))
-    elif library == "sentencepiece":
-        LOGGER.info(f"Getting SentencePieceTokenizer with model: {tokenizer_cfg.model}")
-        tokenizer = SentencePieceTokenizer(
-            model_path=os.path.join(nemo_checkpoint_path, tokenizer_cfg.model), legacy=legacy
-        )
-    elif library == "tiktoken":
-        print(f"Getting TiktokenTokenizer with file: {tokenizer_cfg.vocab_file}")
-        tokenizer = TiktokenTokenizer(vocab_file=os.path.join(nemo_checkpoint_path, tokenizer_cfg.vocab_file))
-    else:
-        raise NotImplementedError("Currently we only support 'huggingface' and 'sentencepiece' tokenizer libraries.")
-
-    return tokenizer
diff --git a/nemo/export/trt_llm/qnemo/utils.py b/nemo/export/trt_llm/qnemo/utils.py
deleted file mode 100644
index b64b9d07431e..000000000000
--- a/nemo/export/trt_llm/qnemo/utils.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from pathlib import Path
-
-from nemo.export.tarutils import TarPath
-
-CONFIG_NAME = "config.json"
-WEIGHTS_NAME = "rank{}.safetensors"
-
-
-def is_qnemo_checkpoint(path: str) -> bool:
-    """Detect if a given path is a TensorRT-LLM a.k.a. "qnemo" checkpoint based on config & tensor data presence."""
-    if os.path.isdir(path):
-        path = Path(path)
-    else:
-        path = TarPath(path)
-    config_path = path / CONFIG_NAME
-    tensor_path = path / WEIGHTS_NAME.format(0)
-    return config_path.exists() and tensor_path.exists()
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
deleted file mode 100755
index 2b7b0cff9965..000000000000
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import logging
-import tensorrt_llm
-from tensorrt_llm._common import check_max_num_tokens
-from tensorrt_llm.builder import BuildConfig
-from tensorrt_llm.commands.build import build as build_trtllm
-from tensorrt_llm.logger import logger
-from tensorrt_llm.lora_manager import LoraConfig
-from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights
-from tensorrt_llm.plugin import PluginConfig
-
-MODEL_NAME = "NeMo"
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def build_and_save_engine(
-    max_input_len=1024,
-    max_output_len=1024,
-    max_batch_size=4,
-    model_dir=None,
-    model_weights=None,
-    model_config=None,
-    model_type='gpt',
-    lora_ckpt_list=None,
-    use_lora_plugin=None,
-    max_lora_rank=64,
-    lora_target_modules=None,
-    max_prompt_embedding_table_size=0,
-    paged_kv_cache: bool = True,
-    remove_input_padding: bool = True,
-    paged_context_fmha: bool = False,
-    use_refit: bool = False,
-    max_num_tokens: int = None,
-    max_seq_len: int = None,
-    opt_num_tokens: int = None,
-    max_beam_width: int = 1,
-    tokens_per_block: int = 128,
-    multiple_profiles: bool = False,
-    gpt_attention_plugin: str = "auto",
-    gemm_plugin: str = "auto",
-    reduce_fusion: bool = False,
-    gather_context_logits: bool = False,
-    gather_generation_logits: bool = False,
-):
-    architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture
-    try:
-        model_cls = getattr(tensorrt_llm.models, architecture)
-    except Exception:
-        raise AttributeError(f"Could not find TRTLLM model type: {model_type}!")
-
-    logger.set_level("info")
-    plugin_config = PluginConfig()
-    plugin_config.gpt_attention_plugin = gpt_attention_plugin
-    plugin_config.gemm_plugin = gemm_plugin
-    if paged_kv_cache:
-        plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
-    else:
-        plugin_config.paged_kv_cache = False
-    plugin_config.remove_input_padding = remove_input_padding
-    plugin_config.use_paged_context_fmha = paged_context_fmha
-    plugin_config.multiple_profiles = multiple_profiles
-    plugin_config.reduce_fusion = reduce_fusion
-
-    max_num_tokens, opt_num_tokens = check_max_num_tokens(
-        max_num_tokens=max_num_tokens,
-        opt_num_tokens=opt_num_tokens,
-        max_seq_len=max_seq_len,
-        max_batch_size=max_batch_size,
-        max_input_len=max_input_len,
-        max_beam_width=max_beam_width,
-        remove_input_padding=remove_input_padding,
-        enable_context_fmha=plugin_config.context_fmha,
-        tokens_per_block=tokens_per_block,
-        multiple_profiles=multiple_profiles,
-    )
-
-    build_dict = {
-        'max_input_len': max_input_len,
-        'max_output_len': max_output_len,
-        'max_batch_size': max_batch_size,
-        'max_beam_width': max_beam_width,
-        'max_seq_len': max_seq_len,
-        'max_num_tokens': max_num_tokens,
-        'opt_num_tokens': opt_num_tokens,
-        'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
-        'gather_context_logits': gather_context_logits,
-        'gather_generation_logits': gather_generation_logits,
-        'strongly_typed': False,
-        'builder_opt': None,
-        'use_refit': use_refit,
-        'multiple_profiles': multiple_profiles,
-    }
-    build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
-
-    if use_lora_plugin is not None:
-        # build_config.plugin_config.set_lora_plugin(use_lora_plugin)
-        build_config.plugin_config._lora_plugin = use_lora_plugin
-        lora_config = LoraConfig(
-            lora_dir=lora_ckpt_list,
-            lora_ckpt_source='nemo',
-            max_lora_rank=max_lora_rank,
-        )
-        if lora_target_modules is not None:
-            lora_config.lora_target_modules = lora_target_modules
-        build_config.lora_config = lora_config
-
-    model = model_cls.from_config(model_config)
-    model = optimize_model(
-        model,
-        use_parallel_embedding=model_config.use_parallel_embedding,
-        share_embedding_table=model_config.share_embedding_table,
-    )
-    preprocess_weights(model_weights, model_config)
-    model.load(model_weights)
-    engine = build_trtllm(model, build_config)
-    engine.save(model_dir)
-
-    return engine
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
deleted file mode 100644
index 3a61d781193c..000000000000
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ /dev/null
@@ -1,931 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import csv
-import json
-import logging
-import os
-import tempfile
-from dataclasses import dataclass
-from pathlib import Path
-from typing import List, Optional
-
-import numpy as np
-import tensorrt as trt
-import tensorrt_llm
-import torch
-from mpi4py.futures import MPIPoolExecutor
-from tensorrt_llm.builder import Engine
-from tensorrt_llm.lora_manager import LoraManager
-from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
-from transformers import PreTrainedTokenizer
-
-LOGGER = logging.getLogger("NeMo")
-
-use_trtllm_bindings = True
-try:
-    from tensorrt_llm.bindings import GptJsonConfig
-except Exception:
-    use_trtllm_bindings = False
-
-TRTLLM_SUPPORTS_DEVICE_DISABLE = True
-try:
-    from tensorrt_llm.runtime.generation import DISABLE_TORCH_DEVICE_SET
-except (ImportError, ModuleNotFoundError):
-    TRTLLM_SUPPORTS_DEVICE_DISABLE = False
-
-
-@dataclass
-class TensorrtLLMHostContext:
-    """The host side context for TRT LLM inference."""
-
-    executor: MPIPoolExecutor = None
-    world_size: int = 1
-    tokenizer: PreTrainedTokenizer = None
-    max_batch_size: int = 0
-    max_input_len: int = 0
-    add_bos: bool = False
-
-
-@dataclass
-class TensorrtLLMWorkerContext:
-    """The MPI worker side context for TRT LLM inference."""
-
-    decoder: ModelRunner | ModelRunnerCpp = None
-    sampling_config: SamplingConfig = None
-    lora_manager: LoraManager = None
-    max_batch_size: int = 0
-    max_input_len: int = 0
-
-
-# This is a global context that will be initialized during the model loading process as MPI worker.
-tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
-
-
-def _read_config(config_path: Path):
-    with open(config_path, "r") as f:
-        config = json.load(f)
-
-    tensor_parallel_size = config["builder_config"]["tensor_parallel"]
-    pipeline_parallel_size = config["builder_config"]["pipeline_parallel"]
-    world_size = tensor_parallel_size * pipeline_parallel_size
-
-    assert world_size <= torch.cuda.device_count(), f"Not enough GPUs, requesting {world_size}"
-
-    num_heads = config["builder_config"]["num_heads"]
-    num_kv_heads = config["builder_config"].get("num_kv_heads", num_heads)
-    head_size = config["builder_config"]["head_size"]
-    hidden_size = config["builder_config"]["hidden_size"] // tensor_parallel_size
-
-    num_heads = num_heads // tensor_parallel_size
-    num_kv_heads = (num_kv_heads + tensor_parallel_size - 1) // tensor_parallel_size
-
-    if "tokens_per_block" in config["plugin_config"]:
-        tokens_per_block = config["plugin_config"]["tokens_per_block"]
-    else:
-        tokens_per_block = config["builder_config"]["tokens_per_block"]
-
-    if quantization := config["builder_config"].get("quantization"):
-        # Field "quantization" (dict) is introduced for quantized Nemo checkpoints support.
-        # For regular Nemo checkpoints "quant_mode" field should be used (default: 0).
-        quant_mode = QuantMode.from_quant_algo(quantization['quant_algo'], quantization['kv_cache_quant_algo'])
-    else:
-        quant_mode = QuantMode(config["builder_config"]["quant_mode"])
-
-    model_config = ModelConfig(
-        model_name=config["builder_config"]["name"],
-        max_batch_size=config["builder_config"]["max_batch_size"],
-        max_beam_width=config["builder_config"]["max_beam_width"],
-        vocab_size=config["builder_config"]["vocab_size"],
-        num_layers=config["builder_config"]["num_layers"],
-        num_heads=num_heads,
-        num_kv_heads=num_kv_heads,
-        hidden_size=hidden_size,
-        head_size=head_size,
-        gpt_attention_plugin=config["plugin_config"]["gpt_attention_plugin"],
-        remove_input_padding=config["plugin_config"]["remove_input_padding"],
-        paged_kv_cache=config["plugin_config"]["paged_kv_cache"],
-        tokens_per_block=tokens_per_block,
-        max_prompt_embedding_table_size=config["builder_config"]["max_prompt_embedding_table_size"],
-        dtype=config["builder_config"]["precision"],
-        lora_plugin=config["plugin_config"]["lora_plugin"],
-        lora_target_modules=config["builder_config"]["lora_target_modules"],
-        quant_mode=quant_mode,
-        use_context_fmha_for_generation=config["plugin_config"]["use_context_fmha_for_generation"],
-        gather_context_logits=config["builder_config"]["gather_context_logits"],
-        gather_generation_logits=config["builder_config"]["gather_generation_logits"],
-    )
-
-    dtype = config["builder_config"]["precision"]
-    max_input_len = config["builder_config"]["max_input_len"]
-    max_batch_size = config["builder_config"]["max_batch_size"]
-
-    return model_config, world_size, tensor_parallel_size, pipeline_parallel_size, dtype, max_input_len, max_batch_size
-
-
-def _load(
-    tokenizer: PreTrainedTokenizer,
-    engine_dir,
-    lora_ckpt_list=None,
-    num_beams=1,
-    use_python_runtime: bool = True,
-    enable_chunked_context: bool = False,
-    max_tokens_in_paged_kv_cache: int = None,
-    multi_block_mode: bool = False,
-):
-    """The impl of `load` API for on a single GPU worker."""
-    try:
-        tensorrt_llm.logger.set_level("info")
-
-        engine_dir = Path(engine_dir)
-        config_path = engine_dir / "config.json"
-        # model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path)
-
-        with open(config_path, "r") as f:
-            config = json.load(f)
-
-        max_batch_size = config["build_config"]["max_batch_size"]
-        max_input_len = config["build_config"]["max_input_len"]
-        # max_output_len = config["build_config"]["max_output_len"]
-        max_beam_width = config["build_config"]["max_beam_width"]
-
-        runtime_rank = tensorrt_llm.mpi_rank()
-
-        if use_python_runtime:
-            if enable_chunked_context:
-                logging.warning("enable_chunked_context is disabled when using python runtime")
-            if multi_block_mode:
-                logging.warning("multi_block_mode is disabled when using python runtime")
-
-            decoder = ModelRunner.from_dir(
-                engine_dir=engine_dir,
-                lora_dir=lora_ckpt_list,
-                lora_ckpt_source="nemo",
-                rank=runtime_rank,
-                debug_mode=False,
-            )
-        else:
-            decoder = ModelRunnerCpp.from_dir(
-                engine_dir=engine_dir,
-                lora_dir=lora_ckpt_list,
-                lora_ckpt_source="nemo",
-                rank=runtime_rank,
-                max_batch_size=max_batch_size,
-                max_input_len=max_input_len,
-                # max_output_len=max_output_len,
-                max_beam_width=max_beam_width,
-                enable_chunked_context=enable_chunked_context,
-                max_tokens_in_paged_kv_cache=max_tokens_in_paged_kv_cache,
-                multi_block_mode=multi_block_mode,
-                debug_mode=False,
-            )
-
-        sampling_config = SamplingConfig(
-            end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams
-        )
-
-        # Initialize the global context so it can be used during `run` API.
-        global tensorrt_llm_worker_context
-        tensorrt_llm_worker_context.decoder = decoder
-        tensorrt_llm_worker_context.sampling_config = sampling_config
-        tensorrt_llm_worker_context.max_batch_size = max_batch_size
-        tensorrt_llm_worker_context.max_input_len = max_input_len
-
-    except Exception as e:
-        print(e)
-        raise e
-
-
-def _forward(
-    input_tensors: List[torch.IntTensor],
-    max_output_len: int,
-    top_k: int = 1,
-    top_p: float = 0.0,
-    temperature: float = 1.0,
-    prompt_table=None,
-    task_vocab_size=None,
-    task_ids: List[int] = None,
-    lora_uids: List[str] = None,
-    stop_words_list=None,
-    bad_words_list=None,
-    no_repeat_ngram_size=None,
-    streaming: bool = False,
-    multiprocessed_env=False,
-    **sampling_kwargs,
-) -> Optional[torch.IntTensor]:
-    """The impl of `forward` API for on a single GPU worker with tensor as IO.
-
-    Returns:
-        the output tokens tensor with shape [batch_size, num_beams, output_len].
-    """
-    try:
-        # Loading the global context initialized from the `load` API.
-        global tensorrt_llm_worker_context
-        decoder = tensorrt_llm_worker_context.decoder
-        assert decoder is not None, "Invalid worker context, decoder is not loaded."
-        sampling_config = tensorrt_llm_worker_context.sampling_config
-        max_batch_size = tensorrt_llm_worker_context.max_batch_size
-        max_input_len = tensorrt_llm_worker_context.max_input_len
-
-        batch_size = len(input_tensors)
-        assert batch_size <= max_batch_size, f"batch size {batch_size} exceedng max batch size {max_batch_size}"
-        input_lengths = [t.shape[0] for t in input_tensors]
-        max_length = max(input_lengths)
-        assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
-        pad_id = sampling_config.pad_id
-        end_id = sampling_config.end_id
-        num_beams = sampling_config.num_beams
-
-        for k in sampling_kwargs.keys():
-            if not hasattr(sampling_config, k):
-                raise TypeError(f"Unknown sampling args '{k}'")
-
-        with torch.no_grad():
-            prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids)
-
-            if prompt_table is not None:
-                prompt_table = prompt_table.reshape(1, *prompt_table.shape)
-                tmp_dir = tempfile.TemporaryDirectory()
-                prompt_table_path = os.path.join(tmp_dir.name, 'prompt_table.npy')
-                np.save(prompt_table_path, prompt_table.cpu().float().numpy())
-                prompt_table = prompt_table_path
-
-            outputs = decoder.generate(
-                input_tensors,
-                max_new_tokens=max_output_len,
-                end_id=end_id,
-                pad_id=pad_id,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                num_beams=num_beams,
-                stop_words_list=stop_words_list,
-                bad_words_list=bad_words_list,
-                lora_uids=lora_uids,
-                prompt_table_path=prompt_table,
-                prompt_table=prompt_table,
-                prompt_tasks=prompt_tasks,
-                streaming=streaming,
-                output_sequence_lengths=True,
-                return_dict=True,
-                **sampling_kwargs,
-            )
-
-            torch.cuda.synchronize()
-
-            if prompt_table is not None:
-                tmp_dir.cleanup()
-
-        runtime_rank = tensorrt_llm.mpi_rank()
-        if runtime_rank == 0 or multiprocessed_env:
-            return outputs
-        else:
-            return None
-
-    except Exception as e:
-        print(e)
-        raise e
-
-
-def load(
-    tokenizer: PreTrainedTokenizer,
-    engine_dir: str,
-    lora_ckpt_list: List[str] = None,
-    num_beams: int = 1,
-    use_python_runtime: bool = True,
-    enable_chunked_context: bool = False,
-    max_tokens_in_paged_kv_cache: int = None,
-    multi_block_mode: bool = False,
-) -> TensorrtLLMHostContext:
-    """Loaded the compiled LLM model and run it.
-
-    It also supports running the TRT LLM model on multi-GPU.
-    """
-    # the parent dir of the engine_dir
-    config_path = os.path.join(engine_dir, "config.json")
-    with open(config_path, "r") as f:
-        config = json.load(f)
-    world_size = config["pretrained_config"]["mapping"]["world_size"]
-    if world_size == 1:
-        _load(
-            tokenizer,
-            engine_dir,
-            lora_ckpt_list,
-            num_beams,
-            use_python_runtime,
-            enable_chunked_context,
-            max_tokens_in_paged_kv_cache,
-            multi_block_mode,
-        )
-        executor = None
-    elif tensorrt_llm.mpi_world_size() > 1:
-        _load(
-            tokenizer,
-            engine_dir,
-            lora_ckpt_list,
-            num_beams,
-            use_python_runtime,
-            enable_chunked_context,
-            max_tokens_in_paged_kv_cache,
-        )
-        executor = None
-        tensorrt_llm.mpi_barrier()
-    else:
-        executor = MPIPoolExecutor(max_workers=world_size)
-        futures = []
-        for _ in range(world_size):
-            future = executor.submit(
-                _load,
-                tokenizer,
-                engine_dir,
-                lora_ckpt_list,
-                num_beams,
-                use_python_runtime,
-                enable_chunked_context,
-                max_tokens_in_paged_kv_cache,
-            )
-            futures.append(future)
-        for future in futures:
-            future.result()
-
-    max_batch_size = config["build_config"]["max_batch_size"]
-    max_input_len = config["build_config"]["max_input_len"]
-    architectures_that_need_bos_token = [
-        "GemmaForCausalLM",
-        "LLaMAForCausalLM",
-        "MistralForCausalLM",
-        "MixtralForCausalLM",
-    ]
-    add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token
-
-    return TensorrtLLMHostContext(
-        executor=executor,
-        world_size=world_size,
-        tokenizer=tokenizer,
-        max_batch_size=max_batch_size,
-        max_input_len=max_input_len,
-        add_bos=add_bos,
-    )
-
-
-def forward(
-    input_tensors: List[torch.IntTensor],
-    max_output_len: int,
-    host_context: TensorrtLLMHostContext,
-    top_k: int = 1,
-    top_p: float = 0.0,
-    temperature: float = 1.0,
-    prompt_table=None,
-    task_vocab_size=None,
-    task_ids: List[int] = None,
-    lora_uids: List[str] = None,
-    stop_words_list=None,
-    bad_words_list=None,
-    no_repeat_ngram_size=None,
-    streaming: bool = False,
-    multiprocessed_env=False,
-    **sampling_kwargs,
-) -> Optional[torch.IntTensor]:
-    """Run the loaded model with the host_context provided from the `load` API."""
-    batch_size = len(input_tensors)
-    max_batch_size = host_context.max_batch_size
-    assert batch_size <= max_batch_size, f"batch size {batch_size} exceedng max batch size {max_batch_size}"
-    max_length = max([t.shape[0] for t in input_tensors])
-    max_input_len = host_context.max_input_len
-    assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
-
-    world_size = host_context.world_size
-    if world_size == 1 or multiprocessed_env:
-        return _forward(
-            input_tensors=input_tensors,
-            max_output_len=max_output_len,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            prompt_table=prompt_table,
-            task_vocab_size=task_vocab_size,
-            task_ids=task_ids,
-            lora_uids=lora_uids,
-            stop_words_list=stop_words_list,
-            bad_words_list=bad_words_list,
-            no_repeat_ngram_size=no_repeat_ngram_size,
-            streaming=streaming,
-            multiprocessed_env=multiprocessed_env,
-            **sampling_kwargs,
-        )
-    else:
-        executor = host_context.executor
-        futures = []
-        for _ in range(world_size):
-            future = executor.submit(
-                _forward,
-                input_tensors=input_tensors,
-                max_output_len=max_output_len,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
-                prompt_table=prompt_table,
-                task_vocab_size=task_vocab_size,
-                task_ids=task_ids,
-                lora_uids=lora_uids,
-                stop_words_list=stop_words_list,
-                bad_words_list=bad_words_list,
-                no_repeat_ngram_size=no_repeat_ngram_size,
-                streaming=streaming,
-                **sampling_kwargs,
-            )
-            futures.append(future)
-        for future in futures:
-            result = future.result()
-            if result is not None:
-                return result
-
-        raise RuntimeError("Internal error")
-
-
-def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):
-    """Loads TRTLLM engines in a distributed gpu environment, in particular
-    this function creates a custom mapping of device_id to WorldConfig
-    """
-    global tensorrt_llm_worker_context
-    if isinstance(tensorrt_llm_worker_context.decoder, ModelRunner):
-        return
-
-    config_path = Path(engine_dir) / f"config_{torch.distributed.get_rank()}.json"
-    json_config = GptJsonConfig.parse_file(config_path)
-    model_config = json_config.model_config
-
-    max_batch_size = model_config.max_batch_size
-    max_input_len = model_config.max_input_len
-
-    tp_size = json_config.tensor_parallelism
-    assert tp_size <= gpus_per_node, "Multinode TP is not unsupported"
-
-    # TRTLLM asserts that rank equals the device num however this
-    # is not true for the megatron mapping of TP->DP->PP.
-    # So we manipulate TRTLLM to emulate a TP->PP single node setup
-    # TRTLLM is expected to fix this in future releases
-    offset = (torch.cuda.current_device() - model_parallel_rank % gpus_per_node + gpus_per_node) % gpus_per_node
-    device_ids = [i for i in range(gpus_per_node)]
-    for _ in range(offset):
-        device_ids.append(device_ids.pop(0))
-    engine_index = model_parallel_rank
-    # mpi_rank = mpi_comm().Get_rank()
-    # Copied from worldConfig.h (getDevice())
-    # mpi_device = mpi_rank % gpus_per_node
-    # TODO: Consider re-enabling
-    # assert torch.cuda.current_device() == mpi_device
-
-    # TODO: check if API exists (copied from gptJsonConfig.cpp)
-    # https://github.com/terrykong/TensorRT-LLM/blob/05316d3313360012536ace46c781518f5afae75e/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp#L478
-    engine_filename = f"rank{engine_index}.engine"
-    serialize_path = Path(engine_dir) / engine_filename
-    with open(serialize_path, "rb") as f:
-        engine_data = bytearray(f.read())
-
-    with open(config_path) as f:
-        json_config_str = f.read()
-
-    engine = Engine.from_buffer(engine_buffer=engine_data, json_config_str=json_config_str, rank=model_parallel_rank)
-
-    if not TRTLLM_SUPPORTS_DEVICE_DISABLE:
-        raise RuntimeError(
-            "TensorRT-LLM does not support torch device disabling. "
-            "Please upgrade TensorRT-LLM to make use of this feature."
-        )
-    elif not DISABLE_TORCH_DEVICE_SET:
-        raise RuntimeError(
-            "To use TensorRT-LLM's python ModelRunner API in load_distributed(...) "
-            "you must set the env var DISABLE_TORCH_DEVICE_SET=1"
-        )
-
-    default_kwargs = {
-        "max_output_len": None,
-        "lora_dir": None,
-        "debug_mode": False,
-        "lora_ckpt_source": "hf",
-        "medusa_choices": None,
-        "stream": None,
-        "gpu_weights_percent": 1.0,
-        "enable_context_fmha_fp32_acc": False,
-        "multi_block_mode": True,
-    }
-
-    decoder = ModelRunner.from_engine(
-        engine=engine,
-        # We want the engine to have the mp_rank,
-        # but the python runtime to not resassign the device of the current process
-        # So we will set it to the current device
-        rank=torch.cuda.current_device(),
-        **default_kwargs,
-    )
-
-    tensorrt_llm_worker_context.decoder = decoder
-    tensorrt_llm_worker_context.max_batch_size = max_batch_size
-    tensorrt_llm_worker_context.max_input_len = max_input_len
-
-
-def maybe_cast_to_trt_dtype(dtype):
-    """
-    Cast input dtype to TensorRT dtype if applicable.
-
-    Args:
-        dtype: Input dtype (torch.dtype or trt.DataType)
-
-    Returns:
-        trt.DataType: Corresponding TensorRT dtype
-    """
-    if isinstance(dtype, trt.DataType):
-        return dtype
-    elif isinstance(dtype, torch.dtype):
-        return tensorrt_llm._utils.torch_dtype_to_trt(dtype)
-    else:
-        raise NotImplementedError(f"Expects the type to be a tensorrt.DataType or torch.dtype, but got {type(dtype)=}")
-
-
-def refit(weights_dict: dict):
-    """
-    Refit TensorRT-LLM by hot-swapping its engine weights.
-
-    Args:
-        weights_dict: Dictionary containing new weights
-    """
-    global tensorrt_llm_worker_context
-    decoder = tensorrt_llm_worker_context.decoder
-    if not isinstance(decoder, ModelRunner):
-        raise ValueError(
-            f"Refit is only supported with ModelRunner, but export has been configured with {type(decoder)=}"
-        )
-
-    engine = decoder.session.runtime.engine
-    # The session dtype plumbs the model_config's dtype
-    model_dtype = maybe_cast_to_trt_dtype(decoder.session.dtype)
-    assert engine.refittable, "Tried refitting engine without refit enabled"
-
-    refitter = trt.Refitter(engine=engine, logger=trt.Logger(trt.Logger.ERROR))
-    remaining_refit_weights = set(refitter.get_all_weights())
-    skipped_weights = []
-    for trt_name, weight in weights_dict.items():
-        if trt_name not in remaining_refit_weights:
-            skipped_weights.append(trt_name)
-            continue
-        trt_weight = trt.Weights(model_dtype, weight.data_ptr(), torch.numel(weight))
-        trt_wt_location = trt.TensorLocation.DEVICE if weight.is_cuda else trt.TensorLocation.HOST
-        assert (
-            model_dtype == refitter.get_weights_prototype(trt_name).dtype == maybe_cast_to_trt_dtype(weight.dtype)
-        ), (
-            f"Expected all three of these dtypes to be the same:\n"
-            f"  {model_dtype=}\n"
-            f"  {refitter.get_weights_prototype(trt_name).dtype=}\n"
-            f"  weight.dtype={maybe_cast_to_trt_dtype(weight.dtype)}"
-        )
-
-        refitter.set_named_weights(
-            trt_name, trt_weight, trt_wt_location
-        ), f"Unable to set {trt_name=} {trt_weight=} {trt_wt_location=}"
-        remaining_refit_weights.remove(trt_name)
-    if skipped_weights:
-        logging.warning(
-            f"These weights were ignored during refit since they are not present in engine: {skipped_weights}"
-        )
-    if remaining_refit_weights:
-        logging.warning(f"Weights dict did not contain weights for these named TRT weights: {remaining_refit_weights}")
-
-    if not refitter.refit_cuda_engine():
-        raise ValueError("Refit failed!")
-
-
-def unload_engine():
-    """
-    Deletes the ModelRunner which should free up device memory
-    """
-    global tensorrt_llm_worker_context
-    decoder = tensorrt_llm_worker_context.decoder
-    if not isinstance(decoder, ModelRunner):
-        raise ValueError(
-            f"unload_engine is only supported with ModelRunner, but export has been configured with {type(decoder)=}"
-        )
-
-    logging.info("Unloading engine...")
-    del tensorrt_llm_worker_context.decoder
-    tensorrt_llm_worker_context.decoder = None
-    logging.info("Engine unloaded!")
-
-
-def prepare_input_tensors(
-    input_texts: List[str],
-    host_context: TensorrtLLMHostContext,
-    prompt_table=None,
-    task_vtoken_counts: List[int] = None,
-    task_ids: List[int] = None,
-):
-    """
-    Prepare input tensors from text input.
-
-    Args:
-        input_texts: List of input text strings
-        host_context: Context containing tokenizer and configuration
-        prompt_table: a lookup table containing trained embeddings for vtoken used in p-tuning
-        task_vtoken_counts: Optional list of vtoken counts per task
-        task_ids: Optional list of task IDs
-
-    Returns:
-        dict: Prepared input tensors for model
-    """
-
-    tokenizer = host_context.tokenizer
-
-    if host_context.add_bos:
-        bos_tokens = [tokenizer.bos_token_id]
-    else:
-        bos_tokens = []
-
-    input_tokens = [bos_tokens + tokenizer.encode(t) for t in input_texts]
-
-    # If p-tuning is used, we need to prepend vtokens to each input.
-    if prompt_table is not None:
-
-        # Go over the tokenized prompts and prepend vtokens.
-        # The number of vtokens could be different for each task.
-        for prompt_index in range(len(input_texts)):
-            # Find out the number of vtokens to generate
-            task_id = task_ids[prompt_index]
-            num_vtokens = task_vtoken_counts[task_id]
-
-            # Create a tensor with vtokens, e.g. 32000, 32001, 32002... when vocab_size=32000
-            # TRT-LLM will convert each vtoken into its corresponding embedding row from the prompt table.
-            vocab_size = tokenizer.vocab_size
-            vtokens = list(range(vocab_size, vocab_size + num_vtokens))
-
-            # Concatenate the vtokens with the real tokens
-            real_tokens = input_tokens[prompt_index]
-            input_tokens[prompt_index] = vtokens + real_tokens
-
-    # Convert input token lists to tensors
-    input_tensors = [torch.IntTensor(token_list) for token_list in input_tokens]
-
-    return input_tensors
-
-
-def generate(
-    input_texts: List[str],
-    max_output_len: int,
-    host_context: TensorrtLLMHostContext,
-    top_k: int = 1,
-    top_p: float = 0.0,
-    temperature: float = 1.0,
-    prompt_table=None,
-    task_vocab_size=None,
-    task_vtoken_counts: List[int] = None,
-    task_ids: List[int] = None,
-    lora_uids: List[str] = None,
-    stop_words_list=None,
-    bad_words_list=None,
-    no_repeat_ngram_size=None,
-    streaming: bool = False,
-    output_log_probs=False,
-    multiprocessed_env=False,
-    output_context_logits=False,
-    output_generation_logits=False,
-    **sampling_kwargs,
-) -> Optional[List[List[str]]]:
-    """Generate the output sequence from the input sequence.
-
-    Returns a 2D string list with shape [batch_size, num_beams].
-    """
-    tokenizer = host_context.tokenizer
-    input_tensors = prepare_input_tensors(input_texts, host_context, prompt_table, task_vtoken_counts, task_ids)
-
-    stop_words_list_tensors = None
-    if stop_words_list is not None:
-        stop_words_arrays = to_word_list_format(stop_words_list, tokenizer)
-        stop_words_list_tensors = (
-            torch.Tensor(stop_words_arrays).to(torch.int32).to(torch.cuda.current_device()).contiguous()
-        )
-
-    bad_words_list_tensors = None
-    if bad_words_list is not None:
-        bad_words_arrays = to_word_list_format(bad_words_list, tokenizer)
-        bad_words_list_tensors = (
-            torch.Tensor(bad_words_arrays).to(torch.int32).to(torch.cuda.current_device()).contiguous()
-        )
-
-    if no_repeat_ngram_size is not None:
-        no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
-
-    outputs = forward(
-        input_tensors=input_tensors,
-        max_output_len=max_output_len,
-        host_context=host_context,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        prompt_table=prompt_table,
-        task_vocab_size=task_vocab_size,
-        task_ids=task_ids,
-        lora_uids=lora_uids,
-        stop_words_list=stop_words_list_tensors,
-        bad_words_list=bad_words_list_tensors,
-        no_repeat_ngram_size=no_repeat_ngram_size,
-        streaming=False,
-        output_log_probs=output_log_probs,
-        multiprocessed_env=multiprocessed_env,
-        **sampling_kwargs,
-    )
-
-    assert outputs is not None
-    if tensorrt_llm.mpi_rank() != 0:
-        return None
-
-    output_ids = outputs['output_ids']
-    sequence_lengths = outputs['sequence_lengths']
-    input_lengths = [t.shape[0] for t in input_tensors]
-
-    output_lines_list = [
-        tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]])
-        for b in range(output_ids.shape[0])
-    ]
-
-    if output_generation_logits:
-        return output_lines_list, outputs['generation_logits']
-    elif output_context_logits:
-        return output_lines_list, outputs['context_logits']
-    return output_lines_list
-
-
-def generate_streaming(
-    input_texts: List[str],
-    max_output_len: int,
-    host_context: TensorrtLLMHostContext,
-    top_k: int = 1,
-    top_p: float = 0.0,
-    temperature: float = 1.0,
-    prompt_table=None,
-    task_vocab_size=None,
-    task_vtoken_counts: List[int] = None,
-    task_ids: List[int] = None,
-    lora_uids: List[str] = None,
-    stop_words_list=None,
-    bad_words_list=None,
-    no_repeat_ngram_size=None,
-    **sampling_kwargs,
-) -> Optional[List[List[str]]]:
-    """Generate the output sequence from the input sequence.
-
-    Returns a 2D string list with shape [batch_size, num_beams].
-    """
-    tokenizer = host_context.tokenizer
-    input_tensors = prepare_input_tensors(input_texts, host_context, prompt_table, task_vtoken_counts, task_ids)
-
-    batch_size = len(input_texts)
-
-    stop_words_list_tensors = None
-    if stop_words_list is not None:
-        stop_words_list_tensors = [tokenizer.encode(t) for t in stop_words_list]
-        stop_words_list_tensors = torch.IntTensor(stop_words_list_tensors)
-        stop_words_list_tensors = (
-            stop_words_list_tensors.unsqueeze(0).repeat(batch_size, 1, 1).to(torch.cuda.current_device())
-        )
-
-    bad_words_list_tensors = None
-    if bad_words_list is not None:
-        bad_words_list_tensors = [tokenizer.encode(t) for t in bad_words_list]
-        bad_words_list_tensors = torch.IntTensor(bad_words_list_tensors)
-        bad_words_list_tensors = (
-            bad_words_list_tensors.unsqueeze(0).repeat(batch_size, 1, 1).to(torch.cuda.current_device())
-        )
-
-    if no_repeat_ngram_size is not None:
-        no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
-
-    outputs = forward(
-        input_tensors=input_tensors,
-        max_output_len=max_output_len,
-        host_context=host_context,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        prompt_table=prompt_table,
-        task_vocab_size=task_vocab_size,
-        task_ids=task_ids,
-        lora_uids=lora_uids,
-        stop_words_list=stop_words_list_tensors,
-        bad_words_list=bad_words_list_tensors,
-        no_repeat_ngram_size=no_repeat_ngram_size,
-        streaming=True,
-        **sampling_kwargs,
-    )
-    assert outputs is not None
-
-    input_lengths = [t.shape[0] for t in input_tensors]
-
-    # 'outputs' is a generator that yields one generator, not sure why... Unwrap that.
-    for output in outputs:
-        output_ids = output['output_ids']
-        # Now iterate over the partial outputs, decode and yield each intermediate result.
-        generated_tokens = 0
-        for partial_outputs in output_ids:
-            if partial_outputs is None:
-                break
-            # partial_outputs is a tensor with shape=(len(input_texts), 1, output_length),
-            # where the last dimension contains a progressively increasing number of valid, generated tokens.
-            assert partial_outputs.shape[0] == len(input_texts)
-            outputs = []
-            generated_tokens += 1
-
-            # For each input in the batch...
-            for input_index in range(len(input_texts)):
-                # Extract the generated part of the output tensor and decode it.
-                input_length = input_lengths[input_index]
-                decoded_output = tokenizer.batch_decode(
-                    partial_outputs[input_index, :, input_length : input_length + generated_tokens]
-                )[0]
-                outputs.append(decoded_output)
-
-            # Yield the list of decoded partial responses.
-            yield outputs
-        # See above - 'outputs' yields just one item.
-        break
-
-
-def unload(host_context: TensorrtLLMHostContext):
-    """Frees the GPU resource from the TensorrtLLMHostContext and reset the host_context."""
-    if host_context.executor is not None:
-        host_context.executor.shutdown(wait=True)
-        host_context.executor = None
-        return
-
-    global tensorrt_llm_worker_context
-    tensorrt_llm_worker_context.decoder = None
-    tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
-
-
-def to_word_list_format(
-    word_dict: List[List[str]],
-    tokenizer=None,
-    ref_str="",
-):
-    '''
-    format of word_dict
-        len(word_dict) should be same to batch_size
-        word_dict[i] means the words for batch i
-        len(word_dict[i]) must be 1, which means it only contains 1 string
-        This string can contains several sentences and split by ",".
-        For example, if word_dict[2] = " I am happy, I am sad", then this function will return
-        the ids for two short sentences " I am happy" and " I am sad".
-    '''
-    assert tokenizer is not None, "need to set tokenizer"
-
-    flat_ids = []
-    offsets = []
-    # The encoding of a single word can't always be trusted. See
-    #   https://github.com/NVIDIA/NeMo/blob/bb575b72fd0be51ae10cc77d9f89ddb9e9d3b96d/nemo/collections/nlp/modules/common/text_generation_strategy.py#L229  # pylint: disable=C0301
-    ids_ref = tokenizer.encode(ref_str)
-    for word_dict_item in word_dict:
-        item_flat_ids = []
-        item_offsets = []
-
-        if isinstance(word_dict_item[0], bytes):
-            word_dict_item = [word_dict_item[0].decode()]
-
-        words = list(csv.reader(word_dict_item))[0]
-        for word in words:
-            ids = tokenizer.encode(f"{ref_str}{word}")
-            if ids[0 : len(ids_ref)] == ids_ref:
-                # It worked! We can obtain the token(s) associated to `word` by stripping the prefix tokens.
-                ids = ids[len(ids_ref) :]
-            else:
-                # Unfortunately the prefix was merged with `word`. We could try with a different prefix, but
-                # for now we just use the basic encoding since this should be a very rare edge case.
-                ids = tokenizer.encode(word)
-                logging.warning(f"The encoding of word '{word}' into tokens {ids} might be incorrect")
-
-            if len(ids) == 0:
-                continue
-
-            item_flat_ids += ids
-            item_offsets.append(len(ids))
-
-        flat_ids.append(np.array(item_flat_ids))
-        offsets.append(np.cumsum(np.array(item_offsets)))
-
-    pad_to = max(1, max(len(ids) for ids in flat_ids))
-
-    for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
-        flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
-        offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
-
-    return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
diff --git a/nemo/export/trt_llm/utils.py b/nemo/export/trt_llm/utils.py
deleted file mode 100644
index d24183923281..000000000000
--- a/nemo/export/trt_llm/utils.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional
-
-import tensorrt_llm
-
-
-def is_rank(rank: Optional[int]) -> bool:
-    """
-    Check if the current MPI rank matches the specified rank.
-
-    Args:
-        rank (Optional[int]): The rank to check against.
-
-    Returns:
-        bool: True if the current rank matches the specified rank or if rank is None.
-    """
-    current_rank = tensorrt_llm.mpi_rank()
-    if rank is None:
-        return True
-    if isinstance(rank, int):
-        return current_rank == rank
-    raise ValueError(f"Invalid rank argument {rank} of type {type(rank)}.")
diff --git a/nemo/export/utils/__init__.py b/nemo/export/utils/__init__.py
deleted file mode 100644
index ed7ee448bc6e..000000000000
--- a/nemo/export/utils/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from nemo.export.utils.lora_converter import convert_lora_nemo_to_canonical
-from nemo.export.utils.model_loader import (
-    load_model_weights,
-    load_sharded_metadata_torch_dist,
-    load_sharded_metadata_zarr,
-    nemo_to_path,
-)
-from nemo.export.utils.utils import (
-    get_example_inputs,
-    get_model_device_type,
-    is_nemo2_checkpoint,
-    is_nemo_tarfile,
-    prepare_directory_for_export,
-    torch_dtype_from_precision,
-    validate_fp8_network,
-)
-
-__all__ = [
-    "convert_lora_nemo_to_canonical",
-    "load_model_weights",
-    "load_sharded_metadata_torch_dist",
-    "load_sharded_metadata_zarr",
-    "nemo_to_path",
-    "is_nemo2_checkpoint",
-    "is_nemo_tarfile",
-    "prepare_directory_for_export",
-    "torch_dtype_from_precision",
-    "get_model_device_type",
-    "get_example_inputs",
-    "validate_fp8_network",
-]
diff --git a/nemo/export/utils/_mock_import.py b/nemo/export/utils/_mock_import.py
deleted file mode 100644
index 0eabda79a926..000000000000
--- a/nemo/export/utils/_mock_import.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import importlib
-import logging
-import sys
-import types
-from contextlib import contextmanager
-
-LOGGER = logging.getLogger("NeMo")
-
-"""
-Utility to mock imports of unavailable modules.
-
-Created for the purpose of using NeMo checkpoints produced with nvcr.io/nvidia/nemo:25.02.rc2
-containers (or later) and used in the environments where Megatron-Core is not available. This
-currently includes NIM containers.
-"""
-
-
-@contextmanager
-def _mock_import(module: str):
-    """
-    Context manager to mock the import of a specified module if it is not available.
-
-    Args:
-        module (str): The name of the module to mock.
-
-    Yields:
-        Yields control back to the caller.
-    """
-
-    class DummyModule(types.ModuleType):
-        """DummyModule."""
-
-        def __getattr__(self, name):
-            class Dummy:
-                """Dummy."""
-
-                pass
-
-            return Dummy
-
-    try:
-        importlib.import_module(module)
-    except ModuleNotFoundError:
-        LOGGER.warning(f"Module '{module}' is not available, mocking with a dummy module.")
-        sys_modules_backup = sys.modules.copy()
-
-        dummy_module = DummyModule("dummy")
-        module_name, *submodules = module.split(".")
-        sys.modules[module_name] = dummy_module
-        modules_mocked = [module_name]
-        for submodule in submodules:
-            module_name += f".{submodule}"
-            sys.modules[module_name] = dummy_module
-            modules_mocked.append(module_name)
-
-        yield
-
-        # Restore the original sys.modules
-        for module_name in modules_mocked:
-            if module_name in sys_modules_backup:
-                sys.modules[module_name] = sys_modules_backup[module_name]
-            else:
-                del sys.modules[module_name]
-    else:
-        yield
diff --git a/nemo/export/utils/constants.py b/nemo/export/utils/constants.py
deleted file mode 100644
index b7360e5f1f22..000000000000
--- a/nemo/export/utils/constants.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Export
-TRTLLM_ENGINE_DIR = "trtllm_engine"
diff --git a/nemo/export/utils/lora_converter.py b/nemo/export/utils/lora_converter.py
deleted file mode 100644
index 020a87ac9f70..000000000000
--- a/nemo/export/utils/lora_converter.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import re
-import tarfile
-import tempfile
-from pathlib import Path
-from typing import Any, Dict, List, Tuple
-
-import torch
-import yaml
-
-from nemo.export.tarutils import TarPath
-
-
-def replace_number_add_offset(key, offset_value):
-    # This function finds the layer number in the state dict key and adds a numeric offset to that number
-
-    if offset_value == 0:
-        return key
-
-    pattern = r'layers.(\d+)'
-
-    def add_offset(match):
-        return "layers." + str(int(match.group(1)) + offset_value)
-
-    return re.sub(pattern, add_offset, key)
-
-
-def rename_qkv_keys(key):
-    new_keys = []
-    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.q_adapter."))
-    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.k_adapter."))
-    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.v_adapter."))
-    return new_keys
-
-
-def reformat_module_names_to_hf(tensors: Dict[str, torch.Tensor]) -> Tuple[Dict[str, torch.Tensor], List[str]]:
-    new_tensors = dict()
-    module_names = set()
-    known_module_names = ["q_proj", "k_proj", "v_proj", "o_proj", "down_proj", "gate_proj", "up_proj"]
-    for module_name, module_weight in tensors.items():
-        # map linear_in and linear_out to lora_a/lora_b counterparts
-        new_module_name = "base_model." + module_name.replace("linear_in", "lora_A").replace("linear_out", "lora_B")
-
-        # map target modules to their vLLM/HF counterparts
-        new_module_name = new_module_name.replace("q_adapter", "q_proj")
-        new_module_name = new_module_name.replace("k_adapter", "k_proj")
-        new_module_name = new_module_name.replace("v_adapter", "v_proj")
-        new_module_name = new_module_name.replace("lora_dense_attention_adapter", "o_proj")
-        new_module_name = new_module_name.replace("lora_4htoh_adapter", "down_proj")
-        new_module_name = new_module_name.replace("gate_adapter", "gate_proj")
-        new_module_name = new_module_name.replace("up_adapter", "up_proj")
-
-        # map other parts of the module names to fit vLLM/huggingface
-        new_module_name = new_module_name.replace(".adapter_layer", "")
-        new_module_name = new_module_name.replace(".lora_unfused_kqv_proj", "")
-        new_module_name = new_module_name.replace(".lora_unfused_hto4h_adapter", "")
-        new_module_name = new_module_name.replace("self_attention", "self_attn")
-        new_module_name = new_module_name.replace("decoder", "model")
-
-        new_tensors[new_module_name] = module_weight
-
-        # keep track of the modules that we've added to store them in the config file
-        for kmn in known_module_names:
-            if f'.{kmn}' in new_module_name:
-                module_names.add(kmn)
-
-    return (new_tensors, list(module_names))
-
-
-def convert_lora_weights_to_canonical(
-    config: Dict[str, Any], lora_weights: Dict[str, torch.Tensor]
-) -> Dict[str, torch.Tensor]:
-    """This function converts nemo style (fused) lora weights to canonical (unfused)
-    LoRA weights. Namely, it unfuses the QKV adapter layers and the H-to-4H adapter layers.
-
-    Returns:
-        Dict[str, torch.Tensor]: The new LoRA weights with unfused layers.
-    """
-
-    hidden_size = int(config["hidden_size"])
-    num_heads = int(config["num_attention_heads"])
-    head_size = hidden_size // num_heads
-    num_query_groups = int(config.get("num_query_groups", num_heads))  # num_kv_heads
-
-    heads_per_group = num_heads // num_query_groups
-    qkv_total_dim = num_heads + 2 * num_query_groups
-
-    adapter_size = config['peft']['lora_tuning']['adapter_dim']
-
-    q_slice = torch.cat(
-        [
-            torch.arange((heads_per_group + 2) * group_idx, (heads_per_group + 2) * group_idx + heads_per_group)
-            for group_idx in range(num_query_groups)
-        ]
-    )
-    k_slice = torch.arange(heads_per_group, qkv_total_dim, heads_per_group + 2)
-    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, heads_per_group + 2)
-
-    qkv_keys_to_update = []
-    hto4h_keys_to_update = []
-    for key in lora_weights.keys():
-        if "lora_kqv_adapter" in key:
-            qkv_keys_to_update.append(key)
-        if "lora_hto4h_adapter" in key:
-            hto4h_keys_to_update.append(key)
-
-    # unfuse QKV layer
-    for key in qkv_keys_to_update:
-        if "linear_in" in key:
-            assert lora_weights[key].size(0) == adapter_size
-            for new_key in rename_qkv_keys(key):
-                lora_weights[new_key] = lora_weights[key]
-                assert len(lora_weights[new_key].size()) == 2
-        elif "linear_out" in key:
-            assert lora_weights[key].size(1) == adapter_size
-            for new_key, size in zip(rename_qkv_keys(key), [q_slice, k_slice, v_slice]):
-                lora_weights[new_key] = (
-                    lora_weights[key]
-                    .reshape((qkv_total_dim, head_size, adapter_size))[size]
-                    .reshape((-1, adapter_size))
-                )
-                assert len(lora_weights[new_key].size()) == 2
-        lora_weights.pop(key)
-
-    # This maps to gate_up_proj in HF, but we need to split it up into gate_proj and up_proj
-    for key in hto4h_keys_to_update:
-        gate_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.gate_adapter.")
-        up_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.up_adapter.")
-
-        module_weight = lora_weights[key]
-        if "linear_in" in key:
-            # lora_a gets duplicated
-            lora_weights[gate_proj_key] = module_weight
-            lora_weights[up_proj_key] = module_weight
-        elif "linear_out" in key:
-            # lora_b gets split
-            split_size = module_weight.shape[0]
-            gate_up_split = module_weight.split(split_size // 2)
-            lora_weights[gate_proj_key] = gate_up_split[0]
-            lora_weights[up_proj_key] = gate_up_split[1]
-        lora_weights.pop(key)
-    return lora_weights
-
-
-def convert_lora_nemo_to_canonical(lora_nemo, save_path, hf_format=False, donor_hf_config=None):
-    with TarPath(lora_nemo) as archive:
-        with (archive / "model_config.yaml").open("r") as config_file:
-            lora_config = yaml.load(config_file, Loader=yaml.SafeLoader)
-
-        tp_size = lora_config.get('tensor_model_parallel_size', 1)
-        pp_size = lora_config.get('pipeline_model_parallel_size', 1)
-
-        lora_state_dict = [{}] * tp_size
-
-        for pp in range(pp_size):
-            for tp in range(tp_size):
-                if tp_size == 1:
-                    ckpt_file = archive / "model_weights.ckpt"
-                elif pp_size == 1:
-                    ckpt_file = archive / f"mp_rank_{tp:02d}/model_weights.ckpt"
-                else:
-                    ckpt_file = archive / f"tp_rank_{tp:02d}_pp_rank_{pp:03d}/model_weights.ckpt"
-
-                with ckpt_file.open("rb") as f:
-                    weights = torch.load(f, map_location=torch.device('cpu'))
-
-                if pp == 0:
-                    lora_state_dict[tp] = weights
-                else:
-                    # calculate layer offset
-                    layer_offset = lora_config['num_layers'] // pp_size * pp
-                    for key, value in weights.items():
-                        new_key = replace_number_add_offset(key, layer_offset)
-                        lora_state_dict[tp][new_key] = value
-
-        # TODO: currently suport tp=1
-        lora_state_dict = lora_state_dict[0]
-        if lora_config['peft']['lora_tuning'].get('variant', 'nemo') == "nemo":
-            lora_config['peft']['lora_tuning']['variant'] = "canonical"
-            lora_state_dict = convert_lora_weights_to_canonical(lora_config, lora_state_dict)
-
-        if hf_format:
-            lora_state_dict, target_modules = reformat_module_names_to_hf(lora_state_dict)
-            Path(save_path).mkdir(parents=True, exist_ok=True)
-            torch.save(lora_state_dict, f"{save_path}/adapter_model.bin")
-            if donor_hf_config is not None:
-                with open(donor_hf_config) as hf_config_file:
-                    adapter_config = json.load(hf_config_file)
-            else:
-                adapter_config = {}
-            adapter_config['peft_type'] = "LORA"
-            adapter_config['r'] = lora_config['peft']['lora_tuning']['adapter_dim']
-            adapter_config['lora_alpha'] = lora_config['peft']['lora_tuning']['alpha']
-            adapter_config['target_modules'] = target_modules
-            with open(f"{save_path}/adapter_config.json", "w") as f:
-                json.dump(adapter_config, f, indent=4)
-        else:
-            with tempfile.TemporaryDirectory() as tmpdir:
-                with open(f"{tmpdir}/model_config.yaml", "w") as f:
-                    yaml.dump(lora_config, f)
-                torch.save(lora_state_dict, f"{tmpdir}/model_weights.ckpt")
-
-                dirname = os.path.dirname(save_path)
-                os.makedirs(dirname, exist_ok=True)
-                with tarfile.open(save_path, "w:") as tar:
-                    tar.add(tmpdir, arcname=".")
-
-    return lora_state_dict, lora_config
diff --git a/nemo/export/utils/model_loader.py b/nemo/export/utils/model_loader.py
deleted file mode 100644
index 64173a8e5cb3..000000000000
--- a/nemo/export/utils/model_loader.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import logging
-import os.path
-from io import BytesIO
-from pathlib import Path
-from typing import Any, Dict, Union
-
-import numpy
-
-# tenosrstore is needed to register 'bfloat16' dtype with numpy for zarr compatibility
-import tensorstore  # noqa: F401 pylint: disable=unused-import
-import torch
-from torch.distributed.checkpoint import FileSystemReader, load
-from torch.distributed.checkpoint.metadata import BytesStorageMetadata, TensorStorageMetadata
-
-from nemo.export.tarutils import TarPath, ZarrPathStore
-from nemo.export.utils._mock_import import _mock_import
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def nemo_to_path(nemo_checkpoint: Union[Path, str]) -> Union[Path, TarPath]:
-    """
-    Creates Path / TarPath object suitable for navigating inside the nemo checkpoint.
-
-    Args:
-        nemo_checkpoint (Path, str): Path to the NeMo checkpoint.
-    Returns:
-        Path | TarPath: Suitable Path object for navigating through the checkpoint.
-    """
-    string_path = str(nemo_checkpoint)
-
-    if os.path.isdir(string_path):
-        return Path(string_path)
-    return TarPath(string_path)
-
-
-class TarFileSystemReader(FileSystemReader):
-    """Reader that accepts both Path and TarPath checkpoint directory.
-
-    The FileSystemReader works with TarPath, but expects a pure Path.
-    It's enough to skip the Path check in __init__.
-    """
-
-    def __init__(self, path: Union[Path, TarPath]) -> None:
-        """Makes sure that super().__init__ gets a pure path as expected."""
-        super_path = str(path) if isinstance(path, TarPath) else path
-        super().__init__(super_path)
-        if isinstance(path, TarPath):
-            self.path = path  # overwrites path set in super().__init__ call
-
-
-def load_sharded_metadata_torch_dist(
-    checkpoint_dir: Union[Path, TarPath], load_extra_states: bool = False
-) -> Dict[str, Any]:
-    """
-    Loads model state dictionary from torch_dist checkpoint.
-
-    Args:
-        checkpoint_dir (Path | TarPath): Path to the model weights directory.
-        load_extra_states (bool): If set to true, loads BytesIO objects, related to the extra states.
-    Returns:
-        dict: Loaded model state dictionary (weights are stored in torch tensors).
-    """
-    fs_reader = TarFileSystemReader(checkpoint_dir)
-    metadata = fs_reader.read_metadata()
-
-    state_dict = {
-        k: torch.empty(tp.size, dtype=tp.properties.dtype)
-        for k, tp in metadata.state_dict_metadata.items()
-        if isinstance(tp, TensorStorageMetadata)
-    }
-
-    if load_extra_states:
-        state_dict.update(
-            {k: [] for k, tp in metadata.state_dict_metadata.items() if isinstance(tp, BytesStorageMetadata)}
-        )
-
-    load(state_dict, storage_reader=fs_reader)
-    return state_dict
-
-
-def load_sharded_pickle_extra_state_scale(dir: Union[Path, TarPath]) -> Dict[str, BytesIO]:
-    """
-    Loads model extra states from the .pt shards.
-
-    Args:
-        dir (Path | TarPath): Path to the directory with sharded extra states.
-    Returns:
-        dict: State dictionary corresponding to the loaded extra states.
-    """
-    pt_files = list(dir.glob('shard_*_*.pt'))
-    extra_states = {}
-    for file in pt_files:
-        shard_name = file.name.split('.')[0]
-        with file.open('rb') as opened_file:
-            extra_states[dir.name + '/' + shard_name] = torch.load(opened_file, weights_only=True)
-
-    return extra_states
-
-
-def contains_extra_states(subdir: Union[Path, TarPath]) -> bool:
-    """
-    Checks if zarr directory contains extra states.
-
-    Args:
-        subdir (Path | TarPath): Directory inside the zarr checkpoint.
-    Returns:
-        bool: Is a directory with extra states
-    """
-    return list(subdir.glob('shard_0_*.pt')) != []
-
-
-def load_sharded_metadata_zarr(
-    checkpoint_dir: Union[Path, TarPath], load_extra_states: bool = False
-) -> Dict[str, Any]:
-    """
-    Loads model dictionary from the zarr format.
-
-    Args:
-        checkpoint_dir (Path | TarPath): Path to the NeMo checkpoint.
-        load_extra_states (bool): If set to True, the function will load BufferIO objects with extra states.
-    Returns:
-        dict: Model state dictionary.
-    """
-    if load_extra_states:
-        torch.serialization.add_safe_globals([BytesIO])
-
-    sharded_state_dict = {}
-    for subdir in checkpoint_dir.iterdir():
-        if not subdir.is_dir():
-            continue
-
-        if load_extra_states and contains_extra_states(subdir):
-            sharded_state_dict.update(load_sharded_pickle_extra_state_scale(subdir))
-
-        elif (subdir / '.zarray').exists():
-            key = subdir.name
-            zstore = ZarrPathStore(subdir)
-
-            import zarr
-
-            arr = zarr.open(zstore, 'r')
-
-            if arr.dtype.name == "bfloat16":
-                sharded_state_dict[key] = torch.from_numpy(arr[:].view(numpy.int16)).view(torch.bfloat16)
-            else:
-                sharded_state_dict[key] = torch.from_numpy(arr[:])
-
-    return sharded_state_dict
-
-
-def nemo_weights_directory(nemo_path: Union[Path, TarPath]) -> Union[Path, TarPath]:
-    """
-    Returns a Path pointing to the weights directory inside the NeMo checkpoint.
-
-    Args:
-        nemo_path (Path | TarPath): Path to the nemo checkpoint.
-    Returns:
-        Path | TarPath: Path to the weights directory inside the model checkpoint.
-    """
-    if (nemo_path / "model_weights").exists():
-        return nemo_path / "model_weights"
-
-    if (nemo_path / "weights").exists():
-        return nemo_path / "weights"
-
-    return nemo_path
-
-
-def load_model_weights(checkpoint_path: Union[str, Path], load_extra_states: bool = False) -> Dict[str, Any]:
-    """
-    Loads NeMo state dictionary. Weights are stored in torch.Tensor
-
-    Args:
-        checkpoint_path (str | Path): Path to the NeMo checkpoint.
-        load_extra_states (bool): If True, loads BytesIO objects, corresponding to the extra states.
-    Returns:
-        dict: Model state dictionary.
-    """
-
-    nemo_path = nemo_to_path(checkpoint_path)
-    nemo_weights = nemo_weights_directory(nemo_path)
-
-    with (nemo_weights / 'metadata.json').open(mode='r') as f:
-        config_dict = json.load(f)
-
-    if config_dict['sharded_backend'] == 'zarr':
-        return load_sharded_metadata_zarr(nemo_weights, load_extra_states=load_extra_states)
-    elif config_dict['sharded_backend'] == 'torch_dist':
-        # TODO: Remove mocking imports once MCore is available in NIM containers
-        with _mock_import("megatron.core.dist_checkpointing.strategies.torch"):
-            return load_sharded_metadata_torch_dist(nemo_weights, load_extra_states=load_extra_states)
-
-    raise NotImplementedError(f'Distributed checkpoint backend {config_dict["sharded_backend"]} not supported')
diff --git a/nemo/export/utils/utils.py b/nemo/export/utils/utils.py
deleted file mode 100755
index fa2034ed70ac..000000000000
--- a/nemo/export/utils/utils.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-from collections import Counter
-from pathlib import Path
-from typing import Dict, Optional, Union
-
-import torch
-
-
-def is_nemo2_checkpoint(checkpoint_path: str) -> bool:
-    """
-    Checks if the checkpoint is in NeMo 2.0 format.
-    Args:
-        checkpoint_path (str): Path to a checkpoint.
-    Returns:
-        bool: True if the path points to a NeMo 2.0 checkpoint; otherwise false.
-    """
-
-    ckpt_path = Path(checkpoint_path)
-    return (ckpt_path / 'context').is_dir()
-
-
-def prepare_directory_for_export(
-    model_dir: Union[str, Path], delete_existing_files: bool, subdir: Optional[str] = None
-) -> None:
-    """
-    Prepares model_dir path for the TensorRTT-LLM / vLLM export.
-    Makes sure that the model_dir directory exists and is empty.
-
-    Args:
-        model_dir (str): Path to the target directory for the export.
-        delete_existing_files (bool): Attempt to delete existing files if they exist.
-        subdir (Optional[str]): Subdirectory to create inside the model_dir.
-
-    Returns:
-        None
-    """
-    model_path = Path(model_dir)
-
-    if model_path.exists():
-        if delete_existing_files:
-            shutil.rmtree(model_path)
-        elif any(model_path.iterdir()):
-            raise RuntimeError(f"There are files in {model_path} folder: try setting delete_existing_files=True.")
-
-    if subdir is not None:
-        model_path /= subdir
-    model_path.mkdir(parents=True, exist_ok=True)
-
-
-def is_nemo_tarfile(path: str) -> bool:
-    """
-    Checks if the path exists and points to packed NeMo 1 checkpoint.
-
-    Args:
-        path (str): Path to possible checkpoint.
-    Returns:
-        bool: NeMo 1 checkpoint exists and is in '.nemo' format.
-    """
-    checkpoint_path = Path(path)
-    return checkpoint_path.exists() and checkpoint_path.suffix == '.nemo'
-
-
-# Copied from nemo.collections.nlp.parts.utils_funcs to avoid introducing extra NeMo dependencies:
-def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: bool = True) -> torch.dtype:
-    """
-    Mapping from PyTorch Lighthing (PTL) precision types to corresponding PyTorch parameter data type.
-
-    Args:
-        precision (Union[int, str]): The PTL precision type used.
-        megatron_amp_O2 (bool): A flag indicating if Megatron AMP O2 is enabled.
-
-    Returns:
-        torch.dtype: The corresponding PyTorch data type based on the provided precision.
-    """
-    if not megatron_amp_O2:
-        return torch.float32
-
-    if precision in ['bf16', 'bf16-mixed']:
-        return torch.bfloat16
-    elif precision in [16, '16', '16-mixed']:
-        return torch.float16
-    elif precision in [32, '32', '32-true']:
-        return torch.float32
-    else:
-        raise ValueError(f"Could not parse the precision of '{precision}' to a valid torch.dtype")
-
-
-def get_model_device_type(module: torch.nn.Module) -> str:
-    """Find the device type the model is assigned to and ensure consistency."""
-    # Collect device types of all parameters and buffers
-    param_device_types = {param.device.type for param in module.parameters()}
-    buffer_device_types = {buffer.device.type for buffer in module.buffers()}
-    all_device_types = param_device_types.union(buffer_device_types)
-
-    if len(all_device_types) > 1:
-        raise ValueError(
-            f"Model parameters and buffers are on multiple device types: {all_device_types}. "
-            "Ensure all parameters and buffers are on the same device type."
-        )
-
-    # Return the single device type, or default to 'cpu' if no parameters or buffers
-    return all_device_types.pop() if all_device_types else "cpu"
-
-
-def get_example_inputs(tokenizer) -> Dict[str, torch.Tensor]:
-    """Gets example data to feed to the model during ONNX export.
-
-    Returns:
-        Dictionary of tokenizer outputs.
-    """
-    example_inputs = dict(
-        tokenizer(
-            ["example query one", "example query two"],
-            ["example passage one", "example passage two"],
-            return_tensors="pt",
-        )
-    )
-
-    return example_inputs
-
-
-def validate_fp8_network(network) -> None:
-    """Checks the network to ensure it's compatible with fp8 precison.
-
-    Raises:
-        ValueError if netowrk doesn't container Q/DQ FP8 layers
-    """
-
-    import tensorrt as trt
-
-    quantize_dequantize_layers = []
-    for layer in network:
-        if layer.type in {trt.LayerType.QUANTIZE, trt.LayerType.DEQUANTIZE}:
-            quantize_dequantize_layers.append(layer)
-    if not quantize_dequantize_layers:
-        error_msg = "No Quantize/Dequantize layers found"
-        raise ValueError(error_msg)
-    quantize_dequantize_layer_dtypes = Counter(layer.precision for layer in quantize_dequantize_layers)
-    if trt.DataType.FP8 not in quantize_dequantize_layer_dtypes:
-        error_msg = "Found Quantize/Dequantize layers. But none with FP8 precision."
-        raise ValueError(error_msg)
diff --git a/nemo/export/vllm/__init__.py b/nemo/export/vllm/__init__.py
deleted file mode 100644
index 341a77c5bc66..000000000000
--- a/nemo/export/vllm/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/nemo/export/vllm/model_config.py b/nemo/export/vllm/model_config.py
deleted file mode 100644
index 8550f8bcbbc1..000000000000
--- a/nemo/export/vllm/model_config.py
+++ /dev/null
@@ -1,252 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from pathlib import Path
-from typing import Any, Dict, Optional, Union
-
-import torch
-import yaml
-from hydra.utils import instantiate
-from omegaconf import OmegaConf
-from transformers import AutoConfig
-from vllm.config import ModelConfig, ModelImpl, PoolerConfig, _get_and_verify_dtype, _get_and_verify_max_len
-from vllm.transformers_utils.config import get_hf_text_config
-
-from nemo.export.tarutils import TarPath
-from nemo.export.utils import is_nemo2_checkpoint
-from nemo.export.vllm.model_converters import get_model_converter
-
-
-class NemoModelConfig(ModelConfig):
-    """
-    This class pretents to be a vllm.config.ModelConfig (with extra fields) but skips
-    some of its initialization code, and initializes the configuration from a Nemo checkpoint instead.
-    """
-
-    def __init__(
-        self,
-        nemo_checkpoint: str,
-        model_dir: str,
-        model_type: str,
-        tokenizer_mode: str,
-        dtype: Union[str, torch.dtype],
-        seed: int,
-        revision: Optional[str] = None,
-        override_neuron_config: Optional[Dict[str, Any]] = None,
-        code_revision: Optional[str] = None,
-        rope_scaling: Optional[dict] = None,
-        rope_theta: Optional[float] = None,
-        tokenizer_revision: Optional[str] = None,
-        max_model_len: Optional[int] = None,
-        quantization: Optional[str] = None,
-        quantization_param_path: Optional[str] = None,
-        enforce_eager: bool = False,
-        max_seq_len_to_capture: Optional[int] = 8192,
-        max_logprobs: int = 5,
-        disable_sliding_window: bool = False,
-        disable_cascade_attn: bool = False,
-        use_async_output_proc: bool = False,
-        disable_mm_preprocessor_cache: bool = False,
-        logits_processor_pattern: Optional[str] = None,
-        override_pooler_config: Optional[PoolerConfig] = None,
-        override_generation_config: Optional[Dict[str, Any]] = None,
-        enable_sleep_mode: bool = False,
-        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
-    ) -> None:
-        # Don't call ModelConfig.__init__ because we don't want it to call
-        # transformers.AutoConfig.from_pretrained(...)
-
-        # TODO: Do something about vLLM's call to _load_generation_config_dict in LLMEngine.__init__
-        # because it calls transformers.GenerationConfig.from_pretrained(...), which tries to download things
-
-        self.nemo_checkpoint = nemo_checkpoint
-        self.model = model_dir
-        self.model_type = model_type
-        self.tokenizer = None
-        self.tokenizer_mode = tokenizer_mode
-        self.skip_tokenizer_init = False
-        self.trust_remote_code = False
-        self.seed = seed
-        self.revision = revision
-        self.code_revision = code_revision
-        self.override_neuron_config = override_neuron_config
-        self.rope_scaling = rope_scaling
-        self.rope_theta = rope_theta
-        self.tokenizer_revision = tokenizer_revision
-        self.model_impl = model_impl
-        self.quantization = quantization
-        self.quantization_param_path = quantization_param_path
-        self.enforce_eager = enforce_eager
-        self.max_seq_len_to_capture = max_seq_len_to_capture
-        self.max_logprobs = max_logprobs
-        self.disable_sliding_window = disable_sliding_window
-        self.disable_cascade_attn = disable_cascade_attn
-        self.served_model_name = nemo_checkpoint
-        self.multimodal_config = None
-        self.mm_processor_kwargs = {}
-        self.use_async_output_proc = use_async_output_proc
-        self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
-        self.logits_processor_pattern = logits_processor_pattern
-        self.generation_config = None
-        self.task = "generate"  # Only the generate task is supported
-        self.is_hybrid = False  # No hybrid models are supported
-        self.attention_chunk_size = None  # Llama4-specific parameter
-        self.override_generation_config = override_generation_config
-
-        if self.task in ("draft", "generate"):
-            self.truncation_side = "left"
-        else:
-            self.truncation_side = "right"
-
-        self.encoder_config = self._get_encoder_config()
-        self.pooler_config = self._init_pooler_config(override_pooler_config)
-        self.enable_sleep_mode = enable_sleep_mode
-
-        from vllm.platforms import current_platform  # vLLM uses local import for current_platform
-
-        if self.enable_sleep_mode and not current_platform.is_cuda():
-            raise ValueError("Sleep mode is only supported on CUDA devices.")
-
-        self.model_converter = get_model_converter(model_type)
-        if self.model_converter is None:
-            raise RuntimeError(f'Unknown model type "{model_type}"')
-
-        if is_nemo2_checkpoint(nemo_checkpoint):
-            nemo_checkpoint: Path = Path(nemo_checkpoint)
-            tokenizer_config = OmegaConf.load(nemo_checkpoint / "context/model.yaml").tokenizer
-            if ('additional_special_tokens' in tokenizer_config) and len(
-                tokenizer_config['additional_special_tokens']
-            ) == 0:
-                del tokenizer_config['additional_special_tokens']
-
-            tokenizer_config = self._change_paths_to_absolute_paths(tokenizer_config, nemo_checkpoint)
-            with (nemo_checkpoint / "context/model.yaml").open('r') as config_file:
-                self.nemo_model_config: dict = yaml.load(config_file, Loader=yaml.SafeLoader)
-            hf_args = self._load_hf_arguments(self.nemo_model_config['config'])
-
-            tokenizer = instantiate(tokenizer_config)
-            hf_args['vocab_size'] = tokenizer.original_vocab_size
-            self.model_converter.convert_config(self.nemo_model_config['config'], hf_args)
-            # In transformers ~= 4.52.0, the config for model_type="mixtral" loads with head_dim=None
-            # which causes issues down the way in vLLM in MixtralAttention class. One possible fix is
-            # to delete head_dim from the config if it is None.
-            self.hf_config = AutoConfig.for_model(model_type, **hf_args)
-            assert "huggingface" in tokenizer_config["_target_"]
-            tokenizer_id = tokenizer_config["pretrained_model_name"]
-        else:
-            with TarPath(nemo_checkpoint) as archive:
-                with (archive / "model_config.yaml").open("r") as model_config_file:
-                    self.nemo_model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
-                    hf_args = self._load_hf_arguments(self.nemo_model_config)
-                    self.model_converter.convert_config(self.nemo_model_config, hf_args)
-                self.hf_config = AutoConfig.for_model(model_type, **hf_args)
-            assert self.nemo_model_config["tokenizer"]["library"] == "huggingface"
-            tokenizer_id = self.nemo_model_config["tokenizer"]["type"]
-        self.tokenizer = tokenizer_id
-
-        self.hf_config.architectures = [self.model_converter.get_architecture()]
-        if self.rope_scaling is not None:
-            self.hf_config['rope_scaling'] = rope_scaling
-
-        self.hf_text_config = get_hf_text_config(self.hf_config)
-        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
-        self.max_model_len = _get_and_verify_max_len(
-            hf_config=self.hf_text_config,
-            max_model_len=max_model_len,
-            disable_sliding_window=self.disable_sliding_window,
-            sliding_window_len=self.get_hf_config_sliding_window(),
-        )
-        self.is_attention_free = self._init_attention_free()
-        self.has_inner_state = self._init_has_inner_state()
-        self.has_noops = self._init_has_noops()
-
-        self._verify_tokenizer_mode()
-        self._verify_quantization()
-        self._verify_cuda_graph()
-
-    @staticmethod
-    def _change_paths_to_absolute_paths(tokenizer_config: Dict[Any, Any], nemo_checkpoint: Path) -> Dict[Any, Any]:
-        """
-        Creates absolute path to the local tokenizers. Used for NeMo 2.0.
-
-        Args:
-            tokenizer_config (dict): Parameters for instantiating the tokenizer.
-            nemo_checkpoint (path): Path to the NeMo2 checkpoint.
-        Returns:
-            dict: Updated tokenizer config.
-        """
-        context_path = nemo_checkpoint / 'context'
-
-        # 'pretrained_model_name' -- huggingface tokenizer case
-        # 'model_path' -- sentencepiece tokenizer
-        path_keys = ['pretrained_model_name', 'model_path']
-
-        for path_key in path_keys:
-            if path := tokenizer_config.get(path_key, None):
-                tokenizer_path = context_path / path
-                if not tokenizer_path.exists():
-                    continue
-
-                tokenizer_config[path_key] = str(tokenizer_path.resolve())
-
-        return tokenizer_config
-
-    def _load_hf_arguments(self, nemo_config: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Maps argument names used in NeMo to their corresponding names in HF.
-        """
-
-        hf_to_nemo_dict = {
-            'hidden_size': 'hidden_size',
-            'intermediate_size': 'ffn_hidden_size',
-            'num_hidden_layers': 'num_layers',
-            'num_attention_heads': 'num_attention_heads',
-            'num_key_value_heads': 'num_query_groups',
-            # 'hidden_act': 'activation', ## <- vLLM has good defaults for the models, nemo values are wrong
-            'num_local_experts': 'num_moe_experts',
-            'max_position_embeddings': ['max_position_embeddings', 'encoder_seq_length'],
-            'tie_word_embeddings': 'share_embeddings_and_output_weights',
-            'rms_norm_eps': 'layernorm_epsilon',
-            'attention_dropout': 'attention_dropout',
-            'initializer_range': 'init_method_std',
-            'norm_epsilon': 'layernorm_epsilon',
-            'rope_theta': 'rotary_base',
-            'use_bias': ['bias', 'add_bias_linear'],
-        }
-
-        hf_args = {}
-        for hf_arg, nemo_arg in hf_to_nemo_dict.items():
-            if not isinstance(nemo_arg, list):
-                nemo_arg = [nemo_arg]
-
-            for nemo_arg_option in nemo_arg:
-                value = nemo_config.get(nemo_arg_option)
-                if value is not None:
-                    hf_args[hf_arg] = value
-                    break
-
-        return hf_args
-
-    def try_get_generation_config(self, *args, **kwargs):
-        """
-        Prevent vLLM from trying to load a generation config
-        """
-        nemo_path = Path(self.nemo_checkpoint)
-        generation_config_path = nemo_path / "context" / "artifacts" / "generation_config.json"
-        if generation_config_path.exists():
-            with generation_config_path.open("r") as f:
-                return json.load(f)
-
-        return {}
diff --git a/nemo/export/vllm/model_converters.py b/nemo/export/vllm/model_converters.py
deleted file mode 100644
index 87b670560c9c..000000000000
--- a/nemo/export/vllm/model_converters.py
+++ /dev/null
@@ -1,421 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from abc import ABC, abstractmethod
-from typing import Generator, Optional, Tuple
-
-import torch
-
-
-class ModelConverter(ABC):
-    """
-    Abstract class that defines the interface for a converter that implements model-specific conversion functions
-    for deploying NeMo checkpoints on vLLM.
-    """
-
-    def __init__(self, model_type: str):
-        self.model_type = model_type
-
-    @abstractmethod
-    def get_architecture(self) -> Optional[str]:
-        """
-        Returns the HF architecture name for the current model, such as 'LlamaForCausalLM'.
-        """
-        pass
-
-    def convert_config(self, nemo_model_config: dict, hf_config: dict) -> None:
-        """
-        Implements any custom HF configuration adjustments in the 'hf_config' dict that are necessary
-        for this model after the common translation takes place in NemoModelConfig's constructor.
-        """
-        pass
-
-    @abstractmethod
-    def convert_weights(
-        self, nemo_model_config: dict, state_dict: dict
-    ) -> Generator[Tuple[str, torch.tensor], None, None]:
-        """
-        Returns or yields a sequence of (name, tensor) tuples that contain model weights in the HF format.
-        """
-        pass
-
-    def requires_bos_token(self) -> bool:
-        """
-        Returns True if the model requires a 'bos' token to be used at the beginning of the input sequence.
-        NeMo checkpoints do not store this information.
-        """
-        return False
-
-
-class LlamaConverter(ModelConverter):
-
-    def get_architecture(self):
-        if self.model_type == 'llama':
-            return 'LlamaForCausalLM'
-        if self.model_type == 'mistral':
-            return 'MistralForCausalLM'
-        return None
-
-    def convert_weights(self, nemo_model_config, state_dict):
-        hidden_size = nemo_model_config["hidden_size"]
-        head_num = nemo_model_config["num_attention_heads"]
-        num_query_groups = nemo_model_config["num_query_groups"]
-        num_layers = nemo_model_config["num_layers"]
-        head_size = hidden_size // head_num
-        heads_per_group = head_num // num_query_groups
-        qkv_total_dim = head_num + 2 * num_query_groups
-
-        yield ('model.embed_tokens.weight', state_dict['model.embedding.word_embeddings.weight'])
-        yield ('model.norm.weight', state_dict['model.decoder.final_layernorm.weight'])
-        if not nemo_model_config.get("share_embeddings_and_output_weights", False):
-            yield ('lm_head.weight', state_dict['model.output_layer.weight'])
-
-        for layer in range(int(num_layers)):
-            qkv_weights = state_dict['model.decoder.layers.self_attention.linear_qkv.weight'][layer]
-            qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
-
-            q_slice = torch.cat(
-                [
-                    torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
-                    for i in range(num_query_groups)
-                ]
-            )
-            k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
-            v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
-
-            for name, slice in [('q_proj', q_slice), ('k_proj', k_slice), ('v_proj', v_slice)]:
-                weight_name = f'model.layers.{layer}.self_attn.{name}.weight'
-                yield (weight_name, qkv_weights[slice].reshape(-1, hidden_size))
-
-            linear_proj_weight = state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer]
-            yield (f'model.layers.{layer}.self_attn.o_proj.weight', linear_proj_weight)
-
-            gate_proj_weight, up_proj_weight = torch.chunk(
-                state_dict['model.decoder.layers.mlp.linear_fc1.weight'][layer], 2, dim=0
-            )
-            yield (f'model.layers.{layer}.mlp.gate_proj.weight', gate_proj_weight)
-            yield (f'model.layers.{layer}.mlp.up_proj.weight', up_proj_weight)
-
-            mlp_up_weight = state_dict['model.decoder.layers.mlp.linear_fc2.weight'][layer]
-            yield (f'model.layers.{layer}.mlp.down_proj.weight', mlp_up_weight)
-
-            input_layernorm_weight = state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_weight'][
-                layer
-            ]
-            yield (f'model.layers.{layer}.input_layernorm.weight', input_layernorm_weight)
-
-            post_attn_layernorm_weight = state_dict['model.decoder.layers.mlp.linear_fc1.layer_norm_weight'][layer]
-            yield (f'model.layers.{layer}.post_attention_layernorm.weight', post_attn_layernorm_weight)
-
-    def requires_bos_token(self):
-        return True
-
-
-class MixtralConverter(ModelConverter):
-
-    def get_architecture(self):
-        if self.model_type == 'mixtral':
-            return 'MixtralForCausalLM'
-        return None
-
-    def convert_weights(self, nemo_model_config, state_dict):
-        hidden_size = nemo_model_config["hidden_size"]
-        head_num = nemo_model_config["num_attention_heads"]
-        num_query_groups = nemo_model_config["num_query_groups"]
-        num_layers = nemo_model_config["num_layers"]
-        num_moe_experts = nemo_model_config["num_moe_experts"]
-        head_size = hidden_size // head_num
-        heads_per_group = head_num // num_query_groups
-        qkv_total_dim = head_num + 2 * num_query_groups
-
-        yield ('model.embed_tokens.weight', state_dict['model.embedding.word_embeddings.weight'])
-        yield ('model.norm.weight', state_dict['model.decoder.final_layernorm.weight'])
-        yield ('lm_head.weight', state_dict['model.output_layer.weight'])
-
-        for layer in range(int(num_layers)):
-            qkv_weights = state_dict['model.decoder.layers.self_attention.linear_qkv.weight'][layer]
-            qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
-
-            q_slice = torch.cat(
-                [
-                    torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
-                    for i in range(num_query_groups)
-                ]
-            )
-            k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
-            v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
-
-            for name, slice in [('q_proj', q_slice), ('k_proj', k_slice), ('v_proj', v_slice)]:
-                weight_name = f'model.layers.{layer}.self_attn.{name}.weight'
-                yield (weight_name, qkv_weights[slice].reshape(-1, hidden_size))
-
-            linear_proj_weight = state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer]
-            yield (f'model.layers.{layer}.self_attn.o_proj.weight', linear_proj_weight)
-
-            mlp_router_weight = state_dict['model.decoder.layers.mlp.router.weight'][layer]
-            yield (f'model.layers.{layer}.block_sparse_moe.gate.weight', mlp_router_weight)
-
-            for expert in range(num_moe_experts):
-                linear_fc1_weight = state_dict['model.decoder.layers.mlp.experts.experts.linear_fc1.weight'][layer][
-                    expert
-                ]
-                gate_proj_weight, up_proj_weight = torch.chunk(linear_fc1_weight, 2, dim=0)
-                yield (f'model.layers.{layer}.block_sparse_moe.experts.{expert}.w1.weight', gate_proj_weight)
-                yield (f'model.layers.{layer}.block_sparse_moe.experts.{expert}.w3.weight', up_proj_weight)
-
-                linear_fc2_weight = state_dict['model.decoder.layers.mlp.experts.experts.linear_fc2.weight'][layer][
-                    expert
-                ]
-                yield (f'model.layers.{layer}.block_sparse_moe.experts.{expert}.w2.weight', linear_fc2_weight)
-
-            input_layernorm_weight = state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_weight'][
-                layer
-            ]
-            yield (f'model.layers.{layer}.input_layernorm.weight', input_layernorm_weight)
-
-            post_attn_layernorm_weight = state_dict['model.decoder.layers.pre_mlp_layernorm.weight'][layer]
-            yield (f'model.layers.{layer}.post_attention_layernorm.weight', post_attn_layernorm_weight)
-
-    def requires_bos_token(self):
-        return True
-
-
-class GemmaConverter(ModelConverter):
-
-    def get_architecture(self):
-        if self.model_type == 'gemma':
-            return 'GemmaForCausalLM'
-        return None
-
-    def convert_weights(self, nemo_model_config, state_dict):
-        num_layers = nemo_model_config["num_layers"]
-        num_query_groups = nemo_model_config["num_query_groups"]
-        head_num = nemo_model_config["num_attention_heads"]
-        head_size = nemo_model_config["kv_channels"]
-        hidden_size = nemo_model_config["hidden_size"]
-        zero_centered_gamma = nemo_model_config.get("layernorm_zero_centered_gamma", False)
-        heads_per_group = head_num // num_query_groups
-
-        yield ('model.embed_tokens.weight', state_dict['model.embedding.word_embeddings.weight'])
-
-        final_layernorm_weight = state_dict['model.decoder.final_layernorm.weight']
-        if not zero_centered_gamma:
-            final_layernorm_weight -= 1.0
-        yield ('model.norm.weight', final_layernorm_weight)
-
-        for layer in range(int(num_layers)):
-            input_layernorm_weight = state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_weight'][
-                layer
-            ]
-            if not zero_centered_gamma:
-                input_layernorm_weight -= 1.0
-            yield (f'model.layers.{layer}.input_layernorm.weight', input_layernorm_weight)
-
-            post_attention_layernorm_weight = state_dict['model.decoder.layers.mlp.linear_fc1.layer_norm_weight'][
-                layer
-            ]
-            if not zero_centered_gamma:
-                post_attention_layernorm_weight -= 1.0
-            yield (f'model.layers.{layer}.post_attention_layernorm.weight', post_attention_layernorm_weight)
-
-            gate_up_combined_weight = state_dict['model.decoder.layers.mlp.linear_fc1.weight'][layer]
-            gate_size = gate_up_combined_weight.shape[0] // 2
-            yield (f'model.layers.{layer}.mlp.gate_proj.weight', gate_up_combined_weight[:gate_size, :])
-            yield (f'model.layers.{layer}.mlp.up_proj.weight', gate_up_combined_weight[gate_size:, :])
-
-            down_proj_weight = state_dict['model.decoder.layers.mlp.linear_fc2.weight'][layer]
-            yield (f'model.layers.{layer}.mlp.down_proj.weight', down_proj_weight)
-
-            self_attn_o_proj_weight = state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer]
-            yield (f'model.layers.{layer}.self_attn.o_proj.weight', self_attn_o_proj_weight)
-
-            qkv_weight = state_dict['model.decoder.layers.self_attention.linear_qkv.weight'][layer]
-            qkv_intermediate_size = head_num + 2 * num_query_groups
-            qkv_weight = qkv_weight.reshape(qkv_intermediate_size, head_size, hidden_size)
-
-            q_weight = torch.empty((head_num, head_size, hidden_size), dtype=qkv_weight.dtype)
-            k_weight = torch.empty((num_query_groups, head_size, hidden_size), dtype=qkv_weight.dtype)
-            v_weight = torch.empty((num_query_groups, head_size, hidden_size), dtype=qkv_weight.dtype)
-
-            ptr = 0
-            for i in range(num_query_groups):
-                q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :] = qkv_weight[
-                    ptr : ptr + heads_per_group, ::
-                ]
-                ptr += heads_per_group
-                k_weight[i : i + 1, :, :] = qkv_weight[ptr : ptr + 1, :, :]
-                ptr += 1
-                v_weight[i : i + 1, :, :] = qkv_weight[ptr : ptr + 1, :, :]
-                ptr += 1
-            assert ptr == qkv_intermediate_size
-
-            q_weight = q_weight.reshape(head_num * head_size, hidden_size)
-            k_weight = k_weight.reshape(num_query_groups * head_size, hidden_size)
-            v_weight = v_weight.reshape(num_query_groups * head_size, hidden_size)
-
-            yield (f'model.layers.{layer}.self_attn.q_proj.weight', q_weight)
-            yield (f'model.layers.{layer}.self_attn.k_proj.weight', k_weight)
-            yield (f'model.layers.{layer}.self_attn.v_proj.weight', v_weight)
-
-    def requires_bos_token(self):
-        return True
-
-
-class Starcoder2Converter(ModelConverter):
-
-    def get_architecture(self):
-        if self.model_type == 'starcoder2':
-            return 'Starcoder2ForCausalLM'
-        return None
-
-    def convert_config(self, nemo_model_config, hf_config):
-        window_sizes = nemo_model_config.get('window_size')
-        if window_sizes is not None:
-            hf_config['sliding_window'] = window_sizes[0]
-
-        # 'tie_word_embeddings = False' means that there is a 'lm_head.weight' tensor.
-        # This converter assumes that it's always there.
-        # If there is a version of starcoder2 where it's not there, we'll need to copy
-        # 'model.embed_tokens.weight' into 'lm_head.weight' and still set 'tie_word_embeddings = False'
-        # because at this point we don't know if the weight is there or not, and this configuration
-        # is not stored in NeMo checkpoints.
-        hf_config['tie_word_embeddings'] = False
-
-    def convert_weights(self, nemo_model_config, state_dict):
-        num_layers = nemo_model_config["num_layers"]
-        num_query_groups = nemo_model_config["num_query_groups"]
-        head_num = nemo_model_config["num_attention_heads"]
-        hidden_size = nemo_model_config["hidden_size"]
-        head_size = hidden_size // head_num
-        heads_per_group = head_num // num_query_groups
-        qkv_total_dim = head_num + 2 * num_query_groups
-
-        if 'bias' in nemo_model_config:
-            has_bias = nemo_model_config["bias"]
-        else:
-            has_bias = nemo_model_config["add_bias_linear"]
-
-        yield ('model.embed_tokens.weight', state_dict['model.embedding.word_embeddings.weight'])
-
-        yield ('model.norm.weight', state_dict['model.decoder.final_layernorm.weight'])
-        if has_bias:
-            yield ('model.norm.bias', state_dict['model.decoder.final_layernorm.bias'])
-
-        yield ('lm_head.weight', state_dict['model.output_layer.weight'])
-
-        for layer in range(int(num_layers)):
-            # q,k,v
-            qkv_weights = state_dict['model.decoder.layers.self_attention.linear_qkv.weight'][layer]
-            qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
-            if has_bias:
-                qkv_bias = state_dict['model.decoder.layers.self_attention.linear_qkv.bias'][layer]
-                qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
-
-            q_slice = torch.cat(
-                [
-                    torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
-                    for i in range(num_query_groups)
-                ]
-            )
-            k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
-            v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
-
-            for name, slice in [('q_proj', q_slice), ('k_proj', k_slice), ('v_proj', v_slice)]:
-                qkv_weights_slice = qkv_weights[slice].reshape(-1, hidden_size)
-                yield (f'model.layers.{layer}.self_attn.{name}.weight', qkv_weights_slice)
-                if has_bias:
-                    qkv_bias_slice = qkv_bias[slice].reshape(-1)
-                    yield (f'model.layers.{layer}.self_attn.{name}.bias', qkv_bias_slice)
-
-            # Attention dense
-            yield (
-                f'model.layers.{layer}.self_attn.o_proj.weight',
-                state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer],
-            )
-            if has_bias:
-                yield (
-                    f'model.layers.{layer}.self_attn.o_proj.bias',
-                    state_dict['model.decoder.layers.self_attention.linear_proj.bias'][layer],
-                )
-
-            # MLP FC1
-            yield (
-                f'model.layers.{layer}.mlp.c_fc.weight',
-                state_dict['model.decoder.layers.mlp.linear_fc1.weight'][layer],
-            )
-            if has_bias:
-                yield (
-                    f'model.layers.{layer}.mlp.c_fc.bias',
-                    state_dict['model.decoder.layers.mlp.linear_fc1.bias'][layer],
-                )
-
-            # MLP FC2
-            yield (
-                f'model.layers.{layer}.mlp.c_proj.weight',
-                state_dict['model.decoder.layers.mlp.linear_fc2.weight'][layer],
-            )
-            if has_bias:
-                yield (
-                    f'model.layers.{layer}.mlp.c_proj.bias',
-                    state_dict['model.decoder.layers.mlp.linear_fc2.bias'][layer],
-                )
-
-            # Input LayerNorm
-            yield (
-                f'model.layers.{layer}.input_layernorm.weight',
-                state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_weight'][layer],
-            )
-            if has_bias:
-                yield (
-                    f'model.layers.{layer}.input_layernorm.bias',
-                    state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_bias'][layer],
-                )
-
-            # Post-attention LayerNorm
-            yield (
-                f'model.layers.{layer}.post_attention_layernorm.weight',
-                state_dict['model.decoder.layers.mlp.linear_fc1.layer_norm_weight'][layer],
-            )
-            if has_bias:
-                yield (
-                    f'model.layers.{layer}.post_attention_layernorm.bias',
-                    state_dict['model.decoder.layers.mlp.linear_fc1.layer_norm_bias'][layer],
-                )
-
-
-_MODEL_CONVERTERS = {
-    'llama': LlamaConverter,
-    'mistral': LlamaConverter,
-    'mixtral': MixtralConverter,
-    'gemma': GemmaConverter,
-    'starcoder2': Starcoder2Converter,
-}
-
-
-def register_model_converter(model_type, cls):
-    """
-    Establishes a mapping from short model type to a class that converts the model from Nemo format
-    to a vLLM compatible format.
-    """
-    _MODEL_CONVERTERS[model_type] = cls
-
-
-def get_model_converter(model_type) -> Optional[ModelConverter]:
-    """
-    Returns an instance of the the model conversion class for the given model type, or None.
-    """
-    cls = _MODEL_CONVERTERS.get(model_type, None)
-    if cls is None:
-        return None
-    return cls(model_type)
diff --git a/nemo/export/vllm/model_loader.py b/nemo/export/vllm/model_loader.py
deleted file mode 100644
index a4c5cef39db6..000000000000
--- a/nemo/export/vllm/model_loader.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os.path
-from typing import Any, Dict
-
-import safetensors.torch
-import torch
-from vllm.config import ModelConfig
-from vllm.model_executor.model_loader.loader import BaseModelLoader, _initialize_model
-from vllm.model_executor.model_loader.utils import set_default_torch_dtype
-
-from nemo.export.utils import load_model_weights
-from nemo.export.vllm.model_config import NemoModelConfig
-
-LOGGER = logging.getLogger("NeMo")
-
-
-class NemoModelLoader(BaseModelLoader):
-    """
-    Implements a custom ModelLoader for vLLM that reads the weights from a Nemo checkpoint
-    and converts them to a vLLM compatible format at load time.
-
-    Also supports an ahead-of-time conversion that stores new weights in a Safetensors file,
-    see convert_and_store_nemo_weights(...)
-    """
-
-    @staticmethod
-    def _load_nemo_checkpoint_state(nemo_file: str) -> Dict[str, Any]:
-        LOGGER.info(f'Loading weights from {nemo_file}...')
-        return load_model_weights(nemo_file)
-
-    def download_model(self, model_config: ModelConfig) -> None:  # pylint: disable=missing-function-docstring
-        raise NotImplementedError
-
-    def load_model(
-        self,
-        *,
-        vllm_config: NemoModelConfig,
-    ) -> torch.nn.Module:
-        """
-        Overrides the load_model function from BaseModelLoader to convert Nemo weights at load time.
-        """
-        model_config = vllm_config.model_config
-        device_config = vllm_config.device_config
-
-        assert isinstance(model_config, NemoModelConfig)
-        state_dict = NemoModelLoader._load_nemo_checkpoint_state(model_config.nemo_checkpoint)
-
-        with set_default_torch_dtype(model_config.dtype):
-            with torch.device(device_config.device):
-                model = _initialize_model(vllm_config)
-
-            config = model_config.nemo_model_config
-            if 'config' in config:
-                config = config['config']
-            state_dict = NemoModelLoader._standardize_nemo2_naming(state_dict)
-
-            weights_iterator = model_config.model_converter.convert_weights(config, state_dict)
-            model.load_weights(weights_iterator)
-
-        return model.eval()
-
-    @staticmethod
-    def convert_and_store_nemo_weights(model_config: NemoModelConfig, safetensors_file: str):
-        """
-        Converts Nemo weights and stores the converted weights in a Safetensors file.
-        """
-
-        assert isinstance(model_config, NemoModelConfig)
-        assert os.path.exists(model_config.model)
-
-        state_dict = NemoModelLoader._load_nemo_checkpoint_state(model_config.nemo_checkpoint)
-
-        config = model_config.nemo_model_config
-
-        # NeMo2 checkpoint loads the whole TrainerContext where the config is stored under 'config' key
-        if 'config' in config:
-            config = config['config']
-        state_dict = NemoModelLoader._standardize_nemo2_naming(state_dict)
-
-        tensors = {name: tensor for name, tensor in model_config.model_converter.convert_weights(config, state_dict)}
-
-        LOGGER.info(f'Saving weights to {safetensors_file}...')
-        safetensors.torch.save_file(tensors, safetensors_file)
-
-    @staticmethod
-    def _standardize_nemo2_naming(state_dict: Dict[str, Any]) -> Dict[str, Any]:
-        return {k.replace('module', 'model'): v for k, v in state_dict.items()}
diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
deleted file mode 100644
index 6aeaa4877bd8..000000000000
--- a/nemo/export/vllm_exporter.py
+++ /dev/null
@@ -1,537 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import logging
-import os.path
-from typing import Iterable, List, Optional, Union
-
-import numpy
-import vllm.envs as envs
-import wrapt
-from vllm import RequestOutput, SamplingParams
-from vllm.config import (
-    CacheConfig,
-    DeviceConfig,
-    LoadConfig,
-    LoadFormat,
-    LoRAConfig,
-    ObservabilityConfig,
-    ParallelConfig,
-    SchedulerConfig,
-    VllmConfig,
-)
-from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.lora.request import LoRARequest
-from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
-from vllm.v1.engine.llm_engine import LLMEngine
-
-from nemo.deploy import ITritonDeployable
-from nemo.deploy.utils import cast_output
-from nemo.export.utils import convert_lora_nemo_to_canonical, prepare_directory_for_export
-from nemo.export.vllm.model_config import NemoModelConfig
-from nemo.export.vllm.model_loader import NemoModelLoader
-
-LOGGER = logging.getLogger("NeMo")
-
-
-@wrapt.decorator
-def noop_decorator(func):
-    """Used as batch if pytriton is not supported"""
-
-    def wrapper(*args, **kwargs):
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-batch = noop_decorator
-use_pytriton = True
-try:
-    from pytriton.decorators import batch
-    from pytriton.model_config import Tensor
-except Exception:
-    use_pytriton = False
-
-
-class vLLMExporter(ITritonDeployable):
-    """
-    The vLLMExporter class implements conversion from a Nemo checkpoint format to something compatible with vLLM,
-    loading the model in vLLM, and binding that model to a Triton server.
-
-    Example:
-        from nemo.export.vllm_exporter import vLLMExporter
-        from nemo.deploy import DeployPyTriton
-
-        exporter = vLLMExporter()
-
-        exporter.export(
-            nemo_checkpoint='/path/to/checkpoint.nemo',
-            model_dir='/path/to/temp_dir',
-            model_type='llama',
-        )
-
-        server = DeployPyTriton(
-            model=exporter,
-            triton_model_name='LLAMA',
-        )
-
-        server.deploy()
-        server.serve()
-    """
-
-    def __init__(self):
-        self.request_id = 0
-        assert envs.VLLM_USE_V1, "Only vLLM V1 is supported"
-
-    def export(
-        self,
-        nemo_checkpoint: str,
-        model_dir: str,
-        model_type: str,
-        device: str = 'auto',
-        tensor_parallel_size: int = 1,
-        pipeline_parallel_size: int = 1,
-        max_model_len: Optional[int] = None,
-        lora_checkpoints: Optional[List[str]] = None,
-        dtype: str = 'auto',
-        seed: int = 0,
-        log_stats: bool = True,
-        weight_storage: str = 'auto',
-        gpu_memory_utilization: float = 0.9,
-        quantization: Optional[str] = None,
-        delete_existing_files: bool = True,
-    ):
-        """
-        Exports the Nemo checkpoint to vLLM and initializes the engine.
-
-        Args:
-            nemo_checkpoint (str): path to the nemo checkpoint.
-            model_dir (str): path to a temporary directory to store weights and the tokenizer model.
-                The temp dir may persist between subsequent export operations, in which case
-                converted weights may be reused to speed up the export.
-            model_type (str): type of the model, such as "llama", "mistral", "mixtral".
-                Needs to be compatible with transformers.AutoConfig.
-            device (str): type of the device to use by the vLLM engine.
-                Supported values are "auto", "cuda", "cpu", "neuron".
-            tensor_parallel_size (int): tensor parallelism.
-            pipeline_parallel_size (int): pipeline parallelism.
-                Values over 1 are not currently supported by vLLM.
-            max_model_len (int): model context length.
-            lora_checkpoints List[str]: paths to LoRA checkpoints.
-            dtype (str): data type for model weights and activations.
-                Possible choices: auto, half, float16, bfloat16, float, float32
-                "auto" will use FP16 precision for FP32 and FP16 models,
-                and BF16 precision for BF16 models.
-            seed (int): random seed value.
-            log_stats (bool): enables logging inference performance statistics by vLLM.
-            weight_storage (str): controls how converted weights are stored:
-                "file" - always write weights into a file inside 'model_dir',
-                "memory" - always do an in-memory conversion,
-                "cache" - reuse existing files if they are newer than the nemo checkpoint,
-                "auto" - use "cache" for multi-GPU runs and "memory" for single-GPU runs.
-            gpu_memory_utilization (float): The fraction of GPU memory to be used for the model
-                executor, which can range from 0 to 1.
-            quantization (str): quantization method that is used to quantize the model weights.
-                Possible choices are None (weights not quantized, default) and "fp8".
-            delete_existing_files (bool): if True, deletes all the files in model_dir.
-        """
-        prepare_directory_for_export(model_dir, delete_existing_files=delete_existing_files)
-
-        # Pouplate the basic configuration structures
-        device_config = DeviceConfig(device)
-
-        assert quantization in {None, 'fp8'}
-
-        model_config = NemoModelConfig(
-            nemo_checkpoint,
-            model_dir,
-            model_type,
-            tokenizer_mode='auto',
-            dtype=dtype,
-            seed=seed,
-            revision=None,
-            code_revision=None,
-            tokenizer_revision=None,
-            max_model_len=max_model_len,
-            quantization=quantization,
-            quantization_param_path=None,
-            enforce_eager=False,
-        )
-
-        if model_config.nemo_model_config.get("fp8", False):
-            LOGGER.warning(
-                "NeMo FP8 checkpoint detected, but exporting FP8 quantized engines is not supported for vLLM."
-            )
-
-        parallel_config = ParallelConfig(
-            pipeline_parallel_size=pipeline_parallel_size, tensor_parallel_size=tensor_parallel_size
-        )
-
-        # vllm/huggingface doesn't like the absense of config file. Place config in load dir.
-        if model_config.model and not os.path.exists(os.path.join(model_config.model, 'config.json')):
-            with open(os.path.join(model_config.model, 'config.json'), "w") as f:
-                json.dump(model_config.hf_text_config.to_dict(), f, indent=2)
-
-        # Dynamic online FP8 quantization currently does not support in-memory conversion [TODO]
-        if quantization is not None and weight_storage in {'auto', 'memory'}:
-            LOGGER.warning('Setting weight_storage = "file" for FP8 quantization')
-            weight_storage = 'file'
-
-        # See if we have an up-to-date safetensors file
-        safetensors_file = os.path.join(model_config.model, 'model.safetensors')
-        safetensors_file_valid = os.path.exists(safetensors_file) and os.path.getmtime(
-            safetensors_file
-        ) > os.path.getmtime(nemo_checkpoint)
-
-        # Decide how we're going to convert the weights
-        if weight_storage == 'auto':
-            if parallel_config.distributed_executor_backend is not None:
-                save_weights = not safetensors_file_valid
-                inmemory_weight_conversion = False
-            else:
-                save_weights = False
-                inmemory_weight_conversion = True
-
-        elif weight_storage == 'cache':
-            save_weights = not safetensors_file_valid
-            inmemory_weight_conversion = False
-
-        elif weight_storage == 'file':
-            save_weights = True
-            inmemory_weight_conversion = False
-
-        elif weight_storage == 'memory':
-            save_weights = False
-            inmemory_weight_conversion = True
-
-        else:
-            raise ValueError(f'Unsupported value for weight_storage: "{weight_storage}"')
-
-        # Convert the weights ahead-of-time, if needed
-        if save_weights:
-            NemoModelLoader.convert_and_store_nemo_weights(model_config, safetensors_file)
-        elif not inmemory_weight_conversion:
-            LOGGER.info(f'Using cached weights in {safetensors_file}')
-
-        # TODO: these values are the defaults from vllm.EngineArgs.
-        cache_config = CacheConfig(
-            block_size=16,
-            gpu_memory_utilization=gpu_memory_utilization,
-            swap_space=4,
-            cache_dtype='auto',
-            sliding_window=model_config.get_sliding_window(),
-        )
-
-        # TODO: these values are the defaults from vllm.EngineArgs.
-        scheduler_config = SchedulerConfig(
-            max_num_batched_tokens=None,
-            max_num_seqs=256,
-            # Note: max_model_len can be derived by model_config if the input value is None
-            max_model_len=model_config.max_model_len,
-            num_lookahead_slots=0,
-            delay_factor=0.0,
-            enable_chunked_prefill=False,
-            scheduler_cls=V1Scheduler,
-        )
-
-        load_config = LoadConfig(
-            load_format=NemoModelLoader if inmemory_weight_conversion else LoadFormat.SAFETENSORS,
-            download_dir=None,
-            model_loader_extra_config=None,
-        )
-
-        # Convert the LoRA checkpoints to vLLM compatible format and derive the configuration structure
-        lora_config = self._prepare_lora_checkpoints(
-            model_dir=model_dir, lora_checkpoints=lora_checkpoints, dtype=model_config.dtype
-        )
-
-        # Initialize the cluster and specify the executor class.
-        if parallel_config.distributed_executor_backend == "ray":
-            initialize_ray_cluster(parallel_config)
-            from vllm.v1.executor.ray_distributed_executor import RayDistributedExecutor
-
-            executor_class = RayDistributedExecutor
-
-        elif parallel_config.distributed_executor_backend == "mp":
-            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
-
-            executor_class = MultiprocExecutor
-
-        else:
-            assert parallel_config.distributed_executor_backend == "uni" or parallel_config.world_size == 1
-
-            from vllm.v1.executor.abstract import UniProcExecutor
-
-            executor_class = UniProcExecutor
-
-        # Initialize the engine
-        self.engine = LLMEngine(
-            vllm_config=VllmConfig(
-                model_config=model_config,
-                cache_config=cache_config,
-                parallel_config=parallel_config,
-                scheduler_config=scheduler_config,
-                device_config=device_config,
-                load_config=load_config,
-                lora_config=lora_config,
-                observability_config=ObservabilityConfig(),
-            ),
-            executor_class=executor_class,
-            log_stats=log_stats,
-        )
-
-    def _prepare_lora_checkpoints(
-        self, model_dir: str, lora_checkpoints: Optional[List[str]], dtype: str
-    ) -> LoRAConfig:
-        self.lora_checkpoints = []
-
-        if not lora_checkpoints:
-            return None
-
-        index = 0
-        max_lora_rank = 0
-        for nemo_file in lora_checkpoints:
-            if not os.path.isfile(nemo_file):
-                raise FileNotFoundError(f"LoRA checkpoint file '{nemo_file} does not exist'")
-
-            hf_lora_dir = os.path.join(model_dir, f'lora_{index}')
-
-            LOGGER.info(f"Converting LoRA checkpoint '{nemo_file}' into '{hf_lora_dir}'...")
-
-            _, lora_config = convert_lora_nemo_to_canonical(nemo_file, hf_lora_dir, hf_format=True)
-            self.lora_checkpoints.append(hf_lora_dir)
-
-            rank = lora_config['peft']['lora_tuning']['adapter_dim']
-            max_lora_rank = max(max_lora_rank, rank)
-
-            index += 1
-
-        return LoRAConfig(max_lora_rank=max_lora_rank, max_loras=len(self.lora_checkpoints), lora_dtype=dtype)
-
-    def _add_request_to_engine(
-        self,
-        prompt: str,
-        max_output_len: int,
-        temperature: float = 1.0,
-        top_k: int = 1,
-        top_p: float = 0.0,
-        lora_uid: Optional[int] = None,
-    ) -> str:
-        if top_p <= 0.0:
-            top_p = 1.0
-
-        sampling_params = SamplingParams(
-            max_tokens=max_output_len, temperature=temperature, top_k=int(top_k), top_p=top_p
-        )
-
-        if lora_uid is not None and lora_uid >= 0 and lora_uid < len(self.lora_checkpoints):
-            lora_request = LoRARequest(
-                lora_name=f'LoRA_{lora_uid}', lora_int_id=lora_uid + 1, lora_local_path=self.lora_checkpoints[lora_uid]
-            )
-        else:
-            lora_request = None
-
-        request_id = str(self.request_id)
-        self.request_id += 1
-
-        self.engine.add_request(request_id, prompt, sampling_params, lora_request=lora_request)
-
-        return request_id
-
-    def _forward_regular(self, request_ids: List[str]):
-        responses = [None] * len(request_ids)
-        finished = [False] * len(request_ids)
-
-        while not all(finished):
-            request_outputs: List[RequestOutput] = self.engine.step()
-
-            for request_output in request_outputs:
-                if not request_output.finished:
-                    continue
-
-                try:
-                    request_index = request_ids.index(request_output.request_id)
-                except ValueError:
-                    continue
-
-                finished[request_index] = request_output.finished
-                output_text = request_output.outputs[-1].text
-                responses[request_index] = output_text
-
-        return [[response] for response in responses]
-
-    def _forward_streaming(self, request_ids: List[str]):
-        responses = [None] * len(request_ids)
-        finished = [False] * len(request_ids)
-
-        while not all(finished):
-            request_outputs: List[RequestOutput] = self.engine.step()
-
-            for request_output in request_outputs:
-                try:
-                    request_index = request_ids.index(request_output.request_id)
-                except ValueError:
-                    continue
-
-                finished[request_index] = request_output.finished
-                output_text = request_output.outputs[-1].text
-                responses[request_index] = output_text
-
-            yield [[response] for response in responses]
-
-    def _add_triton_request_to_engine(self, inputs: numpy.ndarray, index: int) -> str:
-        if 'lora_uids' in inputs:
-            lora_uid = int(numpy.char.decode(inputs['lora_uids'][index][0], encoding="utf-8"))
-        else:
-            lora_uid = None
-
-        return self._add_request_to_engine(
-            prompt=inputs['prompts'][index][0].decode('UTF-8'),
-            max_output_len=inputs['max_output_len'][index][0],
-            temperature=inputs['temperature'][index][0],
-            top_k=inputs['top_k'][index][0],
-            top_p=inputs['top_p'][index][0],
-            lora_uid=lora_uid,
-        )
-
-    @property
-    def get_triton_input(self):
-        inputs = (
-            Tensor(name="prompts", shape=(-1,), dtype=bytes),
-            Tensor(name="max_output_len", shape=(-1,), dtype=numpy.int_, optional=True),
-            Tensor(name="top_k", shape=(-1,), dtype=numpy.int_, optional=True),
-            Tensor(name="top_p", shape=(-1,), dtype=numpy.single, optional=True),
-            Tensor(name="temperature", shape=(-1,), dtype=numpy.single, optional=True),
-            Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
-            Tensor(name="output_generation_logits", shape=(-1,), dtype=numpy.bool_, optional=True),
-            Tensor(name="output_context_logits", shape=(-1,), dtype=numpy.bool_, optional=True),
-        )
-        return inputs
-
-    @property
-    def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
-        return outputs
-
-    @batch
-    def triton_infer_fn(self, **inputs: numpy.ndarray):
-        """
-        This function is used to perform inference on a batch of prompts.
-        """
-        request_ids = []
-        num_requests = len(inputs["prompts"])
-        for index in range(num_requests):
-            request_id = self._add_triton_request_to_engine(inputs, index)
-            request_ids.append(request_id)
-
-        responses = self._forward_regular(request_ids)
-        responses = [r[0] for r in responses]
-
-        output_tensor = cast_output(responses, numpy.bytes_)
-        return {'outputs': output_tensor}
-
-    @batch
-    def triton_infer_fn_streaming(self, **inputs: numpy.ndarray):
-        """
-        This function is used to perform streaming inference.
-        """
-        request_ids = []
-        num_requests = len(inputs["prompts"])
-        for index in range(num_requests):
-            request_id = self._add_triton_request_to_engine(inputs, index)
-            request_ids.append(request_id)
-
-        for responses in self._forward_streaming(request_ids):
-            responses = [r[0] for r in responses]
-            output_tensor = cast_output(responses, numpy.bytes_)
-            yield {'outputs': output_tensor}
-
-    # Mimic the TensorRTLLM exporter's forward function, even though we don't support many of its features.
-    def forward(
-        self,
-        input_texts: List[str],
-        max_output_len: int = 64,
-        top_k: int = 1,
-        top_p: float = 0.0,
-        temperature: float = 1.0,
-        stop_words_list: Optional[List[str]] = None,
-        bad_words_list: Optional[List[str]] = None,
-        no_repeat_ngram_size: Optional[int] = None,
-        task_ids: Optional[List[str]] = None,
-        lora_uids: Optional[List[str]] = None,
-        prompt_embeddings_table=None,
-        prompt_embeddings_checkpoint_path: Optional[str] = None,
-        streaming: bool = False,
-        output_log_probs: bool = False,
-        output_generation_logits: bool = False,
-        output_context_logits: bool = False,
-    ) -> Union[List[List[str]], Iterable[List[List[str]]]]:
-        """
-        The forward function performs LLM evaluation on the provided array of prompts with other parameters shared,
-        and returns the generated texts. If 'streaming' is True, the output texts are returned incrementally
-        with a generator: one token appended to each output at a time. If 'streaming' is false, the final output texts
-        are returned as a single list of responses.
-        """
-
-        if stop_words_list is not None and stop_words_list != []:
-            raise NotImplementedError("stop_words_list is not supported")
-
-        if bad_words_list is not None and bad_words_list != []:
-            raise NotImplementedError("bad_words_list is not supported")
-
-        if no_repeat_ngram_size is not None:
-            raise NotImplementedError("no_repeat_ngram_size is not supported")
-
-        if task_ids is not None and task_ids != []:
-            raise NotImplementedError("task_ids is not supported")
-
-        if prompt_embeddings_table is not None:
-            raise NotImplementedError("prompt_embeddings_table is not supported")
-
-        if prompt_embeddings_checkpoint_path is not None:
-            raise NotImplementedError("prompt_embeddings_checkpoint_path is not supported")
-
-        if output_log_probs:
-            raise NotImplementedError("output_log_probs is not supported")
-
-        if output_generation_logits:
-            raise NotImplementedError("output_generation_logits is not supported")
-
-        if output_context_logits:
-            raise NotImplementedError("output_context_logits is not supported")
-
-        request_ids = []
-        for index in range(len(input_texts)):
-            prompt = input_texts[index]
-
-            if lora_uids is not None and index < len(lora_uids):
-                lora_uid = lora_uids[index]
-            else:
-                lora_uid = None
-
-            request_id = self._add_request_to_engine(
-                prompt=prompt,
-                max_output_len=max_output_len,
-                temperature=temperature,
-                top_k=top_k,
-                top_p=top_p,
-                lora_uid=lora_uid,
-            )
-            request_ids.append(request_id)
-
-        if streaming:
-            return self._forward_streaming(request_ids)
-        else:
-            return self._forward_regular(request_ids)
diff --git a/nemo/export/vllm_hf_exporter.py b/nemo/export/vllm_hf_exporter.py
deleted file mode 100755
index 9e15208a39bf..000000000000
--- a/nemo/export/vllm_hf_exporter.py
+++ /dev/null
@@ -1,132 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import List
-
-import numpy as np
-from pytriton.decorators import batch, first_value
-from pytriton.model_config import Tensor
-from vllm import LLM, SamplingParams
-from vllm.lora.request import LoRARequest
-
-from nemo.deploy import ITritonDeployable
-from nemo.deploy.utils import cast_output, str_ndarray2list
-
-
-class vLLMHFExporter(ITritonDeployable):
-    """
-    The Exporter class uses vLLM APIs to convert a HF model to vLLM and makes the class,
-    deployable with Triton server.
-
-    Example:
-        from nemo.export import vLLMHFExporter
-        from nemo.deploy import DeployPyTriton
-
-        exporter = vLLMHFExporter()
-        exporter.export(model="/path/to/model/")
-
-        server = DeployPyTriton(
-            model=exporter,
-            triton_model_name='model'
-        )
-
-        server.deploy()
-        server.serve()
-        server.stop()
-    """
-
-    def __init__(self):
-        self.model = None
-        self.lora_models = None
-
-    def export(self, model, enable_lora: bool = False):
-        """
-        Exports the HF checkpoint to vLLM and initializes the engine.
-        Args:
-            model (str): model name or the path
-        """
-        self.model = LLM(model=model, enable_lora=enable_lora)
-
-    def add_lora_models(self, lora_model_name, lora_model):
-        if self.lora_models is None:
-            self.lora_models = {}
-        self.lora_models[lora_model_name] = lora_model
-
-    @property
-    def get_triton_input(self):
-        inputs = (
-            Tensor(name="prompts", shape=(-1,), dtype=bytes),
-            Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
-            Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
-            Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
-            Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
-        )
-        return inputs
-
-    @property
-    def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
-        return outputs
-
-    @batch
-    @first_value("max_output_len", "top_k", "top_p", "temperature")
-    def triton_infer_fn(self, **inputs: np.ndarray):
-        try:
-            infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
-            if "max_output_len" in inputs:
-                infer_input["max_output_len"] = inputs.pop("max_output_len")
-            if "top_k" in inputs:
-                infer_input["top_k"] = inputs.pop("top_k")
-            if "top_p" in inputs:
-                infer_input["top_p"] = inputs.pop("top_p")
-            if "temperature" in inputs:
-                infer_input["temperature"] = inputs.pop("temperature")
-
-            output_texts = self.forward(**infer_input)
-            output = cast_output(output_texts, np.bytes_)
-        except Exception as error:
-            err_msg = "An error occurred: {0}".format(str(error))
-            output = cast_output([err_msg], np.bytes_)
-
-        return {"outputs": output}
-
-    def forward(
-        self,
-        input_texts: List[str],
-        max_output_len: int = 64,
-        top_k: int = 1,
-        top_p: float = 0.1,
-        temperature: float = 1.0,
-        lora_model_name: str = None,
-    ):
-        assert self.model is not None, "Model is not initialized."
-
-        lora_request = None
-        if lora_model_name is not None:
-            if self.lora_models is None:
-                raise Exception("No lora models are available.")
-            assert lora_model_name in self.lora_models.keys(), "Lora model was not added before"
-            lora_request = LoRARequest(lora_model_name, 1, self.lora_models[lora_model_name])
-
-        sampling_params = SamplingParams(
-            max_tokens=max_output_len, temperature=temperature, top_k=int(top_k), top_p=top_p
-        )
-
-        request_output = self.model.generate(input_texts, sampling_params, lora_request=lora_request)
-        output = []
-        for o in request_output:
-            output.append(o.outputs[0].text)
-
-        return output
diff --git a/scripts/deploy/multimodal/deploy_triton.py b/scripts/deploy/multimodal/deploy_triton.py
deleted file mode 100755
index 4c996c72f8bc..000000000000
--- a/scripts/deploy/multimodal/deploy_triton.py
+++ /dev/null
@@ -1,237 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import os
-import sys
-from pathlib import Path
-
-from nemo.deploy import DeployPyTriton
-
-LOGGER = logging.getLogger("NeMo")
-
-multimodal_supported = True
-try:
-    from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-except Exception as e:
-    LOGGER.warning(f"Cannot import the TensorRTMMExporter exporter, it will not be available. {type(e).__name__}: {e}")
-    multimodal_supported = False
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Deploy nemo models to Triton",
-    )
-    # default modality is vision, can be changed to audio
-    parser.add_argument(
-        "-mod",
-        "--modality",
-        type=str,
-        required=False,
-        default="vision",
-        choices=["vision", "audio"],
-        help="Modality of the model",
-    )
-    parser.add_argument("-vc", "--visual_checkpoint", type=str, help="Source .nemo file for visual model")
-    parser.add_argument(
-        "-lc",
-        "--llm_checkpoint",
-        type=str,
-        required=False,
-        help="Source .nemo file for llm",
-    )
-    parser.add_argument(
-        "-mt",
-        "--model_type",
-        type=str,
-        required=True,
-        choices=["neva", "video-neva", "lita", "vila", "vita", "salm", "mllama"],
-        help="Type of the model that is supported.",
-    )
-    parser.add_argument(
-        "-lmt",
-        "--llm_model_type",
-        type=str,
-        required=True,
-        choices=["gptnext", "gpt", "llama", "falcon", "starcoder", "mixtral", "gemma", "mllama"],
-        help="Type of LLM. gptnext, gpt, llama, falcon, and starcoder are only supported."
-        " gptnext and gpt are the same and keeping it for backward compatibility",
-    )
-    parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
-    parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
-    parser.add_argument(
-        "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
-    )
-    parser.add_argument(
-        "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
-    )
-    parser.add_argument(
-        "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion"
-    )
-    parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
-    parser.add_argument(
-        "-dt",
-        "--dtype",
-        choices=["bfloat16", "float16"],
-        default="bfloat16",
-        type=str,
-        help="dtype of the model on TensorRT",
-    )
-    parser.add_argument("-mil", "--max_input_len", default=4096, type=int, help="Max input length of the model")
-    parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
-    parser.add_argument("-mbs", "--max_batch_size", default=1, type=int, help="Max batch size of the llm model")
-    parser.add_argument("-mml", "--max_multimodal_len", default=3072, type=int, help="Max length of multimodal input")
-    parser.add_argument(
-        "-vmb",
-        "--vision_max_batch_size",
-        default=1,
-        type=int,
-        help="Max batch size of the visual inputs, for lita/vita model with video inference, this should be set to 256",
-    )
-    parser.add_argument(
-        '--use_lora_plugin',
-        nargs='?',
-        const=None,
-        choices=['float16', 'float32', 'bfloat16'],
-        help="Activates the lora plugin which enables embedding sharing.",
-    )
-    parser.add_argument(
-        '--lora_target_modules',
-        nargs='+',
-        default=None,
-        choices=[
-            "attn_qkv",
-            "attn_q",
-            "attn_k",
-            "attn_v",
-            "attn_dense",
-            "mlp_h_to_4h",
-            "mlp_gate",
-            "mlp_4h_to_h",
-        ],
-        help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
-    )
-    parser.add_argument(
-        '--max_lora_rank',
-        type=int,
-        default=64,
-        help='maximum lora rank for different lora modules. '
-        'It is used to compute the workspace size of lora plugin.',
-    )
-    parser.add_argument("--lora_checkpoint_path", default=None, type=str, help="The checkpoint path of LoRA weights")
-    args = parser.parse_args(argv)
-    return args
-
-
-def get_trt_deployable(args):
-    if args.triton_model_repository is None:
-        trt_path = "/tmp/trt_model_dir/"
-        LOGGER.info(
-            "/tmp/trt_model_dir/ path will be used as the TensorRT folder. "
-            "Please set the --triton_model_repository parameter if you'd like to use a path that already "
-            "includes the TensorRT model files."
-        )
-        Path(trt_path).mkdir(parents=True, exist_ok=True)
-    else:
-        trt_path = args.triton_model_repository
-
-    if args.visual_checkpoint is None and args.triton_model_repository is None:
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT model "
-            "directory. Please provide a --visual_checkpoint."
-        )
-
-    if args.visual_checkpoint is None and not os.path.isdir(args.triton_model_repository):
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT model "
-            "directory. Please provide a --visual_checkpoint."
-        )
-
-    if args.visual_checkpoint is not None and args.model_type is None:
-        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
-
-    exporter = TensorRTMMExporter(
-        model_dir=trt_path,
-        load_model=(args.visual_checkpoint is None),
-        modality=args.modality,
-    )
-
-    if args.visual_checkpoint is not None:
-        try:
-            LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT.")
-            exporter.export(
-                visual_checkpoint_path=args.visual_checkpoint,
-                llm_checkpoint_path=args.llm_checkpoint,
-                model_type=args.model_type,
-                llm_model_type=args.llm_model_type,
-                tensor_parallel_size=args.num_gpus,
-                max_input_len=args.max_input_len,
-                max_output_len=args.max_output_len,
-                vision_max_batch_size=args.vision_max_batch_size,
-                max_batch_size=args.max_batch_size,
-                max_multimodal_len=args.max_multimodal_len,
-                dtype=args.dtype,
-                use_lora_plugin=args.use_lora_plugin,
-                lora_target_modules=args.lora_target_modules,
-                max_lora_rank=args.max_lora_rank,
-                lora_checkpoint_path=args.lora_checkpoint_path,
-            )
-        except Exception as error:
-            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
-
-    return exporter
-
-
-def nemo_deploy(argv):
-    args = get_args(argv)
-
-    loglevel = logging.INFO
-
-    LOGGER.setLevel(loglevel)
-    LOGGER.info("Logging level set to {}".format(loglevel))
-    LOGGER.info(args)
-
-    triton_deployable = get_trt_deployable(args)
-
-    try:
-        nm = DeployPyTriton(
-            model=triton_deployable,
-            triton_model_name=args.triton_model_name,
-            triton_model_version=args.triton_model_version,
-            max_batch_size=args.max_batch_size,
-            http_port=args.triton_port,
-            address=args.triton_http_address,
-        )
-
-        LOGGER.info("Triton deploy function will be called.")
-        nm.deploy()
-    except Exception as error:
-        LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
-        return
-
-    try:
-        LOGGER.info("Model serving on Triton is will be started.")
-        nm.serve()
-    except Exception as error:
-        LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
-        return
-
-    LOGGER.info("Model serving will be stopped.")
-    nm.stop()
-
-
-if __name__ == '__main__':
-    nemo_deploy(sys.argv[1:])
diff --git a/scripts/deploy/multimodal/query.py b/scripts/deploy/multimodal/query.py
deleted file mode 100644
index 3de08e0cbce9..000000000000
--- a/scripts/deploy/multimodal/query.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import sys
-
-from nemo.deploy.multimodal import NemoQueryMultimodal
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Query Triton Multimodal server",
-    )
-    parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server")
-    parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model")
-    parser.add_argument("-mt", "--model_type", required=True, type=str, help="Type of the triton model")
-    parser.add_argument("-int", "--input_text", required=True, type=str, help="Input text")
-    parser.add_argument("-im", "--input_media", required=True, type=str, help="File path of input media")
-    parser.add_argument("-bs", "--batch_size", default=1, type=int, help="Batch size")
-    parser.add_argument("-mol", "--max_output_len", default=128, type=int, help="Max output token length")
-    parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k")
-    parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p")
-    parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature")
-    parser.add_argument("-rp", "--repetition_penalty", default=1.0, type=float, help="repetition_penalty")
-    parser.add_argument("-nb", "--num_beams", default=1, type=int, help="num_beams")
-    parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server")
-    parser.add_argument(
-        "-lt",
-        "--lora_task_uids",
-        default=None,
-        type=str,
-        nargs="+",
-        help="The list of LoRA task uids; use -1 to disable the LoRA module",
-    )
-
-    args = parser.parse_args(argv)
-    return args
-
-
-if __name__ == '__main__':
-    args = get_args(sys.argv[1:])
-    nq = NemoQueryMultimodal(url=args.url, model_name=args.model_name, model_type=args.model_type)
-    output = nq.query(
-        input_text=args.input_text,
-        input_media=args.input_media,
-        batch_size=args.batch_size,
-        max_output_len=args.max_output_len,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        temperature=args.temperature,
-        repetition_penalty=args.repetition_penalty,
-        num_beams=args.num_beams,
-        init_timeout=args.init_timeout,
-        lora_uids=args.lora_task_uids,
-    )
-    print(output)
diff --git a/scripts/deploy/nlp/benchmark_llm_inframework.py b/scripts/deploy/nlp/benchmark_llm_inframework.py
deleted file mode 100644
index 0a44985e1bc0..000000000000
--- a/scripts/deploy/nlp/benchmark_llm_inframework.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import sys
-import time
-from typing import Any, Dict
-
-import numpy as np
-
-from nemo.deploy.nlp import NemoQueryLLMPyTorch
-
-# Test prompts for benchmarking
-TEST_PROMPTS = [
-    "What is the capital of France?",
-    "Explain quantum computing in simple terms.",
-    "Write a short poem about artificial intelligence.",
-    "What are the main differences between Python and Java?",
-    "Describe the process of photosynthesis.",
-    "What is the meaning of life?",
-    "Explain the concept of blockchain technology.",
-    "Write a brief summary of the novel '1984' by George Orwell.",
-    "What are the key principles of machine learning?",
-    "Describe the water cycle in nature.",
-]
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description="Benchmarks Triton server running an in-framework Nemo model",
-    )
-    parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server")
-    parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model")
-    parser.add_argument("-n", "--num_queries", default=10, type=int, help="Number of queries to run")
-    parser.add_argument("-b", "--batch_size", default=1, type=int, help="Number of queries to send in a batch")
-    parser.add_argument("-mol", "--max_output_len", default=128, type=int, help="Max output token length")
-    parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k")
-    parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p")
-    parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature")
-    parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server")
-    parser.add_argument("-clp", "--compute_logprob", default=None, action='store_true', help="Returns log_probs")
-    parser.add_argument(
-        "-w", "--warmup", default=3, type=int, help="Number of warmup queries to run before benchmarking"
-    )
-
-    args = parser.parse_args(argv)
-    return args
-
-
-def run_benchmark(
-    url: str,
-    model_name: str,
-    num_queries: int,
-    batch_size: int,
-    max_output_len: int = 128,
-    top_k: int = 1,
-    top_p: float = 0.0,
-    temperature: float = 1.0,
-    compute_logprob: bool = None,
-    init_timeout: float = 60.0,
-    warmup: int = 3,
-) -> Dict[str, Any]:
-    """
-    Run a benchmark of the LLM deployment.
-
-    Args:
-        url: URL of the Triton server
-        model_name: Name of the model to query
-        num_queries: Number of queries to run for benchmarking
-        batch_size: Number of queries to send in a batch
-        max_output_len: Maximum output length
-        top_k: Top-k sampling parameter
-        top_p: Top-p sampling parameter
-        temperature: Temperature for sampling
-        compute_logprob: Whether to compute log probabilities
-        init_timeout: Initialization timeout
-        warmup: Number of warmup queries to run
-
-    Returns:
-        Dictionary containing benchmark results
-    """
-    nemo_query = NemoQueryLLMPyTorch(url, model_name)
-    latencies = []
-    outputs = []
-
-    # Warmup phase
-    print(f"Running {warmup} warmup queries...")
-    for _ in range(warmup):
-        nemo_query.query_llm(
-            prompts=[TEST_PROMPTS[0]],  # Use first prompt for warmup
-            max_length=max_output_len,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            compute_logprob=compute_logprob,
-            init_timeout=init_timeout,
-        )
-
-    # Benchmark phase
-    print(f"Running {num_queries} benchmark queries with batch size {batch_size}...")
-    num_batches = (num_queries + batch_size - 1) // batch_size
-
-    for batch_idx in range(num_batches):
-        start_idx = batch_idx * batch_size
-        end_idx = min((batch_idx + 1) * batch_size, num_queries)
-        current_batch_size = end_idx - start_idx
-
-        # Select prompts for this batch
-        batch_prompts = []
-        for i in range(current_batch_size):
-            prompt_idx = (start_idx + i) % len(TEST_PROMPTS)
-            batch_prompts.append(TEST_PROMPTS[prompt_idx])
-
-        start_time = time.time()
-        result = nemo_query.query_llm(
-            prompts=batch_prompts,
-            max_length=max_output_len,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            compute_logprob=compute_logprob,
-            init_timeout=init_timeout,
-        )
-        end_time = time.time()
-
-        # Calculate per-query latency
-        batch_latency = end_time - start_time
-        per_query_latency = batch_latency / current_batch_size
-
-        for i in range(current_batch_size):
-            latencies.append(per_query_latency)
-            outputs.append(result[i] if isinstance(result, list) else result)
-            print(f"Query {start_idx + i + 1}/{num_queries} completed in {per_query_latency:.2f} seconds")
-
-    # Calculate statistics
-    latencies = np.array(latencies)
-    stats = {
-        "mean_latency": np.mean(latencies),
-        "median_latency": np.median(latencies),
-        "p95_latency": np.percentile(latencies, 95),
-        "p99_latency": np.percentile(latencies, 99),
-        "min_latency": np.min(latencies),
-        "max_latency": np.max(latencies),
-        "std_latency": np.std(latencies),
-        "queries_per_second": 1.0 / np.mean(latencies),
-        "total_queries": num_queries,
-        "warmup_queries": warmup,
-        "batch_size": batch_size,
-    }
-
-    return stats
-
-
-def print_benchmark_results(stats: Dict[str, Any]) -> None:
-    """Print benchmark results in a formatted way."""
-    print("\nBenchmark Results:")
-    print("=" * 50)
-    print(f"Total Queries: {stats['total_queries']}")
-    print(f"Warmup Queries: {stats['warmup_queries']}")
-    print(f"Batch Size: {stats['batch_size']}")
-    print("\nLatency Statistics (seconds):")
-    print(f"Mean: {stats['mean_latency']:.3f}")
-    print(f"Median: {stats['median_latency']:.3f}")
-    print(f"95th Percentile: {stats['p95_latency']:.3f}")
-    print(f"99th Percentile: {stats['p99_latency']:.3f}")
-    print(f"Min: {stats['min_latency']:.3f}")
-    print(f"Max: {stats['max_latency']:.3f}")
-    print(f"Std Dev: {stats['std_latency']:.3f}")
-    print(f"\nThroughput: {stats['queries_per_second']:.2f} queries/second")
-
-
-def benchmark(argv):
-    args = get_args(argv)
-
-    stats = run_benchmark(
-        url=args.url,
-        model_name=args.model_name,
-        num_queries=args.num_queries,
-        batch_size=args.batch_size,
-        max_output_len=args.max_output_len,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        temperature=args.temperature,
-        compute_logprob=args.compute_logprob,
-        init_timeout=args.init_timeout,
-        warmup=args.warmup,
-    )
-
-    print_benchmark_results(stats)
-
-
-if __name__ == '__main__':
-    benchmark(sys.argv[1:])
diff --git a/scripts/deploy/nlp/deploy_in_fw_oai_server_eval.py b/scripts/deploy/nlp/deploy_in_fw_oai_server_eval.py
deleted file mode 100644
index d2499e3337a0..000000000000
--- a/scripts/deploy/nlp/deploy_in_fw_oai_server_eval.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-from nemo.collections.llm.api import deploy
-
-# NOTE: This script is an example script to deploy a nemo2 model in-framework (i.e wo converting the model to any
-# other model) on PyTriton server by exposing the OpenAI API endpoints (v1/completions and v1/chat/completions).
-# The intended use case of this script is to run evaluations with NVIDIA LM-Evaluation-Harness.
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(description="NeMo2.0 Deployment")
-    parser.add_argument(
-        "--nemo_checkpoint",
-        type=str,
-        help="NeMo 2.0 checkpoint to be evaluated",
-    ),
-    parser.add_argument(
-        "--ngpus",
-        type=int,
-        default=1,
-        help="Num of gpus per node",
-    ),
-    parser.add_argument(
-        "--nnodes",
-        type=int,
-        default=1,
-        help="Num of nodes",
-    ),
-    parser.add_argument(
-        "--tensor_parallelism_size",
-        type=int,
-        default=1,
-        help="Tensor parallelism size to deploy the model",
-    ),
-    parser.add_argument(
-        "--pipeline_parallelism_size",
-        type=int,
-        default=1,
-        help="Pipeline parallelism size to deploy the model",
-    )
-    parser.add_argument(
-        "--context_parallel_size",
-        type=int,
-        default=1,
-        help="context parallelism size to deploy the model",
-    )
-    parser.add_argument(
-        "--expert_model_parallel_size",
-        type=int,
-        default=1,
-        help="Expert model parallelism size to deploy the model",
-    )
-    parser.add_argument(
-        "--expert_tensor_parallel_size",
-        type=int,
-        default=1,
-        help="Expert tensor parallelism size to deploy the model",
-    )
-    parser.add_argument(
-        "--max_batch_size",
-        type=int,
-        default=8,
-        help="Max batch size for the underlying Triton server",
-    )
-    parser.add_argument(
-        "--max_input_len",
-        type=int,
-        default=4096,
-        help="Max input length for the underlying Triton server",
-    )
-    return parser
-
-
-if __name__ == "__main__":
-    args = get_parser().parse_args()
-    deploy(
-        nemo_checkpoint=args.nemo_checkpoint,
-        num_gpus=args.ngpus,
-        num_nodes=args.nnodes,
-        fastapi_port=8886,
-        tensor_parallelism_size=args.tensor_parallelism_size,
-        pipeline_parallelism_size=args.pipeline_parallelism_size,
-        context_parallel_size=args.context_parallel_size,
-        expert_model_parallel_size=args.expert_model_parallel_size,
-        expert_tensor_parallel_size=args.expert_tensor_parallel_size,
-        max_batch_size=args.max_batch_size,
-        max_input_len=args.max_input_len,
-    )
diff --git a/scripts/deploy/nlp/deploy_inframework_hf_triton.py b/scripts/deploy/nlp/deploy_inframework_hf_triton.py
deleted file mode 100755
index 27c767b0dc50..000000000000
--- a/scripts/deploy/nlp/deploy_inframework_hf_triton.py
+++ /dev/null
@@ -1,232 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import logging
-import os
-import sys
-
-import torch
-import torch.distributed as dist
-
-from nemo.deploy import DeployPyTriton
-from nemo.deploy.nlp.hf_deployable import HuggingFaceLLMDeploy
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def setup_torch_dist(rank, world_size):
-    """Sets up PyTorch distributed training environment.
-
-    Args:
-        rank (int): The rank of the current process
-        world_size (int): Total number of processes for distributed training
-    """
-
-    torch.cuda.set_device(rank)
-    # Initialize the process group
-    dist.init_process_group("nccl", rank=rank, world_size=world_size)
-
-
-def get_args(argv):
-    """Get command line arguments for deploying HuggingFace models to Triton.
-
-    Returns:
-        argparse.Namespace: Parsed command line arguments including:
-            - hf_model_id_path: Path to HuggingFace model
-            - task: Model task type (text-generation)
-            - device_map: Device mapping strategy
-            - tp_plan: Tensor parallelism plan
-            - trust_remote_code: Whether to trust remote code
-            - triton_model_name: Name for model in Triton
-            - triton_model_version: Model version number
-            - triton_port: Triton HTTP port
-            - triton_http_address: Triton HTTP address
-            - max_batch_size: Maximum inference batch size
-            - debug_mode: Enable debug logging
-    """
-
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description="Deploy HuggingFace models to Triton Inference Server",
-    )
-    parser.add_argument(
-        "-hp",
-        "--hf_model_id_path",
-        type=str,
-        help="Path to local HuggingFace " "model directory or model ID from HuggingFace " "Hub",
-    )
-    parser.add_argument(
-        "-t",
-        "--task",
-        nargs='?',
-        choices=['text-generation'],
-        default="text-generation",
-        type=str,
-        help="Task type for the HuggingFace model (currently only text-generation is supported)",
-    )
-    parser.add_argument(
-        "-dvm",
-        "--device_map",
-        nargs='?',
-        choices=['auto', 'balanced', 'balanced_low_0', 'sequential'],
-        default=None,
-        type=str,
-        help="Device mapping " "strategy for model placement " "(e.g. 'auto', 'sequential', etc)",
-    )
-    parser.add_argument(
-        "-tpp",
-        "--tp_plan",
-        nargs='?',
-        choices=['auto'],
-        default=None,
-        type=str,
-        help="Tensor parallelism plan for distributed inference",
-    )
-    parser.add_argument(
-        "-trc",
-        "--trust_remote_code",
-        default=False,
-        action='store_true',
-        help="Allow loading " "remote code from HuggingFace " "Hub",
-    )
-    parser.add_argument(
-        "-tmn", "--triton_model_name", required=True, type=str, help="Name to " "identify the model in " "Triton"
-    )
-    parser.add_argument(
-        "-tmv", "--triton_model_version", default=1, type=int, help="Version " "number for the model " "in Triton"
-    )
-    parser.add_argument(
-        "-trp", "--triton_port", default=8000, type=int, help="Port number for Triton server " "HTTP endpoint"
-    )
-    parser.add_argument(
-        "-tha",
-        "--triton_http_address",
-        default="0.0.0.0",
-        type=str,
-        help="Network interface " "address for Triton HTTP endpoint",
-    )
-    parser.add_argument(
-        "-mbs", "--max_batch_size", default=8, type=int, help="Maximum " "batch size for model inference"
-    )
-    parser.add_argument(
-        "-dm", "--debug_mode", default=False, action='store_true', help="Enable " "verbose debug logging"
-    )
-    args = parser.parse_args(argv)
-    return args
-
-
-def hf_deploy(argv):
-    """Deploy a HuggingFace model to Triton Inference Server.
-
-    This function handles the deployment workflow including:
-    - Parsing command line arguments
-    - Setting up distributed training if needed
-    - Initializing the HuggingFace model
-    - Starting the Triton server
-
-    Args:
-        argv: Command line arguments
-
-    Raises:
-        ValueError: If required arguments are missing or invalid
-    """
-
-    args = get_args(argv)
-
-    if args.debug_mode:
-        loglevel = logging.DEBUG
-    else:
-        loglevel = logging.INFO
-
-    LOGGER.setLevel(loglevel)
-    LOGGER.info("Logging level set to {}".format(loglevel))
-    LOGGER.info(args)
-
-    if args.hf_model_id_path is None:
-        raise ValueError("In-Framework deployment requires a Hugging Face model ID or path.")
-
-    if "RANK" in os.environ:
-        rank = int(os.environ["RANK"])
-        world_size = int(os.environ["WORLD_SIZE"])
-        if world_size > 1:
-            setup_torch_dist(rank, world_size)
-    else:
-        if args.device_map == "auto":
-            LOGGER.warning(
-                "device_map is set to auto and it is recommended that the script"
-                "is started with torchrun with a process per GPU. You might "
-                "see unexpected issues during the inference otherwise."
-            )
-
-        if args.tp_plan is not None:
-            raise ValueError("tp_plan is only available with torchrun.")
-
-    hf_deployable = HuggingFaceLLMDeploy(
-        hf_model_id_path=args.hf_model_id_path,
-        task=args.task,
-        trust_remote_code=args.trust_remote_code,
-        device_map=args.device_map,
-        tp_plan=args.tp_plan,
-    )
-
-    start_triton_server = True
-    if dist.is_initialized():
-        if dist.get_rank() > 0:
-            start_triton_server = False
-
-    if start_triton_server:
-        try:
-            nm = DeployPyTriton(
-                model=hf_deployable,
-                triton_model_name=args.triton_model_name,
-                triton_model_version=args.triton_model_version,
-                max_batch_size=args.max_batch_size,
-                http_port=args.triton_port,
-                address=args.triton_http_address,
-            )
-
-            LOGGER.info("Triton deploy function will be called.")
-            nm.deploy()
-        except Exception as error:
-            LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
-            if dist.is_initialized():
-                dist.barrier()
-            return
-
-        try:
-            LOGGER.info("Model serving on Triton will be started.")
-            nm.serve()
-        except Exception as error:
-            LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
-
-        if dist.is_initialized():
-            if dist.get_world_size() > 1:
-                torch.distributed.broadcast(torch.tensor([1], dtype=torch.long, device="cuda"), src=0)
-
-        LOGGER.info("Model serving will be stopped.")
-        nm.stop()
-    else:
-        if dist.is_initialized():
-            if dist.get_rank() > 0:
-                hf_deployable.generate_other_ranks()
-
-    if dist.is_initialized():
-        dist.barrier()
-        dist.destroy_process_group()
-
-
-if __name__ == '__main__':
-    hf_deploy(sys.argv[1:])
diff --git a/scripts/deploy/nlp/deploy_inframework_triton.py b/scripts/deploy/nlp/deploy_inframework_triton.py
deleted file mode 100755
index 5b175b1829f2..000000000000
--- a/scripts/deploy/nlp/deploy_inframework_triton.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import sys
-import torch
-
-from nemo.deploy import DeployPyTriton
-
-LOGGER = logging.getLogger("NeMo")
-
-megatron_llm_supported = True
-try:
-    from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployableNemo2
-except Exception as e:
-    LOGGER.warning(f"Cannot import MegatronLLMDeployable, it will not be available. {type(e).__name__}: {e}")
-    megatron_llm_supported = False
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Deploy nemo models to Triton",
-    )
-    parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
-    parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
-    parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
-    parser.add_argument(
-        "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
-    )
-    parser.add_argument(
-        "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
-    )
-    parser.add_argument("-ng", "--num_gpus", default=1, type=int, help="Number of GPUs for the deployment")
-    parser.add_argument("-nn", "--num_nodes", default=1, type=int, help="Number of GPUs for the deployment")
-    parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
-    parser.add_argument("-pps", "--pipeline_parallelism_size", default=1, type=int, help="Pipeline parallelism size")
-    parser.add_argument("-cps", "--context_parallel_size", default=1, type=int, help="Pipeline parallelism size")
-    parser.add_argument(
-        "-emps",
-        "--expert_model_parallel_size",
-        default=1,
-        type=int,
-        help="Distributes MoE Experts across sub data parallel dimension.",
-    )
-    parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
-    parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
-    parser.add_argument(
-        "-fd",
-        '--enable_flash_decode',
-        default=False,
-        action='store_true',
-        help='Enable flash decoding',
-    )
-    parser.add_argument("-lc", "--legacy_ckpt", action="store_true", help="Load checkpoint saved with TE < 1.14")
-    args = parser.parse_args(argv)
-    return args
-
-
-def nemo_deploy(argv):
-    args = get_args(argv)
-
-    if args.debug_mode:
-        loglevel = logging.DEBUG
-    else:
-        loglevel = logging.INFO
-
-    LOGGER.setLevel(loglevel)
-    LOGGER.info("Logging level set to {}".format(loglevel))
-    LOGGER.info(args)
-
-    if not megatron_llm_supported:
-        raise ValueError("MegatronLLMDeployable is not supported in this environment.")
-
-    if args.nemo_checkpoint is None:
-        raise ValueError("In-Framework deployment requires a checkpoint folder.")
-
-    model = MegatronLLMDeployableNemo2(
-        nemo_checkpoint_filepath=args.nemo_checkpoint,
-        num_devices=args.num_gpus,
-        num_nodes=args.num_nodes,
-        tensor_model_parallel_size=args.tensor_parallelism_size,
-        pipeline_model_parallel_size=args.pipeline_parallelism_size,
-        context_parallel_size=args.context_parallel_size,
-        expert_model_parallel_size=args.expert_model_parallel_size,
-        max_batch_size=args.max_batch_size,
-        enable_flash_decode=args.enable_flash_decode,
-        legacy_ckpt=args.legacy_ckpt,
-    )
-
-    if torch.distributed.is_initialized():
-        if torch.distributed.get_rank() == 0:
-            try:
-                nm = DeployPyTriton(
-                    model=model,
-                    triton_model_name=args.triton_model_name,
-                    triton_model_version=args.triton_model_version,
-                    max_batch_size=args.max_batch_size,
-                    http_port=args.triton_port,
-                    address=args.triton_http_address,
-                )
-
-                LOGGER.info("Triton deploy function will be called.")
-                nm.deploy()
-            except Exception as error:
-                LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
-                return
-
-            try:
-                LOGGER.info("Model serving on Triton will be started.")
-                nm.serve()
-            except Exception as error:
-                LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
-                return
-
-            torch.distributed.broadcast(torch.tensor([1], dtype=torch.long, device="cuda"), src=0)
-
-            LOGGER.info("Model serving will be stopped.")
-            nm.stop()
-        elif torch.distributed.get_rank() > 0:
-            model.generate_other_ranks()
-
-    else:
-        LOGGER.info("Torch distributed wasn't initialized.")
-
-
-if __name__ == '__main__':
-    nemo_deploy(sys.argv[1:])
diff --git a/scripts/deploy/nlp/deploy_triton.py b/scripts/deploy/nlp/deploy_triton.py
deleted file mode 100755
index e0d506308ff6..000000000000
--- a/scripts/deploy/nlp/deploy_triton.py
+++ /dev/null
@@ -1,500 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import logging
-import os
-import sys
-from pathlib import Path
-from typing import Optional
-
-import uvicorn
-from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
-
-from nemo.deploy import DeployPyTriton
-
-LOGGER = logging.getLogger("NeMo")
-
-
-class UsageError(Exception):
-    pass
-
-
-megatron_llm_supported = True
-try:
-    from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable
-except Exception as e:
-    LOGGER.warning(f"Cannot import MegatronLLMDeployable, it will not be available. {type(e).__name__}: {e}")
-    megatron_llm_supported = False
-
-trt_llm_supported = True
-try:
-    from nemo.export.tensorrt_llm import TensorRTLLM
-except Exception as e:
-    LOGGER.warning(f"Cannot import the TensorRTLLM exporter, it will not be available. {type(e).__name__}: {e}")
-    trt_llm_supported = False
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Deploy nemo models to Triton",
-    )
-    parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
-    parser.add_argument("-hfp", "--hf_model_id_path", type=str, help="Huggingface model path or id")
-    parser.add_argument(
-        "-ptnc",
-        "--ptuning_nemo_checkpoint",
-        nargs='+',
-        type=str,
-        required=False,
-        help="Source .nemo file for prompt embeddings table",
-    )
-    parser.add_argument(
-        '-ti', '--task_ids', nargs='+', type=str, required=False, help='Unique task names for the prompt embedding.'
-    )
-    parser.add_argument(
-        "-mt",
-        "--model_type",
-        type=str,
-        required=False,
-        help="Type of the model. gptnext, gpt, llama, falcon, and starcoder are only supported."
-        " gptnext and gpt are the same and keeping it for backward compatibility",
-    )
-    parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
-    parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
-    parser.add_argument(
-        "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
-    )
-    parser.add_argument(
-        "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
-    )
-    parser.add_argument(
-        "-trt", "--triton_request_timeout", default=60, type=int, help="Timeout in seconds for Triton server"
-    )
-    parser.add_argument(
-        "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the trt-llm conversion"
-    )
-    parser.add_argument("-ng", "--num_gpus", default=None, type=int, help="Number of GPUs for the deployment")
-    parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
-    parser.add_argument("-pps", "--pipeline_parallelism_size", default=1, type=int, help="Pipeline parallelism size")
-    parser.add_argument(
-        "-dt",
-        "--dtype",
-        choices=["bfloat16", "float16", "fp8", "int8"],
-        default="bfloat16",
-        type=str,
-        help="dtype of the model on TensorRT-LLM",
-    )
-    parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
-    parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
-    parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
-    parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens")
-    parser.add_argument("-msl", "--max_seq_len", default=None, type=int, help="Maximum number of sequence length")
-    parser.add_argument("-mp", "--multiple_profiles", default=False, action='store_true', help="Multiple profiles")
-    parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens")
-    parser.add_argument(
-        "-gap", "--gpt_attention_plugin", default="auto", type=str, help="dtype of gpt attention plugin"
-    )
-    parser.add_argument("-gp", "--gemm_plugin", default="auto", type=str, help="dtype of gpt plugin")
-    parser.add_argument(
-        "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
-    )
-    parser.add_argument(
-        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Enable paged kv cache."
-    )
-    parser.add_argument(
-        "-drip",
-        "--disable_remove_input_padding",
-        default=False,
-        action='store_true',
-        help="Disables the remove input padding option.",
-    )
-    parser.add_argument(
-        "-upe",
-        "--use_parallel_embedding",
-        default=False,
-        action='store_true',
-        help='Use parallel embedding feature of TensorRT-LLM.',
-    )
-    parser.add_argument(
-        "-mbm",
-        '--multi_block_mode',
-        default=False,
-        action='store_true',
-        help='Split long kv sequence into multiple blocks (applied to generation MHA kernels). \
-                        It is beneifical when batchxnum_heads cannot fully utilize GPU. \
-                        Only available when using c++ runtime.',
-    )
-    parser.add_argument(
-        "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
-    )
-    parser.add_argument(
-        '--use_lora_plugin',
-        nargs='?',
-        const=None,
-        choices=['float16', 'float32', 'bfloat16'],
-        help="Activates the lora plugin which enables embedding sharing.",
-    )
-    parser.add_argument(
-        '--lora_target_modules',
-        nargs='+',
-        default=None,
-        choices=[
-            "attn_qkv",
-            "attn_q",
-            "attn_k",
-            "attn_v",
-            "attn_dense",
-            "mlp_h_to_4h",
-            "mlp_gate",
-            "mlp_4h_to_h",
-        ],
-        help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
-    )
-    parser.add_argument(
-        '--max_lora_rank',
-        type=int,
-        default=64,
-        help='maximum lora rank for different lora modules. '
-        'It is used to compute the workspace size of lora plugin.',
-    )
-    parser.add_argument(
-        "-lc", "--lora_ckpt", default=None, type=str, nargs="+", help="The checkpoint list of LoRA weights"
-    )
-    parser.add_argument(
-        "-ucr",
-        '--use_cpp_runtime',
-        default=False,
-        action='store_true',
-        help='Use TensorRT LLM C++ runtime',
-    )
-    parser.add_argument(
-        "-b",
-        '--backend',
-        nargs='?',
-        const=None,
-        default='TensorRT-LLM',
-        choices=['TensorRT-LLM', 'In-Framework'],
-        help="Different options to deploy nemo model.",
-    )
-    parser.add_argument(
-        "-srs",
-        "--start_rest_service",
-        default=False,
-        type=bool,
-        help="Starts the REST service for OpenAI API support",
-    )
-    parser.add_argument(
-        "-sha", "--service_http_address", default="0.0.0.0", type=str, help="HTTP address for the REST Service"
-    )
-    parser.add_argument("-sp", "--service_port", default=8080, type=int, help="Port for the REST Service")
-    parser.add_argument(
-        "-ofr",
-        "--openai_format_response",
-        default=False,
-        type=bool,
-        help="Return the response from PyTriton server in OpenAI compatible format",
-    )
-    parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
-    parser.add_argument(
-        "-fp8",
-        "--export_fp8_quantized",
-        default="auto",
-        type=str,
-        help="Enables exporting to a FP8-quantized TRT LLM checkpoint",
-    )
-    parser.add_argument(
-        "-kv_fp8",
-        "--use_fp8_kv_cache",
-        default="auto",
-        type=str,
-        help="Enables exporting with FP8-quantizatized KV-cache",
-    )
-    args = parser.parse_args(argv)
-
-    def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]:
-        s = s.lower()
-        true_strings = ["true", "1"]
-        false_strings = ["false", "0"]
-        if s in true_strings:
-            return True
-        if s in false_strings:
-            return False
-        if optional and s == 'auto':
-            return None
-        raise UsageError(f"Invalid boolean value for argument --{name}: '{s}'")
-
-    args.export_fp8_quantized = str_to_bool("export_fp8_quantized", args.export_fp8_quantized, optional=True)
-    args.use_fp8_kv_cache = str_to_bool("use_fp8_kv_cache", args.use_fp8_kv_cache, optional=True)
-    return args
-
-
-def store_args_to_json(args):
-    """
-    Stores user defined arg values relevant for REST API in config.json
-    Gets called only when args.start_rest_service is True.
-    """
-    args_dict = {
-        "triton_service_ip": args.triton_http_address,
-        "triton_service_port": args.triton_port,
-        "triton_request_timeout": args.triton_request_timeout,
-        "openai_format_response": args.openai_format_response,
-    }
-    with open("nemo/deploy/service/config.json", "w") as f:
-        json.dump(args_dict, f)
-
-
-def get_trtllm_deployable(args):
-    if args.triton_model_repository is None:
-        trt_llm_path = "/tmp/trt_llm_model_dir/"
-        LOGGER.info(
-            "/tmp/trt_llm_model_dir/ path will be used as the TensorRT LLM folder. "
-            "Please set the --triton_model_repository parameter if you'd like to use a path that already "
-            "includes the TensorRT LLM model files."
-        )
-        Path(trt_llm_path).mkdir(parents=True, exist_ok=True)
-    else:
-        trt_llm_path = args.triton_model_repository
-
-    if args.hf_model_id_path:
-        # Check if the path is an existing hf checkpoint
-        LOGGER.info(f"Checking if the model is available in the local cache: {args.hf_model_id_path}")
-        local_path = Path(args.hf_model_id_path)
-        model_available = local_path.exists() and (local_path / "config.json").exists()
-        if not model_available:
-            # Download the model from huggingface
-            # Download model, tokenizer and config from HF
-            LOGGER.info(f"Downloading model from HuggingFace: {args.hf_model_id_path}")
-            try:
-                hf_model_cache_dir = "/tmp/hf_model_dir/"
-                Path(hf_model_cache_dir).mkdir(parents=True, exist_ok=True)
-                # Create model specific directory
-                hf_model_path = os.path.join(hf_model_cache_dir, args.hf_model_id_path)
-                Path(hf_model_path).mkdir(parents=True, exist_ok=True)
-
-                # Download model weights in safetensor format
-                model = AutoModelForCausalLM.from_pretrained(
-                    args.hf_model_id_path, cache_dir=hf_model_path, torch_dtype="auto", use_safetensors=True
-                )
-                # Download tokenizer files and config
-                tokenizer = AutoTokenizer.from_pretrained(args.hf_model_id_path, cache_dir=hf_model_path)
-                config = AutoConfig.from_pretrained(args.hf_model_id_path, cache_dir=hf_model_path)
-
-                # Save model weights to model directory
-                model.save_pretrained(hf_model_path, safe_serialization=True)
-
-                # Save tokenizer files and config to model directory
-                tokenizer.save_pretrained(hf_model_path)
-                config.save_pretrained(hf_model_path)
-                args.hf_model_id_path = hf_model_path
-
-                LOGGER.info(f"Downloaded model, tokenizer and config to {args.hf_model_id_path}")
-            except Exception as e:
-                raise RuntimeError(f"Error downloading from HuggingFace: {str(e)}")
-
-    checkpoint_missing = args.nemo_checkpoint is None and args.hf_model_id_path is None
-    if checkpoint_missing and args.triton_model_repository is None:
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint."
-        )
-
-    if checkpoint_missing and not os.path.isdir(args.triton_model_repository):
-        raise ValueError(
-            "The provided model repository is not a valid TensorRT-LLM model "
-            "directory. Please provide a --nemo_checkpoint."
-        )
-
-    if not checkpoint_missing and args.model_type is None:
-        raise ValueError("Model type is required to be defined if a nemo checkpoint is provided.")
-
-    ptuning_tables_files = []
-    if not args.ptuning_nemo_checkpoint is None:
-        if args.max_prompt_embedding_table_size is None:
-            raise ValueError("max_prompt_embedding_table_size parameter is needed for the prompt tuning table(s).")
-
-        for pt_checkpoint in args.ptuning_nemo_checkpoint:
-            ptuning_nemo_checkpoint_path = Path(pt_checkpoint)
-            if ptuning_nemo_checkpoint_path.exists():
-                if ptuning_nemo_checkpoint_path.is_file():
-                    ptuning_tables_files.append(pt_checkpoint)
-                else:
-                    raise IsADirectoryError("Could not read the prompt tuning tables from {0}".format(pt_checkpoint))
-            else:
-                raise FileNotFoundError("File or directory {0} does not exist.".format(pt_checkpoint))
-
-        if args.task_ids is not None:
-            if len(ptuning_tables_files) != len(args.task_ids):
-                raise RuntimeError(
-                    "Number of task ids and prompt embedding tables have to match. "
-                    "There are {0} tables and {1} task ids.".format(len(ptuning_tables_files), len(args.task_ids))
-                )
-
-    trt_llm_exporter = TensorRTLLM(
-        model_dir=trt_llm_path,
-        lora_ckpt_list=args.lora_ckpt,
-        load_model=(args.nemo_checkpoint is None and args.hf_model_id_path is None),
-        use_python_runtime=(not args.use_cpp_runtime),
-        multi_block_mode=args.multi_block_mode,
-    )
-
-    if args.nemo_checkpoint is not None:
-        try:
-            LOGGER.info("Export operation will be started to export the nemo checkpoint to TensorRT-LLM.")
-            trt_llm_exporter.export(
-                nemo_checkpoint_path=args.nemo_checkpoint,
-                model_type=args.model_type,
-                tensor_parallelism_size=args.tensor_parallelism_size,
-                pipeline_parallelism_size=args.pipeline_parallelism_size,
-                max_input_len=args.max_input_len,
-                max_output_len=args.max_output_len,
-                max_batch_size=args.max_batch_size,
-                max_num_tokens=args.max_num_tokens,
-                opt_num_tokens=args.opt_num_tokens,
-                max_seq_len=args.max_seq_len,
-                use_parallel_embedding=args.use_parallel_embedding,
-                max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-                paged_kv_cache=(not args.no_paged_kv_cache),
-                remove_input_padding=(not args.disable_remove_input_padding),
-                dtype=args.dtype,
-                use_lora_plugin=args.use_lora_plugin,
-                lora_target_modules=args.lora_target_modules,
-                max_lora_rank=args.max_lora_rank,
-                multiple_profiles=args.multiple_profiles,
-                gpt_attention_plugin=args.gpt_attention_plugin,
-                gemm_plugin=args.gemm_plugin,
-                fp8_quantized=args.export_fp8_quantized,
-                fp8_kvcache=args.use_fp8_kv_cache,
-            )
-        except Exception as error:
-            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
-    elif args.hf_model_id_path is not None:
-        LOGGER.info("Export operation will be started to export the hugging face checkpoint to TensorRT-LLM.")
-        try:
-            trt_llm_exporter.export_hf_model(
-                hf_model_path=args.hf_model_id_path,
-                max_batch_size=args.max_batch_size,
-                tensor_parallelism_size=args.tensor_parallelism_size,
-                max_input_len=args.max_input_len,
-                max_output_len=args.max_output_len,
-                dtype=args.dtype,
-                model_type=args.model_type,
-            )
-        except Exception as error:
-            raise RuntimeError("An error has occurred during the model export. Error message: " + str(error))
-
-    try:
-        for i, prompt_embeddings_checkpoint_path in enumerate(ptuning_tables_files):
-            if args.task_ids is not None:
-                task_id = args.task_ids[i]
-            else:
-                task_id = i
-
-            LOGGER.info(
-                "Adding prompt embedding table: {0} with task id: {1}.".format(
-                    prompt_embeddings_checkpoint_path, task_id
-                )
-            )
-            trt_llm_exporter.add_prompt_table(
-                task_name=str(task_id),
-                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
-            )
-    except Exception as error:
-        raise RuntimeError(
-            "An error has occurred during adding the prompt embedding table(s). Error message: " + str(error)
-        )
-    return trt_llm_exporter
-
-
-def get_nemo_deployable(args):
-    if args.nemo_checkpoint is None:
-        raise ValueError("In-Framework deployment requires a .nemo checkpoint")
-
-    return MegatronLLMDeployable(args.nemo_checkpoint, args.num_gpus)
-
-
-def nemo_deploy(argv):
-    args = get_args(argv)
-
-    if args.debug_mode:
-        loglevel = logging.DEBUG
-    else:
-        loglevel = logging.INFO
-
-    LOGGER.setLevel(loglevel)
-    LOGGER.info("Logging level set to {}".format(loglevel))
-    LOGGER.info(args)
-
-    if args.start_rest_service:
-        if args.service_port == args.triton_port:
-            logging.error("REST service port and Triton server port cannot use the same port.")
-            return
-        # Store triton ip, port and other args relevant for REST API in config.json to be accessible by rest_model_api.py
-        store_args_to_json(args)
-
-    backend = args.backend.lower()
-    if backend == 'tensorrt-llm':
-        if not trt_llm_supported:
-            raise ValueError("TensorRT-LLM engine is not supported in this environment.")
-        triton_deployable = get_trtllm_deployable(args)
-    elif backend == 'in-framework':
-        if not megatron_llm_supported:
-            raise ValueError("MegatronLLMDeployable is not supported in this environment.")
-        triton_deployable = get_nemo_deployable(args)
-    else:
-        raise ValueError("Backend: {0} is not supported.".format(backend))
-
-    try:
-        nm = DeployPyTriton(
-            model=triton_deployable,
-            triton_model_name=args.triton_model_name,
-            triton_model_version=args.triton_model_version,
-            max_batch_size=args.max_batch_size,
-            http_port=args.triton_port,
-            address=args.triton_http_address,
-            streaming=args.enable_streaming,
-        )
-
-        LOGGER.info("Triton deploy function will be called.")
-        nm.deploy()
-        nm.run()
-    except Exception as error:
-        LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
-        return
-
-    try:
-        LOGGER.info("Model serving on Triton is will be started.")
-        if args.start_rest_service:
-            try:
-                LOGGER.info("REST service will be started.")
-                uvicorn.run(
-                    'nemo.deploy.service.rest_model_api:app',
-                    host=args.service_http_address,
-                    port=args.service_port,
-                    reload=True,
-                )
-            except Exception as error:
-                logging.error("Error message has occurred during REST service start. Error message: " + str(error))
-        nm.serve()
-    except Exception as error:
-        LOGGER.error("Error message has occurred during deploy function. Error message: " + str(error))
-        return
-    LOGGER.info("Model serving will be stopped.")
-    nm.stop()
-
-
-if __name__ == '__main__':
-    nemo_deploy(sys.argv[1:])
diff --git a/scripts/deploy/nlp/deploy_vllm_triton.py b/scripts/deploy/nlp/deploy_vllm_triton.py
deleted file mode 100755
index 2e95bbd49183..000000000000
--- a/scripts/deploy/nlp/deploy_vllm_triton.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import os
-import sys
-import tempfile
-
-from nemo.deploy import DeployPyTriton
-
-# Configure the NeMo logger to look the same as vLLM
-logging.basicConfig(format="%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s", datefmt="%m-%d %H:%M:%S")
-LOGGER = logging.getLogger("NeMo")
-
-try:
-    from nemo.export.vllm_exporter import vLLMExporter
-except Exception as e:
-    LOGGER.error(f"Cannot import the vLLM exporter. {type(e).__name__}: {e}")
-    sys.exit(1)
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Export NeMo models to vLLM and deploy them on Triton",
-    )
-    parser.add_argument("-nc", "--nemo_checkpoint", type=str, help="Source .nemo file")
-    parser.add_argument(
-        "-mt",
-        "--model_type",
-        type=str,
-        required=True,
-        choices=["llama", "mistral", "mixtral", "starcoder2", "gemma"],
-        help="Type of the model",
-    )
-    parser.add_argument("-tmn", "--triton_model_name", required=True, type=str, help="Name for the service")
-    parser.add_argument("-tmv", "--triton_model_version", default=1, type=int, help="Version for the service")
-    parser.add_argument(
-        "-trp", "--triton_port", default=8000, type=int, help="Port for the Triton server to listen for requests"
-    )
-    parser.add_argument(
-        "-tha", "--triton_http_address", default="0.0.0.0", type=str, help="HTTP address for the Triton server"
-    )
-    parser.add_argument(
-        "-tmr", "--triton_model_repository", default=None, type=str, help="Folder for the vLLM conversion"
-    )
-    parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
-    parser.add_argument(
-        "-dt",
-        "--dtype",
-        choices=["bfloat16", "float16", "fp8", "int8"],
-        default="bfloat16",
-        type=str,
-        help="dtype of the model on vLLM",
-    )
-    parser.add_argument(
-        "-mml", "--max_model_len", default=512, type=int, help="Max input + ouptut length of the model"
-    )
-    parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
-    parser.add_argument(
-        "-lc", "--lora_ckpt", default=[], type=str, nargs="+", help="List of LoRA checkpoints in HF format"
-    )
-    parser.add_argument(
-        "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
-    )
-    parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
-    parser.add_argument(
-        '-ws',
-        '--weight_storage',
-        default='auto',
-        choices=['auto', 'cache', 'file', 'memory'],
-        help='Strategy for storing converted weights for vLLM: "file" - always write weights into a file, '
-        '"memory" - always do an in-memory conversion, "cache" - reuse existing files if they are '
-        'newer than the nemo checkpoint, "auto" - use "cache" for multi-GPU runs and "memory" '
-        'for single-GPU runs.',
-    )
-    parser.add_argument(
-        "-gmu",
-        '--gpu_memory_utilization',
-        default=0.9,
-        type=float,
-        help="GPU memory utilization percentage for vLLM.",
-    )
-    parser.add_argument(
-        "-q",
-        "--quantization",
-        choices=["fp8"],
-        help="Quantization method for vLLM.",
-    )
-    args = parser.parse_args(argv)
-    return args
-
-
-def get_vllm_deployable(args, model_dir):
-    exporter = vLLMExporter()
-    exporter.export(
-        nemo_checkpoint=args.nemo_checkpoint,
-        model_dir=model_dir,
-        model_type=args.model_type,
-        tensor_parallel_size=args.tensor_parallelism_size,
-        max_model_len=args.max_model_len,
-        lora_checkpoints=args.lora_ckpt,
-        dtype=args.dtype,
-        weight_storage=args.weight_storage,
-        gpu_memory_utilization=args.gpu_memory_utilization,
-        quantization=args.quantization,
-    )
-    return exporter
-
-
-def nemo_deploy(argv):
-    args = get_args(argv)
-
-    if args.debug_mode:
-        loglevel = logging.DEBUG
-    else:
-        loglevel = logging.INFO
-
-    LOGGER.setLevel(loglevel)
-    LOGGER.info("Logging level set to {}".format(loglevel))
-    LOGGER.info(args)
-
-    # If no model_dir was supplied, create a temporary directory.
-    # This directory should persist while the model is being served, becaue it may contain
-    # converted LoRA checkpoints, and those are accessed by vLLM at request time.
-    tempdir = None
-    model_dir = args.triton_model_repository
-    if model_dir is None:
-        tempdir = tempfile.TemporaryDirectory()
-        model_dir = tempdir.name
-        LOGGER.info(
-            f"{model_dir} will be used for the vLLM intermediate folder. "
-            + "Please set the --triton_model_repository parameter if you'd like to use a path that already "
-            + "includes the vLLM model files."
-        )
-    elif not os.path.exists(model_dir):
-        os.makedirs(model_dir)
-
-    try:
-        triton_deployable = get_vllm_deployable(args, model_dir=model_dir)
-
-        nm = DeployPyTriton(
-            model=triton_deployable,
-            triton_model_name=args.triton_model_name,
-            triton_model_version=args.triton_model_version,
-            max_batch_size=args.max_batch_size,
-            http_port=args.triton_port,
-            address=args.triton_http_address,
-            streaming=args.enable_streaming,
-        )
-
-        LOGGER.info("Starting the Triton server...")
-        nm.deploy()
-        nm.serve()
-
-        LOGGER.info("Stopping the Triton server...")
-        nm.stop()
-
-    except Exception as error:
-        LOGGER.error("An error has occurred while setting up or serving the model. Error message: " + str(error))
-        return
-
-    # Clean up the temporary directory
-    finally:
-        if tempdir is not None:
-            tempdir.cleanup()
-
-
-if __name__ == '__main__':
-    nemo_deploy(sys.argv[1:])
diff --git a/scripts/deploy/nlp/query.py b/scripts/deploy/nlp/query.py
deleted file mode 100644
index 5d70102c8295..000000000000
--- a/scripts/deploy/nlp/query.py
+++ /dev/null
@@ -1,247 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import sys
-import typing
-
-import numpy as np
-from pytriton.client import DecoupledModelClient, ModelClient
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Sends a single query to an LLM hosted on a Triton server.",
-    )
-    parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server")
-    parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model")
-    prompt_group = parser.add_mutually_exclusive_group(required=True)
-    prompt_group.add_argument("-p", "--prompt", required=False, type=str, help="Prompt")
-    prompt_group.add_argument("-pf", "--prompt_file", required=False, type=str, help="File to read the prompt from")
-    parser.add_argument("-swl", "--stop_words_list", type=str, help="Stop words list")
-    parser.add_argument("-bwl", "--bad_words_list", type=str, help="Bad words list")
-    parser.add_argument("-nrns", "--no_repeat_ngram_size", type=int, help="No repeat ngram size")
-    parser.add_argument("-mol", "--max_output_len", default=128, type=int, help="Max output token length")
-    parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k")
-    parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p")
-    parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature")
-    parser.add_argument("-ti", "--task_id", type=str, help="Task id for the prompt embedding tables")
-    parser.add_argument(
-        "-lt",
-        "--lora_task_uids",
-        default=None,
-        type=str,
-        nargs="+",
-        help="The list of LoRA task uids; use -1 to disable the LoRA module",
-    )
-    parser.add_argument(
-        "-es", '--enable_streaming', default=False, action='store_true', help="Enables streaming sentences."
-    )
-    parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server")
-
-    args = parser.parse_args(argv)
-    return args
-
-
-def str_list2numpy(str_list: typing.List[str]) -> np.ndarray:
-    str_ndarray = np.array(str_list)[..., np.newaxis]
-    return np.char.encode(str_ndarray, "utf-8")
-
-
-def query_llm(
-    url,
-    model_name,
-    prompts,
-    stop_words_list=None,
-    bad_words_list=None,
-    no_repeat_ngram_size=None,
-    max_output_len=128,
-    top_k=1,
-    top_p=0.0,
-    temperature=1.0,
-    random_seed=None,
-    task_id=None,
-    lora_uids=None,
-    init_timeout=60.0,
-):
-    prompts = str_list2numpy(prompts)
-    inputs = {"prompts": prompts}
-
-    if max_output_len is not None:
-        inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)
-
-    if top_k is not None:
-        inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
-
-    if top_p is not None:
-        inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
-
-    if temperature is not None:
-        inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
-
-    if random_seed is not None:
-        inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.single)
-
-    if stop_words_list is not None:
-        stop_words_list = np.char.encode(stop_words_list, "utf-8")
-        inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list)
-
-    if bad_words_list is not None:
-        bad_words_list = np.char.encode(bad_words_list, "utf-8")
-        inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list)
-
-    if no_repeat_ngram_size is not None:
-        inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single)
-
-    if task_id is not None:
-        task_id = np.char.encode(task_id, "utf-8")
-        inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id)
-
-    if lora_uids is not None:
-        lora_uids = np.char.encode(lora_uids, "utf-8")
-        inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
-
-    with ModelClient(url, model_name, init_timeout_s=init_timeout) as client:
-        result_dict = client.infer_batch(**inputs)
-        output_type = client.model_config.outputs[0].dtype
-
-    if output_type == np.bytes_:
-        sentences = np.char.decode(result_dict["outputs"].astype("bytes"), "utf-8")
-        return sentences
-    else:
-        return result_dict["outputs"]
-
-
-def query_llm_streaming(
-    url,
-    model_name,
-    prompts,
-    stop_words_list=None,
-    bad_words_list=None,
-    no_repeat_ngram_size=None,
-    max_output_len=512,
-    top_k=1,
-    top_p=0.0,
-    temperature=1.0,
-    random_seed=None,
-    task_id=None,
-    lora_uids=None,
-    init_timeout=60.0,
-):
-    prompts = str_list2numpy(prompts)
-    inputs = {"prompts": prompts}
-
-    if max_output_len is not None:
-        inputs["max_output_len"] = np.full(prompts.shape, max_output_len, dtype=np.int_)
-
-    if top_k is not None:
-        inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
-
-    if top_p is not None:
-        inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
-
-    if temperature is not None:
-        inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
-
-    if random_seed is not None:
-        inputs["random_seed"] = np.full(prompts.shape, random_seed, dtype=np.int_)
-
-    if stop_words_list is not None:
-        stop_words_list = np.char.encode(stop_words_list, "utf-8")
-        inputs["stop_words_list"] = np.full((prompts.shape[0], len(stop_words_list)), stop_words_list)
-
-    if bad_words_list is not None:
-        bad_words_list = np.char.encode(bad_words_list, "utf-8")
-        inputs["bad_words_list"] = np.full((prompts.shape[0], len(bad_words_list)), bad_words_list)
-
-    if no_repeat_ngram_size is not None:
-        inputs["no_repeat_ngram_size"] = np.full(prompts.shape, no_repeat_ngram_size, dtype=np.single)
-
-    if task_id is not None:
-        task_id = np.char.encode(task_id, "utf-8")
-        inputs["task_id"] = np.full((prompts.shape[0], len([task_id])), task_id)
-
-    if lora_uids is not None:
-        lora_uids = np.char.encode(lora_uids, "utf-8")
-        inputs["lora_uids"] = np.full((prompts.shape[0], len(lora_uids)), lora_uids)
-
-    with DecoupledModelClient(url, model_name, init_timeout_s=init_timeout) as client:
-        for partial_result_dict in client.infer_batch(**inputs):
-            output_type = client.model_config.outputs[0].dtype
-            if output_type == np.bytes_:
-                sentences = np.char.decode(partial_result_dict["outputs"].astype("bytes"), "utf-8")
-                yield sentences
-            else:
-                yield partial_result_dict["outputs"]
-
-
-def query(argv):
-    args = get_args(argv)
-
-    if args.prompt_file is not None:
-        with open(args.prompt_file, "r") as f:
-            args.prompt = f.read()
-
-    if args.enable_streaming:
-        output_generator = query_llm_streaming(
-            url=args.url,
-            model_name=args.model_name,
-            prompts=[args.prompt],
-            stop_words_list=None if args.stop_words_list is None else [args.stop_words_list],
-            bad_words_list=None if args.bad_words_list is None else [args.bad_words_list],
-            no_repeat_ngram_size=args.no_repeat_ngram_size,
-            max_output_len=args.max_output_len,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            temperature=args.temperature,
-            task_id=args.task_id,
-            lora_uids=args.lora_task_uids,
-            init_timeout=args.init_timeout,
-        )
-        # The query returns a generator that yields one array per model step,
-        # with the partial generated text in the last dimension. Print that partial text
-        # incrementally and compare it with all the text generated so far.
-        prev_output = ''
-        for output in output_generator:
-            cur_output = output[0][0]
-            if prev_output == '' or cur_output.startswith(prev_output):
-                print(cur_output[len(prev_output) :], end='', flush=True)
-            else:
-                print("WARN: Partial output mismatch, restarting output...")
-                print(cur_output, end='', flush=True)
-            prev_output = cur_output
-        print()
-
-    else:
-        outputs = query_llm(
-            url=args.url,
-            model_name=args.model_name,
-            prompts=[args.prompt],
-            stop_words_list=None if args.stop_words_list is None else [args.stop_words_list],
-            bad_words_list=None if args.bad_words_list is None else [args.bad_words_list],
-            no_repeat_ngram_size=args.no_repeat_ngram_size,
-            max_output_len=args.max_output_len,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            temperature=args.temperature,
-            task_id=args.task_id,
-            lora_uids=args.lora_task_uids,
-            init_timeout=args.init_timeout,
-        )
-        print(outputs[0][0])
-
-
-if __name__ == '__main__':
-    query(sys.argv[1:])
diff --git a/scripts/deploy/nlp/query_inframework.py b/scripts/deploy/nlp/query_inframework.py
deleted file mode 100644
index f4ceece29ea5..000000000000
--- a/scripts/deploy/nlp/query_inframework.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import sys
-import time
-
-from nemo.deploy.nlp import NemoQueryLLMPyTorch
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Queries Triton server running an in-framework Nemo model",
-    )
-    parser.add_argument("-u", "--url", default="0.0.0.0", type=str, help="url for the triton server")
-    parser.add_argument("-mn", "--model_name", required=True, type=str, help="Name of the triton model")
-    prompt_group = parser.add_mutually_exclusive_group(required=True)
-    prompt_group.add_argument("-p", "--prompt", required=False, type=str, help="Prompt")
-    prompt_group.add_argument("-pf", "--prompt_file", required=False, type=str, help="File to read the prompt from")
-    parser.add_argument("-mol", "--max_output_len", default=128, type=int, help="Max output token length")
-    parser.add_argument("-tk", "--top_k", default=1, type=int, help="top_k")
-    parser.add_argument("-tpp", "--top_p", default=0.0, type=float, help="top_p")
-    parser.add_argument("-t", "--temperature", default=1.0, type=float, help="temperature")
-    parser.add_argument("-it", "--init_timeout", default=60.0, type=float, help="init timeout for the triton server")
-    parser.add_argument("-clp", "--compute_logprob", default=None, action='store_true', help="Returns log_probs")
-
-    args = parser.parse_args(argv)
-    return args
-
-
-def query_llm(
-    url,
-    model_name,
-    prompts,
-    max_output_len=128,
-    top_k=1,
-    top_p=0.0,
-    temperature=1.0,
-    compute_logprob=None,
-    init_timeout=60.0,
-):
-    start_time = time.time()
-    nemo_query = NemoQueryLLMPyTorch(url, model_name)
-    result = nemo_query.query_llm(
-        prompts=prompts,
-        max_length=max_output_len,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        compute_logprob=compute_logprob,
-        init_timeout=init_timeout,
-    )
-    end_time = time.time()
-    LOGGER.info(f"Query execution time: {end_time - start_time:.2f} seconds")
-    return result
-
-
-def query(argv):
-    args = get_args(argv)
-
-    if args.prompt_file is not None:
-        with open(args.prompt_file, "r") as f:
-            args.prompt = f.read()
-
-    outputs = query_llm(
-        url=args.url,
-        model_name=args.model_name,
-        prompts=[args.prompt],
-        max_output_len=args.max_output_len,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        temperature=args.temperature,
-        compute_logprob=args.compute_logprob,
-        init_timeout=args.init_timeout,
-    )
-    print(outputs)
-
-
-if __name__ == '__main__':
-    query(sys.argv[1:])
diff --git a/scripts/deploy/nlp/query_inframework_hf.py b/scripts/deploy/nlp/query_inframework_hf.py
deleted file mode 100644
index 10d325337cd9..000000000000
--- a/scripts/deploy/nlp/query_inframework_hf.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import sys
-
-from nemo.deploy.nlp import NemoQueryLLMHF
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description="Query a HuggingFace model deployed on Triton Inference Server",
-    )
-    parser.add_argument(
-        "-u",
-        "--url",
-        default="0.0.0.0",
-        type=str,
-        help="URL of the Triton Inference Server (e.g. localhost or IP address)",
-    )
-    parser.add_argument(
-        "-mn", "--model_name", required=True, type=str, help="Name of the model as deployed on Triton server"
-    )
-    prompt_group = parser.add_mutually_exclusive_group(required=True)
-    prompt_group.add_argument("-p", "--prompt", required=False, type=str, help="Text prompt to send to the model")
-    prompt_group.add_argument(
-        "-pf", "--prompt_file", required=False, type=str, help="Path to file containing the prompt text"
-    )
-    parser.add_argument(
-        "-mol", "--max_output_len", default=128, type=int, help="Maximum number of tokens to generate in the response"
-    )
-    parser.add_argument(
-        "-tk", "--top_k", default=1, type=int, help="Number of highest probability tokens to consider for sampling"
-    )
-    parser.add_argument(
-        "-tpp", "--top_p", default=0.0, type=float, help="Cumulative probability threshold for token sampling"
-    )
-    parser.add_argument(
-        "-t",
-        "--temperature",
-        default=1.0,
-        type=float,
-        help="Temperature for controlling randomness in sampling (higher = more random)",
-    )
-    parser.add_argument(
-        "-it",
-        "--init_timeout",
-        default=60.0,
-        type=float,
-        help="Timeout in seconds when initializing connection to Triton server",
-    )
-    parser.add_argument(
-        "-ol", "--output_logits", default=False, action='store_true', help="Return raw logits from model output"
-    )
-    parser.add_argument(
-        "-os",
-        "--output_scores",
-        default=False,
-        action='store_true',
-        help="Return token probability scores from model output",
-    )
-
-    args = parser.parse_args(argv)
-    return args
-
-
-def query_llm(
-    url,
-    model_name,
-    prompts,
-    max_output_len=128,
-    top_k=1,
-    top_p=0.0,
-    temperature=1.0,
-    output_logits=False,
-    output_scores=False,
-    init_timeout=60.0,
-):
-    """Query a HuggingFace language model deployed on Triton Inference Server.
-
-    Args:
-        url (str): URL of the Triton Inference Server (e.g. localhost or IP address)
-        model_name (str): Name of the model as deployed on Triton server
-        prompts (List[str]): List of text prompts to send to the model
-        max_output_len (int, optional): Maximum number of tokens to generate in the response. Defaults to 128.
-        top_k (int, optional): Number of highest probability tokens to consider for sampling. Defaults to 1.
-        top_p (float, optional): Cumulative probability threshold for token sampling. Defaults to 0.0.
-        temperature (float, optional): Temperature for controlling randomness in sampling (higher = more random). Defaults to 1.0.
-        output_logits (bool, optional): Return raw logits from model output. Defaults to False.
-        output_scores (bool, optional): Return token probability scores from model output. Defaults to False.
-        init_timeout (float, optional): Timeout in seconds when initializing connection to Triton server. Defaults to 60.0.
-
-    Returns:
-        List[str]: Generated text responses for each input prompt
-    """
-
-    nemo_query = NemoQueryLLMHF(url, model_name)
-    return nemo_query.query_llm(
-        prompts=prompts,
-        max_length=max_output_len,
-        top_k=top_k,
-        top_p=top_p,
-        temperature=temperature,
-        output_logits=output_logits,
-        output_scores=output_scores,
-        init_timeout=init_timeout,
-    )
-
-
-def query(argv):
-    """Query a HuggingFace language model deployed on Triton Inference Server using command line arguments.
-
-    This function parses command line arguments and sends queries to a deployed model. It supports
-    reading prompts either directly from command line or from a file.
-
-    Args:
-        argv (List[str]): Command line arguments passed to the script, excluding the script name.
-            Expected arguments include:
-            - url: URL of Triton server
-            - model_name: Name of deployed model
-            - prompt: Text prompt or prompt_file: Path to file containing prompt
-            - max_output_len: Maximum tokens to generate
-            - top_k: Top-k sampling parameter
-            - top_p: Top-p sampling parameter
-            - temperature: Sampling temperature
-            - output_logits: Whether to return logits
-            - output_scores: Whether to return scores
-            - init_timeout: Connection timeout
-
-    Returns:
-        List[str]: Generated text responses from the model
-    """
-
-    args = get_args(argv)
-
-    if args.prompt_file is not None:
-        with open(args.prompt_file, "r") as f:
-            args.prompt = f.read()
-
-    outputs = query_llm(
-        url=args.url,
-        model_name=args.model_name,
-        prompts=[args.prompt],
-        max_output_len=args.max_output_len,
-        top_k=args.top_k,
-        top_p=args.top_p,
-        temperature=args.temperature,
-        output_logits=args.output_logits,
-        output_scores=args.output_scores,
-        init_timeout=args.init_timeout,
-    )
-    print(outputs)
-
-
-if __name__ == '__main__':
-    query(sys.argv[1:])
diff --git a/scripts/export.py b/scripts/export.py
deleted file mode 100644
index 6e0b9b72e15b..000000000000
--- a/scripts/export.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import sys
-
-import torch
-from lightning.pytorch import Trainer
-from omegaconf import OmegaConf
-
-import nemo
-from nemo.core import ModelPT
-from nemo.core.classes import Exportable
-from nemo.core.config.pytorch_lightning import TrainerConfig
-from nemo.utils import logging
-
-try:
-    from contextlib import nullcontext
-except ImportError:
-    # handle python < 3.7
-    from contextlib import suppress as nullcontext
-
-
-def get_args(argv):
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Export NeMo models to ONNX/Torchscript",
-    )
-    parser.add_argument("source", help="Source .nemo file")
-    parser.add_argument("out", help="Location to write result to")
-    parser.add_argument("--autocast", action="store_true", help="Use autocast when exporting")
-    parser.add_argument("--runtime-check", action="store_true", help="Runtime check of exported net result")
-    parser.add_argument("--verbose", default=None, help="Verbose level for logging, numeric")
-    parser.add_argument("--max-batch", type=int, default=None, help="Max batch size for model export")
-    parser.add_argument("--max-dim", type=int, default=None, help="Max dimension(s) for model export")
-    parser.add_argument("--onnx-opset", type=int, default=None, help="ONNX opset for model export")
-    parser.add_argument(
-        "--cache_support", action="store_true", help="enables caching inputs for the models support it."
-    )
-    parser.add_argument("--device", default="cuda", help="Device to export for")
-    parser.add_argument("--check-tolerance", type=float, default=0.01, help="tolerance for verification")
-    parser.add_argument(
-        "--export-config",
-        metavar="KEY=VALUE",
-        nargs='+',
-        help="Set a number of key-value pairs to model.export_config dictionary "
-        "(do not put spaces before or after the = sign). "
-        "Note that values are always treated as strings.",
-    )
-
-    args = parser.parse_args(argv)
-    return args
-
-
-def nemo_export(argv):
-    args = get_args(argv)
-    loglevel = logging.INFO
-    # assuming loglevel is bound to the string value obtained from the
-    # command line argument. Convert to upper case to allow the user to
-    # specify --log=DEBUG or --log=debug
-    if args.verbose is not None:
-        numeric_level = getattr(logging, args.verbose.upper(), None)
-        if not isinstance(numeric_level, int):
-            raise ValueError('Invalid log level: %s' % numeric_level)
-        loglevel = numeric_level
-    logging.setLevel(loglevel)
-    logging.info("Logging level set to {}".format(loglevel))
-
-    """Convert a .nemo saved model into .riva Riva input format."""
-    nemo_in = args.source
-    out = args.out
-
-    # Create a PL trainer object which is required for restoring Megatron models
-    cfg_trainer = TrainerConfig(
-        accelerator='gpu',
-        strategy="ddp",
-        num_nodes=1,
-        devices=1,
-        # Need to set the following two to False as ExpManager will take care of them differently.
-        logger=False,
-        enable_checkpointing=False,
-    )
-    cfg_trainer = OmegaConf.to_container(OmegaConf.create(cfg_trainer))
-    trainer = Trainer(**cfg_trainer)
-
-    logging.info("Restoring NeMo model from '{}'".format(nemo_in))
-    try:
-        with torch.inference_mode():
-            # Restore instance from .nemo file using generic model restore_from
-            model = ModelPT.restore_from(restore_path=nemo_in, trainer=trainer)
-    except Exception as e:
-        logging.error(
-            "Failed to restore model from NeMo file : {}. Please make sure you have the latest NeMo package installed with [all] dependencies.".format(
-                nemo_in
-            )
-        )
-        raise e
-
-    logging.info("Model {} restored from '{}'".format(model.__class__.__name__, nemo_in))
-
-    if not isinstance(model, Exportable):
-        logging.error("Your NeMo model class ({}) is not Exportable.".format(model.__class__.__name__))
-        sys.exit(1)
-
-    #
-    #  Add custom export parameters here
-    #
-    check_trace = args.runtime_check
-
-    in_args = {}
-    max_batch = 1
-    max_dim = None
-    if args.max_batch is not None:
-        in_args["max_batch"] = args.max_batch
-        max_batch = args.max_batch
-    if args.max_dim is not None:
-        in_args["max_dim"] = args.max_dim
-        max_dim = args.max_dim
-
-    if args.cache_support:
-        model.set_export_config({"cache_support": "True"})
-
-    if args.export_config:
-        kv = {}
-        for key_value in args.export_config:
-            lst = key_value.split("=")
-            if len(lst) != 2:
-                raise Exception("Use correct format for --export_config: k=v")
-            k, v = lst
-            kv[k] = v
-        model.set_export_config(kv)
-
-    try:
-        with torch.amp.autocast(args.device, enabled=args.autocast), torch.no_grad(), torch.inference_mode():
-            model.to(device=args.device).freeze()
-            model.eval()
-            input_example = None
-            if check_trace and len(in_args) > 0:
-                input_example = model.input_module.input_example(**in_args)
-                check_trace = [input_example]
-                for key, arg in in_args.items():
-                    in_args[key] = (arg + 1) // 2
-                input_example2 = model.input_module.input_example(**in_args)
-                check_trace.append(input_example2)
-                logging.info(f"Using additional check args: {in_args}")
-
-            _, descriptions = model.export(
-                out,
-                input_example=input_example,
-                check_trace=check_trace,
-                check_tolerance=args.check_tolerance,
-                onnx_opset_version=args.onnx_opset,
-                verbose=bool(args.verbose),
-            )
-
-    except Exception as e:
-        logging.error(
-            "Export failed. Please make sure your NeMo model class ({}) has working export() and that you have the latest NeMo package installed with [all] dependencies.".format(
-                model.__class__
-            )
-        )
-        raise e
-
-
-if __name__ == '__main__':
-    nemo_export(sys.argv[1:])
diff --git a/scripts/export/convert_nemo2_for_export.py b/scripts/export/convert_nemo2_for_export.py
deleted file mode 100644
index 5995cc7c0841..000000000000
--- a/scripts/export/convert_nemo2_for_export.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Convert a NeMo 2.0 checkpoint to NeMo 1.0 for TRTLLM export.
-Example to run this conversion script:
-```
-    python /opt/NeMo/scripts/scripts/export/convert_nemo2_for_export.py \
-     --input_path /path/to/nemo2/ckpt \
-     --output_path /path/to/output \
-     --tokenizer_type huggingface \
-     --tokenizer_name meta-llama/Llama-3.1-8B \
-     --symbolic_link=True
-```
-"""
-
-import os
-import shutil
-from argparse import ArgumentParser
-
-from omegaconf import OmegaConf
-
-from nemo.lightning import io
-
-
-def get_args():
-    parser = ArgumentParser()
-    parser.add_argument(
-        "--input_path",
-        type=str,
-        required=True,
-        help="Path to nemo 2.0 checkpoint",
-    )
-    parser.add_argument(
-        "--output_path",
-        type=str,
-        required=True,
-        help="Output path",
-    )
-    parser.add_argument(
-        "--tokenizer_type",
-        type=str,
-        default="huggingface",
-        help="Type of tokenizer",
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        type=str,
-        default="meta-llama/Meta-Llama-3.1-8B",
-        help="Name or path of tokenizer",
-    )
-    parser.add_argument(
-        "--symbolic_link",
-        type=bool,
-        default=True,
-        help="Whether to use symbiloc link for model weights",
-    )
-
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    input_path = args.input_path
-    output_path = args.output_path
-    weight_path = os.path.join(output_path, "model_weights")
-
-    if os.path.exists(output_path):
-        shutil.rmtree(output_path)
-        print(f"Remove existing {output_path}")
-
-    os.makedirs(output_path, exist_ok=True)
-
-    config = io.load_context(input_path, subpath="model.config")
-
-    config_dict = {}
-    for k, v in config.__dict__.items():
-        if isinstance(v, (float, int, str, bool)):
-            config_dict[k] = v
-        elif k == "activation_func":
-            config_dict["activation"] = v.__name__
-
-    if config_dict.get("num_moe_experts") is None:
-        config_dict["num_moe_experts"] = 0
-        config_dict["moe_router_topk"] = 0
-    if config_dict["activation"] == "silu":
-        config_dict["activation"] = "fast-swiglu"
-
-    config_dict["mcore_gpt"] = True
-    config_dict["max_position_embeddings"] = config_dict.get("seq_length")
-    config_dict["tokenizer"] = {
-        "library": args.tokenizer_type,
-        "type": args.tokenizer_name,
-        "use_fast": True,
-    }
-
-    yaml_config = OmegaConf.create(config_dict)
-    OmegaConf.save(config=yaml_config, f=os.path.join(output_path, "model_config.yaml"))
-
-    if args.symbolic_link:
-        os.symlink(input_path, weight_path)
-    else:
-        os.makedirs(weight_path, exist_ok=True)
-        for file in os.listdir(input_path):
-            source_path = os.path.join(input_path, file)
-            target_path = os.path.join(weight_path, file)
-            shutil.copy(source_path, target_path)
-
-
-if __name__ == "__main__":
-    args = get_args()
-    main(args)
diff --git a/scripts/export/export_mm_to_trtllm.py b/scripts/export/export_mm_to_trtllm.py
deleted file mode 100644
index 755c919e708a..000000000000
--- a/scripts/export/export_mm_to_trtllm.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the 'License');
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an 'AS IS' BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-This script exports multimodal model to TensorRT and do a local inference test.
-For multimodal model, it supports the following models:
-- NEVA
-- Video-NEVA
-- LITA
-- VILA
-- VITA
-- SALM
-"""
-
-import argparse
-import os
-
-from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Export multimodal model to TensorRT')
-    parser.add_argument('--output_dir', required=True, help='Directory to save the exported model')
-    parser.add_argument(
-        '--visual_checkpoint_path',
-        required=True,
-        help='Path to the visual model checkpoint or perception model checkpoint',
-    )
-    parser.add_argument('--llm_checkpoint_path', required=True, help='Source .nemo file for llm')
-    parser.add_argument(
-        '--modality',
-        default="vision",
-        choices=["vision", "audio"],
-        help="Modality of the model",
-    )
-    parser.add_argument(
-        '--model_type',
-        type=str,
-        required=True,
-        choices=["neva", "video-neva", "lita", "vila", "vita", "salm"],
-        help="Type of the model that is supported.",
-    )
-
-    parser.add_argument(
-        '--llm_model_type',
-        type=str,
-        required=True,
-        choices=["gptnext", "gpt", "llama", "falcon", "starcoder", "mixtral", "gemma"],
-        help="Type of LLM. gptnext, gpt, llama, falcon, and starcoder are only supported."
-        " gptnext and gpt are the same and keeping it for backward compatibility",
-    )
-
-    parser.add_argument('--tensor_parallel_size', type=int, default=1, help='tensor parallelism size')
-    parser.add_argument('--max_input_len', type=int, default=4096, help='Maximum input length')
-    parser.add_argument('--max_output_len', type=int, default=256, help='Maximum output length')
-    parser.add_argument('--max_batch_size', type=int, default=1, help='Maximum batch size')
-    parser.add_argument(
-        '--vision_max_batch_size',
-        type=int,
-        default=1,
-        help='Max batch size of the visual inputs, for lita/vita model with video inference, this should be set to 256',
-    )
-    parser.add_argument('--max_multimodal_len', type=int, default=3072, help='Maximum multimodal length')
-    parser.add_argument(
-        "--dtype",
-        choices=["bfloat16", "float16"],
-        default="bfloat16",
-        type=str,
-        help="dtype of the model on TensorRT",
-    )
-    parser.add_argument(
-        '--delete_existing_files', action='store_true', help='Delete existing files in the output directory'
-    )
-    parser.add_argument(
-        '--test_export_only', action='store_true', help='Only test the export without saving the model'
-    )
-    parser.add_argument('--input_text', help='Input text for inference')
-    parser.add_argument('--input_media', default=None, help='Input media file for inference')
-    parser.add_argument('--batch_size', type=int, default=1, help='Batch size for inference')
-    parser.add_argument('--max_output', type=int, default=128, help='Maximum output length for inference')
-    parser.add_argument('--top_k', type=int, default=1, help='Top k for sampling')
-    parser.add_argument('--top_p', type=float, default=0.0, help='Top p for sampling')
-    parser.add_argument("--temperature", default=1.0, type=float, help="temperature")
-    parser.add_argument("--repetition_penalty", default=1.0, type=float, help="repetition_penalty")
-    parser.add_argument("--num_beams", default=1, type=int, help="num_beams")
-
-    args = parser.parse_args()
-    return args
-
-
-def main(args):
-    exporter = TensorRTMMExporter(model_dir=args.output_dir, load_model=False, modality=args.modality)
-    exporter.export(
-        visual_checkpoint_path=args.visual_checkpoint_path,
-        llm_checkpoint_path=args.llm_checkpoint_path,
-        model_type=args.model_type,
-        llm_model_type=args.llm_model_type,
-        tensor_parallel_size=args.tensor_parallel_size,
-        max_input_len=args.max_input_len,
-        max_output_len=args.max_output_len,
-        max_batch_size=args.max_batch_size,
-        vision_max_batch_size=args.vision_max_batch_size,
-        max_multimodal_len=args.max_multimodal_len,
-        dtype=args.dtype,
-        delete_existing_files=args.delete_existing_files,
-        load_model=not args.test_export_only,
-    )
-    test_inference = not args.test_export_only
-    if test_inference:
-        assert args.input_media is not None, "Input media file is required for inference"
-        assert os.path.exists(args.input_media), f"Input media file {args.input_media} does not exist"
-        output = exporter.forward(
-            input_text=args.input_text,
-            input_media=args.input_media,
-            batch_size=args.batch_size,
-            max_output_len=args.max_output,
-            top_k=args.top_k,
-            top_p=args.top_p,
-            temperature=args.temperature,
-            repetition_penalty=args.repetition_penalty,
-            num_beams=args.num_beams,
-        )
-        print(output)
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    main(args)
diff --git a/scripts/export/export_to_trt_llm.py b/scripts/export/export_to_trt_llm.py
deleted file mode 100644
index c1c6863aa300..000000000000
--- a/scripts/export/export_to_trt_llm.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import pprint
-from typing import Optional
-
-from nemo.export.tensorrt_llm import TensorRTLLM
-
-LOGGER = logging.getLogger("NeMo")
-
-
-def get_args():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description="Exports NeMo checkpoint to TensorRT-LLM engine",
-    )
-    parser.add_argument("-nc", "--nemo_checkpoint", required=True, type=str, help="Source model path")
-    parser.add_argument("-mt", "--model_type", type=str, help="Type of the TensorRT-LLM model.")
-    parser.add_argument(
-        "-mr", "--model_repository", required=True, default=None, type=str, help="Folder for the trt-llm model files"
-    )
-    parser.add_argument("-tps", "--tensor_parallelism_size", default=1, type=int, help="Tensor parallelism size")
-    parser.add_argument("-pps", "--pipeline_parallelism_size", default=1, type=int, help="Pipeline parallelism size")
-    parser.add_argument(
-        "-dt",
-        "--dtype",
-        choices=["bfloat16", "float16"],
-        help="Data type of the model on TensorRT-LLM",
-    )
-    parser.add_argument("-mil", "--max_input_len", default=256, type=int, help="Max input length of the model")
-    parser.add_argument("-mol", "--max_output_len", default=256, type=int, help="Max output length of the model")
-    parser.add_argument("-mbs", "--max_batch_size", default=8, type=int, help="Max batch size of the model")
-    parser.add_argument("-mnt", "--max_num_tokens", default=None, type=int, help="Max number of tokens")
-    parser.add_argument("-ont", "--opt_num_tokens", default=None, type=int, help="Optimum number of tokens")
-    parser.add_argument(
-        "-mpet", "--max_prompt_embedding_table_size", default=None, type=int, help="Max prompt embedding table size"
-    )
-    parser.add_argument(
-        "-upe",
-        "--use_parallel_embedding",
-        default=False,
-        action='store_true',
-        help="Use parallel embedding.",
-    )
-    parser.add_argument(
-        "-npkc", "--no_paged_kv_cache", default=False, action='store_true', help="Disable paged kv cache."
-    )
-    parser.add_argument(
-        "-drip",
-        "--disable_remove_input_padding",
-        default=False,
-        action='store_true',
-        help="Disables the remove input padding option.",
-    )
-    parser.add_argument(
-        "-mbm",
-        '--multi_block_mode',
-        default=False,
-        action='store_true',
-        help='Split long kv sequence into multiple blocks (applied to generation MHA kernels). \
-            It is beneifical when batchxnum_heads cannot fully utilize GPU. \
-            available when using c++ runtime.',
-    )
-    parser.add_argument(
-        '--use_lora_plugin',
-        nargs='?',
-        const=None,
-        choices=['float16', 'float32', 'bfloat16'],
-        help="Activates the lora plugin which enables embedding sharing.",
-    )
-    parser.add_argument(
-        '--lora_target_modules',
-        nargs='+',
-        default=None,
-        choices=[
-            "attn_qkv",
-            "attn_q",
-            "attn_k",
-            "attn_v",
-            "attn_dense",
-            "mlp_h_to_4h",
-            "mlp_gate",
-            "mlp_4h_to_h",
-        ],
-        help="Add lora in which modules. Only be activated when use_lora_plugin is enabled.",
-    )
-    parser.add_argument(
-        '--max_lora_rank',
-        type=int,
-        default=64,
-        help='maximum lora rank for different lora modules. '
-        'It is used to compute the workspace size of lora plugin.',
-    )
-    parser.add_argument("-dm", "--debug_mode", default=False, action='store_true', help="Enable debug mode")
-    parser.add_argument(
-        "--use_mcore_path",
-        action="store_true",
-        help="Use Megatron-Core implementation on exporting the model. If not set, use local NeMo codebase",
-    )
-    parser.add_argument(
-        "-fp8",
-        "--export_fp8_quantized",
-        default="auto",
-        type=str,
-        help="Enables exporting to a FP8-quantized TRT LLM checkpoint",
-    )
-    parser.add_argument(
-        "-kv_fp8",
-        "--use_fp8_kv_cache",
-        default="auto",
-        type=str,
-        help="Enables exporting with FP8-quantizatized KV-cache",
-    )
-    args = parser.parse_args()
-
-    def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]:
-        s = s.lower()
-        true_strings = ["true", "1"]
-        false_strings = ["false", "0"]
-        if s in true_strings:
-            return True
-        if s in false_strings:
-            return False
-        if optional and s == 'auto':
-            return None
-        raise argparse.ArgumentTypeError(f"Invalid boolean value for argument --{name}: '{s}'")
-
-    args.export_fp8_quantized = str_to_bool("export_fp8_quantized", args.export_fp8_quantized, optional=True)
-    args.use_fp8_kv_cache = str_to_bool("use_fp8_kv_cache", args.use_fp8_kv_cache, optional=True)
-    return args
-
-
-def nemo_export_trt_llm():
-    args = get_args()
-
-    loglevel = logging.DEBUG if args.debug_mode else logging.INFO
-    LOGGER.setLevel(loglevel)
-    LOGGER.info(f"Logging level set to {loglevel}")
-    LOGGER.info(pprint.pformat(vars(args)))
-
-    trt_llm_exporter = TensorRTLLM(
-        model_dir=args.model_repository, load_model=False, multi_block_mode=args.multi_block_mode
-    )
-
-    LOGGER.info("Export to TensorRT-LLM function is called.")
-    trt_llm_exporter.export(
-        nemo_checkpoint_path=args.nemo_checkpoint,
-        model_type=args.model_type,
-        tensor_parallelism_size=args.tensor_parallelism_size,
-        pipeline_parallelism_size=args.pipeline_parallelism_size,
-        max_input_len=args.max_input_len,
-        max_output_len=args.max_output_len,
-        max_batch_size=args.max_batch_size,
-        max_num_tokens=args.max_num_tokens,
-        opt_num_tokens=args.opt_num_tokens,
-        max_prompt_embedding_table_size=args.max_prompt_embedding_table_size,
-        use_parallel_embedding=args.use_parallel_embedding,
-        paged_kv_cache=not args.no_paged_kv_cache,
-        remove_input_padding=not args.disable_remove_input_padding,
-        dtype=args.dtype,
-        use_lora_plugin=args.use_lora_plugin,
-        lora_target_modules=args.lora_target_modules,
-        max_lora_rank=args.max_lora_rank,
-        fp8_quantized=args.export_fp8_quantized,
-        fp8_kvcache=args.use_fp8_kv_cache,
-        load_model=False,
-        use_mcore_path=args.use_mcore_path,
-    )
-
-    LOGGER.info("Export is successful.")
-
-
-if __name__ == '__main__':
-    nemo_export_trt_llm()
diff --git a/scripts/export/setup_vllm_venv.sh b/scripts/export/setup_vllm_venv.sh
deleted file mode 100755
index 7538bd026439..000000000000
--- a/scripts/export/setup_vllm_venv.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-# This script sets up a Python virtual environment
-# with all the requirements for running vLLM.
-set -ex
-
-VENV_DIR="${1:-/opt/venv}"
-
-echo "Creating virtual environment in ${VENV_DIR}..."
-
-pip install virtualenv
-
-virtualenv ${VENV_DIR}
-
-${VENV_DIR}/bin/pip install \
-    -r /opt/NeMo/requirements/requirements_vllm.txt \
-    -r /opt/NeMo/requirements/requirements_deploy.txt
diff --git a/tests/deploy/__init__.py b/tests/deploy/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py
deleted file mode 100644
index bbe2e63986d6..000000000000
--- a/tests/deploy/nemo_deploy.py
+++ /dev/null
@@ -1,591 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import argparse
-import json
-import shutil
-import time
-from pathlib import Path
-
-import torch
-
-from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable
-
-run_export_tests = True
-try:
-    from nemo.deploy import DeployPyTriton
-    from nemo.deploy.nlp import NemoQueryLLM, NemoQueryLLMPyTorch
-    from nemo.export.tensorrt_llm import TensorRTLLM
-except Exception as e:
-    run_export_tests = False
-
-
-def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path=None):
-    # lambada dataset based accuracy test, which includes more than 5000 sentences.
-    # Use generated last token with original text's last token for accuracy comparison.
-    # If the generated last token start with the original token, trtllm_correct make an increment.
-    # It generates a CSV file for text comparison detail.
-
-    if test_data_path is None:
-        raise Exception("test_data_path cannot be None.")
-
-    trtllm_correct = 0
-    trtllm_deployed_correct = 0
-    trtllm_correct_relaxed = 0
-    trtllm_deployed_correct_relaxed = 0
-    all_expected_outputs = []
-    all_trtllm_outputs = []
-
-    with open(test_data_path, 'r') as file:
-        records = json.load(file)
-
-        eval_start = time.perf_counter()
-        for record in records:
-            prompt = record["text_before_last_word"]
-            expected_output = record["last_word"].strip().lower()
-            trtllm_output = model.forward(
-                input_texts=[prompt],
-                max_output_len=1,
-                top_k=1,
-                top_p=0,
-                temperature=0.1,
-                task_ids=task_ids,
-                lora_uids=lora_uids,
-            )
-            trtllm_output = trtllm_output[0][0].strip().lower()
-
-            all_expected_outputs.append(expected_output)
-            all_trtllm_outputs.append(trtllm_output)
-
-            if expected_output == trtllm_output:
-                trtllm_correct += 1
-
-            if (
-                expected_output == trtllm_output
-                or trtllm_output.startswith(expected_output)
-                or expected_output.startswith(trtllm_output)
-            ):
-                if len(trtllm_output) == 1 and len(expected_output) > 1:
-                    continue
-                trtllm_correct_relaxed += 1
-
-            if nq is not None:
-                trtllm_deployed_output = nq.query_llm(
-                    prompts=[prompt],
-                    max_output_len=1,
-                    top_k=1,
-                    top_p=0,
-                    temperature=0.1,
-                    task_id=task_ids,
-                )
-                trtllm_deployed_output = trtllm_deployed_output[0][0].strip().lower()
-
-                if expected_output == trtllm_deployed_output:
-                    trtllm_deployed_correct += 1
-
-                if (
-                    expected_output == trtllm_deployed_output
-                    or trtllm_deployed_output.startswith(expected_output)
-                    or expected_output.startswith(trtllm_deployed_output)
-                ):
-                    if len(trtllm_deployed_output) == 1 and len(expected_output) > 1:
-                        continue
-                    trtllm_deployed_correct_relaxed += 1
-        eval_end = time.perf_counter()
-
-    trtllm_accuracy = trtllm_correct / len(all_expected_outputs)
-    trtllm_accuracy_relaxed = trtllm_correct_relaxed / len(all_expected_outputs)
-
-    trtllm_deployed_accuracy = trtllm_deployed_correct / len(all_expected_outputs)
-    trtllm_deployed_accuracy_relaxed = trtllm_deployed_correct_relaxed / len(all_expected_outputs)
-
-    evaluation_time = eval_end - eval_start
-
-    return (
-        trtllm_accuracy,
-        trtllm_accuracy_relaxed,
-        trtllm_deployed_accuracy,
-        trtllm_deployed_accuracy_relaxed,
-        evaluation_time,
-    )
-
-
-def run_in_framework_inference(
-    model_name,
-    prompt,
-    checkpoint_path,
-    n_gpu=1,
-    max_batch_size=None,
-    max_input_len=None,
-    max_output_len=None,
-):
-    model = MegatronLLMDeployable(checkpoint_path, n_gpu)
-    nm = DeployPyTriton(
-        model=model,
-        triton_model_name=model_name,
-        http_port=8000,
-    )
-    nm.deploy()
-    nm.run()
-    nq = NemoQueryLLMPyTorch(url="localhost:8000", model_name=model_name)
-
-    output_deployed = nq.query_llm(
-        prompts=prompt,
-    )
-
-    print("Output: ", output_deployed)
-
-    nm.stop()
-
-    return None, None, None, None, None
-
-
-def run_trt_llm_inference(
-    model_name,
-    model_type,
-    prompt,
-    checkpoint_path,
-    trt_llm_model_dir,
-    n_gpu=1,
-    max_batch_size=8,
-    use_embedding_sharing=False,
-    max_input_len=128,
-    max_output_len=128,
-    max_num_tokens=None,
-    ptuning=False,
-    p_tuning_checkpoint=None,
-    lora=False,
-    lora_checkpoint=None,
-    tp_size=None,
-    pp_size=None,
-    top_k=1,
-    top_p=0.0,
-    temperature=1.0,
-    run_accuracy=False,
-    debug=True,
-    streaming=False,
-    stop_words_list=None,
-    test_deployment=False,
-    test_data_path=None,
-    save_engine=False,
-):
-    if Path(checkpoint_path).exists():
-        if n_gpu > torch.cuda.device_count():
-            print(
-                "Path: {0} and model: {1} with {2} gpus won't be tested since available # of gpus = {3}".format(
-                    checkpoint_path, model_name, n_gpu, torch.cuda.device_count()
-                )
-            )
-            return None, None, None, None, None
-
-        Path(trt_llm_model_dir).mkdir(parents=True, exist_ok=True)
-
-        if debug:
-            print("")
-            print("")
-            print(
-                "################################################## NEW TEST ##################################################"
-            )
-            print("")
-
-            print("Path: {0} and model: {1} with {2} gpus will be tested".format(checkpoint_path, model_name, n_gpu))
-
-        prompt_embeddings_checkpoint_path = None
-        task_ids = None
-        max_prompt_embedding_table_size = 0
-
-        if ptuning:
-            if Path(p_tuning_checkpoint).exists():
-                prompt_embeddings_checkpoint_path = p_tuning_checkpoint
-                max_prompt_embedding_table_size = 8192
-                task_ids = ["0"]
-                if debug:
-                    print("---- PTuning enabled.")
-            else:
-                print("---- PTuning could not be enabled and skipping the test.")
-                return None, None, None, None, None
-
-        lora_ckpt_list = None
-        lora_uids = None
-        use_lora_plugin = None
-        lora_target_modules = None
-
-        if lora:
-            if Path(lora_checkpoint).exists():
-                lora_ckpt_list = [lora_checkpoint]
-                lora_uids = ["0", "-1", "0"]
-                use_lora_plugin = "bfloat16"
-                lora_target_modules = ["attn_qkv"]
-                if debug:
-                    print("---- LoRA enabled.")
-            else:
-                print("---- LoRA could not be enabled and skipping the test.")
-                return None, None, None, None, None
-
-        trt_llm_exporter = TensorRTLLM(trt_llm_model_dir, lora_ckpt_list, load_model=False)
-
-        trt_llm_exporter.export(
-            nemo_checkpoint_path=checkpoint_path,
-            model_type=model_type,
-            tensor_parallelism_size=tp_size,
-            pipeline_parallelism_size=pp_size,
-            max_input_len=max_input_len,
-            max_output_len=max_output_len,
-            max_batch_size=max_batch_size,
-            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-            use_lora_plugin=use_lora_plugin,
-            lora_target_modules=lora_target_modules,
-            max_num_tokens=max_num_tokens,
-            opt_num_tokens=60,
-            use_embedding_sharing=use_embedding_sharing,
-        )
-
-        if ptuning:
-            trt_llm_exporter.add_prompt_table(
-                task_name="0",
-                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
-            )
-
-        output = trt_llm_exporter.forward(
-            input_texts=prompt,
-            max_output_len=max_output_len,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            task_ids=task_ids,
-            lora_uids=lora_uids,
-            streaming=streaming,
-            stop_words_list=stop_words_list,
-        )
-
-        if not use_lora_plugin and not ptuning:
-            test_cpp_runtime(
-                engine_path=trt_llm_model_dir,
-                prompt=prompt,
-                max_output_len=max_output_len,
-                debug=True,
-            )
-
-        nq = None
-        nm = None
-        output_deployed = ""
-        if test_deployment:
-            nm = DeployPyTriton(
-                model=trt_llm_exporter,
-                triton_model_name=model_name,
-                http_port=8000,
-            )
-            nm.deploy()
-            nm.run()
-            nq = NemoQueryLLM(url="localhost:8000", model_name=model_name)
-
-            output_deployed = nq.query_llm(
-                prompts=prompt,
-                max_output_len=max_output_len,
-                top_k=1,
-                top_p=0.0,
-                temperature=1.0,
-                lora_uids=lora_uids,
-            )
-
-        if debug:
-            print("")
-            print("--- Prompt: ", prompt)
-            print("")
-            print("--- Output: ", output)
-            print("")
-            print("")
-            print("--- Output deployed: ", output_deployed)
-            print("")
-
-        if run_accuracy:
-            print("Start model accuracy testing ...")
-            result = get_accuracy_with_lambada(trt_llm_exporter, nq, task_ids, lora_uids, test_data_path)
-            if test_deployment:
-                nm.stop()
-
-            if not save_engine:
-                shutil.rmtree(trt_llm_model_dir)
-            return result
-
-        if test_deployment:
-            nm.stop()
-
-        if not save_engine:
-            shutil.rmtree(trt_llm_model_dir)
-
-        return None, None, None, None, None
-    else:
-        raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
-
-
-def test_cpp_runtime(
-    engine_path,
-    prompt,
-    max_output_len,
-    debug,
-):
-    trt_llm_exporter = TensorRTLLM(engine_path, load_model=True)
-    output = trt_llm_exporter.forward(
-        input_texts=prompt,
-        max_output_len=max_output_len,
-        top_k=1,
-        top_p=0.0,
-        temperature=1.0,
-    )
-
-    if debug:
-        print("")
-        print("--- Output deployed with cpp runtime: ", output)
-        print("")
-
-
-def get_args():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Deploy nemo models to Triton and benchmark the models",
-    )
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        required=False,
-    )
-    parser.add_argument(
-        "--min_gpus",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--max_gpus",
-        type=int,
-    )
-    parser.add_argument(
-        "--checkpoint_dir",
-        type=str,
-        default="/tmp/nemo_checkpoint/",
-        required=False,
-    )
-    parser.add_argument(
-        "--trt_llm_model_dir",
-        type=str,
-    )
-    parser.add_argument(
-        "--max_batch_size",
-        type=int,
-        default=8,
-    )
-    parser.add_argument(
-        "--max_input_len",
-        type=int,
-        default=256,
-    )
-    parser.add_argument(
-        "--max_output_len",
-        type=int,
-        default=128,
-    )
-    parser.add_argument(
-        "--max_num_tokens",
-        type=int,
-    )
-    parser.add_argument(
-        "--p_tuning_checkpoint",
-        type=str,
-    )
-    parser.add_argument(
-        "--ptuning",
-        default=False,
-        action='store_true',
-    )
-    parser.add_argument(
-        "--lora_checkpoint",
-        type=str,
-    )
-    parser.add_argument(
-        "--lora",
-        default=False,
-        action='store_true',
-    )
-    parser.add_argument(
-        "--tp_size",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--pp_size",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--top_k",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--top_p",
-        type=float,
-        default=0.0,
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=1.0,
-    )
-    parser.add_argument(
-        "--run_accuracy",
-        type=str,
-        default="False",
-    )
-    parser.add_argument("--streaming", default=False, action="store_true")
-    parser.add_argument(
-        "--test_deployment",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--debug",
-        default=False,
-        action='store_true',
-    )
-    parser.add_argument(
-        "--ci_upload_test_results_to_cloud",
-        default=False,
-        action='store_true',
-    )
-    parser.add_argument(
-        "--test_data_path",
-        type=str,
-        default=None,
-    )
-    parser.add_argument(
-        "-b",
-        '--backend',
-        nargs='?',
-        const=None,
-        default='TensorRT-LLM',
-        choices=['TensorRT-LLM', 'vLLM', 'In-Framework'],
-        help="Different options to deploy nemo model.",
-    )
-    parser.add_argument(
-        "--save_engine",
-        type=str,
-        default="False",
-    )
-
-    return parser.parse_args()
-
-
-def run_inference_tests(args):
-    if args.test_deployment == "True":
-        args.test_deployment = True
-    else:
-        args.test_deployment = False
-
-    if args.save_engine == "True":
-        args.save_engine = True
-    else:
-        args.save_engine = False
-
-    if args.run_accuracy == "True":
-        args.run_accuracy = True
-    else:
-        args.run_accuracy = False
-
-    if args.run_accuracy:
-        if args.test_data_path is None:
-            raise Exception("test_data_path param cannot be None.")
-
-    result_dic = {}
-
-    prompt_template = ["The capital of France is", "Largest animal in the sea is"]
-    n_gpus = args.min_gpus
-    if args.max_gpus is None:
-        args.max_gpus = args.min_gpus
-
-    while n_gpus <= args.max_gpus:
-        if args.backend.lower() == "tensorrt-llm":
-            result_dic[n_gpus] = run_trt_llm_inference(
-                model_name=args.model_name,
-                model_type=args.model_type,
-                prompt=prompt_template,
-                checkpoint_path=args.checkpoint_dir,
-                trt_llm_model_dir=args.trt_llm_model_dir,
-                n_gpu=n_gpus,
-                max_batch_size=args.max_batch_size,
-                max_input_len=args.max_input_len,
-                max_output_len=args.max_output_len,
-                max_num_tokens=args.max_num_tokens,
-                ptuning=args.ptuning,
-                p_tuning_checkpoint=args.p_tuning_checkpoint,
-                lora=args.lora,
-                lora_checkpoint=args.lora_checkpoint,
-                tp_size=args.tp_size,
-                pp_size=args.pp_size,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                temperature=args.temperature,
-                run_accuracy=args.run_accuracy,
-                debug=args.debug,
-                streaming=args.streaming,
-                test_deployment=args.test_deployment,
-                test_data_path=args.test_data_path,
-                save_engine=args.save_engine,
-            )
-        else:
-            result_dic[n_gpus] = run_in_framework_inference(
-                model_name=args.model_name,
-                prompt=prompt_template,
-                checkpoint_path=args.checkpoint_dir,
-                n_gpu=n_gpus,
-                max_batch_size=args.max_batch_size,
-                max_input_len=args.max_input_len,
-                max_output_len=args.max_output_len,
-            )
-
-        n_gpus = n_gpus * 2
-
-    test_result = "PASS"
-    print_separator = False
-    print("============= Test Summary ============")
-    for i, results in result_dic.items():
-        if not results[0] is None and not results[1] is None:
-            if print_separator:
-                print("---------------------------------------")
-            print(
-                "Number of GPUS:                  {}\n"
-                "Model Accuracy:                  {:.4f}\n"
-                "Relaxed Model Accuracy:          {:.4f}\n"
-                "Deployed Model Accuracy:         {:.4f}\n"
-                "Deployed Relaxed Model Accuracy: {:.4f}\n"
-                "Evaluation Time [s]:             {:.2f}".format(i, *results)
-            )
-            print_separator = True
-            if results[1] < 0.5:
-                test_result = "FAIL"
-
-    print("=======================================")
-    print("TEST: " + test_result)
-    if test_result == "FAIL":
-        raise Exception("Model accuracy is below 0.5")
-
-
-if __name__ == '__main__':
-    args = get_args()
-    run_inference_tests(args)
diff --git a/tests/deploy/test_deploy_base.py b/tests/deploy/test_deploy_base.py
deleted file mode 100755
index bfe831666631..000000000000
--- a/tests/deploy/test_deploy_base.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from nemo.deploy.deploy_base import DeployBase
-
-
-class MockDeployable(DeployBase):
-    def deploy(self):
-        pass
-
-    def serve(self):
-        pass
-
-    def run(self):
-        pass
-
-    def stop(self):
-        pass
-
-
-class MockTritonDeployable:
-    pass
-
-
-@pytest.fixture
-def mock_model():
-    return MagicMock()
-
-
-@pytest.fixture
-def deploy_base(mock_model):
-    return MockDeployable(
-        triton_model_name="test_model",
-        model=mock_model,
-        max_batch_size=128,
-        http_port=8000,
-        grpc_port=8001,
-    )
-
-
-def test_initialization_with_model(deploy_base, mock_model):
-    assert deploy_base.triton_model_name == "test_model"
-    assert deploy_base.model == mock_model
-    assert deploy_base.max_batch_size == 128
-    assert deploy_base.http_port == 8000
-    assert deploy_base.grpc_port == 8001
-    assert deploy_base.address == "0.0.0.0"
-    assert deploy_base.allow_grpc is True
-    assert deploy_base.allow_http is True
-    assert deploy_base.streaming is False
-
-
-def test_initialization_with_checkpoint():
-    with patch('nemo.deploy.deploy_base.ModelPT') as mock_model_pt:
-        mock_model_pt.restore_from.return_value = MagicMock()
-        deploy_base = MockDeployable(
-            triton_model_name="test_model",
-            checkpoint_path="test.ckpt",
-        )
-        assert deploy_base.checkpoint_path == "test.ckpt"
-
-
-def test_initialization_without_model_or_checkpoint():
-    with pytest.raises(Exception) as exc_info:
-        MockDeployable(triton_model_name="test_model")
-    assert "Either checkpoint_path or model should be provided" in str(exc_info.value)
-
-
-def test_get_module_and_class():
-    module, class_name = DeployBase.get_module_and_class("nemo.models.test_model.TestModel")
-    assert module == "nemo.models.test_model"
-    assert class_name == "TestModel"
-
-
-def test_is_model_deployable_valid(deploy_base):
-    deploy_base.model = MockTritonDeployable()
-    with patch('nemo.deploy.deploy_base.ITritonDeployable', MockTritonDeployable):
-        assert deploy_base._is_model_deployable() is True
-
-
-def test_is_model_deployable_invalid(deploy_base):
-    deploy_base.model = MagicMock()
-    with patch('nemo.deploy.deploy_base.ITritonDeployable', MockTritonDeployable):
-        with pytest.raises(Exception) as exc_info:
-            deploy_base._is_model_deployable()
-        assert "This model is not deployable to Triton" in str(exc_info.value)
diff --git a/tests/deploy/test_deploy_pytriton.py b/tests/deploy/test_deploy_pytriton.py
deleted file mode 100755
index 52b1c677a99e..000000000000
--- a/tests/deploy/test_deploy_pytriton.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from nemo.deploy import ITritonDeployable
-from nemo.deploy.deploy_pytriton import DeployPyTriton
-
-
-class MockModel(ITritonDeployable):
-    def triton_infer_fn(self, *args, **kwargs):
-        return {"output": "test output"}
-
-    def triton_infer_fn_streaming(self, *args, **kwargs):
-        yield {"output": "test output"}
-
-    def get_triton_input(self):
-        return [{"name": "input", "dtype": "string", "shape": (-1,)}]
-
-    def get_triton_output(self):
-        return [{"name": "output", "dtype": "string", "shape": (-1,)}]
-
-
-@pytest.fixture
-def mock_model():
-    return MockModel()
-
-
-@pytest.fixture
-def deploy_pytriton(mock_model):
-    return DeployPyTriton(triton_model_name="test_model", model=mock_model, http_port=8000, grpc_port=8001)
-
-
-@patch('nemo.deploy.deploy_pytriton.Triton')
-def test_deploy_success(mock_triton, deploy_pytriton):
-    deploy_pytriton.deploy()
-    assert deploy_pytriton.triton is not None
-    mock_triton.return_value.bind.assert_called_once()
-
-
-@patch('nemo.deploy.deploy_pytriton.Triton')
-def test_deploy_streaming_success(mock_triton):
-    deploy = DeployPyTriton(triton_model_name="test_model", model=MockModel(), streaming=True)
-    deploy.deploy()
-    assert deploy.triton is not None
-    mock_triton.return_value.bind.assert_called_once()
-
-
-@patch('nemo.deploy.deploy_pytriton.Triton')
-def test_deploy_failure(mock_triton, deploy_pytriton):
-    mock_triton.side_effect = Exception("Deployment failed")
-    deploy_pytriton.deploy()
-    assert deploy_pytriton.triton is None
-
-
-def test_serve_success(deploy_pytriton):
-    deploy_pytriton.triton = MagicMock()
-    deploy_pytriton.serve()
-    deploy_pytriton.triton.serve.assert_called_once()
-
-
-def test_serve_failure(deploy_pytriton):
-    deploy_pytriton.triton = None
-    with pytest.raises(Exception, match="deploy should be called first."):
-        deploy_pytriton.serve()
-
-
-def test_run_success(deploy_pytriton):
-    deploy_pytriton.triton = MagicMock()
-    deploy_pytriton.run()
-    deploy_pytriton.triton.run.assert_called_once()
-
-
-def test_run_failure(deploy_pytriton):
-    deploy_pytriton.triton = None
-    with pytest.raises(Exception, match="deploy should be called first."):
-        deploy_pytriton.run()
-
-
-def test_stop_success(deploy_pytriton):
-    deploy_pytriton.triton = MagicMock()
-    deploy_pytriton.stop()
-    deploy_pytriton.triton.stop.assert_called_once()
-
-
-def test_stop_failure(deploy_pytriton):
-    deploy_pytriton.triton = None
-    with pytest.raises(Exception, match="deploy should be called first."):
-        deploy_pytriton.stop()
diff --git a/tests/deploy/test_deploy_query.py b/tests/deploy/test_deploy_query.py
deleted file mode 100755
index 8ac858e51a2f..000000000000
--- a/tests/deploy/test_deploy_query.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np
-from pytriton.decorators import batch
-from pytriton.model_config import Tensor
-
-from nemo.deploy import DeployPyTriton, ITritonDeployable
-from nemo.deploy.nlp import NemoQueryLLM
-from nemo.deploy.utils import cast_output, str_ndarray2list
-
-
-class MockModel(ITritonDeployable):
-
-    @property
-    def get_triton_input(self):
-        inputs = (
-            Tensor(name="prompts", shape=(-1,), dtype=bytes),
-            Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
-            Tensor(name="output_context_logits", shape=(-1,), dtype=np.bool_, optional=False),
-            Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False),
-        )
-        return inputs
-
-    @property
-    def get_triton_output(self):
-        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
-        return outputs
-
-    @batch
-    def triton_infer_fn(self, **inputs: np.ndarray):
-        infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
-        if "max_output_len" in inputs:
-            infer_input["max_output_len"] = inputs.pop("max_output_len")[0][0]
-
-        output_dict = dict()
-        output_dict["outputs"] = cast_output("I am good, how about you?", np.bytes_)
-        return output_dict
-
-
-def test_nemo_deploy_query():
-    model_name = "mock_model"
-    model = MockModel()
-    nm = DeployPyTriton(
-        model=model,
-        triton_model_name=model_name,
-        max_batch_size=32,
-        http_port=9002,
-        grpc_port=8001,
-        address="0.0.0.0",
-        allow_grpc=True,
-        allow_http=True,
-        streaming=False,
-    )
-    nm.deploy()
-    nm.run()
-
-    nq = NemoQueryLLM(url="localhost:9002", model_name=model_name)
-    output_deployed = nq.query_llm(
-        prompts=["Hey, how is it going?"],
-        max_output_len=20,
-    )
-    nm.stop()
-
-    assert output_deployed is not None, "Output cannot be none."
-    assert output_deployed == "I am good, how about you?", "Output cannot be none."
diff --git a/tests/deploy/test_deploy_utils.py b/tests/deploy/test_deploy_utils.py
deleted file mode 100644
index 109af88c90c4..000000000000
--- a/tests/deploy/test_deploy_utils.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tarfile
-import tempfile
-import typing
-
-import numpy as np
-import pytest
-import torch
-from PIL import Image
-from pytriton.model_config import Tensor
-
-from nemo.deploy.utils import (
-    NEMO1,
-    NEMO2,
-    broadcast_list,
-    cast_output,
-    ndarray2img,
-    nemo_checkpoint_version,
-    str_list2numpy,
-    str_ndarray2list,
-    typedict2tensor,
-)
-
-
-class TestTypedict2Tensor:
-    class SampleTypedict:
-        int_field: int
-        float_field: float
-        bool_field: bool
-        str_field: str
-        int_list: typing.List[int]
-        float_list: typing.List[float]
-        bool_list: typing.List[bool]
-        str_list: typing.List[str]
-
-    def test_typedict2tensor_basic(self):
-        tensors = typedict2tensor(self.SampleTypedict)
-        assert len(tensors) == 8
-        assert all(isinstance(t, Tensor) for t in tensors)
-
-        # Check int field
-        int_tensor = next(t for t in tensors if t.name == "int_field")
-        assert int_tensor.dtype == np.int32
-        assert int_tensor.shape == (1,)
-
-        # Check float field
-        float_tensor = next(t for t in tensors if t.name == "float_field")
-        assert float_tensor.dtype == np.float32
-        assert float_tensor.shape == (1,)
-
-        # Check bool field
-        bool_tensor = next(t for t in tensors if t.name == "bool_field")
-        assert bool_tensor.dtype == np.bool_
-        assert bool_tensor.shape == (1,)
-
-        # Check str field
-        str_tensor = next(t for t in tensors if t.name == "str_field")
-        assert str_tensor.dtype == bytes
-        assert str_tensor.shape == (1,)
-
-    def test_typedict2tensor_with_overwrite(self):
-        overwrite_kwargs = {"optional": True}
-        tensors = typedict2tensor(self.SampleTypedict, overwrite_kwargs=overwrite_kwargs)
-        assert all(t.optional for t in tensors)
-
-    def test_typedict2tensor_list_types(self):
-        tensors = typedict2tensor(self.SampleTypedict)
-
-        # Check int list
-        int_list_tensor = next(t for t in tensors if t.name == "int_list")
-        assert int_list_tensor.dtype == np.int32
-        assert int_list_tensor.shape == (1,)
-
-        # Check float list
-        float_list_tensor = next(t for t in tensors if t.name == "float_list")
-        assert float_list_tensor.dtype == np.float32
-        assert float_list_tensor.shape == (1,)
-
-        # Check bool list
-        bool_list_tensor = next(t for t in tensors if t.name == "bool_list")
-        assert bool_list_tensor.dtype == np.bool_
-        assert bool_list_tensor.shape == (1,)
-
-        # Check str list
-        str_list_tensor = next(t for t in tensors if t.name == "str_list")
-        assert str_list_tensor.dtype == bytes
-        assert str_list_tensor.shape == (1,)
-
-
-class TestNemoCheckpointVersion:
-    def test_nemo2_checkpoint_dir(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Create NEMO 2.0 structure
-            os.makedirs(os.path.join(tmpdir, "context"))
-            os.makedirs(os.path.join(tmpdir, "weights"))
-            assert nemo_checkpoint_version(tmpdir) == NEMO2
-
-    def test_nemo1_checkpoint_dir(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Create NEMO 1.0 structure (no context/weights dirs)
-            assert nemo_checkpoint_version(tmpdir) == NEMO1
-
-    def test_nemo2_checkpoint_tar(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            tar_path = os.path.join(tmpdir, "checkpoint.tar")
-            with tarfile.open(tar_path, "w") as tar:
-                # Create NEMO 2.0 structure in tar
-                context_info = tarfile.TarInfo("context")
-                context_info.type = tarfile.DIRTYPE
-                tar.addfile(context_info)
-
-                weights_info = tarfile.TarInfo("weights")
-                weights_info.type = tarfile.DIRTYPE
-                tar.addfile(weights_info)
-
-            assert nemo_checkpoint_version(tar_path) == NEMO2
-
-    def test_nemo1_checkpoint_tar(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            tar_path = os.path.join(tmpdir, "checkpoint.tar")
-            with tarfile.open(tar_path, "w") as tar:
-                # Create empty tar (NEMO 1.0)
-                pass
-
-            assert nemo_checkpoint_version(tar_path) == NEMO1
-
-
-class TestStringConversions:
-    def test_str_list2numpy(self):
-        input_list = ["hello", "world", "test"]
-        result = str_list2numpy(input_list)
-        assert isinstance(result, np.ndarray)
-        assert result.shape == (3, 1)
-        assert all(isinstance(x, bytes) for x in result.flatten())
-
-    def test_str_ndarray2list(self):
-        input_array = np.array([b"hello", b"world", b"test"]).reshape(3, 1)
-        result = str_ndarray2list(input_array)
-        assert isinstance(result, list)
-        assert result == ["hello", "world", "test"]
-
-    def test_str_conversion_roundtrip(self):
-        input_list = ["hello", "world", "test"]
-        numpy_array = str_list2numpy(input_list)
-        output_list = str_ndarray2list(numpy_array)
-        assert input_list == output_list
-
-
-class TestImageConversions:
-    def test_ndarray2img(self):
-        # Create a test image array
-        img_array = np.random.randint(0, 255, size=(2, 100, 100, 3), dtype=np.uint8)
-        result = ndarray2img(img_array)
-
-        assert isinstance(result, list)
-        assert len(result) == 2
-        assert all(isinstance(img, Image.Image) for img in result)
-        assert all(img.size == (100, 100) for img in result)
-
-
-class TestCastOutput:
-    def test_cast_tensor(self):
-        input_tensor = torch.tensor([1, 2, 3])
-        result = cast_output(input_tensor, np.int32)
-        assert isinstance(result, np.ndarray)
-        assert result.dtype == np.int32
-        assert result.shape == (3, 1)
-
-    def test_cast_numpy(self):
-        input_array = np.array([1, 2, 3])
-        result = cast_output(input_array, np.float32)
-        assert isinstance(result, np.ndarray)
-        assert result.dtype == np.float32
-        assert result.shape == (3, 1)
-
-    def test_cast_string(self):
-        input_list = ["hello", "world"]
-        result = cast_output(input_list, bytes)
-        assert isinstance(result, np.ndarray)
-        assert result.shape == (2, 1)
-
-    def test_cast_1d_to_2d(self):
-        input_array = np.array([1, 2, 3])
-        result = cast_output(input_array, np.int32)
-        assert result.ndim == 2
-        assert result.shape == (3, 1)
-
-
-class TestBroadcastList:
-    def test_broadcast_list_no_distributed(self):
-        with pytest.raises(RuntimeError, match="Distributed environment is not initialized"):
-            broadcast_list(["test"])
-
-    def test_broadcast_list_distributed(self, monkeypatch):
-        # Mock distributed environment
-        monkeypatch.setattr(torch.distributed, "is_initialized", lambda: True)
-        monkeypatch.setattr(torch.distributed, "get_rank", lambda: 0)
-
-        # Mock broadcast_object_list
-        def mock_broadcast_object_list(object_list, src, group=None):
-            if src == 0:
-                object_list[0] = ["test"]
-
-        monkeypatch.setattr(torch.distributed, "broadcast_object_list", mock_broadcast_object_list)
-
-        result = broadcast_list(["test"])
-        assert result == ["test"]
diff --git a/tests/deploy/test_deployment_service.py b/tests/deploy/test_deployment_service.py
deleted file mode 100644
index 56fe31f6df4d..000000000000
--- a/tests/deploy/test_deployment_service.py
+++ /dev/null
@@ -1,319 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-from fastapi.testclient import TestClient
-
-from nemo.deploy.service.fastapi_interface_to_pytriton import (
-    ChatCompletionRequest,
-    CompletionRequest,
-    TritonSettings,
-    _helper_fun,
-    app,
-    convert_numpy,
-    dict_to_str,
-    query_llm_async,
-)
-from nemo.deploy.service.rest_model_api import CompletionRequest as RestCompletionRequest
-from nemo.deploy.service.rest_model_api import TritonSettings as RestTritonSettings
-from nemo.deploy.service.rest_model_api import app as rest_app
-
-
-@pytest.fixture
-def client():
-    return TestClient(app)
-
-
-@pytest.fixture
-def mock_triton_settings():
-    with patch('nemo.deploy.service.fastapi_interface_to_pytriton.TritonSettings') as mock:
-        instance = mock.return_value
-        instance.triton_service_port = 8000
-        instance.triton_service_ip = "localhost"
-        yield instance
-
-
-@pytest.fixture
-def rest_client():
-    return TestClient(rest_app)
-
-
-@pytest.fixture
-def mock_rest_triton_settings():
-    with patch('nemo.deploy.service.rest_model_api.TritonSettings') as mock:
-        instance = mock.return_value
-        instance.triton_service_port = 8080
-        instance.triton_service_ip = "localhost"
-        instance.triton_request_timeout = 60
-        instance.openai_format_response = False
-        instance.output_generation_logits = False
-        yield instance
-
-
-class TestTritonSettings:
-    def test_default_values(self):
-        with patch.dict(os.environ, {}, clear=True):
-            settings = TritonSettings()
-            assert settings.triton_service_port == 8000
-            assert settings.triton_service_ip == "0.0.0.0"
-
-    def test_custom_values(self):
-        with patch.dict(os.environ, {'TRITON_PORT': '9000', 'TRITON_HTTP_ADDRESS': '127.0.0.1'}, clear=True):
-            settings = TritonSettings()
-            assert settings.triton_service_port == 9000
-            assert settings.triton_service_ip == "127.0.0.1"
-
-
-class TestCompletionRequest:
-    def test_default_completions_values(self):
-        request = CompletionRequest(model="test_model", prompt="test prompt")
-        assert request.model == "test_model"
-        assert request.prompt == "test prompt"
-        assert request.max_tokens == 512
-        assert request.temperature == 1.0
-        assert request.top_p == 0.0
-        assert request.top_k == 0
-        assert request.logprobs is None
-        assert request.echo is False
-
-    def test_default_chat_values(self):
-        request = ChatCompletionRequest(model="test_model", messages=[{"role": "user", "content": "test message"}])
-        assert request.model == "test_model"
-        assert request.messages == [{"role": "user", "content": "test message"}]
-        assert request.max_tokens == 512
-        assert request.temperature == 1.0
-        assert request.top_p == 0.0
-        assert request.top_k == 0
-
-    def test_greedy_params(self):
-        request = CompletionRequest(model="test_model", prompt="test prompt", temperature=0.0, top_p=0.0)
-        assert request.top_k == 1
-
-
-class TestHealthEndpoints:
-    def test_health_check(self, client):
-        response = client.get("/v1/health")
-        assert response.status_code == 200
-        assert response.json() == {"status": "ok"}
-
-
-class TestUtilityFunctions:
-    def test_convert_numpy(self):
-        # Test with numpy array
-        arr = np.array([1, 2, 3])
-        assert convert_numpy(arr) == [1, 2, 3]
-
-        # Test with nested dictionary
-        nested = {"a": np.array([1, 2]), "b": {"c": np.array([3, 4])}}
-        assert convert_numpy(nested) == {"a": [1, 2], "b": {"c": [3, 4]}}
-
-        # Test with list
-        lst = [np.array([1, 2]), np.array([3, 4])]
-        assert convert_numpy(lst) == [[1, 2], [3, 4]]
-
-    def test_dict_to_str(self):
-        test_dict = {"key": "value", "number": 42}
-        result = dict_to_str(test_dict)
-        assert isinstance(result, str)
-        assert json.loads(result) == test_dict
-
-
-class TestLLMQueryFunctions:
-    def test_helper_fun(self):
-        mock_nq = MagicMock()
-        mock_nq.query_llm.return_value = {"test": "response"}
-
-        with patch('nemo.deploy.service.fastapi_interface_to_pytriton.NemoQueryLLMPyTorch', return_value=mock_nq):
-            result = _helper_fun(
-                url="http://test",
-                model="test_model",
-                prompts=["test prompt"],
-                temperature=0.7,
-                top_k=10,
-                top_p=0.9,
-                compute_logprob=True,
-                max_length=100,
-                apply_chat_template=False,
-                echo=False,
-                n_top_logprobs=0,
-            )
-            assert result == {"test": "response"}
-            mock_nq.query_llm.assert_called_once()
-
-    def test_query_llm_async(self):
-        mock_result = {"test": "response"}
-        with patch('nemo.deploy.service.fastapi_interface_to_pytriton._helper_fun', return_value=mock_result):
-            # Create an event loop and run the async function
-            import asyncio
-
-            loop = asyncio.get_event_loop()
-            result = loop.run_until_complete(
-                query_llm_async(
-                    url="http://test",
-                    model="test_model",
-                    prompts=["test prompt"],
-                    temperature=0.7,
-                    top_k=10,
-                    top_p=0.9,
-                    compute_logprob=True,
-                    max_length=100,
-                    apply_chat_template=False,
-                    echo=False,
-                    n_top_logprobs=0,
-                )
-            )
-            assert result == mock_result
-
-
-class TestAPIEndpoints:
-    def test_completions_v1(self, client):
-        mock_output = {
-            "choices": [
-                {
-                    "text": [["test response"]],
-                    "logprobs": {"token_logprobs": [[1.0, 2.0]], "top_logprobs": [[{"a": 0.5}, {"b": 0.5}]]},
-                }
-            ]
-        }
-
-        with patch('nemo.deploy.service.fastapi_interface_to_pytriton.query_llm_async', return_value=mock_output):
-            response = client.post(
-                "/v1/completions/", json={"model": "test_model", "prompt": "test prompt", "logprobs": 1}
-            )
-            assert response.status_code == 200
-            data = response.json()
-            assert data["choices"][0]["text"] == "test response"
-            assert "logprobs" in data["choices"][0]
-
-    def test_chat_completions_v1(self, client):
-        mock_output = {"choices": [{"text": [["test response"]]}]}
-
-        with patch('nemo.deploy.service.fastapi_interface_to_pytriton.query_llm_async', return_value=mock_output):
-            response = client.post(
-                "/v1/chat/completions/",
-                json={"model": "test_model", "messages": [{"role": "user", "content": "test message"}]},
-            )
-            assert response.status_code == 200
-            data = response.json()
-            assert data["choices"][0]["message"]["role"] == "assistant"
-            assert data["choices"][0]["message"]["content"] == "test response"
-
-
-class TestRestTritonSettings:
-    def test_default_values(self):
-        with patch.dict(os.environ, {}, clear=True):
-            settings = RestTritonSettings()
-            assert settings.triton_service_port == 8080
-            assert settings.triton_service_ip == "0.0.0.0"
-            assert settings.triton_request_timeout == 60
-            assert settings.openai_format_response is False
-            assert settings.output_generation_logits is False
-
-    def test_custom_values(self):
-        with patch.dict(
-            os.environ,
-            {
-                'TRITON_PORT': '9000',
-                'TRITON_HTTP_ADDRESS': '127.0.0.1',
-                'TRITON_REQUEST_TIMEOUT': '120',
-                'OPENAI_FORMAT_RESPONSE': 'True',
-                'OUTPUT_GENERATION_LOGITS': 'True',
-            },
-            clear=True,
-        ):
-            settings = RestTritonSettings()
-            assert settings.triton_service_port == 9000
-            assert settings.triton_service_ip == "127.0.0.1"
-            assert settings.triton_request_timeout == 120
-            assert settings.openai_format_response is True
-            assert settings.output_generation_logits is True
-
-
-class TestRestCompletionRequest:
-    def test_default_values(self):
-        request = RestCompletionRequest(model="test_model", prompt="test prompt")
-        assert request.model == "test_model"
-        assert request.prompt == "test prompt"
-        assert request.max_tokens == 512
-        assert request.temperature == 1.0
-        assert request.top_p == 0.0
-        assert request.top_k == 1
-        assert request.stream is False
-        assert request.stop is None
-        assert request.frequency_penalty == 1.0
-
-
-class TestRestHealthEndpoints:
-    def test_health_check(self, rest_client):
-        response = rest_client.get("/v1/health")
-        assert response.status_code == 200
-        assert response.json() == {"status": "ok"}
-
-    def test_triton_health_success(self, rest_client):
-        with patch('requests.get') as mock_get:
-            mock_response = MagicMock()
-            mock_response.status_code = 200
-            mock_get.return_value = mock_response
-
-            response = rest_client.get("/v1/triton_health")
-            assert response.status_code == 200
-            assert response.json() == {"status": "Triton server is reachable and ready"}
-
-
-class TestRestCompletionsEndpoint:
-    def test_completions_success(self, rest_client):
-        mock_output = [["test response"]]
-        with patch('nemo.deploy.service.rest_model_api.NemoQueryLLM') as mock_llm:
-            mock_instance = mock_llm.return_value
-            mock_instance.query_llm.return_value = mock_output
-
-            response = rest_client.post(
-                "/v1/completions/",
-                json={
-                    "model": "test_model",
-                    "prompt": "test prompt",
-                    "max_tokens": 100,
-                    "temperature": 0.7,
-                    "top_p": 0.9,
-                    "top_k": 10,
-                },
-            )
-            assert response.status_code == 200
-            assert response.json() == {"output": "test response"}
-
-    def test_completions_standard_format(self, rest_client, mock_rest_triton_settings):
-        mock_output = [["test response"]]
-        mock_rest_triton_settings.openai_format_response = False
-
-        with patch('nemo.deploy.service.rest_model_api.NemoQueryLLM') as mock_llm:
-            mock_instance = mock_llm.return_value
-            mock_instance.query_llm.return_value = mock_output
-
-            response = rest_client.post("/v1/completions/", json={"model": "test_model", "prompt": "test prompt"})
-            assert response.status_code == 200
-            assert response.json() == {"output": "test response"}
-
-    def test_completions_error_handling(self, rest_client):
-        with patch('nemo.deploy.service.rest_model_api.NemoQueryLLM') as mock_llm:
-            mock_instance = mock_llm.return_value
-            mock_instance.query_llm.side_effect = Exception("Test error")
-
-            response = rest_client.post("/v1/completions/", json={"model": "test_model", "prompt": "test prompt"})
-            assert response.status_code == 200
-            assert response.json() == {"error": "An exception occurred"}
diff --git a/tests/deploy/test_hf_deployable.py b/tests/deploy/test_hf_deployable.py
deleted file mode 100755
index fb865abb1772..000000000000
--- a/tests/deploy/test_hf_deployable.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from nemo.deploy.nlp.hf_deployable import HuggingFaceLLMDeploy
-
-
-@pytest.fixture
-def mock_model():
-    model = MagicMock(spec=AutoModelForCausalLM)
-    model.generate = MagicMock()
-    model.generate.return_value = torch.tensor([[1, 2, 3]])
-    model.cuda = MagicMock(return_value=model)
-    return model
-
-
-@pytest.fixture
-def mock_tokenizer():
-    tokenizer = MagicMock(spec=AutoTokenizer)
-    tokenizer.pad_token = "[PAD]"
-    tokenizer.eos_token = "[EOS]"
-    tokenizer.batch_decode = MagicMock(return_value=["Generated text"])
-    tokenizer.return_value = {"input_ids": torch.tensor([[1, 2, 3]]), "attention_mask": torch.tensor([[1, 1, 1]])}
-    return tokenizer
-
-
-@pytest.fixture
-def mock_peft_model():
-    with patch("nemo.deploy.nlp.hf_deployable.PeftModel") as mock:
-        mock.from_pretrained.return_value = MagicMock()
-        yield mock
-
-
-@pytest.fixture
-def mock_distributed():
-    with patch("torch.distributed") as mock:
-        mock.is_initialized.return_value = True
-        mock.get_world_size.return_value = 2
-        mock.get_rank.return_value = 1
-        mock.broadcast = MagicMock(return_value=torch.tensor([0]))
-        yield mock
-
-
-@pytest.fixture
-def mock_torch_cuda():
-    with patch('torch.cuda.is_available', return_value=False):
-        with patch('torch.Tensor.cuda', return_value=torch.tensor([[1, 2, 3]])):
-            yield
-
-
-class MockRequest:
-    def __init__(self, data):
-        self.data = data
-        self.span = None
-
-    def __getitem__(self, key):
-        return self.data[key]
-
-    def keys(self):
-        return self.data.keys()
-
-    def values(self):
-        return self.data.values()
-
-
-class TestHuggingFaceLLMDeploy:
-
-    def test_initialization_invalid_task(self):
-        with pytest.raises(AssertionError):
-            HuggingFaceLLMDeploy(hf_model_id_path="test/model", task="invalid-task")
-
-    def test_initialization_no_model(self):
-        with pytest.raises(ValueError):
-            HuggingFaceLLMDeploy(task="text-generation")
-
-    def test_initialization_with_model_and_tokenizer(self):
-        model = MagicMock(spec=AutoModelForCausalLM)
-        tokenizer = MagicMock(spec=AutoTokenizer)
-        deployer = HuggingFaceLLMDeploy(model=model, tokenizer=tokenizer, task="text-generation")
-        assert deployer.model == model
-        assert deployer.tokenizer == tokenizer
-        assert deployer.task == "text-generation"
-
-    def test_initialization_with_model_path(self, mock_model, mock_tokenizer):
-        with (
-            patch("transformers.AutoModelForCausalLM.from_pretrained", return_value=mock_model),
-            patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
-        ):
-            deployer = HuggingFaceLLMDeploy(hf_model_id_path="test/model", task="text-generation")
-            assert deployer.model == mock_model
-            assert deployer.tokenizer == mock_tokenizer
-
-    def test_initialization_with_peft_model(self, mock_model, mock_tokenizer, mock_peft_model):
-        with (
-            patch("transformers.AutoModelForCausalLM.from_pretrained", return_value=mock_model),
-            patch("transformers.AutoTokenizer.from_pretrained", return_value=mock_tokenizer),
-        ):
-            deployer = HuggingFaceLLMDeploy(
-                hf_model_id_path="test/model", hf_peft_model_id_path="test/peft_model", task="text-generation"
-            )
-            assert deployer.model == mock_peft_model.from_pretrained.return_value
-
-    def test_triton_input_output_config(self):
-        deployer = HuggingFaceLLMDeploy(model=MagicMock(), tokenizer=MagicMock(), task="text-generation")
-
-        inputs = deployer.get_triton_input
-        outputs = deployer.get_triton_output
-
-        assert len(inputs) == 10  # Verify number of input tensors
-        assert len(outputs) == 3  # Verify number of output tensors
-
-        # Verify required input tensor names
-        assert any(tensor.name == "prompts" for tensor in inputs)
-        assert any(tensor.name == "max_length" for tensor in inputs)
-
-        # Verify output tensor names
-        assert any(tensor.name == "sentences" for tensor in outputs)
-        assert any(tensor.name == "logits" for tensor in outputs)
-        assert any(tensor.name == "scores" for tensor in outputs)
-
-    def test_generate_without_model(self):
-        deployer = HuggingFaceLLMDeploy(model=MagicMock(), tokenizer=MagicMock(), task="text-generation")
-        deployer.model = None
-        with pytest.raises(RuntimeError):
-            deployer.generate(text_inputs=["test prompt"])
-
-    def test_generate_with_model(self, mock_model, mock_tokenizer, mock_torch_cuda):
-        deployer = HuggingFaceLLMDeploy(model=mock_model, tokenizer=mock_tokenizer, task="text-generation")
-        output = deployer.generate(text_inputs=["test prompt"])
-        assert output == ["Generated text"]
-        mock_model.generate.assert_called_once()
-        mock_tokenizer.batch_decode.assert_called_once()
-
-    def test_generate_with_output_logits_and_scores(self, mock_model, mock_tokenizer, mock_torch_cuda):
-        mock_model.generate.return_value = {
-            "sequences": torch.tensor([[1, 2, 3]]),
-            "logits": torch.tensor([1.0]),
-            "scores": torch.tensor([0.5]),
-        }
-        deployer = HuggingFaceLLMDeploy(model=mock_model, tokenizer=mock_tokenizer, task="text-generation")
-        output = deployer.generate(
-            text_inputs=["test prompt"], output_logits=True, output_scores=True, return_dict_in_generate=True
-        )
-        assert isinstance(output, dict)
-        assert "sentences" in output
-        assert "logits" in output
-        assert "scores" in output
-
-    def test_triton_infer_fn(self, mock_model, mock_tokenizer):
-        deployer = HuggingFaceLLMDeploy(model=mock_model, tokenizer=mock_tokenizer, task="text-generation")
-        request_data = {
-            "prompts": np.array(["test prompt"]),
-            "temperature": np.array([[1.0]]),
-            "top_k": np.array([[1]]),
-            "top_p": np.array([[0.0]]),
-            "max_length": np.array([[10]]),
-            "output_logits": np.array([[False]]),
-            "output_scores": np.array([[False]]),
-        }
-        requests = [MockRequest(request_data)]
-        output = deployer.triton_infer_fn(requests)
-        assert "sentences" in output[0]
-        assert isinstance(output[0]["sentences"], np.ndarray)
-
-    def test_triton_infer_fn_with_error(self, mock_model, mock_tokenizer):
-        deployer = HuggingFaceLLMDeploy(model=mock_model, tokenizer=mock_tokenizer, task="text-generation")
-        mock_model.generate.side_effect = Exception("Test error")
-        request_data = {
-            "prompts": np.array(["test prompt"]),
-            "temperature": np.array([[1.0]]),
-            "top_k": np.array([[1]]),
-            "top_p": np.array([[0.0]]),
-            "max_length": np.array([[10]]),
-            "output_logits": np.array([[False]]),
-            "output_scores": np.array([[False]]),
-        }
-        requests = [MockRequest(request_data)]
-        output = deployer.triton_infer_fn(requests)
-        assert "sentences" in output[0]
-        assert "An error occurred" in str(output[0]["sentences"][0])
diff --git a/tests/deploy/test_hf_import.py b/tests/deploy/test_hf_import.py
deleted file mode 100644
index 8ca69994d9d0..000000000000
--- a/tests/deploy/test_hf_import.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-
-import pytest
-import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
-
-from nemo.deploy.nlp.hf_deployable import HuggingFaceLLMDeploy
-from nemo.deploy.utils import broadcast_list
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_hf_generate():
-    """Tests HF deployable class's generate function."""
-
-    hf_deployable = HuggingFaceLLMDeploy(
-        hf_model_id_path="/home/TestData/llm/models/llama3.2-1B-hf/",
-        task="text-generation",
-        trust_remote_code=True,
-        device_map=None,
-        tp_plan=None,
-    )
-
-    output = hf_deployable.generate(
-        text_inputs=["What is the color of a banana? ", "Tell me a joke."],
-        max_length=32,
-        do_sample=True,
-    )
-
-    assert len(output) == 2, "Output should have to be a list."
-    assert len(output[0]) > 0, "First list in the output should have more than 0 elements."
-    assert len(output[1]) > 0, "Second list in the output should have more than 0 elements."
-
-    # Test output_logits and output_scores
-    output = hf_deployable.generate(
-        text_inputs=["What is the color of a banana? ", "Tell me a joke."],
-        max_length=32,
-        do_sample=True,
-        output_logits=True,
-        output_scores=True,
-        return_dict_in_generate=True,
-    )
-    assert "logits" in output, "Output should have logits."
-    assert "scores" in output, "Output should have scores."
-    assert "sentences" in output, "Output should have sentences."
-    assert len(output["sentences"]) == 2, "Output should have 2 sentences."
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-@pytest.mark.skip(reason="will be enabled later.")
-def test_hf_multigpu_generate():
-    """Tests HF deployable class's generate function with multiple GPUs."""
-
-    mp.spawn(_run_generate, nprocs=2)
-
-
-def _run_generate(rank):
-    """Code to run generate in each rank."""
-
-    os.environ['WORLD_SIZE'] = '2'
-    os.environ['MASTER_ADDR'] = 'localhost'
-    os.environ['MASTER_PORT'] = '12355'
-
-    if rank == 0:
-        os.environ['RANK'] = str(rank)
-        dist.init_process_group("nccl", rank=rank, world_size=2)
-        _hf_generate_ranks()
-        dist.destroy_process_group()
-    else:
-        os.environ['RANK'] = str(rank)
-        dist.init_process_group("nccl", rank=rank, world_size=2)
-        _hf_generate_ranks()
-        dist.destroy_process_group()
-
-
-def _hf_generate_ranks():
-    """Generate by Ranks"""
-
-    torch.cuda.set_device(dist.get_rank())
-
-    hf_deployable = HuggingFaceLLMDeploy(
-        hf_model_id_path="/home/TestData/llm/models/llama3.2-1B-hf/",
-        task="text-generation",
-        trust_remote_code=True,
-        device_map=None,
-        tp_plan=None,
-    )
-
-    if dist.get_rank() == 0:
-        temperature = 1.0
-        top_k = 1
-        top_p = 0.0
-        num_tokens_to_generate = 32
-        output_logits = False
-        output_scores = False
-
-        prompts = ["What is the color of a banana? ", "Tell me a joke."]
-
-        dist.broadcast(torch.tensor([0], dtype=torch.long, device="cuda"), src=0)
-        broadcast_list(prompts, src=0)
-        broadcast_list(
-            data=[
-                temperature,
-                top_k,
-                top_p,
-                num_tokens_to_generate,
-                output_logits,
-                output_scores,
-            ],
-            src=0,
-        )
-
-        output = hf_deployable.generate(
-            text_inputs=prompts,
-            max_length=num_tokens_to_generate,
-            do_sample=True,
-            temperature=temperature,
-            top_k=top_k,
-            top_p=top_p,
-            output_logits=output_logits,
-            output_scores=output_scores,
-        )
-        dist.broadcast(torch.tensor([1], dtype=torch.long, device="cuda"), src=0)
-    else:
-        hf_deployable.generate_other_ranks()
-
-    dist.barrier()
-
-    if dist.get_rank() == 0:
-        assert len(output) == 2, "Output should have to be a lists."
-        assert len(output[0]) > 0, "First list in the output should have more than 0 elements."
-        assert len(output[1]) > 0, "Second list in the output should have more than 0 elements."
diff --git a/tests/deploy/test_megatronllm_deployable.py b/tests/deploy/test_megatronllm_deployable.py
deleted file mode 100644
index a48e1d1cb2a1..000000000000
--- a/tests/deploy/test_megatronllm_deployable.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-from megatron.core.inference.common_inference_params import CommonInferenceParams
-
-from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployableNemo2
-
-
-@pytest.fixture
-def mock_model_and_tokenizer():
-    """Fixture to mock the model and tokenizer setup."""
-    with patch('nemo.collections.llm.inference.setup_mcore_engine') as mock_setup:
-        mock_engine = MagicMock()
-        mock_model = MagicMock()
-        mock_tokenizer = MagicMock()
-        mock_tokenizer.tokenizer.tokenizer = MagicMock()
-        mock_tokenizer.tokenizer.tokenizer.chat_template = "{{messages}}"
-        mock_tokenizer.tokenizer.tokenizer.bos_token = ""
-        mock_tokenizer.tokenizer.tokenizer.eos_token = ""
-        mock_setup.return_value = (mock_engine, mock_model, mock_tokenizer)
-        yield mock_setup
-
-
-@pytest.fixture
-def deployable(mock_model_and_tokenizer):
-    """Fixture to create a deployable instance with mocked dependencies."""
-    return MegatronLLMDeployableNemo2(
-        nemo_checkpoint_filepath="dummy.nemo",
-        num_devices=1,
-        num_nodes=1,
-        tensor_model_parallel_size=1,
-        pipeline_model_parallel_size=1,
-        context_parallel_size=1,
-        expert_model_parallel_size=1,
-        params_dtype="bfloat16",
-        inference_batch_times_seqlen_threshold=1000,
-        inference_max_seq_length=4096,
-        max_batch_size=32,
-        random_seed=42,
-        enable_flash_decode=True,
-        legacy_ckpt=False,
-    )
-
-
-@pytest.mark.run_only_on("GPU")
-def test_initialization(deployable, mock_model_and_tokenizer):
-    """Test initialization of the deployable class."""
-    assert deployable.nemo_checkpoint_filepath == "dummy.nemo"
-    mock_model_and_tokenizer.assert_called_once()
-
-
-@pytest.mark.run_only_on("GPU")
-def test_generate(deployable):
-    """Test text generation functionality."""
-    prompts = ["Hello", "World"]
-    inference_params = CommonInferenceParams(
-        temperature=1.0,
-        top_k=1,
-        top_p=0.0,
-        num_tokens_to_generate=256,
-        return_log_probs=False,
-    )
-
-    # Mock the generate method
-    with patch.object(deployable.mcore_engine, 'generate') as mock_generate:
-        mock_result = MagicMock()
-        mock_result.generated_text = "Generated text"
-        mock_generate.return_value = [mock_result]
-
-        results = deployable.generate(prompts, inference_params)
-        assert len(results) == 1
-        mock_generate.assert_called_once()
-
-
-@pytest.mark.run_only_on("GPU")
-def test_apply_chat_template(deployable):
-    """Test chat template application."""
-    messages = [{"role": "user", "content": "Hello"}]
-    template = deployable.apply_chat_template(messages)
-    assert isinstance(template, str)
-    assert messages[0]["content"] in template
-
-
-@pytest.mark.run_only_on("GPU")
-def test_remove_eos_token(deployable):
-    """Test EOS token removal."""
-    texts = ["Hello", "World", "Test"]
-    cleaned_texts = deployable.remove_eos_token(texts)
-    assert cleaned_texts == ["Hello", "World", "Test"]
-
-
-@pytest.mark.run_only_on("GPU")
-def test_str_to_dict(deployable):
-    """Test string to dictionary conversion."""
-    json_str = '{"key": "value"}'
-    result = deployable.str_to_dict(json_str)
-    assert isinstance(result, dict)
-    assert result["key"] == "value"
-
-
-@pytest.mark.run_only_on("GPU")
-def test_triton_input_output(deployable):
-    """Test Triton input and output tensor definitions."""
-    inputs = deployable.get_triton_input
-    outputs = deployable.get_triton_output
-
-    assert len(inputs) == 11  # Number of input tensors
-    assert len(outputs) == 3  # Number of output tensors
-
-    # Check input tensor names
-    input_names = [tensor.name for tensor in inputs]
-    assert "prompts" in input_names
-    assert "max_length" in input_names
-    assert "max_batch_size" in input_names
-    assert "top_k" in input_names
-    assert "top_p" in input_names
-    assert "temperature" in input_names
-    assert "random_seed" in input_names
-    assert "compute_logprob" in input_names
-    assert "apply_chat_template" in input_names
-    assert "n_top_logprobs" in input_names
-    assert "echo" in input_names
-
-    # Check output tensor names
-    output_names = [tensor.name for tensor in outputs]
-    assert "sentences" in output_names
-    assert "log_probs" in output_names
-    assert "top_logprobs" in output_names
diff --git a/tests/deploy/test_query_llm.py b/tests/deploy/test_query_llm.py
deleted file mode 100755
index fb7e3b647a33..000000000000
--- a/tests/deploy/test_query_llm.py
+++ /dev/null
@@ -1,200 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-
-from nemo.deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMBase, NemoQueryLLMHF, NemoQueryLLMPyTorch
-
-
-class TestNemoQueryLLMBase:
-    def test_base_initialization(self):
-        url = "localhost:8000"
-        model_name = "test-model"
-        query = NemoQueryLLMBase(url=url, model_name=model_name)
-        assert query.url == url
-        assert query.model_name == model_name
-
-
-class TestNemoQueryLLMPyTorch:
-    @pytest.fixture
-    def query(self):
-        return NemoQueryLLMPyTorch(url="localhost:8000", model_name="test-model")
-
-    def test_initialization(self, query):
-        assert isinstance(query, NemoQueryLLMBase)
-        assert query.url == "localhost:8000"
-        assert query.model_name == "test-model"
-
-    @patch('nemo.deploy.nlp.query_llm.ModelClient')
-    def test_query_llm_basic(self, mock_client, query):
-        # Setup mock
-        mock_instance = MagicMock()
-        mock_client.return_value.__enter__.return_value = mock_instance
-        mock_instance.infer_batch.return_value = {"sentences": np.array([b"test response"])}
-        mock_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-
-        # Test basic query
-        response = query.query_llm(prompts=["test prompt"], max_length=100, temperature=0.7, top_k=1, top_p=0.9)
-
-        assert isinstance(response, dict)
-        assert "choices" in response
-        assert response["choices"][0]["text"] == "test response"
-
-    @patch('nemo.deploy.nlp.query_llm.ModelClient')
-    def test_query_llm_with_logprobs(self, mock_client, query):
-        # Setup mock
-        mock_instance = MagicMock()
-        mock_client.return_value.__enter__.return_value = mock_instance
-        mock_instance.infer_batch.return_value = {
-            "sentences": np.array([b"test response"]),
-            "log_probs": np.array([0.1, 0.2, 0.3]),
-        }
-        mock_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-
-        # Test query with logprobs
-        response = query.query_llm(prompts=["test prompt"], max_length=100, compute_logprob=True)
-
-        assert "logprobs" in response["choices"][0]
-        assert "token_logprobs" in response["choices"][0]["logprobs"]
-
-
-class TestNemoQueryLLMHF:
-    @pytest.fixture
-    def query(self):
-        return NemoQueryLLMHF(url="localhost:8000", model_name="test-model")
-
-    def test_initialization(self, query):
-        assert isinstance(query, NemoQueryLLMBase)
-        assert query.url == "localhost:8000"
-        assert query.model_name == "test-model"
-
-    @patch('nemo.deploy.nlp.query_llm.ModelClient')
-    def test_query_llm_basic(self, mock_client, query):
-        # Setup mock
-        mock_instance = MagicMock()
-        mock_client.return_value.__enter__.return_value = mock_instance
-        mock_instance.infer_batch.return_value = {"sentences": np.array([b"test response"])}
-        mock_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-
-        # Test basic query
-        response = query.query_llm(prompts=["test prompt"], max_length=100, temperature=0.7, top_k=1, top_p=0.9)
-
-        assert isinstance(response, dict)
-        assert "choices" in response
-        assert response["choices"][0]["text"] == "test response"
-
-    @patch('nemo.deploy.nlp.query_llm.ModelClient')
-    def test_query_llm_with_logits(self, mock_client, query):
-        # Setup mock
-        mock_instance = MagicMock()
-        mock_client.return_value.__enter__.return_value = mock_instance
-        mock_instance.infer_batch.return_value = {
-            "sentences": np.array([b"test response"]),
-            "logits": np.array([[0.1, 0.2, 0.3]]),
-        }
-        mock_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-
-        # Test query with logits
-        response = query.query_llm(prompts=["test prompt"], max_length=100, output_logits=True)
-
-        assert "logits" in response
-
-
-class TestNemoQueryLLM:
-    @pytest.fixture
-    def query(self):
-        return NemoQueryLLM(url="localhost:8000", model_name="test-model")
-
-    def test_initialization(self, query):
-        assert isinstance(query, NemoQueryLLMBase)
-        assert query.url == "localhost:8000"
-        assert query.model_name == "test-model"
-
-    @patch('nemo.deploy.nlp.query_llm.ModelClient')
-    def test_query_llm_basic(self, mock_client, query):
-        # Setup mock
-        mock_instance = MagicMock()
-        mock_client.return_value.__enter__.return_value = mock_instance
-        mock_instance.infer_batch.return_value = {"outputs": np.array([b"test response"])}
-        mock_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-
-        # Test basic query
-        response = query.query_llm(prompts=["test prompt"], max_output_len=100, temperature=0.7, top_k=1, top_p=0.9)
-
-        assert isinstance(response[0], str)
-        assert response[0] == "test response"
-
-    @patch('nemo.deploy.nlp.query_llm.ModelClient')
-    def test_query_llm_openai_format(self, mock_client, query):
-        # Setup mock
-        mock_instance = MagicMock()
-        mock_client.return_value.__enter__.return_value = mock_instance
-        mock_instance.infer_batch.return_value = {"outputs": np.array([b"test response"])}
-        mock_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-
-        # Test query with OpenAI format
-        response = query.query_llm(prompts=["test prompt"], max_output_len=100, openai_format_response=True)
-
-        assert isinstance(response, dict)
-        assert "choices" in response
-        assert response["choices"][0]["text"] == "test response"
-
-    @patch('nemo.deploy.nlp.query_llm.DecoupledModelClient')
-    def test_query_llm_streaming(self, mock_client, query):
-        # Setup mock
-        mock_instance = MagicMock()
-        mock_client.return_value.__enter__.return_value = mock_instance
-        mock_instance.infer_batch.return_value = [
-            {"outputs": np.array([b"test"])},
-            {"outputs": np.array([b" response"])},
-        ]
-        mock_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-
-        # Test streaming query
-        responses = list(query.query_llm_streaming(prompts=["test prompt"], max_output_len=100))
-
-        assert len(responses) == 2
-        assert responses[0] == "test"
-        assert responses[1] == " response"
-
-    @patch('nemo.deploy.nlp.query_llm.ModelClient')
-    def test_query_llm_with_stop_words(self, mock_client, query):
-        # Setup mock
-        mock_instance = MagicMock()
-        mock_client.return_value.__enter__.return_value = mock_instance
-        mock_instance.infer_batch.return_value = {"outputs": np.array([b"test response"])}
-        mock_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-
-        # Test query with stop words
-        response = query.query_llm(prompts=["test prompt"], max_output_len=100, stop_words_list=["stop"])
-
-        assert isinstance(response[0], str)
-        assert response[0] == "test response"
-
-    @patch('nemo.deploy.nlp.query_llm.ModelClient')
-    def test_query_llm_with_bad_words(self, mock_client, query):
-        # Setup mock
-        mock_instance = MagicMock()
-        mock_client.return_value.__enter__.return_value = mock_instance
-        mock_instance.infer_batch.return_value = {"outputs": np.array([b"test response"])}
-        mock_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-
-        # Test query with bad words
-        response = query.query_llm(prompts=["test prompt"], max_output_len=100, bad_words_list=["bad"])
-
-        assert isinstance(response[0], str)
-        assert response[0] == "test response"
diff --git a/tests/deploy/test_query_multimodal.py b/tests/deploy/test_query_multimodal.py
deleted file mode 100644
index a37768f592e0..000000000000
--- a/tests/deploy/test_query_multimodal.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-from PIL import Image
-
-from nemo.deploy.multimodal.query_multimodal import NemoQueryMultimodal
-
-
-class TestNemoQueryMultimodal:
-    @pytest.fixture
-    def query_multimodal(self):
-        return NemoQueryMultimodal(url="localhost", model_name="test_model", model_type="neva")
-
-    @pytest.fixture
-    def mock_image(self):
-        # Create a temporary image file
-        with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as tmp:
-            img = Image.new('RGB', (100, 100), color='red')
-            img.save(tmp.name)
-            return tmp.name
-
-    @pytest.fixture
-    def mock_video(self):
-        # Create a temporary video file
-        with tempfile.NamedTemporaryFile(suffix='.mp4', delete=False) as tmp:
-            # Just create an empty file for testing
-            return tmp.name
-
-    @pytest.fixture
-    def mock_audio(self):
-        # Create a temporary audio file
-        with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
-            # Just create an empty file for testing
-            return tmp.name
-
-    def test_init(self):
-        nq = NemoQueryMultimodal(url="localhost", model_name="test_model", model_type="neva")
-        assert nq.url == "localhost"
-        assert nq.model_name == "test_model"
-        assert nq.model_type == "neva"
-
-    def test_setup_media_image_local(self, query_multimodal, mock_image):
-        result = query_multimodal.setup_media(mock_image)
-        assert isinstance(result, np.ndarray)
-        assert result.shape[0] == 1  # Batch dimension
-        os.unlink(mock_image)
-
-    @patch('requests.get')
-    def test_setup_media_image_url(self, mock_get, query_multimodal):
-        # Mock the response from requests.get
-        mock_response = MagicMock()
-        mock_response.content = b"fake_image_data"
-        mock_get.return_value = mock_response
-
-        # Mock Image.open
-        with patch('PIL.Image.open') as mock_image_open:
-            mock_image = MagicMock()
-            mock_image.convert.return_value = mock_image
-            mock_image_open.return_value = mock_image
-
-            result = query_multimodal.setup_media("http://example.com/image.jpg")
-            assert isinstance(result, np.ndarray)
-            assert result.shape[0] == 1
-
-    def test_frame_len(self, query_multimodal):
-        # Test with frames less than max_frames
-        frames = [np.zeros((100, 100, 3)) for _ in range(100)]
-        assert query_multimodal.frame_len(frames) == 100
-
-        # Test with frames more than max_frames
-        frames = [np.zeros((100, 100, 3)) for _ in range(300)]
-        result = query_multimodal.frame_len(frames)
-        assert result <= 256  # Should be less than or equal to max_frames
-
-    def test_get_subsampled_frames(self, query_multimodal):
-        frames = [np.zeros((100, 100, 3)) for _ in range(10)]
-        subsample_len = 5
-        result = query_multimodal.get_subsampled_frames(frames, subsample_len)
-        assert len(result) == subsample_len
-
-    @patch('nemo.deploy.multimodal.query_multimodal.ModelClient')
-    def test_query(self, mock_model_client, query_multimodal, mock_image):
-        # Mock the ModelClient context manager
-        mock_client_instance = MagicMock()
-        mock_client_instance.infer_batch.return_value = {"outputs": np.array(["test response"])}
-        mock_client_instance.model_config.outputs = [MagicMock(dtype=np.bytes_)]
-        mock_model_client.return_value.__enter__.return_value = mock_client_instance
-
-        result = query_multimodal.query(
-            input_text="test prompt",
-            input_media=mock_image,
-            max_output_len=30,
-            top_k=1,
-            top_p=0.0,
-            temperature=1.0,
-        )
-
-        assert isinstance(result, np.ndarray)
-        assert result[0] == "test response"
-        os.unlink(mock_image)
-
-    @patch('nemo.deploy.multimodal.query_multimodal.VideoReader')
-    def test_setup_media_video(self, mock_video_reader, mock_video):
-        nq = NemoQueryMultimodal(url="localhost", model_name="test_model", model_type="video-neva")
-
-        # Mock VideoReader
-        mock_frames = [MagicMock(asnumpy=lambda: np.zeros((100, 100, 3))) for _ in range(10)]
-        mock_video_reader.return_value = mock_frames
-
-        result = nq.setup_media(mock_video)
-        assert isinstance(result, np.ndarray)
-        os.unlink(mock_video)
-
-    @patch('soundfile.read')
-    def test_setup_media_audio(self, mock_sf_read, mock_audio):
-        nq = NemoQueryMultimodal(url="localhost", model_name="test_model", model_type="salm")
-
-        # Mock soundfile.read
-        mock_sf_read.return_value = (np.zeros(1000), 16000)
-
-        result = nq.setup_media(mock_audio)
-        assert isinstance(result, dict)
-        assert "input_signal" in result
-        assert "input_signal_length" in result
-        os.unlink(mock_audio)
diff --git a/tests/deploy/test_triton_deployable.py b/tests/deploy/test_triton_deployable.py
deleted file mode 100644
index 323184e7c983..000000000000
--- a/tests/deploy/test_triton_deployable.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import numpy as np
-import pytest
-from nemo.deploy.triton_deployable import ITritonDeployable
-
-
-class MockTritonDeployable(ITritonDeployable):
-    def __init__(self):
-        self.input_shape = (1, 10)
-        self.output_shape = (1, 5)
-
-    def get_triton_input(self):
-        return {"input": {"shape": self.input_shape, "dtype": np.float32}}
-
-    def get_triton_output(self):
-        return {"output": {"shape": self.output_shape, "dtype": np.float32}}
-
-    def triton_infer_fn(self, **inputs: np.ndarray):
-        input_data = inputs["input"]
-        return {"output": np.ones(self.output_shape) * np.mean(input_data)}
-
-
-@pytest.fixture
-def mock_deployable():
-    return MockTritonDeployable()
-
-
-def test_get_triton_input(mock_deployable):
-    """Test that get_triton_input returns the correct input specification."""
-    input_spec = mock_deployable.get_triton_input()
-
-    assert "input" in input_spec
-    assert input_spec["input"]["shape"] == (1, 10)
-    assert input_spec["input"]["dtype"] == np.float32
-
-
-def test_get_triton_output(mock_deployable):
-    """Test that get_triton_output returns the correct output specification."""
-    output_spec = mock_deployable.get_triton_output()
-
-    assert "output" in output_spec
-    assert output_spec["output"]["shape"] == (1, 5)
-    assert output_spec["output"]["dtype"] == np.float32
-
-
-def test_triton_infer_fn(mock_deployable):
-    """Test that triton_infer_fn processes inputs correctly."""
-    # Create test input
-    test_input = np.random.rand(1, 10).astype(np.float32)
-    input_mean = np.mean(test_input)
-
-    # Run inference
-    result = mock_deployable.triton_infer_fn(input=test_input)
-
-    # Check output
-    assert "output" in result
-    assert result["output"].shape == (1, 5)
-    assert np.allclose(result["output"], input_mean)
-
-
-def test_abstract_class_instantiation():
-    """Test that ITritonDeployable cannot be instantiated directly."""
-    with pytest.raises(TypeError):
-        ITritonDeployable()
diff --git a/tests/export/__init__.py b/tests/export/__init__.py
deleted file mode 100644
index 341a77c5bc66..000000000000
--- a/tests/export/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/tests/export/multimodal/test_build.py b/tests/export/multimodal/test_build.py
deleted file mode 100644
index e107b8bfd3d4..000000000000
--- a/tests/export/multimodal/test_build.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import tempfile
-import unittest
-from unittest.mock import MagicMock, patch
-
-import pytest
-import torch
-
-
-@pytest.mark.run_only_on('GPU')
-class TestBuild(unittest.TestCase):
-
-    @pytest.mark.run_only_on('GPU')
-    def setUp(self):
-        self.temp_dir = tempfile.mkdtemp()
-        self.mock_config = {
-            "mm_cfg": {
-                "vision_encoder": {
-                    "from_pretrained": "test_model",
-                    "hidden_size": 768,
-                },
-                "mm_mlp_adapter_type": "linear",
-                "hidden_size": 4096,
-            }
-        }
-        self.mock_weights = {
-            "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector.weight": torch.randn(
-                4096, 768
-            ),
-            "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector.bias": torch.randn(4096),
-        }
-
-    @pytest.mark.run_only_on('GPU')
-    def tearDown(self):
-        # Clean up temporary directory
-        if os.path.exists(self.temp_dir):
-            for root, dirs, files in os.walk(self.temp_dir, topdown=False):
-                for name in files:
-                    os.remove(os.path.join(root, name))
-                for name in dirs:
-                    os.rmdir(os.path.join(root, name))
-            os.rmdir(self.temp_dir)
-
-    @pytest.mark.run_only_on('GPU')
-    @patch('nemo.export.multimodal.build.TensorRTLLM')
-    def test_build_trtllm_engine(self, mock_trtllm):
-        # Test basic functionality
-        mock_exporter = MagicMock()
-        mock_trtllm.return_value = mock_exporter
-
-        from nemo.export.multimodal.build import build_trtllm_engine
-
-        build_trtllm_engine(
-            model_dir=self.temp_dir,
-            visual_checkpoint_path="test_path",
-            model_type="neva",
-            tensor_parallelism_size=1,
-            max_input_len=256,
-            max_output_len=256,
-            max_batch_size=1,
-            max_multimodal_len=1024,
-            dtype="bfloat16",
-        )
-
-        mock_exporter.export.assert_called_once()
-
-    @pytest.mark.run_only_on('GPU')
-    @patch('nemo.export.multimodal.build.MLLaMAForCausalLM')
-    @patch('nemo.export.multimodal.build.build_trtllm')
-    def test_build_mllama_trtllm_engine(self, mock_build_trtllm, mock_mllama):
-        # Test basic functionality
-        mock_model = MagicMock()
-        mock_mllama.from_hugging_face.return_value = mock_model
-        mock_build_trtllm.return_value = MagicMock()
-
-        from nemo.export.multimodal.build import build_mllama_trtllm_engine
-
-        build_mllama_trtllm_engine(
-            model_dir=self.temp_dir,
-            hf_model_path="test_path",
-            tensor_parallelism_size=1,
-            max_input_len=256,
-            max_output_len=256,
-            max_batch_size=1,
-            max_multimodal_len=1024,
-            dtype="bfloat16",
-        )
-
-        mock_mllama.from_hugging_face.assert_called_once()
-        mock_build_trtllm.assert_called_once()
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/export/multimodal/test_converter.py b/tests/export/multimodal/test_converter.py
deleted file mode 100755
index eaabc832bb93..000000000000
--- a/tests/export/multimodal/test_converter.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-import torch
-
-from nemo.export.multimodal.converter import split_gate_weight, split_kv_weight, split_qkv_weight
-
-
-class TestMultimodalConverter:
-    @pytest.fixture
-    def model_config(self):
-        # Create a simple test config
-        config = type(
-            'TestConfig',
-            (),
-            {'hidden_size': 128, 'num_attention_heads': 4, 'num_query_groups': 2, 'kv_channels': None},
-        )()
-        return config
-
-    def test_split_qkv_weight(self, model_config):
-        # Create a test QKV weight tensor
-        batch_size = model_config.num_attention_heads + 2 * model_config.num_query_groups
-        qkv_weight = torch.randn(
-            batch_size, model_config.hidden_size // model_config.num_attention_heads, model_config.hidden_size
-        )
-
-        result = split_qkv_weight(qkv_weight, model_config)
-
-        assert len(result) == 3
-        assert result[0][0] == 'q_proj'
-        assert result[1][0] == 'k_proj'
-        assert result[2][0] == 'v_proj'
-
-        # Check shapes
-        assert result[0][1].shape == (
-            model_config.num_attention_heads,
-            model_config.hidden_size // model_config.num_attention_heads,
-            model_config.hidden_size,
-        )
-        assert result[1][1].shape == (
-            model_config.num_query_groups,
-            model_config.hidden_size // model_config.num_attention_heads,
-            model_config.hidden_size,
-        )
-        assert result[2][1].shape == (
-            model_config.num_query_groups,
-            model_config.hidden_size // model_config.num_attention_heads,
-            model_config.hidden_size,
-        )
-
-    def test_split_kv_weight(self, model_config):
-        # Create a test KV weight tensor
-        batch_size = 2 * model_config.num_query_groups
-        kv_weight = torch.randn(
-            batch_size, model_config.hidden_size // model_config.num_attention_heads, model_config.hidden_size
-        )
-
-        result = split_kv_weight(kv_weight, model_config)
-
-        assert len(result) == 2
-        assert result[0][0] == 'k_proj'
-        assert result[1][0] == 'v_proj'
-
-        # Check shapes
-        assert result[0][1].shape == (
-            model_config.num_query_groups,
-            model_config.hidden_size // model_config.num_attention_heads,
-            model_config.hidden_size,
-        )
-        assert result[1][1].shape == (
-            model_config.num_query_groups,
-            model_config.hidden_size // model_config.num_attention_heads,
-            model_config.hidden_size,
-        )
-
-    def test_split_gate_weight(self):
-        # Create a test gate weight tensor
-        gate_weight = torch.randn(200, 100)  # Example dimensions
-
-        result = split_gate_weight(gate_weight)
-
-        assert len(result) == 2
-        assert result[0][0] == 'gate_proj'
-        assert result[1][0] == 'up_proj'
-
-        # Check shapes
-        assert result[0][1].shape == (100, 100)
-        assert result[1][1].shape == (100, 100)
diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py
deleted file mode 100644
index cd97b8f3d1df..000000000000
--- a/tests/export/nemo_export.py
+++ /dev/null
@@ -1,910 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import logging
-import shutil
-import time
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Dict, List, Optional, Tuple
-
-import torch
-
-LOGGER = logging.getLogger("NeMo")
-
-triton_supported = True
-try:
-    from nemo.deploy import DeployPyTriton
-    from nemo.deploy.nlp import NemoQueryLLM
-except Exception as e:
-    LOGGER.warning(f"Cannot import Triton, deployment will not be available. {type(e).__name__}: {e}")
-    triton_supported = False
-
-in_framework_supported = True
-try:
-    from megatron.core.inference.common_inference_params import CommonInferenceParams
-
-    from nemo.deploy.nlp import NemoQueryLLMPyTorch
-    from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeploy, MegatronLLMDeployableNemo2
-except Exception as e:
-    LOGGER.warning(
-        "Cannot import MegatronLLMDeploy* classes, or NemoQueryLLMPyTorch, or CommonInferenceParams, "
-        f"in-framework inference will not be available. Reason: {type(e).__name__}: {e}"
-    )
-    in_framework_supported = False
-
-trt_llm_supported = True
-try:
-    from nemo.export.tensorrt_llm import TensorRTLLM
-except Exception as e:
-    LOGGER.warning(f"Cannot import the TensorRTLLM exporter, it will not be available. {type(e).__name__}: {e}")
-    trt_llm_supported = False
-
-vllm_supported = True
-try:
-    from nemo.export.vllm_exporter import vLLMExporter
-except Exception as e:
-    LOGGER.warning(f"Cannot import the vLLM exporter, it will not be available. {type(e).__name__}: {e}")
-    vllm_supported = False
-
-
-class UsageError(Exception):
-    pass
-
-
-@dataclass
-class FunctionalResult:
-    regular_pass: Optional[bool] = None
-    deployed_pass: Optional[bool] = None
-
-
-@dataclass
-class AccuracyResult:
-    accuracy: float
-    accuracy_relaxed: float
-    deployed_accuracy: float
-    deployed_accuracy_relaxed: float
-    evaluation_time: float
-
-
-def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path):
-    # lambada dataset based accuracy test, which includes more than 5000 sentences.
-    # Use generated last token with original text's last token for accuracy comparison.
-    # If the generated last token start with the original token, trtllm_correct make an increment.
-    # It generates a CSV file for text comparison detail.
-
-    correct_answers = 0
-    correct_answers_deployed = 0
-    correct_answers_relaxed = 0
-    correct_answers_deployed_relaxed = 0
-    all_expected_outputs = []
-    all_actual_outputs = []
-
-    with open(test_data_path, 'r') as file:
-        records = json.load(file)
-
-        eval_start = time.monotonic()
-        for record in records:
-            prompt = record["text_before_last_word"]
-            expected_output = record["last_word"].strip().lower()
-            all_expected_outputs.append(expected_output)
-            if model is not None:
-                if in_framework_supported and isinstance(model, MegatronLLMDeployableNemo2):
-                    model_output = model.generate(
-                        prompts=[prompt],
-                        inference_params=CommonInferenceParams(
-                            temperature=0.1,
-                            top_k=1,
-                            top_p=0.0,
-                            num_tokens_to_generate=1,
-                            return_log_probs=False,
-                        ),
-                    )
-                    model_output = model_output[0].generated_text  # Index [0] as a single prompt is used
-                else:
-                    model_output = model.forward(
-                        input_texts=[prompt],
-                        max_output_len=1,
-                        top_k=1,
-                        top_p=0.0,
-                        temperature=0.1,
-                        task_ids=task_ids,
-                        lora_uids=lora_uids,
-                    )
-                    model_output = model_output[0][0].strip().lower()
-                all_actual_outputs.append(model_output)
-
-                if expected_output == model_output:
-                    correct_answers += 1
-
-                if (
-                    expected_output == model_output
-                    or model_output.startswith(expected_output)
-                    or expected_output.startswith(model_output)
-                ):
-                    if len(model_output) == 1 and len(expected_output) > 1:
-                        continue
-                    correct_answers_relaxed += 1
-
-            if nq is not None:
-                if in_framework_supported and isinstance(nq, NemoQueryLLMPyTorch):
-                    deployed_output = nq.query_llm(
-                        prompts=[prompt],
-                        max_length=1,
-                        top_k=1,
-                        top_p=0.0,
-                        temperature=0.1,
-                    )
-                    # Accessing [0][0] of "text" is to get a raw string entry from a NumPy array
-                    # for a single prompt (batch size = 1) and stripping prefix if needed:
-                    deployed_output = deployed_output["choices"][0]["text"][0][0][0:].strip().lower()
-                else:
-                    deployed_output = nq.query_llm(
-                        prompts=[prompt],
-                        max_output_len=1,
-                        top_k=1,
-                        top_p=0.0,
-                        temperature=0.1,
-                        task_id=task_ids,
-                    )
-                    deployed_output = deployed_output[0][0].strip().lower()
-
-                if expected_output == deployed_output:
-                    correct_answers_deployed += 1
-
-                if (
-                    expected_output == deployed_output
-                    or deployed_output.startswith(expected_output)
-                    or expected_output.startswith(deployed_output)
-                ):
-                    if len(deployed_output) == 1 and len(expected_output) > 1:
-                        continue
-                    correct_answers_deployed_relaxed += 1
-        eval_end = time.monotonic()
-
-    return AccuracyResult(
-        accuracy=correct_answers / len(all_expected_outputs),
-        accuracy_relaxed=correct_answers_relaxed / len(all_expected_outputs),
-        deployed_accuracy=correct_answers_deployed / len(all_expected_outputs),
-        deployed_accuracy_relaxed=correct_answers_deployed_relaxed / len(all_expected_outputs),
-        evaluation_time=eval_end - eval_start,
-    )
-
-
-# Tests if the model outputs contain the expected keywords.
-def check_model_outputs(streaming: bool, model_outputs, expected_outputs: List[str]) -> bool:
-
-    # In streaming mode, we get a list of lists of lists, and we only care about the last item in that list
-    if streaming:
-        if len(model_outputs) == 0:
-            return False
-        model_outputs = model_outputs[-1]
-
-    # See if we have the right number of final answers.
-    if len(model_outputs) != len(expected_outputs):
-        return False
-
-    # Check the presence of keywords in the final answers.
-    for i in range(len(model_outputs)):
-        if expected_outputs[i] not in model_outputs[i][0]:
-            return False
-
-    return True
-
-
-def run_inference(
-    model_name,
-    model_type,
-    prompts,
-    expected_outputs,
-    checkpoint_path,
-    model_dir,
-    use_vllm,
-    use_huggingface,
-    max_batch_size=8,
-    use_embedding_sharing=False,
-    max_input_len=128,
-    max_output_len=128,
-    max_num_tokens=None,
-    use_parallel_embedding=False,
-    ptuning=False,
-    p_tuning_checkpoint=None,
-    lora=False,
-    lora_checkpoint=None,
-    tp_size=1,
-    pp_size=1,
-    top_k=1,
-    top_p=0.0,
-    temperature=1.0,
-    run_accuracy=False,
-    debug=True,
-    streaming=False,
-    stop_words_list=None,
-    test_cpp_runtime=False,
-    test_deployment=False,
-    test_data_path=None,
-    save_engine=False,
-    fp8_quantized=False,
-    fp8_kvcache=False,
-    trt_llm_export_kwargs=None,
-    vllm_export_kwargs=None,
-) -> Tuple[Optional[FunctionalResult], Optional[AccuracyResult]]:
-    if trt_llm_export_kwargs is None:
-        trt_llm_export_kwargs = {}
-
-    if vllm_export_kwargs is None:
-        vllm_export_kwargs = {}
-
-    if Path(checkpoint_path).exists():
-        if tp_size > torch.cuda.device_count():
-            print(
-                "Path: {0} and model: {1} with {2} tps won't be tested since available # of gpus = {3}".format(
-                    checkpoint_path, model_name, tp_size, torch.cuda.device_count()
-                )
-            )
-            return (None, None)
-
-        Path(model_dir).mkdir(parents=True, exist_ok=True)
-
-        if debug:
-            print("")
-            print("")
-            print(
-                "################################################## NEW TEST ##################################################"
-            )
-            print("")
-
-            print("Path: {0} and model: {1} with {2} tps will be tested".format(checkpoint_path, model_name, tp_size))
-
-        prompt_embeddings_checkpoint_path = None
-        task_ids = None
-        max_prompt_embedding_table_size = 0
-
-        if ptuning:
-            if Path(p_tuning_checkpoint).exists():
-                prompt_embeddings_checkpoint_path = p_tuning_checkpoint
-                max_prompt_embedding_table_size = 8192
-                task_ids = ["0"]
-                if debug:
-                    print("---- PTuning enabled.")
-            else:
-                print("---- PTuning could not be enabled and skipping the test.")
-                return (None, None)
-
-        lora_ckpt_list = None
-        lora_uids = None
-        use_lora_plugin = None
-        lora_target_modules = None
-
-        if lora:
-            if Path(lora_checkpoint).exists():
-                lora_ckpt_list = [lora_checkpoint]
-                lora_uids = ["0", "-1", "0"]
-                use_lora_plugin = "bfloat16"
-                lora_target_modules = ["attn_qkv"]
-                if debug:
-                    print("---- LoRA enabled.")
-            else:
-                print("---- LoRA could not be enabled and skipping the test.")
-                return (None, None)
-
-        if use_vllm:
-            exporter = vLLMExporter()
-
-            exporter.export(
-                nemo_checkpoint=checkpoint_path,
-                model_dir=model_dir,
-                model_type=model_type,
-                tensor_parallel_size=tp_size,
-                pipeline_parallel_size=pp_size,
-                max_model_len=max_input_len + max_output_len,
-                gpu_memory_utilization=args.gpu_memory_utilization,
-                **vllm_export_kwargs,
-            )
-        else:
-            exporter = TensorRTLLM(model_dir, lora_ckpt_list, load_model=False)
-            if use_huggingface:
-                exporter.export_hf_model(
-                    hf_model_path=checkpoint_path,
-                    max_batch_size=max_batch_size,
-                    tensor_parallelism_size=tp_size,
-                    max_input_len=max_input_len,
-                    max_num_tokens=max_num_tokens,
-                    model_type=model_type,
-                )
-            else:
-                exporter.export(
-                    nemo_checkpoint_path=checkpoint_path,
-                    model_type=model_type,
-                    tensor_parallelism_size=tp_size,
-                    pipeline_parallelism_size=pp_size,
-                    max_input_len=max_input_len,
-                    max_seq_len=(max_input_len + max_output_len),
-                    max_batch_size=max_batch_size,
-                    use_parallel_embedding=use_parallel_embedding,
-                    max_prompt_embedding_table_size=max_prompt_embedding_table_size,
-                    use_lora_plugin=use_lora_plugin,
-                    lora_target_modules=lora_target_modules,
-                    max_num_tokens=max_num_tokens,
-                    use_embedding_sharing=use_embedding_sharing,
-                    fp8_quantized=fp8_quantized,
-                    fp8_kvcache=fp8_kvcache,
-                    **trt_llm_export_kwargs,
-                )
-
-        if ptuning:
-            exporter.add_prompt_table(
-                task_name="0",
-                prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path,
-            )
-
-        output = exporter.forward(
-            input_texts=prompts,
-            max_output_len=max_output_len,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            task_ids=task_ids,
-            lora_uids=lora_uids,
-            streaming=streaming,
-            stop_words_list=stop_words_list,
-        )
-
-        # Unwrap the generator if needed
-        output = list(output)
-
-        functional_result = FunctionalResult()
-
-        # Check non-deployed funcitonal correctness
-        if args.functional_test:
-            functional_result.regular_pass = True
-            if not check_model_outputs(streaming, output, expected_outputs):
-                LOGGER.warning("Model outputs don't match the expected result.")
-                functional_result.regular_pass = False
-
-        output_cpp = ""
-        if test_cpp_runtime and not use_lora_plugin and not ptuning and not use_vllm:
-            # This may cause OOM for large models as it creates 2nd instance of a model
-            exporter_cpp = TensorRTLLM(
-                model_dir,
-                load_model=True,
-                use_python_runtime=False,
-            )
-
-            output_cpp = exporter_cpp.forward(
-                input_texts=prompts,
-                max_output_len=max_output_len,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
-            )
-
-        nq = None
-        nm = None
-        output_deployed = ""
-        if test_deployment:
-            nm = DeployPyTriton(
-                model=exporter,
-                triton_model_name=model_name,
-                http_port=8000,
-            )
-            nm.deploy()
-            nm.run()
-            nq = NemoQueryLLM(url="localhost:8000", model_name=model_name)
-
-            output_deployed = nq.query_llm(
-                prompts=prompts,
-                max_output_len=max_output_len,
-                top_k=1,
-                top_p=0.0,
-                temperature=1.0,
-                lora_uids=lora_uids,
-            )
-
-            # Unwrap the generator if needed
-            output_deployed = list(output_deployed)
-
-            # Check deployed funcitonal correctness
-            if args.functional_test:
-                functional_result.deployed_pass = True
-                if not check_model_outputs(streaming, output_deployed, expected_outputs):
-                    LOGGER.warning("Deployed model outputs don't match the expected result.")
-                    functional_result.deployed_pass = False
-
-        if debug or functional_result.regular_pass == False or functional_result.deployed_pass == False:
-            print("")
-            print("--- Prompt: ", prompts)
-            print("")
-            print("--- Expected keywords: ", expected_outputs)
-            print("")
-            print("--- Output: ", output)
-            print("")
-            print("--- Output deployed: ", output_deployed)
-            print("")
-            print("")
-            print("--- Output with C++ runtime: ", output_cpp)
-            print("")
-
-        accuracy_result = None
-        if run_accuracy:
-            print("Start model accuracy testing ...")
-            accuracy_result = get_accuracy_with_lambada(exporter, nq, task_ids, lora_uids, test_data_path)
-
-        if test_deployment:
-            nm.stop()
-
-        if not save_engine and model_dir:
-            shutil.rmtree(model_dir)
-
-        return (functional_result, accuracy_result)
-    else:
-        raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
-
-
-def run_in_framework_inference(
-    model_name,
-    prompts,
-    checkpoint_path,
-    num_gpus=1,
-    max_output_len=128,
-    top_k=1,
-    top_p=0.0,
-    temperature=1.0,
-    run_accuracy=False,
-    debug=True,
-    test_data_path=None,
-    enable_flash_decode=True,
-    legacy_ckpt=False,
-) -> Tuple[Optional[FunctionalResult], Optional[AccuracyResult]]:
-    if Path(checkpoint_path).exists():
-        if debug:
-            print("")
-            print("")
-            print(
-                "################################################## NEW TEST ##################################################"
-            )
-            print("")
-
-            print("Path: {0} and model: {1} will be tested".format(checkpoint_path, model_name))
-
-        deployed_model = MegatronLLMDeploy.get_deployable(
-            checkpoint_path, num_gpus, enable_flash_decode=enable_flash_decode, legacy_ckpt=legacy_ckpt
-        )
-
-        nm = DeployPyTriton(
-            model=deployed_model,
-            triton_model_name=model_name,
-            http_port=8000,
-        )
-        nm.deploy()
-        nm.run()
-        nq = NemoQueryLLMPyTorch(url="localhost:8000", model_name=model_name)
-
-        output_deployed = nq.query_llm(
-            prompts=prompts, top_k=top_k, top_p=top_p, temperature=temperature, max_length=max_output_len
-        )
-        output_deployed = output_deployed["choices"][0]["text"]
-
-        # Unwrap the generator if needed
-        output_deployed = list(output_deployed)
-        print("\n --------- Output: ", output_deployed)
-
-        accuracy_result = None
-        if run_accuracy:
-            print("Start model accuracy testing ...")
-            # This script is not written with torch.distributed support in mind, so running non-deployed in-framework models on multiple devices will not work
-            accuracy_result = get_accuracy_with_lambada(deployed_model, nq, None, None, test_data_path)
-
-        nm.stop()
-
-        return (None, accuracy_result)
-    else:
-        raise Exception("Checkpoint {0} could not be found.".format(checkpoint_path))
-
-
-def get_args():
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description=f"Deploy nemo models to Triton and benchmark the models",
-    )
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        required=True,
-    )
-    parser.add_argument(
-        "--model_type",
-        type=str,
-        required=False,
-    )
-    parser.add_argument(
-        "--min_tps",
-        type=int,
-        default=1,
-        required=True,
-    )
-    parser.add_argument(
-        "--max_tps",
-        type=int,
-    )
-    parser.add_argument(
-        "--pps",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--checkpoint_dir",
-        type=str,
-        default="/tmp/nemo_checkpoint/",
-        required=False,
-    )
-    parser.add_argument(
-        "--model_dir",
-        type=str,
-    )
-    parser.add_argument(
-        "--max_batch_size",
-        type=int,
-        default=8,
-    )
-    parser.add_argument(
-        "--max_input_len",
-        type=int,
-        default=256,
-    )
-    parser.add_argument(
-        "--max_output_len",
-        type=int,
-        default=128,
-    )
-    parser.add_argument(
-        "--max_num_tokens",
-        type=int,
-    )
-    parser.add_argument(
-        "--use_parallel_embedding",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--p_tuning_checkpoint",
-        type=str,
-    )
-    parser.add_argument(
-        "--ptuning",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--lora_checkpoint",
-        type=str,
-    )
-    parser.add_argument(
-        "--lora",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--top_k",
-        type=int,
-        default=1,
-    )
-    parser.add_argument(
-        "--top_p",
-        type=float,
-        default=0.0,
-    )
-    parser.add_argument(
-        "--temperature",
-        type=float,
-        default=1.0,
-    )
-    parser.add_argument(
-        "--run_accuracy",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--accuracy_threshold",
-        type=float,
-        default=0.5,
-    )
-    parser.add_argument("--streaming", default=False, action="store_true")
-    parser.add_argument(
-        "--test_cpp_runtime",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--test_deployment",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--functional_test",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--debug",
-        default=False,
-        action='store_true',
-    )
-    parser.add_argument(
-        "--test_data_path",
-        type=str,
-        default=None,
-    )
-    parser.add_argument(
-        "--save_engine",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--use_vllm",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--use_huggingface",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--enable_flash_decode",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--in_framework",
-        type=str,
-        default="False",
-    )
-    parser.add_argument(
-        "--legacy_ckpt",
-        type=str,
-        default="False",
-        help="Load checkpoint saved with TE < 1.14 (only for in-framework inference)",
-    )
-    parser.add_argument(
-        "-gmu",
-        '--gpu_memory_utilization',
-        default=0.95,  # 0.95 is needed to run Mixtral-8x7B on 2x48GB GPUs
-        type=float,
-        help="GPU memory utilization percentage for vLLM.",
-    )
-    parser.add_argument(
-        "-fp8",
-        "--export_fp8_quantized",
-        default="auto",
-        type=str,
-        help="Enables exporting to a FP8-quantized TRT LLM checkpoint",
-    )
-    parser.add_argument(
-        "-kv_fp8",
-        "--use_fp8_kv_cache",
-        default="auto",
-        type=str,
-        help="Enables exporting with FP8-quantizatized KV-cache",
-    )
-    parser.add_argument(
-        "--trt_llm_export_kwargs",
-        default={},
-        type=json.loads,
-        help="Extra keyword arguments passed to TensorRTLLM.export",
-    )
-    parser.add_argument(
-        "--vllm_export_kwargs",
-        default={},
-        type=json.loads,
-        help="Extra keyword arguments passed to vLLMExporter.export",
-    )
-
-    args = parser.parse_args()
-
-    def str_to_bool(name: str, s: str, optional: bool = False) -> Optional[bool]:
-        s = s.lower()
-        true_strings = ["true", "1"]
-        false_strings = ["false", "0"]
-        if s == '':
-            return False
-        if s in true_strings:
-            return True
-        if s in false_strings:
-            return False
-        if optional and s == 'auto':
-            return None
-        raise UsageError(f"Invalid boolean value for argument --{name}: '{s}'")
-
-    args.model_type = None if str(args.model_type).lower() == "none" else args.model_type
-    args.test_cpp_runtime = str_to_bool("test_cpp_runtime", args.test_cpp_runtime)
-    args.test_deployment = str_to_bool("test_deployment", args.test_deployment)
-    args.functional_test = str_to_bool("functional_test", args.functional_test)
-    args.save_engine = str_to_bool("save_engine", args.save_engine)
-    args.run_accuracy = str_to_bool("run_accuracy", args.run_accuracy)
-    args.use_vllm = str_to_bool("use_vllm", args.use_vllm)
-    args.use_huggingface = str_to_bool("use_huggingface", args.use_huggingface)
-    args.enable_flash_decode = str_to_bool("enable_flash_decode", args.enable_flash_decode)
-    args.lora = str_to_bool("lora", args.lora)
-    args.ptuning = str_to_bool("ptuning", args.ptuning)
-    args.use_parallel_embedding = str_to_bool("use_parallel_embedding", args.use_parallel_embedding)
-    args.in_framework = str_to_bool("in_framework", args.in_framework)
-    args.export_fp8_quantized = str_to_bool("export_fp8_quantized", args.export_fp8_quantized, optional=True)
-    args.use_fp8_kv_cache = str_to_bool("use_fp8_kv_cache", args.use_fp8_kv_cache, optional=True)
-    args.legacy_ckpt = str_to_bool("legacy_ckpt", args.legacy_ckpt)
-
-    return args
-
-
-def run_inference_tests(args):
-    if not args.use_vllm and not args.in_framework and not trt_llm_supported:
-        raise UsageError("TensorRT-LLM engine is not supported in this environment.")
-
-    if args.use_vllm and not vllm_supported:
-        raise UsageError("vLLM engine is not supported in this environment.")
-
-    if args.in_framework and not in_framework_supported:
-        raise UsageError("In-framework inference is not supported in this environment.")
-
-    if args.use_vllm and (args.ptuning or args.lora):
-        raise UsageError("The vLLM integration currently does not support P-tuning or LoRA.")
-
-    if args.test_deployment and not triton_supported:
-        raise UsageError("Deployment tests are not available because Triton is not supported in this environment.")
-
-    if args.run_accuracy and args.test_data_path is None:
-        raise UsageError("Accuracy testing requires the --test_data_path argument.")
-
-    if args.max_tps is None:
-        args.max_tps = args.min_tps
-
-    if args.use_vllm and args.min_tps != args.max_tps:
-        raise UsageError(
-            "vLLM doesn't support changing tensor parallel group size without relaunching the process. "
-            "Use the same value for --min_tps and --max_tps."
-        )
-
-    if args.debug:
-        LOGGER.setLevel(logging.DEBUG)
-
-    result_dic: Dict[int, Tuple[FunctionalResult, Optional[AccuracyResult]]] = {}
-
-    if not args.in_framework and args.model_dir is None:
-        raise Exception("When using custom checkpoints, --model_dir is required.")
-
-    prompts = ["The capital of France is", "Largest animal in the sea is"]
-    expected_outputs = ["Paris", "blue whale"]
-    tps = args.min_tps
-
-    while tps <= args.max_tps:
-        if args.in_framework:
-            result_dic[tps] = run_in_framework_inference(
-                model_name=args.model_name,
-                prompts=prompts,
-                checkpoint_path=args.checkpoint_dir,
-                num_gpus=tps,
-                max_output_len=args.max_output_len,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                temperature=args.temperature,
-                run_accuracy=args.run_accuracy,
-                debug=args.debug,
-                test_data_path=args.test_data_path,
-                enable_flash_decode=args.enable_flash_decode,
-                legacy_ckpt=args.legacy_ckpt,
-            )
-        else:
-            result_dic[tps] = run_inference(
-                model_name=args.model_name,
-                model_type=args.model_type,
-                prompts=prompts,
-                expected_outputs=expected_outputs,
-                checkpoint_path=args.checkpoint_dir,
-                model_dir=args.model_dir,
-                use_vllm=args.use_vllm,
-                use_huggingface=args.use_huggingface,
-                tp_size=tps,
-                pp_size=args.pps,
-                max_batch_size=args.max_batch_size,
-                max_input_len=args.max_input_len,
-                max_output_len=args.max_output_len,
-                max_num_tokens=args.max_num_tokens,
-                use_parallel_embedding=args.use_parallel_embedding,
-                ptuning=args.ptuning,
-                p_tuning_checkpoint=args.p_tuning_checkpoint,
-                lora=args.lora,
-                lora_checkpoint=args.lora_checkpoint,
-                top_k=args.top_k,
-                top_p=args.top_p,
-                temperature=args.temperature,
-                run_accuracy=args.run_accuracy,
-                debug=args.debug,
-                streaming=args.streaming,
-                test_deployment=args.test_deployment,
-                test_cpp_runtime=args.test_cpp_runtime,
-                test_data_path=args.test_data_path,
-                save_engine=args.save_engine,
-                fp8_quantized=args.export_fp8_quantized,
-                fp8_kvcache=args.use_fp8_kv_cache,
-                trt_llm_export_kwargs=args.trt_llm_export_kwargs,
-                vllm_export_kwargs=args.vllm_export_kwargs,
-            )
-
-        tps = tps * 2
-
-    functional_test_result = "PASS"
-    accuracy_test_result = "PASS"
-    print_separator = False
-    print("============= Test Summary ============")
-    # in-framework tests will only return deployed model accuracy results for tps > 1
-    deployed_tests_only = args.in_framework and args.max_tps > 1
-    for num_tps, results in result_dic.items():
-        functional_result, accuracy_result = results
-
-        if print_separator:
-            print("---------------------------------------")
-        print_separator = True
-
-        def optional_bool_to_pass_fail(b: Optional[bool]):
-            if b is None:
-                return "N/A"
-            return "PASS" if b else "FAIL"
-
-        print(f"Tensor Parallelism:              {num_tps}")
-
-        if args.functional_test and functional_result is not None:
-            print(f"Functional Test:                 {optional_bool_to_pass_fail(functional_result.regular_pass)}")
-            print(f"Deployed Functional Test:        {optional_bool_to_pass_fail(functional_result.deployed_pass)}")
-
-            if functional_result.regular_pass == False:
-                functional_test_result = "FAIL"
-            if functional_result.deployed_pass == False:
-                functional_test_result = "FAIL"
-
-        if args.run_accuracy and accuracy_result is not None:
-            print(f"Model Accuracy:                  {accuracy_result.accuracy:.4f}")
-            print(f"Relaxed Model Accuracy:          {accuracy_result.accuracy_relaxed:.4f}")
-            print(f"Deployed Model Accuracy:         {accuracy_result.deployed_accuracy:.4f}")
-            print(f"Deployed Relaxed Model Accuracy: {accuracy_result.deployed_accuracy_relaxed:.4f}")
-            print(f"Evaluation Time [s]:             {accuracy_result.evaluation_time:.2f}")
-            if (deployed_tests_only and accuracy_result.deployed_accuracy_relaxed < args.accuracy_threshold) or (
-                not deployed_tests_only and accuracy_result.accuracy_relaxed < args.accuracy_threshold
-            ):
-                accuracy_test_result = "FAIL"
-
-    print("=======================================")
-    if args.functional_test:
-        print(f"Functional: {functional_test_result}")
-    if args.run_accuracy:
-        print(f"Acccuracy: {accuracy_test_result}")
-
-    if functional_test_result == "FAIL":
-        raise Exception("Functional test failed")
-
-    if accuracy_test_result == "FAIL":
-        raise Exception(f"Model accuracy is below {args.accuracy_threshold}")
-
-
-if __name__ == '__main__':
-    try:
-        args = get_args()
-        run_inference_tests(args)
-    except UsageError as e:
-        LOGGER.error(f"{e}")
-        raise e
-    except argparse.ArgumentError as e:
-        LOGGER.error(f"{e}")
-        raise e
diff --git a/tests/export/test_export_onnx.py b/tests/export/test_export_onnx.py
deleted file mode 100644
index 95a6c8f95062..000000000000
--- a/tests/export/test_export_onnx.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-import tensorrt as trt
-
-from nemo.collections.llm.gpt.model.hf_llama_embedding import get_llama_bidirectional_hf_model
-from nemo.export.onnx_llm_exporter import OnnxLLMExporter
-from nemo.utils import logging
-
-
-def get_args():
-    parser = argparse.ArgumentParser(description='Test ONNX and TensorRT export for LLM embedding models.')
-    parser.add_argument('--hf_model_path', type=str, required=True, help="Hugging Face model id or path.")
-    parser.add_argument('--pooling_strategy', type=str, default="avg", help="Pooling strategy for the model.")
-    parser.add_argument("--normalize", default=False, action="store_true", help="Normalize the embeddings or not.")
-    parser.add_argument('--onnx_export_path', type=str, default="/tmp/onnx_model/", help="Path to store ONNX model.")
-    parser.add_argument('--onnx_opset', type=int, default=17, help="ONNX version to use for export.")
-    parser.add_argument('--trt_model_path', type=str, default="/tmp/trt_model/", help="Path to store TensorRT model.")
-    parser.add_argument(
-        "--trt_version_compatible",
-        default=False,
-        action="store_true",
-        help="Whether to generate version compatible TensorRT models.",
-    )
-
-    return parser.parse_args()
-
-
-def export_onnx_trt(args):
-    # Base Llama model needs to be adapted to turn it into an embedding model.
-    model, tokenizer = get_llama_bidirectional_hf_model(
-        model_name_or_path=args.hf_model_path,
-        normalize=args.normalize,
-        pooling_mode=args.pooling_strategy,
-        trust_remote_code=True,
-    )
-
-    input_names = ["input_ids", "attention_mask", "dimensions"]  # ONNX specific arguments, input names in this case.
-    dynamic_axes_input = {
-        "input_ids": {0: "batch_size", 1: "seq_length"},
-        "attention_mask": {0: "batch_size", 1: "seq_length"},
-        "dimensions": {0: "batch_size"},
-    }
-
-    output_names = ["embeddings"]  # ONNX specific arguments, output names in this case.
-    dynamic_axes_output = {"embeddings": {0: "batch_size", 1: "embedding_dim"}}
-
-    # Initialize ONNX exporter.
-    onnx_exporter = OnnxLLMExporter(
-        onnx_model_dir=args.onnx_export_path,
-        model=model,
-        tokenizer=tokenizer,
-    )
-
-    # Export ONNX model.
-    onnx_exporter.export(
-        input_names=input_names,
-        output_names=output_names,
-        opset=args.onnx_opset,
-        dynamic_axes_input=dynamic_axes_input,
-        dynamic_axes_output=dynamic_axes_output,
-        export_dtype="fp32",
-    )
-
-    # Input profiles for TensorRT.
-    input_profiles = [
-        {
-            "input_ids": [[1, 3], [16, 128], [64, 256]],
-            "attention_mask": [[1, 3], [16, 128], [64, 256]],
-            "dimensions": [[1], [16], [64]],
-        }
-    ]
-
-    # TensorRT builder flags.
-    trt_builder_flags = None
-    if args.trt_version_compatible:
-        trt_builder_flags = [trt.BuilderFlag.VERSION_COMPATIBLE]
-
-    # Model specific layers to override the precision to fp32.
-    override_layers_to_fp32 = [
-        "/model/norm/",
-        "/pooling_module",
-        "/ReduceL2",
-        "/Div",
-    ]
-    # Model specific operation wheter to override layernorm precision or not.
-    override_layernorm_precision_to_fp32 = True
-    profiling_verbosity = "layer_names_only"
-
-    # Export ONNX to TensorRT.
-    onnx_exporter.export_onnx_to_trt(
-        trt_model_dir=args.trt_model_path,
-        profiles=input_profiles,
-        override_layernorm_precision_to_fp32=override_layernorm_precision_to_fp32,
-        override_layers_to_fp32=override_layers_to_fp32,
-        profiling_verbosity=profiling_verbosity,
-        trt_builder_flags=trt_builder_flags,
-    )
-
-    assert os.path.exists(args.trt_model_path)
-    assert os.path.exists(args.onnx_export_path)
-
-    prompt = ["hello", "world"]
-
-    prompt = onnx_exporter.get_tokenizer(prompt)
-    prompt["dimensions"] = [[2]]
-
-    output = onnx_exporter.forward(prompt)
-    if output is None:
-        logging.warning(f"Output is None because ONNX runtime is not installed.")
-
-
-if __name__ == '__main__':
-    export_onnx_trt(get_args())
diff --git a/tests/export/test_mock_import.py b/tests/export/test_mock_import.py
deleted file mode 100644
index dcc227c97251..000000000000
--- a/tests/export/test_mock_import.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-
-from nemo.export.utils._mock_import import _mock_import
-
-
-def test_mock_import_existing_module():
-    """Test mocking an existing module."""
-    import math as math_org
-
-    with _mock_import("math"):
-        import math
-
-        assert math is math_org
-
-
-def test_mock_import_non_existing_module():
-    """Test mocking a non-existing module."""
-    with _mock_import("non.existing.module"):
-        import non.existing.module
-
-    with pytest.raises(ModuleNotFoundError):
-        import non.existing.module
diff --git a/tests/export/test_model_loading.py b/tests/export/test_model_loading.py
deleted file mode 100644
index 48f8bcace406..000000000000
--- a/tests/export/test_model_loading.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-import pytest
-
-from nemo.collections import llm
-
-HF_PATH = "/home/TestData/nlp/megatron_llama/llama-ci-hf"
-OUTPUT_PATH = '/tmp/imported_nemo2'
-
-dummy_module = MagicMock()
-dummy_module.torch_to_numpy = lambda torch_tensor: torch_tensor.detach().cpu().numpy()
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_model_loading() -> None:
-    """
-    Test if model loading works for tensorrt_llm export.
-    """
-
-    model = llm.LlamaModel(config=llm.Llama2Config7B)
-    nemo_path = llm.import_ckpt(model, 'hf://' + HF_PATH, output_path=Path(OUTPUT_PATH))
-
-    assert nemo_path.exists()
-    assert (nemo_path / 'weights').exists()
-    assert (nemo_path / 'context').exists()
-
-    export_path = Path('/tmp/trtllm_exported_model')
-    export_path.mkdir(parents=True, exist_ok=True)
-    export_path_mcore = export_path / 'mcore_export'
-    export_path_local = export_path / 'local_export'
-
-    with patch.dict(
-        'sys.modules',
-        {
-            'tensorrt_llm': dummy_module,
-            'tensorrt_llm._utils': dummy_module,
-        },
-    ):
-        from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_nemo_model
-
-        load_nemo_model(nemo_path, export_path_local, False)
-        load_nemo_model(nemo_path, export_path_mcore, True)
-
-    shutil.rmtree(OUTPUT_PATH, ignore_errors=True)
diff --git a/tests/export/test_onnx_llm_exporter.py b/tests/export/test_onnx_llm_exporter.py
deleted file mode 100644
index 6cc0e9d2292f..000000000000
--- a/tests/export/test_onnx_llm_exporter.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from unittest.mock import MagicMock
-
-import pytest
-import torch
-
-from nemo.export.onnx_llm_exporter import OnnxLLMExporter
-
-
-class DummyModel(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.linear = torch.nn.Linear(10, 5)
-
-    def forward(self, inputs):
-        return self.linear(inputs['input_ids'])
-
-
-class TestOnnxLLMExporter:
-    @pytest.fixture
-    def temp_dir(self, tmp_path):
-        return str(tmp_path / "onnx_model")
-
-    @pytest.fixture
-    def dummy_tokenizer(self):
-        tokenizer = MagicMock()
-        tokenizer.save_pretrained = MagicMock()
-        return tokenizer
-
-    @pytest.fixture
-    def dummy_model(self):
-        return DummyModel()
-
-    def test_init_with_model_and_tokenizer(self, temp_dir, dummy_model, dummy_tokenizer):
-        exporter = OnnxLLMExporter(
-            onnx_model_dir=temp_dir, model=dummy_model, tokenizer=dummy_tokenizer, load_runtime=False
-        )
-        assert exporter.model == dummy_model
-        assert exporter.tokenizer == dummy_tokenizer
-        assert exporter.onnx_model_dir == temp_dir
-
-    def test_init_with_model_and_model_path_raises_error(self, temp_dir, dummy_model):
-        with pytest.raises(ValueError, match="A model was also passed but it will be overridden"):
-            OnnxLLMExporter(
-                onnx_model_dir=temp_dir, model=dummy_model, model_name_or_path="some/path", load_runtime=False
-            )
diff --git a/tests/export/test_quantizer.py b/tests/export/test_quantizer.py
deleted file mode 100644
index 0740e20e4b07..000000000000
--- a/tests/export/test_quantizer.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-from omegaconf import DictConfig
-
-from nemo.export.quantize.quantizer import QUANT_CFG_CHOICES, Quantizer
-
-
-@pytest.fixture
-def basic_quantization_config():
-    return DictConfig(
-        {'algorithm': 'int8', 'decoder_type': 'llama', 'awq_block_size': 128, 'sq_alpha': 0.5, 'enable_kv_cache': True}
-    )
-
-
-@pytest.fixture
-def basic_export_config():
-    return DictConfig(
-        {
-            'dtype': '16',
-            'decoder_type': 'llama',
-            'inference_tensor_parallel': 1,
-            'inference_pipeline_parallel': 1,
-            'save_path': '/tmp/model.qnemo',
-        }
-    )
-
-
-class TestQuantizer:
-    def test_init_valid_configs(self, basic_quantization_config, basic_export_config):
-        quantizer = Quantizer(basic_quantization_config, basic_export_config)
-        assert quantizer.quantization_config == basic_quantization_config
-        assert quantizer.export_config == basic_export_config
-        assert quantizer.quant_cfg == QUANT_CFG_CHOICES['int8']
-
-    def test_init_invalid_algorithm(self, basic_quantization_config, basic_export_config):
-        basic_quantization_config.algorithm = 'invalid_algo'
-        with pytest.raises(AssertionError):
-            Quantizer(basic_quantization_config, basic_export_config)
-
-    def test_init_invalid_dtype(self, basic_quantization_config, basic_export_config):
-        basic_export_config.dtype = '32'
-        with pytest.raises(AssertionError):
-            Quantizer(basic_quantization_config, basic_export_config)
-
-    def test_null_algorithm(self, basic_quantization_config, basic_export_config):
-        basic_quantization_config.algorithm = None
-        quantizer = Quantizer(basic_quantization_config, basic_export_config)
-        assert quantizer.quant_cfg is None
-
-    @patch('nemo.export.quantize.quantizer.dist')
-    def test_quantize_method(self, mock_dist, basic_quantization_config, basic_export_config):
-        mock_dist.get_rank.return_value = 0
-
-        # Create mock model and forward loop
-        mock_model = MagicMock()
-        mock_forward_loop = MagicMock()
-
-        quantizer = Quantizer(basic_quantization_config, basic_export_config)
-
-        with patch('modelopt.torch.quantization.quantize') as mock_quantize:
-            with patch('modelopt.torch.quantization.print_quant_summary'):
-                quantizer.quantize(mock_model, mock_forward_loop)
-
-                # Verify quantize was called with correct arguments
-                mock_quantize.assert_called_once_with(mock_model, QUANT_CFG_CHOICES['int8'], mock_forward_loop)
-
-    @patch('nemo.export.quantize.quantizer.dist')
-    def test_modify_model_config(self, mock_dist):
-        mock_config = DictConfig({'sequence_parallel': True})
-        modified_config = Quantizer.modify_model_config(mock_config)
-
-        assert modified_config.sequence_parallel is False
-        assert modified_config.name == 'modelopt'
-        assert modified_config.apply_rope_fusion is False
-
-    @patch('nemo.export.quantize.quantizer.dist')
-    @patch('nemo.export.quantize.quantizer.export_tensorrt_llm_checkpoint')
-    def test_export_method(self, mock_export, mock_dist, basic_quantization_config, basic_export_config):
-        mock_dist.get_rank.return_value = 0
-        mock_model = MagicMock()
-        mock_model.cfg.megatron_amp_O2 = False
-        mock_model.trainer.num_nodes = 1
-
-        quantizer = Quantizer(basic_quantization_config, basic_export_config)
-
-        with patch('nemo.export.quantize.quantizer.save_artifacts'):
-            quantizer.export(mock_model)
-
-            # Verify export was called with correct arguments
-            mock_export.assert_called_once()
-            call_args = mock_export.call_args[1]
-            assert call_args['decoder_type'] == 'llama'
-            assert call_args['inference_tensor_parallel'] == 1
-            assert call_args['inference_pipeline_parallel'] == 1
diff --git a/tests/export/test_sentencepiece_tokenizer.py b/tests/export/test_sentencepiece_tokenizer.py
deleted file mode 100644
index 343e0bba9460..000000000000
--- a/tests/export/test_sentencepiece_tokenizer.py
+++ /dev/null
@@ -1,192 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import tempfile
-import unittest
-from unittest.mock import MagicMock
-
-import numpy as np
-import sentencepiece
-import torch
-
-from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
-
-
-class TestSentencePieceTokenizer(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        # Create a temporary directory for test files
-        cls.test_dir = tempfile.mkdtemp()
-
-        # Create a simple sentencepiece model for testing
-        with open(os.path.join(cls.test_dir, "test.txt"), "w") as f:
-            f.write("Hello world\nThis is a test\n")
-
-        # Train a simple sentencepiece model
-        sentencepiece.SentencePieceTrainer.Train(
-            f'--input={os.path.join(cls.test_dir, "test.txt")} '
-            f'--model_prefix={os.path.join(cls.test_dir, "test_model")} '
-            '--vocab_size=55 --model_type=bpe'
-        )
-
-        cls.model_path = os.path.join(cls.test_dir, "test_model.model")
-
-    @classmethod
-    def tearDownClass(cls):
-        # Clean up temporary files
-        import shutil
-
-        shutil.rmtree(cls.test_dir)
-
-    def setUp(self):
-        self.tokenizer = SentencePieceTokenizer(model_path=self.model_path)
-
-    def test_initialization(self):
-        # Test initialization with model path
-        tokenizer = SentencePieceTokenizer(model_path=self.model_path)
-        self.assertIsNotNone(tokenizer.tokenizer)
-        self.assertEqual(tokenizer.original_vocab_size, tokenizer.vocab_size)
-
-        # Test initialization with invalid model path
-        with self.assertRaises(ValueError):
-            SentencePieceTokenizer(model_path="nonexistent.model")
-
-        # Test initialization with both model_path and tokenizer
-        mock_tokenizer = MagicMock()
-        with self.assertRaises(ValueError):
-            SentencePieceTokenizer(model_path=self.model_path, tokenizer=mock_tokenizer)
-
-        # Test initialization with neither model_path nor tokenizer
-        with self.assertRaises(ValueError):
-            SentencePieceTokenizer()
-
-    def test_text_to_tokens(self):
-        text = "Hello world"
-        tokens = self.tokenizer.text_to_tokens(text)
-        self.assertIsInstance(tokens, list)
-        self.assertTrue(all(isinstance(t, str) for t in tokens))
-
-    def test_encode(self):
-        text = "Hello world"
-        ids = self.tokenizer.encode(text)
-        self.assertIsInstance(ids, list)
-        self.assertTrue(all(isinstance(i, int) for i in ids))
-
-    def test_tokens_to_text(self):
-        text = "Hello world"
-        tokens = self.tokenizer.text_to_tokens(text)
-        reconstructed_text = self.tokenizer.tokens_to_text(tokens)
-        self.assertIsInstance(reconstructed_text, str)
-        self.assertNotEqual(reconstructed_text, "")  # Should not be empty
-
-    def test_batch_decode(self):
-        text = "Hello world"
-        ids = self.tokenizer.encode(text)
-
-        # Test with list
-        decoded_text = self.tokenizer.batch_decode(ids)
-        self.assertIsInstance(decoded_text, str)
-
-        # Test with numpy array
-        ids_np = np.array(ids)
-        decoded_text_np = self.tokenizer.batch_decode(ids_np)
-        self.assertIsInstance(decoded_text_np, str)
-
-        # Test with torch tensor
-        ids_torch = torch.tensor(ids)
-        decoded_text_torch = self.tokenizer.batch_decode(ids_torch)
-        self.assertIsInstance(decoded_text_torch, str)
-
-    def test_token_to_id(self):
-        text = "Hello"
-        tokens = self.tokenizer.text_to_tokens(text)
-        token_id = self.tokenizer.token_to_id(tokens[0])
-        self.assertIsInstance(token_id, int)
-
-    def test_ids_to_tokens(self):
-        text = "Hello world"
-        ids = self.tokenizer.encode(text)
-        tokens = self.tokenizer.ids_to_tokens(ids)
-        self.assertIsInstance(tokens, list)
-        self.assertTrue(all(isinstance(t, str) for t in tokens))
-
-    def test_tokens_to_ids(self):
-        text = "Hello"
-        tokens = self.tokenizer.text_to_tokens(text)
-        ids = self.tokenizer.tokens_to_ids(tokens)
-        self.assertIsInstance(ids, list)
-        self.assertTrue(all(isinstance(i, int) for i in ids))
-
-    def test_legacy_mode(self):
-        special_tokens = ["[PAD]", "[BOS]", "[EOS]"]
-        tokenizer = SentencePieceTokenizer(model_path=self.model_path, special_tokens=special_tokens, legacy=True)
-
-        # Test adding special tokens
-        self.assertGreater(tokenizer.vocab_size, tokenizer.original_vocab_size)
-
-        # Test special token encoding
-        text = "Hello [PAD] world"
-        tokens = tokenizer.text_to_tokens(text)
-        self.assertIn("[PAD]", tokens)
-
-        # Test special token decoding
-        ids = tokenizer.encode(text)
-        decoded_text = tokenizer.batch_decode(ids)
-        self.assertIn("[PAD]", decoded_text)
-
-    def test_properties(self):
-        # Test pad_id property
-        self.assertIsInstance(self.tokenizer.pad_id, int)
-
-        # Test bos_token_id property
-        self.assertIsInstance(self.tokenizer.bos_token_id, int)
-
-        # Test eos_token_id property
-        self.assertIsInstance(self.tokenizer.eos_token_id, int)
-
-        # Test unk_id property
-        self.assertIsInstance(self.tokenizer.unk_id, int)
-
-    def test_vocab_property(self):
-        vocab = self.tokenizer.vocab
-        self.assertIsInstance(vocab, list)
-        self.assertTrue(all(isinstance(t, str) for t in vocab))
-
-    def test_convert_ids_to_tokens(self):
-        text = "Hello world"
-        ids = self.tokenizer.encode(text)
-        tokens = self.tokenizer.convert_ids_to_tokens(ids)
-        self.assertIsInstance(tokens, list)
-        self.assertTrue(all(isinstance(t, str) for t in tokens))
-
-    def test_convert_tokens_to_string(self):
-        text = "Hello world"
-        tokens = self.tokenizer.text_to_tokens(text)
-        string = self.tokenizer.convert_tokens_to_string(tokens)
-        self.assertIsInstance(string, str)
-
-    def test_len(self):
-        self.assertEqual(len(self.tokenizer), self.tokenizer.vocab_size)
-
-    def test_is_fast(self):
-        self.assertTrue(self.tokenizer.is_fast)
-
-    def test_get_added_vocab(self):
-        self.assertIsNone(self.tokenizer.get_added_vocab())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/export/test_tarutils.py b/tests/export/test_tarutils.py
deleted file mode 100644
index fdd15252f5b9..000000000000
--- a/tests/export/test_tarutils.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import tarfile
-import tempfile
-from pathlib import Path
-
-import pytest
-
-from nemo.export.tarutils import TarPath
-
-
-@pytest.fixture
-def sample_tar():
-    # Create a temporary directory and tar file with sample content
-    with tempfile.TemporaryDirectory() as temp_dir:
-        # Create some test files
-        test_dir = Path(temp_dir) / "test_dir"
-        test_dir.mkdir()
-
-        (test_dir / "file1.txt").write_text("content1")
-        (test_dir / "file2.txt").write_text("content2")
-        (test_dir / "subdir").mkdir()
-        (test_dir / "subdir" / "file3.txt").write_text("content3")
-
-        # Create tar file
-        tar_path = Path(temp_dir) / "test.tar"
-        with tarfile.open(tar_path, "w") as tar:
-            tar.add(test_dir, arcname=".")
-
-        yield str(tar_path)
-
-
-def test_tar_path_initialization(sample_tar):
-    # Test initialization with string path
-    with TarPath(sample_tar) as path:
-        assert isinstance(path, TarPath)
-        assert path.exists()
-
-    # Test initialization with tarfile object
-    with tarfile.open(sample_tar, "r") as tar:
-        path = TarPath(tar)
-        assert isinstance(path, TarPath)
-        assert path.exists()
-
-
-def test_path_operations(sample_tar):
-    with TarPath(sample_tar) as root:
-        # Test path division
-        file_path = root / "file1.txt"
-        assert str(file_path) == f"{sample_tar}/file1.txt"
-
-        # Test nested path division
-        subdir_path = root / "subdir" / "file3.txt"
-        assert str(subdir_path) == f"{sample_tar}/subdir/file3.txt"
-
-        # Test name property
-        assert file_path.name == "file1.txt"
-        assert subdir_path.name == "file3.txt"
-
-        # Test suffix property
-        assert file_path.suffix == ".txt"
-        assert (root / "subdir").suffix == ""
-
-
-def test_file_operations(sample_tar):
-    with TarPath(sample_tar) as root:
-        # Test file existence
-        assert (root / "file1.txt").exists()
-        assert (root / "file1.txt").is_file()
-
-        # Test directory existence
-        assert (root / "subdir").exists()
-        assert (root / "subdir").is_dir()
-
-        # Test non-existent path
-        assert not (root / "nonexistent.txt").exists()
-
-        # Test file reading
-        with (root / "file1.txt").open("r") as f:
-            content = f.read()
-            assert content == b"content1"
-
-
-def test_directory_operations(sample_tar):
-    with TarPath(sample_tar) as root:
-        # Test iterdir
-        entries = list(root.iterdir())
-        assert len(entries) == 5  # file1.txt, file2.txt, subdir, ., file3.txt
-
-        # Test glob
-        txt_files = list(root.glob("*.txt"))
-        assert len(txt_files) == 3
-        assert all(f.suffix == ".txt" for f in txt_files)
-
-        # Test rglob
-        all_txt_files = list(root.rglob("*.txt"))
-        assert len(all_txt_files) == 3
-        assert all(f.suffix == ".txt" for f in all_txt_files)
-
-
-def test_error_handling(sample_tar):
-    with TarPath(sample_tar) as root:
-        # Test opening non-existent file
-        with pytest.raises(FileNotFoundError):
-            (root / "nonexistent.txt").open("r")
-
-        # Test invalid mode
-        with pytest.raises(NotImplementedError):
-            (root / "file1.txt").open("w")
-
-        # Test invalid initialization
-        with pytest.raises(ValueError):
-            TarPath(123)  # Invalid type
diff --git a/tests/export/test_tensorrt_lazy_compiler.py b/tests/export/test_tensorrt_lazy_compiler.py
deleted file mode 100755
index 39c00440fc43..000000000000
--- a/tests/export/test_tensorrt_lazy_compiler.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import tempfile
-import unittest
-from unittest.mock import MagicMock, patch
-
-import pytest
-import torch.nn as nn
-
-
-@pytest.mark.run_only_on('GPU')
-class SimpleModel(nn.Module):
-    @pytest.mark.run_only_on('GPU')
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(3, 64, kernel_size=3, padding=1)
-        self.relu = nn.ReLU()
-
-    @pytest.mark.run_only_on('GPU')
-    def forward(self, x):
-        return self.relu(self.conv(x))
-
-
-@pytest.mark.run_only_on('GPU')
-class TestTensorRTLazyCompiler(unittest.TestCase):
-
-    @pytest.mark.run_only_on('GPU')
-    def setUp(self):
-        self.model = SimpleModel()
-        self.temp_dir = tempfile.mkdtemp()
-        self.plan_path = os.path.join(self.temp_dir, "test_model.plan")
-
-    @pytest.mark.run_only_on('GPU')
-    def tearDown(self):
-        if os.path.exists(self.plan_path):
-            os.remove(self.plan_path)
-        os.rmdir(self.temp_dir)
-
-    @pytest.mark.run_only_on('GPU')
-    def test_get_profile_shapes(self):
-        from nemo.export.tensorrt_lazy_compiler import get_profile_shapes
-
-        input_shape = [1, 3, 224, 224]
-        dynamic_batchsize = [1, 4, 8]
-
-        min_shape, opt_shape, max_shape = get_profile_shapes(input_shape, dynamic_batchsize)
-
-        self.assertEqual(min_shape, [1, 3, 224, 224])
-        self.assertEqual(opt_shape, [4, 3, 224, 224])
-        self.assertEqual(max_shape, [8, 3, 224, 224])
-
-        # Test with None dynamic_batchsize
-        min_shape, opt_shape, max_shape = get_profile_shapes(input_shape, None)
-        self.assertEqual(min_shape, input_shape)
-        self.assertEqual(opt_shape, input_shape)
-        self.assertEqual(max_shape, input_shape)
-
-    @pytest.mark.run_only_on('GPU')
-    def test_get_dynamic_axes(self):
-        from nemo.export.tensorrt_lazy_compiler import get_dynamic_axes
-
-        profiles = [{"input": [[1, 3, 224, 224], [4, 3, 224, 224], [8, 3, 224, 224]]}]
-
-        dynamic_axes = get_dynamic_axes(profiles)
-        self.assertEqual(dynamic_axes, {"input": [0]})
-
-        # Test with empty profiles
-        dynamic_axes = get_dynamic_axes([])
-        self.assertEqual(dynamic_axes, {})
-
-    @pytest.mark.run_only_on('GPU')
-    @patch('nemo.export.tensorrt_lazy_compiler.trt_imported', True)
-    @patch('nemo.export.tensorrt_lazy_compiler.polygraphy_imported', True)
-    @patch('torch.cuda.is_available', return_value=True)
-    def test_trt_compile_basic(self, mock_cuda_available):
-        from nemo.export.tensorrt_lazy_compiler import trt_compile
-
-        # Test basic compilation
-        compiled_model = trt_compile(
-            self.model,
-            self.plan_path,
-            args={"method": "onnx", "precision": "fp16", "build_args": {"builder_optimization_level": 5}},
-        )
-
-        self.assertEqual(compiled_model, self.model)
-        self.assertTrue(hasattr(compiled_model, '_trt_compiler'))
-
-    @pytest.mark.run_only_on('GPU')
-    @patch('nemo.export.tensorrt_lazy_compiler.trt_imported', False)
-    def test_trt_compile_no_tensorrt(self):
-        from nemo.export.tensorrt_lazy_compiler import trt_compile
-
-        # Test when TensorRT is not available
-        compiled_model = trt_compile(self.model, self.plan_path)
-        self.assertEqual(compiled_model, self.model)
-        self.assertFalse(hasattr(compiled_model, '_trt_compiler'))
-
-    @pytest.mark.run_only_on('GPU')
-    def test_trt_compiler_initialization(self):
-        from nemo.export.tensorrt_lazy_compiler import TrtCompiler
-
-        compiler = TrtCompiler(
-            self.model,
-            self.plan_path,
-            precision="fp16",
-            method="onnx",
-            input_names=["x"],
-            output_names=["output"],
-            logger=MagicMock(),
-        )
-
-        self.assertEqual(compiler.plan_path, self.plan_path)
-        self.assertEqual(compiler.precision, "fp16")
-        self.assertEqual(compiler.method, "onnx")
-        self.assertEqual(compiler.input_names, ["x"])
-        self.assertEqual(compiler.output_names, ["output"])
-
-    @pytest.mark.run_only_on('GPU')
-    def test_trt_compiler_invalid_precision(self):
-        from nemo.export.tensorrt_lazy_compiler import TrtCompiler
-
-        with self.assertRaises(ValueError):
-            TrtCompiler(self.model, self.plan_path, precision="invalid_precision")
-
-    @pytest.mark.run_only_on('GPU')
-    def test_trt_compiler_invalid_method(self):
-        from nemo.export.tensorrt_lazy_compiler import TrtCompiler
-
-        with self.assertRaises(ValueError):
-            TrtCompiler(self.model, self.plan_path, method="invalid_method")
-
-    @pytest.mark.run_only_on('GPU')
-    @patch('nemo.export.tensorrt_lazy_compiler.trt_imported', True)
-    @patch('nemo.export.tensorrt_lazy_compiler.polygraphy_imported', True)
-    @patch('torch.cuda.is_available', return_value=True)
-    def test_trt_compile_with_submodule(self, mock_cuda_available):
-        from nemo.export.tensorrt_lazy_compiler import trt_compile
-
-        class NestedModel(nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.submodule = SimpleModel()
-
-        model = NestedModel()
-        compiled_model = trt_compile(model, self.plan_path, submodule=["submodule"])
-
-        self.assertEqual(compiled_model, model)
-        self.assertTrue(hasattr(model.submodule, '_trt_compiler'))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/tests/export/test_tensorrt_llm.py b/tests/export/test_tensorrt_llm.py
deleted file mode 100644
index 3d20fc6f3cdf..000000000000
--- a/tests/export/test_tensorrt_llm.py
+++ /dev/null
@@ -1,233 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-
-import pytest
-import torch
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_get_nemo_to_trtllm_conversion_dict_on_nemo_model():
-    try:
-        from nemo.export.tensorrt_llm import TensorRTLLM
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    dummy_state = object()
-    model_state_dict = {
-        'model.embedding.word_embeddings.weight': dummy_state,
-        'model.decoder.layers.0.self_attention.linear_proj.weight': dummy_state,
-    }
-    nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict)
-
-    # Check that every key starts with 'model.' and not 'model..' by using a regex
-    # This pattern ensures:
-    #   - The key starts with 'model.'
-    #   - Immediately after 'model.', there must be at least one character that is NOT a '.'
-    #     (preventing the 'model..' scenario)
-    pattern = re.compile(r'^model\.[^.].*')
-    for key in nemo_model_conversion_dict.keys():
-        assert pattern.match(key), f"Key '{key}' does not properly start with 'model.'"
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_get_nemo_to_trtllm_conversion_dict_on_mcore_model():
-    try:
-        from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT
-
-        from nemo.export.tensorrt_llm import TensorRTLLM
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    dummy_state = object()
-    model_state_dict = {
-        'embedding.word_embeddings.weight': dummy_state,
-        'decoder.layers.0.self_attention.linear_proj.weight': dummy_state,
-    }
-    nemo_model_conversion_dict = TensorRTLLM.get_nemo_to_trtllm_conversion_dict(model_state_dict)
-
-    # This is essentially a no-op
-    assert nemo_model_conversion_dict == DEFAULT_CONVERSION_DICT
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_tensorrt_llm_initialization():
-    try:
-        from nemo.export.tensorrt_llm import TensorRTLLM
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    # Test basic initialization
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-    assert trt_llm.model_dir == model_dir
-    assert trt_llm.engine_dir == os.path.join(model_dir, "trtllm_engine")
-    assert trt_llm.model is None
-    assert trt_llm.tokenizer is None
-    assert trt_llm.config is None
-
-    # Test initialization with lora checkpoints
-    lora_ckpt_list = ["/path/to/lora1", "/path/to/lora2"]
-    trt_llm = TensorRTLLM(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False)
-    assert trt_llm.lora_ckpt_list == lora_ckpt_list
-
-    # Test initialization with python runtime options
-    trt_llm = TensorRTLLM(
-        model_dir=model_dir,
-        use_python_runtime=False,
-        enable_chunked_context=False,
-        max_tokens_in_paged_kv_cache=None,
-        load_model=False,
-    )
-    assert trt_llm.use_python_runtime is False
-    assert trt_llm.enable_chunked_context is False
-    assert trt_llm.max_tokens_in_paged_kv_cache is None
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_tensorrt_llm_supported_models():
-    try:
-        from nemo.export.tensorrt_llm import TensorRTLLM
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Test supported models list
-    supported_models = trt_llm.get_supported_models_list
-    assert isinstance(supported_models, list)
-    assert len(supported_models) > 0
-    assert all(isinstance(model, str) for model in supported_models)
-
-    # Test HF model mapping
-    hf_mapping = trt_llm.get_supported_hf_model_mapping
-    assert isinstance(hf_mapping, dict)
-    assert len(hf_mapping) > 0
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_tensorrt_llm_input_dtype():
-    try:
-        from nemo.export.tensorrt_llm import TensorRTLLM
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    from megatron.core.export.data_type import DataType
-
-    # Test different storage dtypes
-    test_cases = [
-        (torch.float32, DataType.float32),
-        (torch.float16, DataType.float16),
-        (torch.bfloat16, DataType.bfloat16),
-    ]
-
-    for storage_dtype, expected_dtype in test_cases:
-        input_dtype = trt_llm.get_input_dtype(storage_dtype)
-        assert input_dtype == expected_dtype, f"Expected {expected_dtype} for {storage_dtype}, got {input_dtype}"
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_tensorrt_llm_hidden_size():
-    try:
-        from nemo.export.tensorrt_llm import TensorRTLLM
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Test hidden size property
-    hidden_size = trt_llm.get_hidden_size
-    if hidden_size is not None:
-        assert isinstance(hidden_size, int)
-        assert hidden_size > 0
-    else:
-        assert hidden_size is None
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_tensorrt_llm_triton_io():
-    try:
-        from nemo.export.tensorrt_llm import TensorRTLLM
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Test Triton input configuration
-    triton_input = trt_llm.get_triton_input
-    assert isinstance(triton_input, tuple)
-    assert triton_input[0].name == "prompts"
-    assert triton_input[1].name == "max_output_len"
-    assert triton_input[2].name == "top_k"
-    assert triton_input[3].name == "top_p"
-    assert triton_input[4].name == "temperature"
-    assert triton_input[5].name == "random_seed"
-    assert triton_input[6].name == "stop_words_list"
-    assert triton_input[7].name == "bad_words_list"
-    assert triton_input[8].name == "no_repeat_ngram_size"
-
-    # Test Triton output configuration
-    triton_output = trt_llm.get_triton_output
-    assert isinstance(triton_output, tuple)
-    assert triton_output[0].name == "outputs"
-    assert triton_output[1].name == "generation_logits"
-    assert triton_output[2].name == "context_logits"
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_tensorrt_llm_pad_logits():
-    try:
-        from nemo.export.tensorrt_llm import TensorRTLLM
-    except ImportError:
-        pytest.skip("Could not import TRTLLM helpers. tensorrt_llm is likely not installed")
-        return
-
-    model_dir = "/tmp/test_model_dir"
-    trt_llm = TensorRTLLM(model_dir=model_dir, load_model=False)
-
-    # Create a sample logits tensor
-    batch_size = 2
-    seq_len = 3
-    vocab_size = 1000
-    logits = torch.randn(batch_size, seq_len, vocab_size)
-
-    # Test padding logits
-    padded_logits = trt_llm._pad_logits(logits)
-    assert isinstance(padded_logits, torch.Tensor)
-    assert padded_logits.shape[0] == batch_size
-    assert padded_logits.shape[1] == seq_len
-    assert padded_logits.shape[2] >= vocab_size  # Should be padded to a multiple of 8
diff --git a/tests/export/test_tensorrt_mm_exporter.py b/tests/export/test_tensorrt_mm_exporter.py
deleted file mode 100644
index cd9c9eb2db07..000000000000
--- a/tests/export/test_tensorrt_mm_exporter.py
+++ /dev/null
@@ -1,224 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from unittest.mock import Mock, patch
-
-import numpy as np
-import pytest
-
-
-@pytest.fixture
-def model_dir(tmp_path):
-    return str(tmp_path / "model_dir")
-
-
-@pytest.fixture
-def mock_runner():
-    runner = Mock()
-    runner.model_type = "neva"
-    runner.load_test_media = Mock(return_value=np.zeros((1, 224, 224, 3)))
-    runner.run = Mock(return_value="Test response")
-    return runner
-
-
-class TestTensorRTMMExporter:
-
-    @pytest.mark.run_only_on('GPU')
-    def test_init(self, model_dir):
-        # Test basic initialization
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        assert exporter.model_dir == model_dir
-        assert exporter.runner is None
-        assert exporter.modality == "vision"
-
-    @pytest.mark.run_only_on('GPU')
-    def test_init_invalid_modality(self, model_dir):
-        # Test initialization with invalid modality
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        with pytest.raises(AssertionError):
-            TensorRTMMExporter(model_dir, modality="invalid")
-
-    @pytest.mark.run_only_on('GPU')
-    @patch("nemo.export.tensorrt_mm_exporter.build_mllama_engine")
-    def test_export_mllama(self, mock_build, model_dir):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path", model_type="mllama", tensor_parallel_size=1, load_model=False
-        )
-        mock_build.assert_called_once()
-
-    @pytest.mark.run_only_on('GPU')
-    @patch("nemo.export.tensorrt_mm_exporter.build_trtllm_engine")
-    @patch("nemo.export.tensorrt_mm_exporter.build_visual_engine")
-    def test_export_neva(self, mock_visual, mock_trtllm, model_dir):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.export(
-            visual_checkpoint_path="dummy/path", model_type="neva", tensor_parallel_size=1, load_model=False
-        )
-        mock_trtllm.assert_called_once()
-        mock_visual.assert_called_once()
-
-    @pytest.mark.run_only_on('GPU')
-    def test_forward_without_loading(self, model_dir):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        with pytest.raises(Exception) as exc_info:
-            exporter.forward("test prompt", "test_image.jpg")
-        assert "should be exported and" in str(exc_info.value)
-
-    @pytest.mark.run_only_on('GPU')
-    def test_forward(self, model_dir, mock_runner):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.runner = mock_runner
-
-        result = exporter.forward(
-            input_text="What's in this image?", input_media="test_image.jpg", batch_size=1, max_output_len=30
-        )
-
-        assert result == "Test response"
-        mock_runner.load_test_media.assert_called_once()
-        mock_runner.run.assert_called_once()
-
-    @pytest.mark.run_only_on('GPU')
-    def test_get_triton_input(self, model_dir):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        inputs = exporter.get_triton_input
-
-        # Verify we have the expected number of inputs
-        assert len(inputs) == 10  # 1 text input + 1 media input + 8 optional parameters
-
-        # Verify the first input is for text
-        assert inputs[0].name == "input_text"
-        assert inputs[0].dtype == bytes
-
-    @pytest.mark.run_only_on('GPU')
-    def test_get_triton_output(self, model_dir):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        outputs = exporter.get_triton_output
-
-        assert len(outputs) == 1
-        assert outputs[0].name == "outputs"
-        assert outputs[0].dtype == bytes
-
-    @pytest.mark.run_only_on('GPU')
-    def test_forward_with_all_params(self, model_dir, mock_runner):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        exporter.runner = mock_runner
-
-        result = exporter.forward(
-            input_text="What's in this image?",
-            input_media="test_image.jpg",
-            batch_size=2,
-            max_output_len=50,
-            top_k=5,
-            top_p=0.9,
-            temperature=0.7,
-            repetition_penalty=1.2,
-            num_beams=4,
-            lora_uids=["lora1", "lora2"],
-        )
-
-        assert result == "Test response"
-        mock_runner.load_test_media.assert_called_once()
-        mock_runner.run.assert_called_once_with(
-            "What's in this image?",
-            mock_runner.load_test_media.return_value,
-            50,
-            2,
-            5,
-            0.9,
-            0.7,
-            1.2,
-            4,
-            ["lora1", "lora2"],
-        )
-
-    @pytest.mark.run_only_on('GPU')
-    def test_get_input_media_tensors_vision(self, model_dir):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False, modality="vision")
-        tensors = exporter.get_input_media_tensors()
-
-        assert len(tensors) == 1
-        assert tensors[0].name == "input_media"
-        assert tensors[0].shape == (-1, -1, -1, 3)
-        assert tensors[0].dtype == np.uint8
-
-    @pytest.mark.run_only_on('GPU')
-    def test_get_input_media_tensors_audio(self, model_dir):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False, modality="audio")
-        tensors = exporter.get_input_media_tensors()
-
-        assert len(tensors) == 2
-        assert tensors[0].name == "input_signal"
-        assert tensors[0].shape == (-1,)
-        assert tensors[0].dtype == np.single
-        assert tensors[1].name == "input_signal_length"
-        assert tensors[1].shape == (1,)
-        assert tensors[1].dtype == np.intc
-
-    @pytest.mark.run_only_on('GPU')
-    def test_export_with_invalid_model_type(self, model_dir):
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        with pytest.raises(Exception):
-            exporter.export(
-                visual_checkpoint_path="dummy/path",
-                model_type="invalid_model_type",
-                tensor_parallel_size=1,
-                load_model=False,
-            )
-
-    @pytest.mark.run_only_on('GPU')
-    def test_export_with_existing_files(self, model_dir):
-        import os
-
-        from nemo.export.tensorrt_mm_exporter import TensorRTMMExporter
-
-        # Create some files in the model directory
-        os.makedirs(model_dir, exist_ok=True)
-        with open(os.path.join(model_dir, "test.txt"), "w") as f:
-            f.write("test")
-
-        exporter = TensorRTMMExporter(model_dir, load_model=False)
-        with pytest.raises(Exception) as exc_info:
-            exporter.export(
-                visual_checkpoint_path="dummy/path",
-                model_type="neva",
-                tensor_parallel_size=1,
-                load_model=False,
-                delete_existing_files=False,
-            )
-        assert "There are files in this folder" in str(exc_info.value)
diff --git a/tests/export/test_tiktoken_tokenizer.py b/tests/export/test_tiktoken_tokenizer.py
deleted file mode 100644
index 1d265c210146..000000000000
--- a/tests/export/test_tiktoken_tokenizer.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import base64
-import json
-import tempfile
-from pathlib import Path
-
-import pytest
-
-from nemo.export.tiktoken_tokenizer import TiktokenTokenizer, reload_mergeable_ranks
-
-
-@pytest.fixture
-def sample_vocab_file():
-    # Create a temporary vocab file for testing
-    vocab_data = [
-        {"rank": i, "token_bytes": base64.b64encode(bytes([i])).decode('utf-8'), "token_str": f"token_{i}"}
-        for i in range(256)
-    ]
-    # Add a few merged tokens
-    vocab_data.extend(
-        [
-            {"rank": 256, "token_bytes": base64.b64encode(b"Hello").decode('utf-8'), "token_str": "Hello"},
-            {"rank": 257, "token_bytes": base64.b64encode(b"World").decode('utf-8'), "token_str": "World"},
-        ]
-    )
-
-    with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
-        json.dump(vocab_data, f)
-        temp_path = f.name
-
-    yield temp_path
-    Path(temp_path).unlink()  # Cleanup after tests
-
-
-def test_reload_mergeable_ranks(sample_vocab_file):
-    ranks = reload_mergeable_ranks(sample_vocab_file)
-    assert len(ranks) == 258  # 256 base tokens + 2 merged tokens
-    assert ranks[b"Hello"] == 256
-    assert ranks[b"World"] == 257
-
-
-def test_tokenizer_initialization(sample_vocab_file):
-    tokenizer = TiktokenTokenizer(sample_vocab_file)
-    assert tokenizer.bos_token_id == 1  # 
-    assert tokenizer.eos_token_id == 2  # 
-    assert tokenizer.pad_id == 2  # same as eos_token_id
-
-
-def test_encode_decode(sample_vocab_file):
-    tokenizer = TiktokenTokenizer(sample_vocab_file)
-    text = "Hello World"
-    tokens = tokenizer.encode(text)
-    decoded_text = tokenizer.decode(tokens)
-    assert isinstance(tokens, list)
-    assert all(isinstance(t, int) for t in tokens)
-    assert isinstance(decoded_text, str)
-
-
-def test_batch_decode(sample_vocab_file):
-    tokenizer = TiktokenTokenizer(sample_vocab_file)
-    tokens = [[1000, 1001, 1002]]  # Example token IDs above num_special_tokens
-    decoded_text = tokenizer.batch_decode(tokens)
-    assert isinstance(decoded_text, str)
-
-
-def test_special_token_handling(sample_vocab_file):
-    tokenizer = TiktokenTokenizer(sample_vocab_file)
-    # Test that special tokens are properly filtered during decoding
-    tokens = [tokenizer.bos_token_id, 1000, 1001, tokenizer.eos_token_id]
-    decoded_text = tokenizer.decode(tokens)
-    assert decoded_text != ""  # Should decode the non-special tokens
-
-
-def test_empty_decode(sample_vocab_file):
-    tokenizer = TiktokenTokenizer(sample_vocab_file)
-    # Test decoding with only special tokens
-    tokens = [tokenizer.bos_token_id, tokenizer.eos_token_id]
-    decoded_text = tokenizer.decode(tokens)
-    assert decoded_text == ""  # Should return empty string
-
-
-def test_batch_decode_numpy_tensor(sample_vocab_file):
-    import numpy as np
-    import torch
-
-    tokenizer = TiktokenTokenizer(sample_vocab_file)
-    np_tokens = np.array([[1000, 1001, 1002]])
-    torch_tokens = torch.tensor([[1000, 1001, 1002]])
-
-    np_decoded = tokenizer.batch_decode(np_tokens)
-    torch_decoded = tokenizer.batch_decode(torch_tokens)
-
-    assert isinstance(np_decoded, str)
-    assert isinstance(torch_decoded, str)
-    assert np_decoded == torch_decoded
diff --git a/tests/export/test_trt_compile.py b/tests/export/test_trt_compile.py
deleted file mode 100644
index 324deba2deb7..000000000000
--- a/tests/export/test_trt_compile.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import annotations
-
-import tempfile
-import unittest
-from typing import List
-
-import torch
-
-TEST_CASE_1 = ["fp32"]
-TEST_CASE_2 = ["fp16"]
-
-
-class ListAdd(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, x: List[torch.Tensor], y: torch.Tensor, z: torch.Tensor, bs: float = 0.1):
-        y1 = y.clone()
-        x1 = x.copy()
-        z1 = z + y
-        for xi in x:
-            y1 = y1 + xi + bs
-        return x1, [y1, z1], y1 + z1
-
-
-@unittest.skip
-class TestTRTCompile(unittest.TestCase):
-
-    def setUp(self):
-        self.gpu_device = torch.cuda.current_device()
-
-    def tearDown(self):
-        current_device = torch.cuda.current_device()
-        if current_device != self.gpu_device:
-            torch.cuda.set_device(self.gpu_device)
-
-    def test_torch_trt(self):
-
-        model = torch.nn.Sequential(*[torch.nn.PReLU(), torch.nn.PReLU()])
-        data1 = model.state_dict()
-        data1["0.weight"] = torch.tensor([0.1])
-        data1["1.weight"] = torch.tensor([0.2])
-        model.load_state_dict(data1)
-        model.cuda()
-        x = torch.randn(1, 16).to("cuda")
-
-        with tempfile.TemporaryDirectory() as tempdir:
-            args = {
-                "method": "torch_trt",
-                "dynamic_batchsize": [1, 4, 8],
-            }
-            input_example = (x,)
-            output_example = model(*input_example)
-            trt_compile(
-                model,
-                f"{tempdir}/test_lists",
-                args=args,
-            )
-            self.assertIsNone(model._trt_compiler.engine)
-            trt_output = model(*input_example)
-            # Check that lazy TRT build succeeded
-            self.assertIsNotNone(model._trt_compiler.engine)
-            torch.testing.assert_close(trt_output, output_example, rtol=0.01, atol=0.01)
-
-    def test_profiles(self):
-        model = ListAdd().cuda()
-
-        with torch.no_grad(), tempfile.TemporaryDirectory() as tmpdir:
-            args = {
-                "export_args": {
-                    "dynamo": False,
-                },
-                "input_profiles": [
-                    {
-                        "x_0": [[1, 8], [2, 16], [2, 32]],
-                        "x_1": [[1, 8], [2, 16], [2, 32]],
-                        "x_2": [[1, 8], [2, 16], [2, 32]],
-                        "y": [[1, 8], [2, 16], [2, 32]],
-                        "z": [[1, 8], [1, 16], [1, 32]],
-                    }
-                ],
-                "output_lists": [[-1], [2], []],
-            }
-            x = torch.randn(1, 16).to("cuda")
-            y = torch.randn(1, 16).to("cuda")
-            z = torch.randn(1, 16).to("cuda")
-            input_example = ([x, y, z], y.clone(), z.clone())
-            output_example = model(*input_example)
-            trt_compile(
-                model,
-                f"{tmpdir}/test_dynamo_trt",
-                args=args,
-            )
-            self.assertIsNone(model._trt_compiler.engine)
-            trt_output = model(*input_example)
-            # Check that lazy TRT build succeeded
-            self.assertIsNotNone(model._trt_compiler.engine)
-            torch.testing.assert_close(trt_output, output_example, rtol=0.01, atol=0.01)
-
-    def test_lists(self):
-        model = ListAdd().cuda()
-
-        with torch.no_grad(), tempfile.TemporaryDirectory() as tmpdir:
-            args = {
-                "export_args": {
-                    "dynamo": True,
-                },
-                "output_lists": [[-1], [2], []],
-            }
-            x = torch.randn(1, 16).to("cuda")
-            y = torch.randn(1, 16).to("cuda")
-            z = torch.randn(1, 16).to("cuda")
-            input_example = ([x, y, z], y.clone(), z.clone())
-            output_example = model(*input_example)
-            trt_compile(
-                model,
-                f"{tmpdir}/test_lists",
-                args=args,
-            )
-            self.assertIsNone(model._trt_compiler.engine)
-            trt_output = model(*input_example)
-            # Check that lazy TRT build succeeded
-            self.assertIsNotNone(model._trt_compiler.engine)
-            torch.testing.assert_close(trt_output, output_example, rtol=0.01, atol=0.01)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/export/test_vllm_hf_exporter.py b/tests/export/test_vllm_hf_exporter.py
deleted file mode 100644
index 5b72c8ae2458..000000000000
--- a/tests/export/test_vllm_hf_exporter.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-
-
-@pytest.fixture
-def exporter():
-    from nemo.export.vllm_hf_exporter import vLLMHFExporter
-
-    return vLLMHFExporter()
-
-
-@pytest.fixture
-def mock_llm():
-    with patch('nemo.export.vllm_hf_exporter.LLM') as mock:
-        mock_instance = MagicMock()
-        mock.return_value = mock_instance
-        yield mock_instance
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_init(exporter):
-    """Test initialization of vLLMHFExporter"""
-    assert exporter.model is None
-    assert exporter.lora_models is None
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_export(exporter, mock_llm):
-    """Test export method"""
-    model_path = "/path/to/model"
-    exporter.export(model=model_path)
-
-    assert exporter.model is not None
-    mock_llm.assert_called_once_with(model=model_path, enable_lora=False)
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_export_with_lora(exporter, mock_llm):
-    """Test export method with LoRA enabled"""
-    model_path = "/path/to/model"
-    exporter.export(model=model_path, enable_lora=True)
-
-    assert exporter.model is not None
-    mock_llm.assert_called_once_with(model=model_path, enable_lora=True)
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_add_lora_models(exporter):
-    """Test adding LoRA models"""
-    lora_name = "test_lora"
-    lora_model = "path/to/lora"
-
-    exporter.add_lora_models(lora_name, lora_model)
-
-    assert exporter.lora_models is not None
-    assert lora_name in exporter.lora_models
-    assert exporter.lora_models[lora_name] == lora_model
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_get_triton_input(exporter):
-    """Test triton input configuration"""
-    inputs = exporter.get_triton_input
-
-    # Check that we have all expected inputs
-    input_names = [tensor.name for tensor in inputs]
-    assert "prompts" in input_names
-    assert "max_output_len" in input_names
-    assert "top_k" in input_names
-    assert "top_p" in input_names
-    assert "temperature" in input_names
-
-    # Check data types
-    for tensor in inputs:
-        if tensor.name == "prompts":
-            assert tensor.dtype == bytes
-        elif tensor.name == "max_output_len":
-            assert tensor.dtype == np.int_
-        elif tensor.name in ["top_k"]:
-            assert tensor.dtype == np.int_
-        elif tensor.name in ["top_p", "temperature"]:
-            assert tensor.dtype == np.single
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_get_triton_output(exporter):
-    """Test triton output configuration"""
-    outputs = exporter.get_triton_output
-
-    assert len(outputs) == 1
-    assert outputs[0].name == "outputs"
-    assert outputs[0].dtype == bytes
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_forward_without_model(exporter):
-    """Test forward method without initialized model"""
-    with pytest.raises(AssertionError, match="Model is not initialized"):
-        exporter.forward(["test prompt"])
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_forward_with_lora_not_added(exporter, mock_llm):
-    """Test forward method with non-existent LoRA model"""
-    exporter.export(model="/path/to/model")
-
-    with pytest.raises(Exception, match="No lora models are available"):
-        exporter.forward(["test prompt"], lora_model_name="non_existent_lora")
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_forward_with_invalid_lora(exporter, mock_llm):
-    """Test forward method with invalid LoRA model name"""
-    exporter.export(model="/path/to/model")
-    exporter.add_lora_models("valid_lora", "path/to/lora")
-
-    with pytest.raises(AssertionError, match="Lora model was not added before"):
-        exporter.forward(["test prompt"], lora_model_name="invalid_lora")
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_triton_infer_fn(exporter, mock_llm):
-    """Test triton inference function"""
-    exporter.export(model="/path/to/model")
-    mock_llm.generate.return_value = [MagicMock(outputs=[MagicMock(text="test output")])]
-
-    inputs = {
-        "prompts": np.array([b"test prompt"]),
-        "max_output_len": np.array([64]),
-        "top_k": np.array([1]),
-        "top_p": np.array([0.1]),
-        "temperature": np.array([1.0]),
-    }
-
-    result = exporter.triton_infer_fn(**inputs)
-
-    assert "outputs" in result
-    assert isinstance(result["outputs"], np.ndarray)
-    assert result["outputs"].dtype == np.bytes_
-
-
-@pytest.mark.skip(reason="Need to enable virtual environment for vLLM")
-@pytest.mark.run_only_on('GPU')
-def test_triton_infer_fn_error_handling(exporter):
-    """Test triton inference function error handling"""
-    inputs = {"prompts": np.array([b"test prompt"])}
-
-    result = exporter.triton_infer_fn(**inputs)
-
-    assert "outputs" in result
-    assert isinstance(result["outputs"], np.ndarray)
-    assert result["outputs"].dtype == np.bytes_
-    assert b"An error occurred" in result["outputs"][0]
diff --git a/tests/export/trt_llm/__init__.py b/tests/export/trt_llm/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/export/trt_llm/converter/__init__.py b/tests/export/trt_llm/converter/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/tests/export/trt_llm/converter/test_converter_utils.py b/tests/export/trt_llm/converter/test_converter_utils.py
deleted file mode 100755
index eb861a452630..000000000000
--- a/tests/export/trt_llm/converter/test_converter_utils.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import pytest
-import torch
-
-
-@pytest.mark.run_only_on('GPU')
-def test_any_word_in_key():
-    # Test positive cases
-    from nemo.export.trt_llm.converter.utils import any_word_in_key
-
-    assert any_word_in_key("model.layer1.attention.dense.weight", ["attention", "mlp"]) == True
-    assert any_word_in_key("model.layer1.mlp.weight", ["attention", "mlp"]) == True
-
-    # Test negative cases
-    assert any_word_in_key("model.layer1.other.weight", ["attention", "mlp"]) == False
-    assert any_word_in_key("", ["attention", "mlp"]) == False
-
-
-@pytest.mark.run_only_on('GPU')
-def test_get_trt_llm_keyname():
-    # Test final layernorm case
-    from nemo.export.trt_llm.converter.utils import get_trt_llm_keyname
-
-    assert get_trt_llm_keyname("final_layernorm.weight") == "transformer.ln_f.weight"
-
-    # Test layer cases
-    assert get_trt_llm_keyname("layers.1.attention.dense.weight") == "transformer.layers.1.attention.dense.weight"
-    assert get_trt_llm_keyname("layers.2.mlp.linear_fc2.weight") == "transformer.layers.2.mlp.proj.weight"
-
-
-@pytest.mark.run_only_on('GPU')
-def test_is_scaling_factor():
-    from nemo.export.trt_llm.converter.utils import is_scaling_factor
-
-    assert is_scaling_factor("model.layer1.scale_fwd.weight") == True
-    assert is_scaling_factor("model.layer1.weight") == False
-    assert is_scaling_factor("") == False
-
-
-@pytest.mark.run_only_on('GPU')
-def test_get_scaling_factor_keys():
-    from nemo.export.trt_llm.converter.utils import get_scaling_factor_keys
-
-    key = "layers.1.mlp.dense_h_to_4h.scale_fwd"
-    keys, gate_keys = get_scaling_factor_keys(key)
-
-    # Check main keys
-    assert keys[0].endswith(".weights_scaling_factor")
-    assert keys[1].endswith(".activation_scaling_factor")
-
-    # Check gate keys
-    assert gate_keys[0].endswith(".activation_scaling_factor")
-    assert gate_keys[1].endswith(".weights_scaling_factor")
-
-
-@pytest.mark.run_only_on('GPU')
-def test_split():
-    # Test numpy array splitting
-    from nemo.export.trt_llm.converter.utils import split
-
-    arr = np.array([1, 2, 3, 4])
-    assert np.array_equal(split(arr, tp_size=2, idx=0), np.array([1, 2]))
-    assert np.array_equal(split(arr, tp_size=2, idx=1), np.array([3, 4]))
-
-    # Test torch tensor splitting
-    tensor = torch.tensor([1, 2, 3, 4])
-    assert torch.equal(split(tensor, tp_size=2, idx=0), torch.tensor([1, 2]))
-    assert torch.equal(split(tensor, tp_size=2, idx=1), torch.tensor([3, 4]))
-
-    # Test no split case
-    assert np.array_equal(split(arr, tp_size=1, idx=0), arr)
-
-
-@pytest.mark.run_only_on('GPU')
-def test_generate_int8():
-    # Create test weights and activation ranges
-    from nemo.export.trt_llm.converter.utils import generate_int8
-
-    weights = np.random.randn(4, 4).astype(np.float32)
-    act_range = {"w": torch.tensor(2.0), "x": torch.tensor(3.0), "y": torch.tensor(4.0)}
-
-    result = generate_int8(weights, act_range)
-
-    # Check that all expected keys are present
-    expected_keys = [
-        "weight.int8",
-        "weight.int8.col",
-        "scale_x_orig_quant",
-        "scale_w_quant_orig",
-        "scale_w_quant_orig.col",
-        "scale_y_accum_quant",
-        "scale_y_accum_quant.col",
-        "scale_y_quant_orig",
-    ]
-    assert all(key in result for key in expected_keys)
-
-    # Check that int8 weights are in correct range
-    assert np.all(result["weight.int8"] >= -127)
-    assert np.all(result["weight.int8"] <= 127)
-    assert np.all(result["weight.int8.col"] >= -127)
-    assert np.all(result["weight.int8.col"] <= 127)
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/export/trt_llm/converter/test_model_converter.py b/tests/export/trt_llm/converter/test_model_converter.py
deleted file mode 100644
index 5e0cf3997f1e..000000000000
--- a/tests/export/trt_llm/converter/test_model_converter.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-import torch
-
-
-@pytest.mark.run_only_on('GPU')
-def test_determine_quantization_settings():
-    # Test with default NeMo config (no fp8)
-    from nemo.export.trt_llm.converter.model_converter import determine_quantization_settings
-
-    nemo_config = {'fp8': False}
-    fp8_quant, fp8_kv = determine_quantization_settings(nemo_config)
-    assert not fp8_quant
-    assert not fp8_kv
-
-    # Test with NeMo config having fp8=True
-    nemo_config = {'fp8': True}
-    fp8_quant, fp8_kv = determine_quantization_settings(nemo_config)
-    assert fp8_quant
-    assert fp8_kv
-
-    # Test with override parameters
-    fp8_quant, fp8_kv = determine_quantization_settings(nemo_config, fp8_quantized=False, fp8_kvcache=True)
-    assert not fp8_quant
-    assert fp8_kv
-
-
-@pytest.mark.run_only_on('GPU')
-def test_prompt_convert_task_templates():
-    # Test with task templates
-    from nemo.export.trt_llm.converter.model_converter import prompt_convert
-
-    prompt_config = {
-        'task_templates': [
-            {'taskname': 'task1'},
-            {'taskname': 'task2'},
-        ]
-    }
-
-    # Create mock weights
-    prompt_weights = {
-        'prompt_table': {
-            'prompt_table.task1.prompt_embeddings.weight': torch.ones(2, 4),
-            'prompt_table.task2.prompt_embeddings.weight': torch.ones(3, 4),
-        }
-    }
-
-    result = prompt_convert(prompt_config, prompt_weights)
-    assert isinstance(result, torch.Tensor)
-    assert result.shape == (2, 3, 4)  # (num_tasks, max_length, embedding_dim)
-
-
-@pytest.mark.run_only_on('GPU')
-def test_prompt_convert_direct_embeddings():
-    # Test with direct embeddings
-    from nemo.export.trt_llm.converter.model_converter import prompt_convert
-
-    prompt_config = {}
-    prompt_weights = {'prompt_embeddings_weights': torch.ones(2, 3, 4)}
-
-    result = prompt_convert(prompt_config, prompt_weights)
-    assert isinstance(result, torch.Tensor)
-    assert result.shape == (2, 3, 4)
diff --git a/tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py b/tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py
deleted file mode 100644
index 2c614fe54fe3..000000000000
--- a/tests/export/trt_llm/converter/test_model_to_trt_llm_ckpt.py
+++ /dev/null
@@ -1,117 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-
-
-@pytest.mark.run_only_on('GPU')
-def test_rename_key():
-    # Test basic self_attention replacement
-    from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import rename_key
-
-    assert rename_key("self_attention.weight") == "attention.weight"
-
-    # Test layernorm replacements
-    assert rename_key("attention.linear_qkv.layer_norm_weight") == "input_layernorm.weight"
-    assert rename_key("attention.linear_qkv.layer_norm_bias") == "input_layernorm.bias"
-    assert rename_key("mlp.linear_fc1.layer_norm_weight") == "post_attention_layernorm.weight"
-    assert rename_key("mlp.linear_fc1.layer_norm_bias") == "post_attention_layernorm.bias"
-
-    # Test key with no replacements needed
-    assert rename_key("some_other_key") == "some_other_key"
-
-
-@pytest.mark.run_only_on('GPU')
-def test_rename_key_dist_ckpt():
-    # Test key with layers
-    from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import rename_key_dist_ckpt
-
-    assert rename_key_dist_ckpt("layers.linear_qkv.weight", 0) == "layers.0.linear_qkv.weight"
-    assert rename_key_dist_ckpt("layers.self_attention.weight", 1) == "layers.1.attention.weight"
-
-    # Test key without layers
-    assert rename_key_dist_ckpt("embedding.weight", 0) == "embedding.weight"
-
-
-@pytest.mark.run_only_on('GPU')
-def test_get_layer_prefix():
-    # Test for mcore model
-    from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import get_layer_prefix
-
-    layer_names_mcore = [
-        "model.decoder.layers.0.self_attention.weight",
-        "optimizer.state",
-        "model.decoder.layers.1.self_attention.bias",
-    ]
-    model_prefix, transformer_prefix = get_layer_prefix(layer_names_mcore, is_mcore=True)
-    assert model_prefix == "model."
-    assert transformer_prefix == "model.decoder."
-
-    # Test for non-mcore model
-    layer_names_non_mcore = [
-        "model.encoder.layers.0.self_attention.weight",
-        "optimizer.state",
-        "model.encoder.layers.1.self_attention.bias",
-    ]
-    model_prefix, transformer_prefix = get_layer_prefix(layer_names_non_mcore, is_mcore=False)
-    assert model_prefix == "model."
-    assert transformer_prefix == "model.encoder."
-
-
-@pytest.mark.run_only_on('GPU')
-def test_rename_layer_num():
-    # Test basic layer number replacement
-    from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import rename_layer_num
-
-    assert rename_layer_num("model.layers.0.attention.weight", 1) == "model.layers.1.attention.weight"
-    assert rename_layer_num("decoder.layers.5.mlp.weight", 2) == "decoder.layers.2.mlp.weight"
-
-    # Test with multiple numeric components
-    assert rename_layer_num("model.layers.0.attention.head.8.weight", 3) == "model.layers.3.attention.head.8.weight"
-
-
-@pytest.mark.run_only_on('GPU')
-def test_get_layer_num():
-    from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import get_layer_num
-
-    assert get_layer_num("model.layers.0.attention.weight") == 0
-    assert get_layer_num("decoder.layers.5.mlp.weight") == 5
-
-    with pytest.raises(ValueError):
-        get_layer_num("model.attention.weight")  # No layers component
-
-
-@pytest.mark.run_only_on('GPU')
-def test_is_scaling_factor():
-    from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import is_scaling_factor
-
-    assert is_scaling_factor("layer.extra_state.weight") == True
-    assert is_scaling_factor("layer.weight") == False
-    assert is_scaling_factor("extra_state") == True
-
-
-@pytest.mark.run_only_on('GPU')
-def test_create_export_dir(tmp_path):
-    from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import create_export_dir
-
-    # Test creating new directory
-    export_dir = tmp_path / "new_export_dir"
-    created_dir = create_export_dir(export_dir)
-    assert created_dir.exists()
-    assert created_dir.is_dir()
-
-    # Test with existing directory
-    existing_dir = create_export_dir(export_dir)
-    assert existing_dir == export_dir
diff --git a/tests/export/trt_llm/test_tensorrt_llm_export.py b/tests/export/trt_llm/test_tensorrt_llm_export.py
deleted file mode 100755
index e52dca64534d..000000000000
--- a/tests/export/trt_llm/test_tensorrt_llm_export.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import shutil
-
-import pytest
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-@pytest.mark.parametrize("tensor_parallelism_size,pipeline_parallelism_size", [(2, 1), (1, 2)])
-def test_nemo2_convert_to_safe_tensors(tensor_parallelism_size, pipeline_parallelism_size):
-    """
-    Test safe tensor exporter. This tests the whole nemo export until engine building.
-    """
-    from pathlib import Path
-
-    from nemo.export.tensorrt_llm import TensorRTLLM
-
-    trt_llm_exporter = TensorRTLLM(model_dir="/tmp/safe_tensor_test/")
-    trt_llm_exporter.convert_to_safe_tensors(
-        nemo_checkpoint_path="/home/TestData/llm/models/llama32_1b_nemo2",
-        model_type="llama",
-        delete_existing_files=True,
-        tensor_parallelism_size=tensor_parallelism_size,
-        pipeline_parallelism_size=pipeline_parallelism_size,
-        gpus_per_node=2,
-        use_parallel_embedding=False,
-        use_embedding_sharing=False,
-        dtype="bfloat16",
-    )
-
-    assert Path("/tmp/safe_tensor_test/").exists(), "Safe tensors were not generated."
-    assert Path("/tmp/safe_tensor_test/rank0.safetensors").exists(), "Safe tensors for rank0 were not generated."
-    if pipeline_parallelism_size == 1 and tensor_parallelism_size == 2:
-        assert Path("/tmp/safe_tensor_test/rank1.safetensors").exists(), "Safe tensors for rank1 were not generated."
-    assert Path("/tmp/safe_tensor_test/config.json").exists(), "config.yaml was not generated."
-
-    shutil.rmtree("/tmp/safe_tensor_test/")
-
-
-@pytest.mark.run_only_on('GPU')
-@pytest.mark.unit
-def test_nemo2_convert_to_export():
-    """
-    Test safe tensor exporter. This tests the whole nemo export until engine building.
-    """
-    from pathlib import Path
-
-    from nemo.export.tensorrt_llm import TensorRTLLM
-
-    trt_llm_exporter = TensorRTLLM(model_dir="/tmp/safe_tensor_test_2/")
-    trt_llm_exporter.export(
-        nemo_checkpoint_path="/home/TestData/llm/models/llama32_1b_nemo2",
-        model_type="llama",
-        delete_existing_files=True,
-        tensor_parallelism_size=1,
-        pipeline_parallelism_size=1,
-        gpus_per_node=None,
-        max_input_len=1024,
-        max_output_len=256,
-        max_batch_size=4,
-        max_prompt_embedding_table_size=None,
-        use_parallel_embedding=False,
-        use_embedding_sharing=False,
-        paged_kv_cache=True,
-        remove_input_padding=True,
-        paged_context_fmha=False,
-        dtype=None,
-        load_model=True,
-        use_lora_plugin=None,
-        lora_target_modules=None,
-        max_lora_rank=64,
-        max_num_tokens=None,
-        opt_num_tokens=None,
-        max_seq_len=512,
-        multiple_profiles=False,
-        gpt_attention_plugin="auto",
-        gemm_plugin="auto",
-        use_mcore_path=True,
-        reduce_fusion=True,
-        fp8_quantized=None,
-        fp8_kvcache=None,
-        gather_context_logits=True,
-        gather_generation_logits=True,
-        build_rank=None,
-    )
-
-    output = trt_llm_exporter.forward(
-        input_texts=["Tell me the capitol of France "],
-        max_output_len=16,
-        top_k=1,
-        top_p=0.0,
-        temperature=0.1,
-        stop_words_list=None,
-        bad_words_list=None,
-        no_repeat_ngram_size=None,
-        task_ids=None,
-        lora_uids=None,
-        prompt_embeddings_table=None,
-        prompt_embeddings_checkpoint_path=None,
-        streaming=False,
-        output_log_probs=False,
-        output_context_logits=False,
-        output_generation_logits=False,
-    )
-
-    print(output)
-
-    assert Path("/tmp/safe_tensor_test_2/trtllm_engine/").exists(), "Safe tensors were not generated."
-    assert Path(
-        "/tmp/safe_tensor_test_2/trtllm_engine/rank0.engine"
-    ).exists(), "Safe tensors for rank0 were not generated."
-    assert Path("/tmp/safe_tensor_test_2/trtllm_engine/config.json").exists(), "config.yaml was not generated."
-
-    shutil.rmtree("/tmp/safe_tensor_test_2/")
diff --git a/tests/export/utils/test_exp_utils.py b/tests/export/utils/test_exp_utils.py
deleted file mode 100644
index 66892ab7d402..000000000000
--- a/tests/export/utils/test_exp_utils.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import shutil
-import tempfile
-from unittest.mock import MagicMock
-
-import pytest
-import torch
-
-
-class TestUtils:
-    @pytest.fixture
-    def temp_dir(self):
-        # Create a temporary directory
-        temp_dir = tempfile.mkdtemp()
-        yield temp_dir
-        # Cleanup after test
-        shutil.rmtree(temp_dir)
-
-    @pytest.mark.run_only_on('GPU')
-    def test_is_nemo2_checkpoint(self, temp_dir):
-        from nemo.export.utils.utils import is_nemo2_checkpoint
-
-        # Test with non-existent path
-        assert not is_nemo2_checkpoint("/non/existent/path")
-
-        # Test with directory without context folder
-        os.makedirs(os.path.join(temp_dir, "no_context"))
-        assert not is_nemo2_checkpoint(os.path.join(temp_dir, "no_context"))
-
-        # Test with valid NeMo 2.0 checkpoint
-        os.makedirs(os.path.join(temp_dir, "valid_ckpt", "context"))
-        assert is_nemo2_checkpoint(os.path.join(temp_dir, "valid_ckpt"))
-
-    @pytest.mark.run_only_on('GPU')
-    def test_prepare_directory_for_export(self, temp_dir):
-        from nemo.export.utils.utils import prepare_directory_for_export
-
-        # Test creating new directory
-        model_dir = os.path.join(temp_dir, "new_dir")
-        prepare_directory_for_export(model_dir, delete_existing_files=False)
-        assert os.path.exists(model_dir)
-        assert os.path.isdir(model_dir)
-
-        # Test with existing empty directory
-        prepare_directory_for_export(model_dir, delete_existing_files=False)
-        assert os.path.exists(model_dir)
-
-        # Test with existing non-empty directory
-        with open(os.path.join(model_dir, "test.txt"), "w") as f:
-            f.write("test")
-
-        with pytest.raises(RuntimeError):
-            prepare_directory_for_export(model_dir, delete_existing_files=False)
-
-        # Test with delete_existing_files=True
-        prepare_directory_for_export(model_dir, delete_existing_files=True)
-        assert os.path.exists(model_dir)
-        assert not os.path.exists(os.path.join(model_dir, "test.txt"))
-
-        # Test with subdir
-        prepare_directory_for_export(model_dir, delete_existing_files=False, subdir="subdir")
-        assert os.path.exists(os.path.join(model_dir, "subdir"))
-
-    @pytest.mark.run_only_on('GPU')
-    def test_is_nemo_tarfile(self, temp_dir):
-        from nemo.export.utils.utils import is_nemo_tarfile
-
-        # Test with non-existent file
-        assert not is_nemo_tarfile("/non/existent/file.nemo")
-
-        # Test with non-nemo file
-        test_file = os.path.join(temp_dir, "test.txt")
-        with open(test_file, "w") as f:
-            f.write("test")
-        assert not is_nemo_tarfile(test_file)
-
-        # Test with .nemo file
-        nemo_file = os.path.join(temp_dir, "test.nemo")
-        with open(nemo_file, "w") as f:
-            f.write("test")
-        assert is_nemo_tarfile(nemo_file)
-
-    @pytest.mark.run_only_on('GPU')
-    def test_torch_dtype_from_precision(self):
-        from nemo.export.utils.utils import torch_dtype_from_precision
-
-        # Test with megatron_amp_O2=False
-        assert torch_dtype_from_precision("bf16", megatron_amp_O2=False) == torch.float32
-
-        # Test with different precision values
-        assert torch_dtype_from_precision("bf16") == torch.bfloat16
-        assert torch_dtype_from_precision("bf16-mixed") == torch.bfloat16
-        assert torch_dtype_from_precision(16) == torch.float16
-        assert torch_dtype_from_precision("16") == torch.float16
-        assert torch_dtype_from_precision("16-mixed") == torch.float16
-        assert torch_dtype_from_precision(32) == torch.float32
-        assert torch_dtype_from_precision("32") == torch.float32
-        assert torch_dtype_from_precision("32-true") == torch.float32
-
-        # Test with invalid precision
-        with pytest.raises(ValueError):
-            torch_dtype_from_precision("invalid")
-
-    @pytest.mark.run_only_on('GPU')
-    def test_get_example_inputs(self):
-        from nemo.export.utils.utils import get_example_inputs
-
-        # Mock tokenizer
-        mock_tokenizer = MagicMock()
-        mock_tokenizer.return_value = {
-            "input_ids": torch.tensor([[1, 2, 3], [4, 5, 6]]),
-            "attention_mask": torch.tensor([[1, 1, 1], [1, 1, 1]]),
-        }
-
-        result = get_example_inputs(mock_tokenizer)
-
-        # Verify tokenizer was called with correct arguments
-        mock_tokenizer.assert_called_once_with(
-            ["example query one", "example query two"],
-            ["example passage one", "example passage two"],
-            return_tensors="pt",
-        )
-
-        # Verify result structure
-        assert isinstance(result, dict)
-        assert "input_ids" in result
-        assert "attention_mask" in result
-        assert isinstance(result["input_ids"], torch.Tensor)
-        assert isinstance(result["attention_mask"], torch.Tensor)
diff --git a/tests/export/utils/test_lora_converter.py b/tests/export/utils/test_lora_converter.py
deleted file mode 100644
index c49825bc2d58..000000000000
--- a/tests/export/utils/test_lora_converter.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import pytest
-import torch
-
-
-@pytest.mark.run_only_on('GPU')
-def test_replace_number_add_offset():
-    from nemo.export.utils.lora_converter import replace_number_add_offset
-
-    # Test with no offset
-    key = "layers.0.self_attention.lora_kqv_adapter.linear_in.weight"
-    assert replace_number_add_offset(key, 0) == key
-
-    # Test with positive offset
-    assert replace_number_add_offset(key, 1) == "layers.1.self_attention.lora_kqv_adapter.linear_in.weight"
-
-    # Test with negative offset
-    assert replace_number_add_offset(key, -1) == "layers.-1.self_attention.lora_kqv_adapter.linear_in.weight"
-
-    # Test with key that doesn't contain layer number
-    key = "embedding.word_embeddings.weight"
-    assert replace_number_add_offset(key, 1) == key
-
-
-@pytest.mark.run_only_on('GPU')
-def test_rename_qkv_keys():
-    from nemo.export.utils.lora_converter import rename_qkv_keys
-
-    key = "layers.0.self_attention.lora_kqv_adapter.linear_in.weight"
-    new_keys = rename_qkv_keys(key)
-
-    assert len(new_keys) == 3
-    assert new_keys[0] == "layers.0.self_attention.lora_unfused_kqv_adapter.q_adapter.linear_in.weight"
-    assert new_keys[1] == "layers.0.self_attention.lora_unfused_kqv_adapter.k_adapter.linear_in.weight"
-    assert new_keys[2] == "layers.0.self_attention.lora_unfused_kqv_adapter.v_adapter.linear_in.weight"
-
-
-@pytest.mark.run_only_on('GPU')
-def test_reformat_module_names_to_hf():
-    from nemo.export.utils.lora_converter import reformat_module_names_to_hf
-
-    # Create sample tensors with NeMo-style names
-    tensors = {
-        "q_adapter.linear_in.weight": torch.randn(10, 10),
-        "k_adapter.linear_out.weight": torch.randn(10, 10),
-        "v_adapter.linear_in.weight": torch.randn(10, 10),
-        "lora_dense_attention_adapter.linear_out.weight": torch.randn(10, 10),
-        "lora_4htoh_adapter.linear_in.weight": torch.randn(10, 10),
-        "gate_adapter.linear_out.weight": torch.randn(10, 10),
-        "up_adapter.linear_in.weight": torch.randn(10, 10),
-    }
-
-    new_tensors, module_names = reformat_module_names_to_hf(tensors)
-
-    # Check that all tensors were converted
-    assert len(new_tensors) == len(tensors)
-
-    # Check that module names were correctly identified
-    expected_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "down_proj", "gate_proj", "up_proj"]
-    assert set(module_names) == set(expected_modules)
-
-    # Check some specific conversions
-    assert "base_model.q_proj.lora_A.weight" in new_tensors
-    assert "base_model.k_proj.lora_B.weight" in new_tensors
-    assert "base_model.v_proj.lora_A.weight" in new_tensors
-
-
-@pytest.mark.run_only_on('GPU')
-def test_convert_lora_weights_to_canonical():
-    from nemo.export.utils.lora_converter import convert_lora_weights_to_canonical
-
-    # Create a sample config
-    config = {
-        "hidden_size": 512,
-        "num_attention_heads": 8,
-        "num_query_groups": 4,
-        "peft": {"lora_tuning": {"adapter_dim": 16}},
-    }
-
-    # Create sample fused QKV weights
-    lora_weights = {
-        "layers.0.self_attention.lora_kqv_adapter.linear_in.weight": torch.randn(16, 1024),
-        "layers.0.self_attention.lora_kqv_adapter.linear_out.weight": torch.randn(1024, 16),
-        "layers.0.lora_hto4h_adapter.linear_in.weight": torch.randn(16, 1024),
-        "layers.0.lora_hto4h_adapter.linear_out.weight": torch.randn(2048, 16),
-    }
-
-    converted_weights = convert_lora_weights_to_canonical(config, lora_weights)
-
-    # Check that QKV weights were unfused
-    assert "layers.0.self_attention.lora_unfused_kqv_adapter.q_adapter.linear_in.weight" in converted_weights
-    assert "layers.0.self_attention.lora_unfused_kqv_adapter.k_adapter.linear_in.weight" in converted_weights
-    assert "layers.0.self_attention.lora_unfused_kqv_adapter.v_adapter.linear_in.weight" in converted_weights
-
-    # Check that H-to-4H weights were unfused
-    assert "layers.0.lora_unfused_hto4h_adapter.gate_adapter.linear_in.weight" in converted_weights
-    assert "layers.0.lora_unfused_hto4h_adapter.up_adapter.linear_in.weight" in converted_weights
diff --git a/tests/export/utils/test_model_loader.py b/tests/export/utils/test_model_loader.py
deleted file mode 100644
index a7ef30dc4317..000000000000
--- a/tests/export/utils/test_model_loader.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import json
-import shutil
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pytest
-import torch
-
-from nemo.export.utils.model_loader import (
-    TarFileSystemReader,
-    load_model_weights,
-    load_sharded_metadata_zarr,
-    nemo_to_path,
-    nemo_weights_directory,
-)
-
-
-@pytest.fixture
-def mock_checkpoint_dir(tmp_path):
-    # Create a temporary directory structure mimicking a NeMo checkpoint
-    weights_dir = tmp_path / "model_weights"
-    weights_dir.mkdir()
-
-    # Create metadata.json
-    metadata = {"sharded_backend": "torch_dist"}
-    with open(weights_dir / "metadata.json", "w") as f:
-        json.dump(metadata, f)
-
-    return tmp_path
-
-
-def test_nemo_to_path():
-    # Test directory path
-    dir_path = "/path/to/checkpoint"
-    with patch("os.path.isdir", return_value=True):
-        result = nemo_to_path(dir_path)
-        assert isinstance(result, Path)
-        assert str(result) == dir_path
-
-
-def test_tar_file_system_reader():
-    path = Path("/some/path")
-    reader = TarFileSystemReader(path)
-    assert reader.path == path
-
-
-@patch("zarr.open")
-def test_load_sharded_metadata_zarr(mock_zarr_open):
-    checkpoint_dir = MagicMock()
-
-    # Mock directory structure
-    subdir = MagicMock()
-    subdir.name = "test_tensor"
-    subdir.is_dir.return_value = True
-    subdir.__truediv__.return_value.exists.return_value = True
-    checkpoint_dir.iterdir.return_value = [subdir]
-
-    # Mock zarr array
-    mock_array = MagicMock()
-    mock_array.dtype.name = "float32"
-    mock_array.__getitem__.return_value = np.array([1.0])
-    mock_zarr_open.return_value = mock_array
-
-    state_dict = load_sharded_metadata_zarr(checkpoint_dir)
-    assert "test_tensor" in state_dict
-    assert isinstance(state_dict["test_tensor"], torch.Tensor)
-
-
-def test_nemo_weights_directory(mock_checkpoint_dir):
-    # Test model_weights directory
-    result = nemo_weights_directory(mock_checkpoint_dir)
-    assert result == mock_checkpoint_dir / "model_weights"
-
-    # Test weights directory
-    shutil.rmtree(mock_checkpoint_dir / "model_weights")
-    weights_dir = mock_checkpoint_dir / "weights"
-    weights_dir.mkdir()
-    result = nemo_weights_directory(mock_checkpoint_dir)
-    assert result == weights_dir
-
-    # Test fallback to checkpoint directory
-    weights_dir.rmdir()
-    result = nemo_weights_directory(mock_checkpoint_dir)
-    assert result == mock_checkpoint_dir
-
-
-@patch("nemo.export.utils.model_loader.load_sharded_metadata_zarr")
-@patch("nemo.export.utils.model_loader.load_sharded_metadata_torch_dist")
-def test_load_model_weights(mock_torch_dist, mock_zarr, mock_checkpoint_dir):
-    # Test torch_dist backend
-    load_model_weights(mock_checkpoint_dir)
-    mock_torch_dist.assert_called_once()
-    mock_zarr.assert_not_called()
-
-    # Test zarr backend
-    mock_torch_dist.reset_mock()
-    metadata = {"sharded_backend": "zarr"}
-    with open(mock_checkpoint_dir / "model_weights" / "metadata.json", "w") as f:
-        json.dump(metadata, f)
-
-    load_model_weights(mock_checkpoint_dir)
-    mock_zarr.assert_called_once()
-    mock_torch_dist.assert_not_called()
-
-    # Test unsupported backend
-    metadata = {"sharded_backend": "unsupported"}
-    with open(mock_checkpoint_dir / "model_weights" / "metadata.json", "w") as f:
-        json.dump(metadata, f)
-
-    with pytest.raises(NotImplementedError):
-        load_model_weights(mock_checkpoint_dir)
diff --git a/tests/functional_tests/L0_Unit_Tests_CPU_Export_Deploy.sh b/tests/functional_tests/L0_Unit_Tests_CPU_Export_Deploy.sh
deleted file mode 100644
index a3176d34b99f..000000000000
--- a/tests/functional_tests/L0_Unit_Tests_CPU_Export_Deploy.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-CUDA_VISIBLE_DEVICES="" NEMO_NUMBA_MINVER=0.53 coverage run -a --data-file=/workspace/.coverage --source=/workspace/ -m pytest tests/deploy tests/export -m "not pleasefixme" --cpu --with_downloads --relax_numba_compat
diff --git a/tests/functional_tests/L0_Unit_Tests_GPU_Export_Deploy.sh b/tests/functional_tests/L0_Unit_Tests_GPU_Export_Deploy.sh
deleted file mode 100644
index ab05fc74c90a..000000000000
--- a/tests/functional_tests/L0_Unit_Tests_GPU_Export_Deploy.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-NEMO_NUMBA_MINVER=0.53 CUDA_VISIBLE_DEVICES=0,1 coverage run -a --data-file=/workspace/.coverage --source=/workspace/ -m pytest tests/deploy tests/export -m "not pleasefixme" --with_downloads
diff --git a/tests/functional_tests/L2_NeMo_2_Export_Deploy_Query_In_Framework.sh b/tests/functional_tests/L2_NeMo_2_Export_Deploy_Query_In_Framework.sh
deleted file mode 100644
index 74c5ce8d24c7..000000000000
--- a/tests/functional_tests/L2_NeMo_2_Export_Deploy_Query_In_Framework.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/collections/llm/test_hf_import.py \
-  --hf_model /home/TestData/nlp/megatron_llama/llama-ci-hf \
-  --output_path /tmp/nemo2_ckpt
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo scripts/deploy/nlp/deploy_inframework_triton.py \
-  --nemo_checkpoint /tmp/nemo2_ckpt \
-  --triton_model_name llama \
-  --num_gpus 2 \
-  --tensor_parallelism_size 2 &
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo scripts/deploy/nlp/query_inframework.py \
-  --model_name llama \
-  --prompt "What is the color of a banana?" \
-  --max_output_len 20

From 7b55883b220d06f26c5d7f11dd813c4361d22f00 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Fri, 7 Nov 2025 13:12:21 -0800
Subject: [PATCH 03/15] remove exportDeploy tests

Signed-off-by: Pablo Garay 
---
 .../functional_tests/L2_NEMO_2_LoRA_Export.sh | 16 ---------
 .../L2_NeMo_2_Export_HF_TRT_LLM.sh            | 23 ------------
 .../L2_NeMo_2_Export_In_Framework.sh          | 31 ----------------
 .../L2_NeMo_2_Export_Qnemo_TRT_LLM.sh         | 36 -------------------
 .../L2_NeMo_2_Export_TRT_LLM.sh               | 21 -----------
 .../L2_NeMo_2_PTQ_Unified_Export.sh           | 17 ---------
 .../L2_NeMo_2_export_ckpt_Llama2_FP8_nemo.sh  | 17 ---------
 .../L2_NeMo_2_vLLM_Export_Llama.sh            | 28 ---------------
 .../L2_NeMo_2_vLLM_Export_Mixtral.sh          | 28 ---------------
 .../L2_ONNX_TRT_LLM_Embedding_Export.sh       | 15 --------
 10 files changed, 232 deletions(-)
 delete mode 100644 tests/functional_tests/L2_NEMO_2_LoRA_Export.sh
 delete mode 100644 tests/functional_tests/L2_NeMo_2_Export_HF_TRT_LLM.sh
 delete mode 100644 tests/functional_tests/L2_NeMo_2_Export_In_Framework.sh
 delete mode 100644 tests/functional_tests/L2_NeMo_2_Export_Qnemo_TRT_LLM.sh
 delete mode 100644 tests/functional_tests/L2_NeMo_2_Export_TRT_LLM.sh
 delete mode 100644 tests/functional_tests/L2_NeMo_2_PTQ_Unified_Export.sh
 delete mode 100644 tests/functional_tests/L2_NeMo_2_export_ckpt_Llama2_FP8_nemo.sh
 delete mode 100644 tests/functional_tests/L2_NeMo_2_vLLM_Export_Llama.sh
 delete mode 100644 tests/functional_tests/L2_NeMo_2_vLLM_Export_Mixtral.sh
 delete mode 100644 tests/functional_tests/L2_ONNX_TRT_LLM_Embedding_Export.sh

diff --git a/tests/functional_tests/L2_NEMO_2_LoRA_Export.sh b/tests/functional_tests/L2_NEMO_2_LoRA_Export.sh
deleted file mode 100644
index f76f32f54d40..000000000000
--- a/tests/functional_tests/L2_NEMO_2_LoRA_Export.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/collections/llm/peft/lora_export.py \
-    --lora_checkpoint_path=/home/TestData/nemo2_ckpt/llama_lora_ci_checkpoint_v4/ \
-    --output_path=/tmp/nemo2_lora_merge/$RUN_ID
diff --git a/tests/functional_tests/L2_NeMo_2_Export_HF_TRT_LLM.sh b/tests/functional_tests/L2_NeMo_2_Export_HF_TRT_LLM.sh
deleted file mode 100644
index 854ec17c256f..000000000000
--- a/tests/functional_tests/L2_NeMo_2_Export_HF_TRT_LLM.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/export/nemo_export.py \
-  --model_name test \
-  --model_dir /tmp/trt_llm_model_dir/ \
-  --model_type LlamaForCausalLM \
-  --use_huggingface True \
-  --checkpoint_dir /home/TestData/llm/models/llama3.2-1B-hf/ \
-  --min_tps 1 \
-  --test_deployment True \
-  --debug
diff --git a/tests/functional_tests/L2_NeMo_2_Export_In_Framework.sh b/tests/functional_tests/L2_NeMo_2_Export_In_Framework.sh
deleted file mode 100644
index d7f63cef7790..000000000000
--- a/tests/functional_tests/L2_NeMo_2_Export_In_Framework.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/collections/llm/test_hf_import.py \
-  --hf_model /home/TestData/nlp/megatron_llama/llama-ci-hf \
-  --output_path /tmp/nemo2_ckpt
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/setup/data/create_sample_lambada.py \
-  --output_file /tmp/lambada.json
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/export/nemo_export.py \
-  --model_name test \
-  --model_type llama \
-  --checkpoint_dir /tmp/nemo2_ckpt \
-  --min_tps 1 \
-  --in_framework True \
-  --test_deployment True \
-  --run_accuracy True \
-  --test_data_path /tmp/lambada.json \
-  --accuracy_threshold 0.0 \
-  --debug
diff --git a/tests/functional_tests/L2_NeMo_2_Export_Qnemo_TRT_LLM.sh b/tests/functional_tests/L2_NeMo_2_Export_Qnemo_TRT_LLM.sh
deleted file mode 100644
index 8847141380ba..000000000000
--- a/tests/functional_tests/L2_NeMo_2_Export_Qnemo_TRT_LLM.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/setup/models/create_hf_model.py \
-  --model_name_or_path /home/TestData/hf/Llama-2-7b-hf \
-  --output_dir /tmp/llama_tiny_hf \
-  --config_updates "{\"num_hidden_layers\": 2, \"hidden_size\": 512, \"intermediate_size\": 384, \"num_attention_heads\": 8, \"num_key_value_heads\": 8}"
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/collections/llm/test_hf_import.py \
-  --hf_model /tmp/llama_tiny_hf \
-  --output_path /tmp/nemo2_ckpt
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo scripts/llm/ptq.py \
-  -nc /tmp/nemo2_ckpt \
-  -algo int8_sq \
-  -out /tmp/nemo2_ptq \
-  --export_format trtllm
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/export/nemo_export.py \
-  --model_name test \
-  --model_dir /tmp/trt_llm_model_dir/ \
-  --checkpoint_dir /tmp/nemo2_ptq \
-  --min_tps 1 \
-  --test_deployment True \
-  --debug
diff --git a/tests/functional_tests/L2_NeMo_2_Export_TRT_LLM.sh b/tests/functional_tests/L2_NeMo_2_Export_TRT_LLM.sh
deleted file mode 100644
index 51bf95d9a6be..000000000000
--- a/tests/functional_tests/L2_NeMo_2_Export_TRT_LLM.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/export/nemo_export.py \
-  --model_name test \
-  --model_dir /tmp/llama32_1b_nemo2_trt \
-  --model_type llama \
-  --checkpoint_dir /home/TestData/llm/models/llama32_1b_nemo2 \
-  --min_tps 1 \
-  --test_deployment True \
-  --debug
diff --git a/tests/functional_tests/L2_NeMo_2_PTQ_Unified_Export.sh b/tests/functional_tests/L2_NeMo_2_PTQ_Unified_Export.sh
deleted file mode 100644
index 23a2c54365ef..000000000000
--- a/tests/functional_tests/L2_NeMo_2_PTQ_Unified_Export.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/collections/llm/test_hf_import.py --hf_model /home/TestData/nlp/megatron_llama/llama-ci-hf --output_path /tmp/nemo2_ckpt
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo scripts/llm/ptq.py -nc /tmp/nemo2_ckpt -algo fp8 -out /tmp/nemo2_unified_ptq --export_format hf --legacy_ckpt --generate_sample
diff --git a/tests/functional_tests/L2_NeMo_2_export_ckpt_Llama2_FP8_nemo.sh b/tests/functional_tests/L2_NeMo_2_export_ckpt_Llama2_FP8_nemo.sh
deleted file mode 100644
index 09dd8d292337..000000000000
--- a/tests/functional_tests/L2_NeMo_2_export_ckpt_Llama2_FP8_nemo.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo scripts/llm/ptq.py -nc /home/TestData/llm/models/llama32_1b_nemo2 -algo fp8 -out /tmp/nemo2_ptq_ckpt --export_format nemo --legacy_ckpt
-
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo $(which nemo) llm export path="/tmp/nemo2_ptq_ckpt" target="hf" output_path="/tmp/nemo2_hf_ckpt" overwrite=false -y
diff --git a/tests/functional_tests/L2_NeMo_2_vLLM_Export_Llama.sh b/tests/functional_tests/L2_NeMo_2_vLLM_Export_Llama.sh
deleted file mode 100644
index a5d6ce78fe64..000000000000
--- a/tests/functional_tests/L2_NeMo_2_vLLM_Export_Llama.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/setup/models/create_hf_model.py \
-  --model_name_or_path /home/TestData/nlp/megatron_llama/llama-ci-hf \
-  --output_dir /tmp/llama_head64 \
-  --config_updates "{\"hidden_size\": 512, \"num_attention_heads\": 4, \"num_key_value_heads\": 4, \"intermediate_size\": 1024, \"head_dim\": 128, \"num_hidden_layers\": 2, \"torch_dtype\": \"float16\" }" &&
-  coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/collections/llm/test_hf_import.py --hf_model /tmp/llama_head64 --output_path /tmp/nemo2_ckpt &&
-  /opt/venv/bin/coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/export/nemo_export.py \
-    --min_tps 1 \
-    --max_tps 1 \
-    --use_vllm True \
-    --model_type llama \
-    --max_output_len 128 \
-    --test_deployment True \
-    --model_name nemo2_ckpt \
-    --model_dir /tmp/vllm_from_nemo2 \
-    --checkpoint_dir /tmp/nemo2_ckpt
diff --git a/tests/functional_tests/L2_NeMo_2_vLLM_Export_Mixtral.sh b/tests/functional_tests/L2_NeMo_2_vLLM_Export_Mixtral.sh
deleted file mode 100644
index b3b3522ab081..000000000000
--- a/tests/functional_tests/L2_NeMo_2_vLLM_Export_Mixtral.sh
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/setup/models/create_hf_model.py \
-  --model_name_or_path /home/TestData/hf/Mixtral-8x7B-Instruct-v0.1 \
-  --output_dir /tmp/mixtral_tiny_hf \
-  --config_updates "{\"num_hidden_layers\": 2, \"hidden_size\": 128, \"intermediate_size\": 448, \"num_attention_heads\": 4, \"num_key_value_heads\": 2, \"head_dim\": 32, \"num_local_experts\": 4}" &&
-  coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/collections/llm/test_hf_import.py --hf_model /tmp/mixtral_tiny_hf --model MixtralModel --config MixtralConfig8x7B --output_path /tmp/mixtral_tiny_nemo2 &&
-  /opt/venv/bin/coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/export/nemo_export.py \
-    --min_tps 1 \
-    --max_tps 1 \
-    --use_vllm True \
-    --model_type mixtral \
-    --max_output_len 128 \
-    --test_deployment True \
-    --model_name nemo2_ckpt \
-    --model_dir /tmp/vllm_from_nemo2 \
-    --checkpoint_dir /tmp/mixtral_tiny_nemo2
diff --git a/tests/functional_tests/L2_ONNX_TRT_LLM_Embedding_Export.sh b/tests/functional_tests/L2_ONNX_TRT_LLM_Embedding_Export.sh
deleted file mode 100644
index 3bde39233e8c..000000000000
--- a/tests/functional_tests/L2_ONNX_TRT_LLM_Embedding_Export.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo tests/export/test_export_onnx.py \
-    --hf_model_path /home/TestData/llm/models/llama-3.2-nv-embedqa-1b-v2
\ No newline at end of file

From 1e8825c8d6be31af2245a66c26fab865248dec55 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Fri, 7 Nov 2025 13:14:34 -0800
Subject: [PATCH 04/15] remove references

Signed-off-by: Pablo Garay 
---
 docker/common/install_dep.sh                  |  3 +-
 examples/llm/finetune/automodel_vllm.py       | 19 ++++++++++--
 .../common/video_tokenizers/README.md         |  6 ++++
 .../common/video_tokenizers/cosmos_trt_run.py |  5 ++++
 nemo/deploy/__init__.py                       | 23 ++++++++++++++
 nemo/deploy/deploy_pytriton.py                | 30 +++++++++++++++++++
 nemo/deploy/nlp/__init__.py                   | 30 +++++++++++++++++++
 nemo/export/__init__.py                       | 23 ++++++++++++++
 nemo/export/tensorrt_lazy_compiler.py         | 28 +++++++++++++++++
 nemo/export/trt_llm/__init__.py               | 19 ++++++++++++
 .../trt_llm/nemo_ckpt_loader/__init__.py      | 19 ++++++++++++
 .../trt_llm/nemo_ckpt_loader/nemo_file.py     | 28 +++++++++++++++++
 nemo/export/vllm_hf_exporter.py               | 30 +++++++++++++++++++
 requirements/requirements_deploy.txt          |  8 -----
 requirements/requirements_vllm.txt            |  5 ++--
 setup.py                                      | 11 +------
 16 files changed, 263 insertions(+), 24 deletions(-)
 create mode 100644 nemo/deploy/__init__.py
 create mode 100644 nemo/deploy/deploy_pytriton.py
 create mode 100644 nemo/deploy/nlp/__init__.py
 create mode 100644 nemo/export/__init__.py
 create mode 100644 nemo/export/tensorrt_lazy_compiler.py
 create mode 100644 nemo/export/trt_llm/__init__.py
 create mode 100644 nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
 create mode 100644 nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
 create mode 100644 nemo/export/vllm_hf_exporter.py
 delete mode 100644 requirements/requirements_deploy.txt

diff --git a/docker/common/install_dep.sh b/docker/common/install_dep.sh
index 87e607de3594..8675714b7168 100755
--- a/docker/common/install_dep.sh
+++ b/docker/common/install_dep.sh
@@ -279,8 +279,7 @@ vllm() {
       $INSTALL_DIR/venv/bin/pip install --no-cache-dir setuptools coverage
       $INSTALL_DIR/venv/bin/pip wheel --no-cache-dir --no-build-isolation \
         --wheel-dir $WHEELS_DIR/ \
-        -r $CURR/requirements/requirements_vllm.txt \
-        -r $CURR/requirements/requirements_deploy.txt
+        -r $CURR/requirements/requirements_vllm.txt
     fi
   }
 
diff --git a/examples/llm/finetune/automodel_vllm.py b/examples/llm/finetune/automodel_vllm.py
index c918dc663aab..8314c4968949 100644
--- a/examples/llm/finetune/automodel_vllm.py
+++ b/examples/llm/finetune/automodel_vllm.py
@@ -12,12 +12,27 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Example: Export vLLM HF model with optional LoRA and Triton deployment.
+
+Note: This example requires the Export-Deploy repository:
+  pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git
+
+Additionally, vLLM must be installed or activated in your environment:
+  source /opt/venv/bin/activate
+"""
+
 import argparse
 
 try:
     from nemo.export.vllm_hf_exporter import vLLMHFExporter
-except ImportError:
-    raise Exception("vLLM must be installed or activated in your environment:\n" "  source /opt/venv/bin/activate")
+except ImportError as e:
+    raise ImportError(
+        "This example requires the Export-Deploy repository.\n"
+        "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git\n"
+        "Additionally, vLLM must be installed or activated:\n"
+        "  source /opt/venv/bin/activate"
+    ) from e
 
 from nemo.deploy import DeployPyTriton
 from nemo.deploy.nlp import NemoQueryLLM
diff --git a/nemo/collections/common/video_tokenizers/README.md b/nemo/collections/common/video_tokenizers/README.md
index 72e94cdb0208..02c8d04ee5b4 100644
--- a/nemo/collections/common/video_tokenizers/README.md
+++ b/nemo/collections/common/video_tokenizers/README.md
@@ -28,6 +28,12 @@ Please see the official [NVIDIA Cosmos repository](https://github.com/NVIDIA/Cos
 for the complete list of supported tokenizers.
 
 ### Acceleration with TensorRT
+
+**Note:** TensorRT acceleration requires the Export-Deploy repository:
+```bash
+pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git
+```
+
 To use these tokenizers with TensorRT and acheive up to 3X speedup during tokenization,
 users can define a lightweight wrapper model and then pass this wrapper model to `trt_compile`
 ```python
diff --git a/nemo/collections/common/video_tokenizers/cosmos_trt_run.py b/nemo/collections/common/video_tokenizers/cosmos_trt_run.py
index c046d91b41cc..6602256aabf0 100644
--- a/nemo/collections/common/video_tokenizers/cosmos_trt_run.py
+++ b/nemo/collections/common/video_tokenizers/cosmos_trt_run.py
@@ -12,6 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+"""
+Note: This script requires the Export-Deploy repository for TensorRT compilation.
+Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git
+"""
+
 import argparse
 import os
 import shutil
diff --git a/nemo/deploy/__init__.py b/nemo/deploy/__init__.py
new file mode 100644
index 000000000000..0bd6c3d82b44
--- /dev/null
+++ b/nemo/deploy/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+warnings.warn(
+    "The 'nemo.deploy' module has been moved to a separate repository. "
+    "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy",
+    DeprecationWarning,
+    stacklevel=2,
+)
+
diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
new file mode 100644
index 000000000000..b5f973158168
--- /dev/null
+++ b/nemo/deploy/deploy_pytriton.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Stub module for PyTriton deployment.
+
+This module has been moved to the Export-Deploy repository.
+"""
+
+
+class DeployPyTriton:
+    """Stub class that raises an error directing users to the Export-Deploy repository."""
+
+    def __init__(self, *args, **kwargs):
+        raise ImportError(
+            "The 'DeployPyTriton' class has been moved to a separate repository. "
+            "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
+            "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
+        )
+
diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py
new file mode 100644
index 000000000000..308c134c4b33
--- /dev/null
+++ b/nemo/deploy/nlp/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Stub module for NLP deployment.
+
+This module has been moved to the Export-Deploy repository.
+"""
+
+
+class NemoQueryLLM:
+    """Stub class that raises an error directing users to the Export-Deploy repository."""
+
+    def __init__(self, *args, **kwargs):
+        raise ImportError(
+            "The 'NemoQueryLLM' class has been moved to a separate repository. "
+            "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
+            "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
+        )
+
diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
new file mode 100644
index 000000000000..f5be0ea6e2bc
--- /dev/null
+++ b/nemo/export/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import warnings
+
+warnings.warn(
+    "The 'nemo.export' module has been moved to a separate repository. "
+    "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy",
+    DeprecationWarning,
+    stacklevel=2,
+)
+
diff --git a/nemo/export/tensorrt_lazy_compiler.py b/nemo/export/tensorrt_lazy_compiler.py
new file mode 100644
index 000000000000..651a2ed4aefe
--- /dev/null
+++ b/nemo/export/tensorrt_lazy_compiler.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Stub module for TensorRT lazy compiler.
+
+This module has been moved to the Export-Deploy repository.
+"""
+
+
+def trt_compile(*args, **kwargs):
+    """Stub function that raises an error directing users to the Export-Deploy repository."""
+    raise ImportError(
+        "The 'trt_compile' function has been moved to a separate repository. "
+        "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
+        "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
+    )
+
diff --git a/nemo/export/trt_llm/__init__.py b/nemo/export/trt_llm/__init__.py
new file mode 100644
index 000000000000..67052757bd00
--- /dev/null
+++ b/nemo/export/trt_llm/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+raise ImportError(
+    "The 'nemo.export.trt_llm' module has been moved to a separate repository. "
+    "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy"
+)
+
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
new file mode 100644
index 000000000000..bd9b221bbb79
--- /dev/null
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+raise ImportError(
+    "The 'nemo.export.trt_llm.nemo_ckpt_loader' module has been moved to a separate repository. "
+    "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy"
+)
+
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
new file mode 100644
index 000000000000..b3661ee4f4a0
--- /dev/null
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Stub module for load_distributed_model_weights and related functions.
+
+This module has been moved to the Export-Deploy repository.
+"""
+
+
+def load_distributed_model_weights(*args, **kwargs):
+    """Stub function that raises an error directing users to the Export-Deploy repository."""
+    raise ImportError(
+        "The 'load_distributed_model_weights' function has been moved to a separate repository. "
+        "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
+        "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
+    )
+
diff --git a/nemo/export/vllm_hf_exporter.py b/nemo/export/vllm_hf_exporter.py
new file mode 100644
index 000000000000..d70df465e3a9
--- /dev/null
+++ b/nemo/export/vllm_hf_exporter.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Stub module for vLLM HF exporter.
+
+This module has been moved to the Export-Deploy repository.
+"""
+
+
+class vLLMHFExporter:
+    """Stub class that raises an error directing users to the Export-Deploy repository."""
+
+    def __init__(self, *args, **kwargs):
+        raise ImportError(
+            "The 'vLLMHFExporter' class has been moved to a separate repository. "
+            "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
+            "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
+        )
+
diff --git a/requirements/requirements_deploy.txt b/requirements/requirements_deploy.txt
deleted file mode 100644
index 46c26f8915b6..000000000000
--- a/requirements/requirements_deploy.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-accelerate
-fastapi
-nvidia-pytriton ; platform_system != 'Darwin'
-nvtx
-pydantic-settings
-tensorstore<0.1.72; platform_system != 'Darwin'
-uvicorn
-zarr>=2.18.2,<3.0.0
diff --git a/requirements/requirements_vllm.txt b/requirements/requirements_vllm.txt
index 607119038932..f6edafaa9dce 100644
--- a/requirements/requirements_vllm.txt
+++ b/requirements/requirements_vllm.txt
@@ -1,7 +1,8 @@
 # Minimal set of NeMo requirements to run vLLM export & deployment in /opt/venv in a NeMo container
+# Note: vLLM export & deployment functionality has been moved to the Export-Deploy repository:
+# https://github.com/NVIDIA-NeMo/Export-Deploy
 braceexpand
-# datasets and pandas import are triggered by hydra.utils.instantiate in nemo/export/vllm/model_config.py.
-# TODO: remove those dependencies by switching to local nemo.export tokenizers.
+# datasets and pandas import are triggered by hydra.utils.instantiate
 datasets
 faiss-cpu
 fiddle
diff --git a/setup.py b/setup.py
index 63cf819ea201..b8502d71ce48 100644
--- a/setup.py
+++ b/setup.py
@@ -82,11 +82,10 @@ def req_file(filename, folder="requirements"):
     'slu': req_file("requirements_slu.txt"),
     'multimodal-only': req_file("requirements_multimodal.txt"),
     'audio': req_file("requirements_audio.txt"),
-    'deploy': req_file("requirements_deploy.txt"),
 }
 
 
-extras_require['all'] = list(chain(val for key, val in extras_require.items() if key != 'deploy'))
+extras_require['all'] = list(chain(val for key, val in extras_require.items()))
 
 # Add lightning requirements as needed
 extras_require['common'] = extras_require['common-only']
@@ -147,14 +146,6 @@ def req_file(filename, folder="requirements"):
         extras_require['asr'],
     )
 )
-extras_require['deploy'] = list(
-    chain(
-        extras_require['nlp'],
-        extras_require['multimodal'],
-        extras_require['tts'],
-        extras_require['deploy'],
-    )
-)
 
 
 ###############################################################################

From 60c161b850ae4d6c3d93b7d55a9921e3ce6c7612 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Fri, 7 Nov 2025 13:20:24 -0800
Subject: [PATCH 05/15] lintfix

Signed-off-by: Pablo Garay 
---
 nemo/collections/common/video_tokenizers/cosmos_trt_run.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/common/video_tokenizers/cosmos_trt_run.py b/nemo/collections/common/video_tokenizers/cosmos_trt_run.py
index 6602256aabf0..c4f1bf73dde1 100644
--- a/nemo/collections/common/video_tokenizers/cosmos_trt_run.py
+++ b/nemo/collections/common/video_tokenizers/cosmos_trt_run.py
@@ -50,14 +50,18 @@
 
 
 def main():
+    """Export and run tokenizer in TensorRT."""
     model = CausalVideoTokenizer.from_pretrained(args.tokenizer_name, use_pytorch=True, dtype="float")
 
     class VaeWrapper(torch.nn.Module):
+        """Wrapper class for VAE model to enable TensorRT compilation."""
+
         def __init__(self, vae):
             super().__init__()
             self.vae = vae
 
         def forward(self, input_tensor):
+            """Forward pass through the VAE autoencoder."""
             output_tensor = self.vae.autoencode(input_tensor)
             return output_tensor
 
@@ -91,7 +95,7 @@ def forward(self, input_tensor):
     )
 
     input_tensor = torch.randn(max_shape).to('cuda').to(torch.float)
-    output = model_wrapper(input_tensor)
+    _ = model_wrapper(input_tensor)  # Warmup call to ensure TensorRT engine is compiled
 
 
 if __name__ == '__main__':

From b87afcda733e730bdbd4b14fb773020b5c833602 Mon Sep 17 00:00:00 2001
From: Taejin Park 
Date: Thu, 6 Nov 2025 08:33:30 -0800
Subject: [PATCH 06/15] Fixing lines for multispeaker pipeline (#15030)

* Fixing lines for multispeaker pipeline

Signed-off-by: taejinp 

* Removing unused imports

Signed-off-by: taejinp 

* Apply isort and black reformatting

Signed-off-by: tango4j 

* Making changes for HF Space deployment

Signed-off-by: taejinp 

* Apply isort and black reformatting

Signed-off-by: chtruong814 

* Updated multispk trans utils.

Signed-off-by: taejinp 

---------

Signed-off-by: taejinp 
Signed-off-by: tango4j 
Signed-off-by: chtruong814 
Co-authored-by: tango4j 
Co-authored-by: chtruong814 
Signed-off-by: Pablo Garay 
---
 .../asr/models/sortformer_diar_models.py      |   7 +-
 .../parts/utils/multispk_transcribe_utils.py  | 159 +++++++++---------
 2 files changed, 84 insertions(+), 82 deletions(-)

diff --git a/nemo/collections/asr/models/sortformer_diar_models.py b/nemo/collections/asr/models/sortformer_diar_models.py
index 75035dba0617..0a1eece7dbb5 100644
--- a/nemo/collections/asr/models/sortformer_diar_models.py
+++ b/nemo/collections/asr/models/sortformer_diar_models.py
@@ -112,8 +112,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None):
             self.sortformer_modules.encoder_proj = None
         self._init_loss_weights()
 
-        self.eps = 1e-3
-        self.negative_init_val = -99
+        self.eps = self._cfg.get("eps", 1e-3)
+        self.negative_init_val = self._cfg.get("negative_init_val", -99)
         self.loss = instantiate(self._cfg.loss)
 
         self.async_streaming = self._cfg.get("async_streaming", False)
@@ -832,6 +832,7 @@ def _get_aux_train_evaluations(self, preds, targets, target_lens) -> dict:
         Returns:
             (dict): A dictionary containing the following training metrics.
         """
+        targets = targets.to(preds.dtype)
         if preds.shape[1] < targets.shape[1]:
             logging.info(
                 f"WARNING! preds has less frames than targets ({preds.shape[1]} < {targets.shape[1]}). "
@@ -904,6 +905,7 @@ def _get_aux_validation_evaluations(self, preds, targets, target_lens) -> dict:
         Returns:
             val_metrics (dict): A dictionary containing the following validation metrics
         """
+        targets = targets.to(preds.dtype)
         if preds.shape[1] < targets.shape[1]:
             logging.info(
                 f"WARNING! preds has less frames than targets ({preds.shape[1]} < {targets.shape[1]}). "
@@ -1035,6 +1037,7 @@ def _get_aux_test_batch_evaluations(self, batch_idx: int, preds, targets, target
             target_lens (torch.Tensor): Lengths of target sequences.
                 Shape: (batch_size,)
         """
+        targets = targets.to(preds.dtype)
         if preds.shape[1] < targets.shape[1]:
             logging.info(
                 f"WARNING! preds has less frames than targets ({preds.shape[1]} < {targets.shape[1]}). "
diff --git a/nemo/collections/asr/parts/utils/multispk_transcribe_utils.py b/nemo/collections/asr/parts/utils/multispk_transcribe_utils.py
index 4168e49c9734..7b88361930d8 100644
--- a/nemo/collections/asr/parts/utils/multispk_transcribe_utils.py
+++ b/nemo/collections/asr/parts/utils/multispk_transcribe_utils.py
@@ -29,16 +29,10 @@
 from nemo.collections.asr.data.audio_to_diar_label import extract_frame_info_from_rttm, get_frame_targets_from_rttm
 from nemo.collections.asr.models.sortformer_diar_models import SortformerEncLabelModel
 from nemo.collections.asr.modules.sortformer_modules import StreamingSortformerState
-from nemo.collections.asr.parts.utils.diarization_utils import (
-    OnlineEvaluation,
-    get_color_palette,
-    print_sentences,
-    read_seglst,
-    write_txt,
-)
+from nemo.collections.asr.parts.utils.diarization_utils import get_color_palette, print_sentences, write_txt
 from nemo.collections.asr.parts.utils.rnnt_utils import Hypothesis
 from nemo.collections.asr.parts.utils.speaker_utils import audio_rttm_map as get_audio_rttm_map
-from nemo.collections.asr.parts.utils.speaker_utils import get_uniqname_from_filepath, rttm_to_labels
+from nemo.collections.asr.parts.utils.speaker_utils import get_uniqname_from_filepath
 from nemo.utils import logging
 
 
@@ -100,7 +94,7 @@ def add_delay_for_real_time(
         loop_end_time (float): The loop end time in seconds.
         loop_start_time (float): The loop start time in seconds.
     """
-    time_diff = max(0, (time.time() - session_start_time) - feat_frame_count * cfg.feat_len_sec)
+    time_diff = max(0, (time.time() - session_start_time) - feat_frame_count * cfg.get("feat_len_sec", 0.01))
     eta_min_sec = format_time(time.time() - session_start_time)
     logging.info(
         f"[   REAL TIME MODE   ] min:sec - {eta_min_sec} "
@@ -109,9 +103,9 @@ def add_delay_for_real_time(
     time.sleep(
         max(
             0,
-            (chunk_audio.shape[-1] - cfg.discarded_frames) * cfg.feat_len_sec
+            (chunk_audio.shape[-1] - cfg.get("discarded_frames", 8)) * cfg.get("feat_len_sec", 0.01)
             - (loop_end_time - loop_start_time)
-            - time_diff * cfg.finetune_realtime_ratio,
+            - time_diff * cfg.get("finetune_realtime_ratio", 0.01),
         )
     )
 
@@ -136,7 +130,7 @@ def write_seglst_file(seglst_dict_list: List[Dict[str, Any]], output_path: str):
     """
     if len(seglst_dict_list) == 0:
         raise ValueError("seglst_dict_list is empty. No transcriptions were generated.")
-    with open(output_path, 'w') as f:
+    with open(output_path, 'w', encoding='utf-8') as f:
         f.write(json.dumps(seglst_dict_list, indent=4) + '\n')
     logging.info(f"Saved the transcriptions of the streaming inference in\n:{output_path}")
 
@@ -165,7 +159,7 @@ def get_multi_talker_samples_from_manifest(cfg, manifest_file: str, feat_per_sec
             if 'duration' not in item:
                 raise KeyError(f"Line {line_num}: 'duration' missing")
             samples.append(item)
-            if cfg.spk_supervision == "rttm":
+            if cfg.get("spk_supervision", "diar") == "rttm":
                 rttm_path = samples[-1]['rttm_filepath']
                 if not rttm_path:
                     raise ValueError(f"Line {line_num}: rttm_filepath required when spk_supervision='rttm'")
@@ -231,6 +225,7 @@ def get_new_sentence_dict(
     end_time: float,
     text: str,
     session_id: Optional[str] = None,
+    decimal: int = 3,
 ) -> dict:
     """
     Get a new SegLST style sentence dictionary variable.
@@ -245,10 +240,15 @@ def get_new_sentence_dict(
     Returns:
         Dict[str, Any]: A new SegLST style sentence dictionary variable.
     """
+    # If start_time or end_time is a torch tensor, convert it to a float and round it to 3 decimal places
+    if isinstance(start_time, torch.Tensor):
+        start_time = start_time.item()
+    if isinstance(end_time, torch.Tensor):
+        end_time = end_time.item()
     return {
         'speaker': speaker,
-        'start_time': start_time,
-        'end_time': end_time,
+        'start_time': round(start_time, decimal),
+        'end_time': round(end_time, decimal),
         'words': text.lstrip(),
         'session_id': session_id,
     }
@@ -285,7 +285,7 @@ def fix_frame_time_step(cfg: Any, new_tokens: List[str], new_words: List[str], f
         elif len(frame_inds_seq) < len(new_tokens):
             deficit = len(new_tokens) - len(frame_inds_seq)
             frame_inds_seq = [frame_inds_seq[0]] * deficit + frame_inds_seq
-        if cfg.log:
+        if cfg.get("log", True):
             logging.warning(
                 f"Length of new token sequence ({len(new_tokens)}) does not match"
                 f"the length of frame indices sequence ({len(frame_inds_seq)}). Skipping this chunk."
@@ -306,10 +306,12 @@ def get_simulated_softmax(cfg, speaker_sigmoid: torch.Tensor) -> torch.Tensor:
     """
     if speaker_sigmoid.ndim != 1:
         raise ValueError(f"Expected 1D tensor for speaker_sigmoid, got shape {speaker_sigmoid.shape}")
-    if speaker_sigmoid.shape[0] < cfg.max_num_of_spks:
-        raise ValueError(f"speaker_sigmoid size {speaker_sigmoid.shape[0]} < max_num_of_spks {cfg.max_num_of_spks}")
+    if speaker_sigmoid.shape[0] < cfg.get("max_num_of_spks", 4):
+        raise ValueError(
+            f"speaker_sigmoid size {speaker_sigmoid.shape[0]} < max_num_of_spks {cfg.get('max_num_of_spks', 4)}"
+        )
 
-    speaker_sigmoid = torch.clamp(speaker_sigmoid, min=cfg.min_sigmoid_val, max=1)
+    speaker_sigmoid = torch.clamp(speaker_sigmoid, min=cfg.get("min_sigmoid_val", 1e-2), max=1)
     sigmoid_sum = speaker_sigmoid.sum()
     if sigmoid_sum == 0:
         logging.warning("speaker_sigmoid sum is zero, returning uniform distribution")
@@ -317,7 +319,7 @@ def get_simulated_softmax(cfg, speaker_sigmoid: torch.Tensor) -> torch.Tensor:
     else:
         speaker_softmax = speaker_sigmoid / sigmoid_sum
     speaker_softmax = speaker_softmax.cpu()
-    speaker_softmax[cfg.max_num_of_spks :] = 0.0
+    speaker_softmax[cfg.get("max_num_of_spks", 4) :] = 0.0
     return speaker_softmax
 
 
@@ -357,11 +359,11 @@ def get_word_dict_content_offline(
             frame_end = frame_stt + 1
 
     # Get the speaker based on the frame-wise softmax probabilities.
-    stt_p, end_p = max((frame_stt + cfg.left_frame_shift), 0), (frame_end + cfg.right_frame_shift)
+    stt_p, end_p = max((frame_stt + cfg.get("left_frame_shift", -1)), 0), (frame_end + cfg.get("right_frame_shift", 0))
     speaker_sigmoid = diar_pred_out[stt_p:end_p, :].mean(dim=0)
     speaker_softmax = get_simulated_softmax(cfg, speaker_sigmoid)
 
-    speaker_softmax[cfg.max_num_of_spks :] = 0.0
+    speaker_softmax[cfg.get("max_num_of_spks", 4) :] = 0.0
     spk_id = speaker_softmax.argmax().item()
     stt_sec, end_sec = frame_stt * frame_len, frame_end * frame_len
     word_dict = {
@@ -424,11 +426,11 @@ def get_word_dict_content_online(
             frame_end = frame_stt + 1
 
     # Get the speaker based on the frame-wise softmax probabilities.
-    stt_p, end_p = max((frame_stt + cfg.left_frame_shift), 0), (frame_end + cfg.right_frame_shift)
+    stt_p, end_p = max((frame_stt + cfg.get("left_frame_shift", -1)), 0), (frame_end + cfg.get("right_frame_shift", 0))
     speaker_sigmoid = diar_pred_out_stream[stt_p:end_p, :].mean(dim=0)
     speaker_softmax = get_simulated_softmax(cfg, speaker_sigmoid)
 
-    speaker_softmax[cfg.max_num_of_spks :] = 0.0
+    speaker_softmax[cfg.get("max_num_of_spks", 4) :] = 0.0
     spk_id = speaker_softmax.argmax().item()
     stt_sec, end_sec = frame_stt * frame_len, frame_end * frame_len
     word_dict = {
@@ -516,15 +518,25 @@ def __init__(
     ):
         # Required configs, models and datasets for inference
         self.cfg = cfg
-        if self.cfg.manifest_file:
-            self.test_manifest_dict = get_audio_rttm_map(self.cfg.manifest_file)
-        elif self.cfg.audio_file is not None:
-            uniq_id = get_uniqname_from_filepath(filepath=self.cfg.audio_file)
+        if not self.cfg.get("deploy_mode", False):
+            if self.cfg.manifest_file:
+                self.test_manifest_dict = get_audio_rttm_map(self.cfg.manifest_file)
+            elif self.cfg.audio_file is not None:
+                uniq_id = get_uniqname_from_filepath(filepath=self.cfg.audio_file)
+                self.test_manifest_dict = {
+                    uniq_id: {'audio_filepath': self.cfg.audio_file, 'seglst_filepath': None, 'rttm_filepath': None}
+                }
+            else:
+                raise ValueError("One of the audio_file and manifest_file should be non-empty!")
+        else:
             self.test_manifest_dict = {
-                uniq_id: {'audio_filepath': self.cfg.audio_file, 'seglst_filepath': None, 'rttm_filepath': None}
+                "streaming_session": {
+                    'audio_filepath': 'streaming_session.wav',
+                    'seglst_filepath': None,
+                    'rttm_filepath': None,
+                }
             }
-        else:
-            raise ValueError("One of the audio_file and manifest_file should be non-empty!")
+        self.transcribed_speaker_texts = [None] * len(self.test_manifest_dict)
 
         self.asr_model = asr_model
         self.diar_model = diar_model
@@ -534,9 +546,10 @@ def __init__(
         self._sentence_render_length = int(self._fix_prev_words_count + cfg.update_prev_words_sentence)
         self._frame_len_sec = 0.08
         self._initial_steps = cfg.ignored_initial_frame_steps
+        self._word_and_ts_seq = {}
         self._stt_words = []
-        self._init_evaluator()
         self._frame_hop_length = self.asr_model.encoder.streaming_cfg.valid_out_len
+        self._init_transcript_sessions()
 
         # Multi-instance configs
         self._max_num_of_spks = cfg.get("max_num_of_spks", 4)
@@ -562,12 +575,11 @@ def __init__(
         )
         self.n_active_speakers_per_stream = self.cfg.max_num_of_spks
 
-    def _init_evaluator(self):
+    def _init_transcript_sessions(self):
         """
-        Initialize the evaluator for the offline STT and speaker diarization.
+        Initialize the word and time-stamp sequence for each session.
         """
-        self.online_evaluators, self._word_and_ts_seq = [], {}
-        for _, (uniq_id, data_dict) in enumerate(self.test_manifest_dict.items()):
+        for uniq_id in self.test_manifest_dict.keys():
             uniq_id = uniq_id.split(".")[0]  # Make sure there is no "." in the uniq_id
             self._word_and_ts_seq[uniq_id] = {
                 "words": [],
@@ -585,26 +597,6 @@ def _init_evaluator(self):
                 "sentence_memory": {},
             }
 
-            if 'seglst_filepath' in data_dict and data_dict['seglst_filepath'] is not None:
-                ref_seglst = read_seglst(data_dict['seglst_filepath'])
-            else:
-                ref_seglst = None
-
-            if 'rttm_filepath' in data_dict and data_dict['rttm_filepath'] is not None:
-                ref_rttm_labels = rttm_to_labels(data_dict['rttm_filepath'])
-            else:
-                ref_rttm_labels = None
-
-            eval_instance = OnlineEvaluation(
-                ref_seglst=ref_seglst,
-                ref_rttm_labels=ref_rttm_labels,
-                hyp_seglst=None,
-                collar=0.25,
-                ignore_overlap=False,
-                verbose=True,
-            )
-            self.online_evaluators.append(eval_instance)
-
     def _get_offset_sentence(self, session_trans_dict: Dict[str, Any], offset: int) -> Dict[str, Any]:
         """
         For the very first word in a session, get the offset sentence.
@@ -698,7 +690,6 @@ def merge_transcript_and_speakers(
             transcribed_speaker_texts (List[str]): List of transcribed speaker texts.
             self._word_and_ts_seq (Dict[str, Dict[str, Any]]): Dictionary of word-level dictionaries with uniq_id as key.
         """
-        transcribed_speaker_texts = [None] * len(test_manifest_dict)
 
         for idx, (uniq_id, _) in enumerate(test_manifest_dict.items()):
             uniq_id = uniq_id.split(".")[0]  # Make sure there is no "." in the uniq_id
@@ -716,16 +707,17 @@ def merge_transcript_and_speakers(
                         sentence_render_length=self._sentence_render_length,
                     )
                     if self.cfg.generate_realtime_scripts:
-                        transcribed_speaker_texts[idx] = print_sentences(
+                        self.transcribed_speaker_texts[idx] = print_sentences(
                             sentences=self._word_and_ts_seq[uniq_id]["sentences"],
                             color_palette=get_color_palette(),
                             params=self.cfg,
                         )
-                        write_txt(
-                            f'{self.cfg.print_path}'.replace(".sh", f"_{idx}.sh"),
-                            transcribed_speaker_texts[idx].strip(),
-                        )
-        return transcribed_speaker_texts, self._word_and_ts_seq
+                        if not self.cfg.get("deploy_mode", False):
+                            write_txt(
+                                f'{self.cfg.print_path}'.replace(".sh", f"_{idx}.sh"),
+                                self.transcribed_speaker_texts[idx].strip(),
+                            )
+        return self.transcribed_speaker_texts, self._word_and_ts_seq
 
     def get_frame_and_words_offline(
         self,
@@ -920,7 +912,6 @@ def generate_seglst_dicts_from_serial_streaming(self, samples: List[Dict[str, An
         Args:
             samples (List[Dict[str, Any]]): List of samples.
         """
-        # for _, word_ts_and_seq in enumerate(self._word_and_ts_seq):
         for sample in samples:
             uniq_id = get_uniqname_from_filepath(sample['audio_filepath']).split('.')[0]
             word_ts_and_seq_dict = self._word_and_ts_seq[uniq_id]
@@ -934,6 +925,7 @@ def generate_seglst_dicts_from_serial_streaming(self, samples: List[Dict[str, An
                     session_id=session_id,
                 )
                 self.instance_manager.seglst_dict_list.append(seglst_dict)
+        return self.instance_manager.seglst_dict_list
 
     def generate_seglst_dicts_from_parallel_streaming(self, samples: List[Dict[str, Any]]):
         """
@@ -960,6 +952,7 @@ def generate_seglst_dicts_from_parallel_streaming(self, samples: List[Dict[str,
             ]
             seglsts = sorted(seglsts, key=lambda x: x['start_time'])
             self.instance_manager.seglst_dict_list.extend(seglsts)
+        return self.instance_manager.seglst_dict_list
 
     def _find_active_speakers(self, diar_preds: torch.Tensor, n_active_speakers_per_stream: int) -> List[List[int]]:
         """
@@ -1040,10 +1033,12 @@ def mask_features(
         mask = mask.unsqueeze(-1).repeat(1, 1, 8).flatten(1, 2)
 
         if mask.shape[1] > chunk_audio.shape[2]:
-            logging.warning(f"Mask shape {mask.shape} is greater than chunk_audio shape {chunk_audio.shape}")
+            if self.cfg.get("log", False):
+                logging.warning(f"Mask shape {mask.shape} is greater than chunk_audio shape {chunk_audio.shape}")
             mask = mask[:, : chunk_audio.shape[2]]
         elif mask.shape[1] < chunk_audio.shape[2]:
-            logging.warning(f"Mask shape {mask.shape} is less than chunk_audio shape {chunk_audio.shape}")
+            if self.cfg.get("log", False):
+                logging.warning(f"Mask shape {mask.shape} is less than chunk_audio shape {chunk_audio.shape}")
             mask = torch.nn.functional.pad(mask, (chunk_audio.shape[2] - mask.shape[1], 0), mode='constant', value=0)
 
         masked_chunk_audio = chunk_audio * mask.unsqueeze(1)
@@ -1164,7 +1159,6 @@ def perform_serial_streaming_stt_spk(
             _, new_chunk_preds = self.get_diar_pred_out_stream(step_num)
             diar_pred_out_stream = new_chunk_preds
 
-        transcribed_speaker_texts = [None] * len(self.test_manifest_dict)
         for idx, (uniq_id, _) in enumerate(self.test_manifest_dict.items()):
             if not (len(previous_hypotheses[idx].text) == 0 and step_num <= self._initial_steps):
                 # Get the word-level dictionaries for each word in the chunk
@@ -1180,16 +1174,17 @@ def perform_serial_streaming_stt_spk(
                         session_trans_dict=self._word_and_ts_seq[uniq_id],
                         sentence_render_length=self._sentence_render_length,
                     )
-                    if self.cfg.generate_realtime_scripts:
-                        transcribed_speaker_texts[idx] = print_sentences(
+                    if self.cfg.get("generate_realtime_scripts", True):
+                        self.transcribed_speaker_texts[idx] = print_sentences(
                             sentences=self._word_and_ts_seq[uniq_id]["sentences"],
                             color_palette=get_color_palette(),
                             params=self.cfg,
                         )
-                        write_txt(
-                            f'{self.cfg.print_path}'.replace(".sh", f"_{idx}.sh"),
-                            transcribed_speaker_texts[idx].strip(),
-                        )
+                        if not self.cfg.get("deploy_mode", False):
+                            write_txt(
+                                f'{self.cfg.get("print_path", "./print_script.sh")}'.replace(".sh", f"_{idx}.sh"),
+                                self.transcribed_speaker_texts[idx].strip(),
+                            )
 
         for batch_idx in range(chunk_audio.shape[0]):
             self.instance_manager.update_asr_state(
@@ -1201,6 +1196,7 @@ def perform_serial_streaming_stt_spk(
                 previous_hypotheses=previous_hypotheses[batch_idx],
                 previous_pred_out=asr_pred_out_stream[batch_idx],
             )
+        return self.transcribed_speaker_texts
 
     @measure_eta
     def perform_parallel_streaming_stt_spk(
@@ -1349,15 +1345,18 @@ def perform_parallel_streaming_stt_spk(
         self.instance_manager.update_seglsts(offset=self._offset_chunk_start_time)
         self._offset_chunk_start_time += self._nframes_per_chunk * self._frame_len_sec
 
-        if self.cfg.generate_realtime_scripts:
-            for session_idx in self.cfg.print_sample_indices:
+        if self.cfg.get("generate_realtime_scripts", True):
+            for session_idx in self.cfg.get("print_sample_indices", [0]):
                 asr_state = self.instance_manager.batch_asr_states[session_idx]
-                transcribed_speaker_texts = print_sentences(
+                self.transcribed_speaker_texts[session_idx] = print_sentences(
                     sentences=asr_state.seglsts, color_palette=get_color_palette(), params=self.cfg
                 )
-                write_txt(
-                    f'{self.cfg.print_path.replace(".sh", f"_{session_idx}.sh")}', transcribed_speaker_texts.strip()
-                )
+                if not self.cfg.get("deploy_mode", False):
+                    write_txt(
+                        f'{self.cfg.get("print_path", "./print_script.sh").replace(".sh", f"_{session_idx}.sh")}',
+                        self.transcribed_speaker_texts[session_idx].strip(),
+                    )
+        return self.transcribed_speaker_texts
 
 
 class MultiTalkerInstanceManager:
@@ -1778,7 +1777,7 @@ def to(self, device: torch.device):
         Args:
             device (torch.device): The device to move the ASR and Diar states to.
         """
-        for batch_idx in range(self.batch_size):
+        for batch_idx in range(len(self.batch_asr_states)):
             self.batch_asr_states[batch_idx].to(device)
         self.diar_states.to(device)
 

From 1aa0460855319dd73446f77ba8dbdee09d287b19 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Fri, 7 Nov 2025 15:18:27 -0800
Subject: [PATCH 07/15] remove ExportDeploy & references

Signed-off-by: Pablo Garay 
---
 docs/source/nlp/quantization.rst              |  42 ++---
 examples/llm/finetune/automodel_vllm.py       | 158 ------------------
 .../common/video_tokenizers/README.md         |  51 +-----
 .../common/video_tokenizers/cosmos_trt_run.py | 102 -----------
 nemo/collections/llm/api.py                   |   4 +-
 nemo/collections/llm/gpt/model/deepseek.py    |  34 ----
 nemo/collections/llm/gpt/model/llama.py       |  41 -----
 .../llm/modelopt/quantization/quantizer.py    |   6 +-
 nemo/collections/vlm/api.py                   |   4 +-
 .../vlm/llama4/model/llama4_omni.py           |  37 ----
 nemo/collections/vlm/mllama/model/mllama.py   |  35 ----
 nemo/collections/vlm/qwen2vl/model/qwen2vl.py |  34 ----
 nemo/deploy/__init__.py                       |  23 ---
 nemo/deploy/deploy_pytriton.py                |  30 ----
 nemo/deploy/nlp/__init__.py                   |  30 ----
 nemo/export/__init__.py                       |  23 ---
 nemo/export/tensorrt_lazy_compiler.py         |  28 ----
 nemo/export/trt_llm/__init__.py               |  19 ---
 .../trt_llm/nemo_ckpt_loader/__init__.py      |  19 ---
 .../trt_llm/nemo_ckpt_loader/nemo_file.py     |  28 ----
 nemo/export/vllm_hf_exporter.py               |  30 ----
 requirements/requirements_vllm.txt            |   1 +
 .../functional_tests/L2_NeMo_2_VLLM_VISION.sh |  30 ----
 23 files changed, 21 insertions(+), 788 deletions(-)
 delete mode 100644 examples/llm/finetune/automodel_vllm.py
 delete mode 100644 nemo/collections/common/video_tokenizers/cosmos_trt_run.py
 delete mode 100644 nemo/deploy/__init__.py
 delete mode 100644 nemo/deploy/deploy_pytriton.py
 delete mode 100644 nemo/deploy/nlp/__init__.py
 delete mode 100644 nemo/export/__init__.py
 delete mode 100644 nemo/export/tensorrt_lazy_compiler.py
 delete mode 100644 nemo/export/trt_llm/__init__.py
 delete mode 100644 nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
 delete mode 100644 nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
 delete mode 100644 nemo/export/vllm_hf_exporter.py
 delete mode 100644 tests/functional_tests/L2_NeMo_2_VLLM_VISION.sh

diff --git a/docs/source/nlp/quantization.rst b/docs/source/nlp/quantization.rst
index 0579e24e306a..2006dfebdd64 100644
--- a/docs/source/nlp/quantization.rst
+++ b/docs/source/nlp/quantization.rst
@@ -19,7 +19,7 @@ The quantization process consists of the following steps:
 2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
 3. Producing an output directory or .qnemo tarball with model config (json), quantized weights (safetensors) and tokenizer config (yaml).
 
-Loading models requires using an ModelOpt spec defined in `nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec `_ module. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also available in NeMo project in ``nemo.deploy`` and ``nemo.export`` modules.
+Loading models requires using an ModelOpt spec defined in `nemo.collections.nlp.models.language_modeling.megatron.gpt_layer_modelopt_spec `_ module. Typically the calibration step is lightweight and uses a small dataset to obtain appropriate statistics for scaling tensors. The output directory produced (or a .qnemo tarball) is ready to be used to build a serving engine with the Nvidia TensorRT-LLM library. The engine build step is also available in the `Export-Deploy repository `_.
 
 Quantization algorithm can also be conveniently set to ``"null"`` to perform only the weights export step using default precision for TensorRT-LLM deployment. This is useful to obtain baseline performance and accuracy results for comparison.
 
@@ -103,19 +103,11 @@ The output directory stores the following files:
     ├── tokenizer.model
     └── tokenizer_config.yaml
 
-The TensorRT-LLM engine can be conveniently built and run using ``TensorRTLLM`` class available in ``nemo.export`` submodule:
+.. note::
+   The export and deployment functionality has been moved to a separate repository.
+   Install with: ``pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git``
 
-.. code-block:: python
-
-    from nemo.export.tensorrt_llm import TensorRTLLM
-    trt_llm_exporter = TensorRTLLM(model_dir="/path/to/trt_llm_engine_folder")
-    trt_llm_exporter.export(
-        nemo_checkpoint_path="llama3-70b-base-fp8-qnemo",
-        model_type="llama",
-    )
-    trt_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
-
-Alternatively, it can also be built directly using ``trtllm-build`` command, see `TensorRT-LLM documentation `_:
+The TensorRT-LLM engine can be built directly using ``trtllm-build`` command, see `TensorRT-LLM documentation `_:
 
 .. code-block:: bash
 
@@ -129,7 +121,7 @@ Alternatively, it can also be built directly using ``trtllm-build`` command, see
 
 Known issues
 ^^^^^^^^^^^^
-* Currently with ``nemo.export`` module building TensorRT-LLM engines for quantized "qnemo" models is limited to single-node deployments.
+* Building TensorRT-LLM engines for quantized "qnemo" models is limited to single-node deployments.
 
 
 Quantization-Aware Training (QAT)
@@ -183,25 +175,11 @@ Note that you may tweak the QAT trainer steps and learning rate if needed to ach
 NeMo checkpoints trained in FP8 with `NVIDIA Transformer Engine `_
 ----------------------------------------------------------------------------------------------------------------
 
-If you have an FP8-quantized checkpoint, produced during pre-training or fine-tuning with Transformer Engine, you can convert it to a FP8 TensorRT-LLM engine directly using ``nemo.export``.
-The API is the same as with regular ``.nemo`` and ``.qnemo`` checkpoints:
-
-.. code-block:: python
-
-    from nemo.export.tensorrt_llm import TensorRTLLM
-    trt_llm_exporter = TensorRTLLM(model_dir="/path/to/trt_llm_engine_folder")
-    trt_llm_exporter.export(
-        nemo_checkpoint_path="/path/to/llama2-7b-base-fp8.nemo",
-        model_type="llama",
-    )
-    trt_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
-
-The export settings for quantization can be adjusted via ``trt_llm_exporter.export`` arguments:
-
-* ``fp8_quantized: Optional[bool] = None``: manually enables/disables FP8 quantization
-* ``fp8_kvcache: Optional[bool] = None``: manually enables/disables FP8 quantization for KV-cache
+If you have an FP8-quantized checkpoint, produced during pre-training or fine-tuning with Transformer Engine, you can convert it to a FP8 TensorRT-LLM engine directly using the Export-Deploy repository.
 
-By default quantization settings are auto-detected from the NeMo checkpoint.
+.. note::
+   Export and deployment functionality is available in the Export-Deploy repository.
+   See: https://github.com/NVIDIA-NeMo/Export-Deploy
 
 
 References
diff --git a/examples/llm/finetune/automodel_vllm.py b/examples/llm/finetune/automodel_vllm.py
deleted file mode 100644
index 8314c4968949..000000000000
--- a/examples/llm/finetune/automodel_vllm.py
+++ /dev/null
@@ -1,158 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Example: Export vLLM HF model with optional LoRA and Triton deployment.
-
-Note: This example requires the Export-Deploy repository:
-  pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git
-
-Additionally, vLLM must be installed or activated in your environment:
-  source /opt/venv/bin/activate
-"""
-
-import argparse
-
-try:
-    from nemo.export.vllm_hf_exporter import vLLMHFExporter
-except ImportError as e:
-    raise ImportError(
-        "This example requires the Export-Deploy repository.\n"
-        "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git\n"
-        "Additionally, vLLM must be installed or activated:\n"
-        "  source /opt/venv/bin/activate"
-    ) from e
-
-from nemo.deploy import DeployPyTriton
-from nemo.deploy.nlp import NemoQueryLLM
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Export a vLLM HF model, optionally apply LoRA, and optionally deploy to Triton."
-    )
-
-    # 1) Base model
-    parser.add_argument(
-        "--model",
-        required=True,
-        help="Local path or HuggingFace name of the base model",
-    )
-
-    # 2) Optional LoRA
-    parser.add_argument(
-        "--lora-model",
-        help="Local path of a LoRA adapter to apply (optional)",
-    )
-    parser.add_argument(
-        "--lora-name",
-        default="lora_adapter",
-        help="Logical name for the LoRA adapter (default: %(default)s)",
-    )
-
-    # 3) Optional Triton deploy
-    parser.add_argument(
-        "--deploy",
-        action="store_true",
-        help="Deploy to Triton if set",
-    )
-    parser.add_argument(
-        "--triton-model-name",
-        help="Triton model name (required with --deploy)",
-    )
-    parser.add_argument(
-        "--triton-model-version",
-        type=int,
-        default=1,
-        help="Triton model version (default: %(default)s)",
-    )
-    parser.add_argument(
-        "--max-batch-size",
-        type=int,
-        default=64,
-        help="Triton max batch size (default: %(default)s)",
-    )
-    parser.add_argument(
-        "--http-port",
-        type=int,
-        default=8000,
-        help="Triton HTTP port (default: %(default)s)",
-    )
-    parser.add_argument(
-        "--address",
-        default="0.0.0.0",
-        help="Triton bind address (default: %(default)s)",
-    )
-
-    args = parser.parse_args()
-
-    enable_lora = bool(args.lora_model)
-
-    # Export base model (with LoRA enabled if requested)
-    exporter = vLLMHFExporter()
-    exporter.export(model=args.model, enable_lora=enable_lora)
-
-    # Attach LoRA adapter if provided
-    if enable_lora:
-        exporter.add_lora_models(
-            lora_model_name=args.lora_name,
-            lora_model=args.lora_model,
-        )
-
-    # If not deploying, just do a local forward
-    if not args.deploy:
-        output = exporter.forward(
-            input_texts=["How are you doing?"],
-            lora_model_name=(args.lora_name if enable_lora else None),
-        )
-        print("Local forward output:", output)
-        return
-
-    # Validate Triton args
-    if not args.triton_model_name:
-        parser.error("--triton-model-name is required when --deploy is set")
-
-    # Deploy to Triton
-    server = DeployPyTriton(
-        model=exporter,
-        triton_model_name=args.triton_model_name,
-        triton_model_version=args.triton_model_version,
-        max_batch_size=args.max_batch_size,
-        http_port=args.http_port,
-        address=args.address,
-    )
-
-    try:
-        server.deploy()
-        server.run()
-
-        # Query the deployed model
-        client = NemoQueryLLM(
-            url=f"localhost:{args.http_port}",
-            model_name=args.triton_model_name,
-        )
-        resp = client.query_llm(
-            prompts=["How are you doing?"],
-            max_output_len=128,
-            top_k=1,
-            top_p=0.2,
-            temperature=1.0,
-        )
-        print("Deployed Triton output:", resp)
-    finally:
-        server.stop()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/nemo/collections/common/video_tokenizers/README.md b/nemo/collections/common/video_tokenizers/README.md
index 02c8d04ee5b4..2c75371cb0bf 100644
--- a/nemo/collections/common/video_tokenizers/README.md
+++ b/nemo/collections/common/video_tokenizers/README.md
@@ -29,58 +29,13 @@ for the complete list of supported tokenizers.
 
 ### Acceleration with TensorRT
 
-**Note:** TensorRT acceleration requires the Export-Deploy repository:
+**Note:** TensorRT acceleration functionality has been moved to the Export-Deploy repository:
 ```bash
 pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git
 ```
 
-To use these tokenizers with TensorRT and acheive up to 3X speedup during tokenization,
-users can define a lightweight wrapper model and then pass this wrapper model to `trt_compile`
-```python
-import torch
-from nemo.collections.common.video_tokenizers.cosmos_tokenizer import CausalVideoTokenizer
-from nemo.export.tensorrt_lazy_compiler import trt_compile
-
-class VaeWrapper(torch.nn.Module):
-    def __init__(self, vae):
-        super().__init__()
-        self.vae = vae
-
-    def forward(self, input_tensor):
-        output_tensor = self.vae.autoencode(input_tensor)
-        return output_tensor
-
-model = CausalVideoTokenizer.from_pretrained(
-    "Cosmos-Tokenizer-DV4x8x8", 
-    use_pytorch=True, 
-    dtype="float"
-)
-model_wrapper = VaeWrapper(model)
-
-input_tensor = torch.randn(1, 3, 9, 512, 512).to('cuda').to(torch.float)
-opt_shape = min_shape = max_shape = input_tensor.shape
-
-path_to_engine_outputs="./trt_outputs"
-trt_compile(
-    model_wrapper,
-    path_to_engine_outputs,
-    args={
-        "precision": "bf16",
-        "input_profiles": [
-            {"input_tensor": [min_shape, opt_shape, max_shape]},
-        ],
-    },
-)
-
-output = model_wrapper(input_tensor)
-```
-Note that the `trt_compile` function requires 
-providing `min_shape`, `opt_shape` and `max_shape`
-as arguments (in this example all are set to the input tensor shape for simplicity) which enables inputs with dynamic shapes after compilation.
-For more information about TensorRT and dynamic shapes please review the [Torch-Tensorrt documentation](https://pytorch.org/TensorRT/user_guide/dynamic_shapes.html)
-
-The file `cosmos_trt_run.py` provides a stand-alone script to tokenize tensors with a TensorRT-accelerated
-Cosmos tokenizer.
+For TensorRT acceleration examples and documentation, please refer to:
+https://github.com/NVIDIA-NeMo/Export-Deploy
 
 # Examples
 1. Multimodal autoregressive model dataset preparation using the [discrete cosmos tokenizer](../../../../nemo/collections/multimodal_autoregressive/data/README.md)
diff --git a/nemo/collections/common/video_tokenizers/cosmos_trt_run.py b/nemo/collections/common/video_tokenizers/cosmos_trt_run.py
deleted file mode 100644
index c4f1bf73dde1..000000000000
--- a/nemo/collections/common/video_tokenizers/cosmos_trt_run.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""
-Note: This script requires the Export-Deploy repository for TensorRT compilation.
-Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git
-"""
-
-import argparse
-import os
-import shutil
-
-import torch
-
-from nemo.collections.common.video_tokenizers.cosmos_tokenizer import CausalVideoTokenizer
-from nemo.export.tensorrt_lazy_compiler import trt_compile
-
-parser = argparse.ArgumentParser(description="Export and run tokenizer in TensorRT")
-parser.add_argument(
-    "--tokenizer_name",
-    type=str,
-    default="Cosmos-Tokenizer-CV4x8x8",
-    help="Tokenizer name or path",
-)
-parser.add_argument(
-    "--engine_path",
-    type=str,
-    default="outputs",
-    help="Path to TensorRT engine",
-)
-parser.add_argument("--min_shape", type=int, nargs='+', help="min input shape for inference")
-parser.add_argument("--opt_shape", type=int, nargs='+', help="opt input shape for inference")
-parser.add_argument(
-    "--max_shape", type=int, nargs='+', default=[1, 3, 9, 512, 512], help="max input shape for inference"
-)
-parser.add_argument("--clean", action="store_true", help="Clean all files in engine_path before export")
-
-args = parser.parse_args()
-
-
-def main():
-    """Export and run tokenizer in TensorRT."""
-    model = CausalVideoTokenizer.from_pretrained(args.tokenizer_name, use_pytorch=True, dtype="float")
-
-    class VaeWrapper(torch.nn.Module):
-        """Wrapper class for VAE model to enable TensorRT compilation."""
-
-        def __init__(self, vae):
-            super().__init__()
-            self.vae = vae
-
-        def forward(self, input_tensor):
-            """Forward pass through the VAE autoencoder."""
-            output_tensor = self.vae.autoencode(input_tensor)
-            return output_tensor
-
-    model_wrapper = VaeWrapper(model)
-
-    if args.clean and os.path.exists(args.engine_path):
-        print(f"Remove existing {args.engine_path}")
-        shutil.rmtree(args.engine_path)
-
-    os.makedirs(args.engine_path, exist_ok=True)
-
-    min_shape = args.min_shape
-    opt_shape = args.opt_shape
-    max_shape = args.max_shape
-
-    if opt_shape is None:
-        opt_shape = max_shape
-    if min_shape is None:
-        min_shape = opt_shape
-
-    output_path = os.path.join(args.engine_path, "auto_encoder")
-    trt_compile(
-        model_wrapper,
-        output_path,
-        args={
-            "precision": "bf16",
-            "input_profiles": [
-                {"input_tensor": [min_shape, opt_shape, max_shape]},
-            ],
-        },
-    )
-
-    input_tensor = torch.randn(max_shape).to('cuda').to(torch.float)
-    _ = model_wrapper(input_tensor)  # Warmup call to ensure TensorRT engine is compiled
-
-
-if __name__ == '__main__':
-    main()
diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 29ad1f6deabb..a95653ad4582 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -485,8 +485,8 @@ def ptq(
     """
     Applies Post-Training Quantization (PTQ) for a model using the specified quantization and export configs. It runs
     calibration for a small dataset to collect scaling factors low-precision GEMMs used by desired quantization method.
-    By default, this function produces TensorRT-LLM checkpoint ready for deployment using nemo.export and nemo.deploy
-    modules or direcly using TensorRT-LLM library.
+    By default, this function produces TensorRT-LLM checkpoint ready for deployment using the Export-Deploy repository
+    (https://github.com/NVIDIA-NeMo/Export-Deploy) or directly using TensorRT-LLM library.
 
     The function can be used through the NeMo CLI in the following way:
     ```bash
diff --git a/nemo/collections/llm/gpt/model/deepseek.py b/nemo/collections/llm/gpt/model/deepseek.py
index 5723ee7d73bc..bb5124682e04 100644
--- a/nemo/collections/llm/gpt/model/deepseek.py
+++ b/nemo/collections/llm/gpt/model/deepseek.py
@@ -35,7 +35,6 @@
     gpt_data_step,
     torch_dtype_from_dict_config,
 )
-from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import io, teardown
 from nemo.lightning.io.state import TransformFns, _ModelState
 from nemo.lightning.pytorch.optim import OptimizerModule
@@ -495,39 +494,6 @@ def _detect_hf_deepseek_version(self, source_config: Dict[str, Any]) -> str:
         )
         return target_model_name
 
-    def ckpt_load(self, path: Path) -> Tuple[Dict, Dict]:
-        """
-        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
-        so that it is consistent with the key names you would get from loading the checkpoint into a model.
-        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
-
-        Args:
-            path (Path): The path from which the model will be loaded.
-
-        Returns
-        -------
-            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
-        """
-        model_yaml = path / "context" / "model.yaml"
-        if not model_yaml.exists():
-            raise FileNotFoundError("model.yaml is not found in the context folder of the checkpoint.")
-        with open(model_yaml, 'r') as stream:
-            config = yaml.safe_load(stream)
-
-        dist_ckpt_folder = path / "weights"
-        state_dict = {}
-        for k, v in load_distributed_model_weights(dist_ckpt_folder, True).items():
-            if '_extra_state' in k:
-                continue
-            new_k = k.replace("module.", "")
-            if '.experts.experts.' in k:
-                # split experts into multiple tensors
-                for i in range(v.size(0)):
-                    state_dict[new_k.replace(".experts.experts.", ".experts.") + str(i)] = v[i]
-            else:
-                state_dict[new_k] = v
-        return state_dict, config['config']
-
     def apply(self, output_path: Path, target_model_name=None) -> Path:
         logging.info("Loading DeepSeek NeMo checkpoint. This may take a while...")
         source, source_config = self.ckpt_load(self)
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 1c1a4654a6d2..160d69240f4c 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -27,7 +27,6 @@
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel, torch_dtype_from_mcore_config
 from nemo.collections.llm.gpt.model.llama4_utils import get_llama4_layer_spec
 from nemo.collections.llm.utils import Config
-from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import OptimizerModule, io, teardown
 from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME
 from nemo.lightning.io.pl import ckpt_to_weights_subdir
@@ -1057,46 +1056,6 @@ def create_llama4_config(self, source):
         )
         return config
 
-    def ckpt_load(self, path: Path) -> Tuple[Dict, Any]:
-        """
-        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
-        so that it is consistent with the key names you would get from loading the checkpoint into a model.
-        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
-
-        Args:
-            path (Path): The path from which the model will be loaded.
-
-        Returns
-        -------
-            Tuple[Dict, Any]: The loaded state dict and the yaml config object.
-        """
-        model_yaml = path / "context" / "model.yaml"
-        if not model_yaml.exists():
-            raise FileNotFoundError("model.yaml is not found in the context folder of the checkpoint.")
-        with open(model_yaml, 'r') as stream:
-            config = yaml.safe_load(stream)
-
-        dist_ckpt_folder = path / "weights"
-        state_dict = {}
-
-        dict_to_obj = lambda d: (
-            type('Config', (), {kk: dict_to_obj(vv) for kk, vv in d.items()}) if isinstance(d, dict) else d
-        )
-        config_obj = dict_to_obj(config['config'])
-        langauge_layers = config_obj.num_layers
-        distributed_model_weights = load_distributed_model_weights(dist_ckpt_folder, True).items()
-        for k, v in distributed_model_weights:
-            if '_extra_state' in k:
-                continue
-            new_k = k.replace("module.", "")
-            if 'layers' in new_k and v.size(0) == langauge_layers:
-                # Only split layers
-                for i in range(v.size(0)):
-                    state_dict[new_k.replace('layers', f'layers.{str(i)}')] = v[i]
-            state_dict[new_k] = v
-
-        return state_dict, config_obj
-
     def _modify_llama4_source_state(self, state_dict, source_config):
         """
         For MoE layer, we transpose the gate_up_proj and down_proj to match HF implementation.
diff --git a/nemo/collections/llm/modelopt/quantization/quantizer.py b/nemo/collections/llm/modelopt/quantization/quantizer.py
index 1aea3d661778..6ea74de8e0a0 100644
--- a/nemo/collections/llm/modelopt/quantization/quantizer.py
+++ b/nemo/collections/llm/modelopt/quantization/quantizer.py
@@ -113,7 +113,8 @@ class Quantizer:
         3. Producing an output directory with a quantized checkpoint and a tokenizer
 
     By default, the output directory produced is intended to be consumed by TensorRT-LLM toolbox
-    for efficient inference. This can be achieved using nemo.export.tensorrt_llm module.
+    for efficient inference. This can be achieved using the Export-Deploy repository
+    (https://github.com/NVIDIA-NeMo/Export-Deploy).
     This can be changed to export a standard NeMo 2.0 checkpoint instead using `ExportConfig`.
     """
 
@@ -401,8 +402,7 @@ def _save_tokenizer(self, model, model_dir: str, export_dir: Path, export_fmt: s
             ):
                 model.tokenizer.tokenizer.save_pretrained(str(export_dir))
             else:
-                # Save the model context in order to restore its tokenizer later. The destination
-                # path is "nemo_context" as this name is used in nemo.export to setup tokenizer.
+                # Save the model context in order to restore its tokenizer later.
                 shutil.copytree(
                     ckpt_to_context_subdir(model_dir), os.path.join(export_dir, "nemo_context"), dirs_exist_ok=True
                 )
diff --git a/nemo/collections/vlm/api.py b/nemo/collections/vlm/api.py
index 30462bdfbd30..3339d0b4a60b 100644
--- a/nemo/collections/vlm/api.py
+++ b/nemo/collections/vlm/api.py
@@ -45,8 +45,8 @@ def ptq(
     specified quantization and export configs.
     It runs calibration for a small dataset to collect scaling factors low-precision
     GEMMs used by desired quantization method.
-    By default, this function produces TensorRT-LLM checkpoint ready for deployment using nemo.export and nemo.deploy
-    modules or directly using TensorRT-LLM library.
+    By default, this function produces TensorRT-LLM checkpoint ready for deployment using the Export-Deploy repository
+    (https://github.com/NVIDIA-NeMo/Export-Deploy) or directly using TensorRT-LLM library.
 
     Args:
         model_path (str): The path to model to be quantized.
diff --git a/nemo/collections/vlm/llama4/model/llama4_omni.py b/nemo/collections/vlm/llama4/model/llama4_omni.py
index 5e1edd0a091a..f39ba2a8e623 100644
--- a/nemo/collections/vlm/llama4/model/llama4_omni.py
+++ b/nemo/collections/vlm/llama4/model/llama4_omni.py
@@ -27,7 +27,6 @@
 from nemo.collections.vlm.llama4.model.vision import Llama4VisionConfig
 from nemo.collections.vlm.neva.model.llava import export_qkv, export_qkv_bias, import_qkv
 from nemo.collections.vlm.vision.base import MultimodalProjectorConfig
-from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import io, teardown
 from nemo.lightning.io.state import TransformFns, _ModelState
 from nemo.utils import logging
@@ -624,42 +623,6 @@ def tokenizer(self) -> "TokenizerSpec":
         """
         return io.load_context(str(self), subpath="model").tokenizer
 
-    def ckpt_load(self, path: Path) -> Tuple[Dict, Dict]:
-        """
-        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
-        so that it is consistent with the key names you would get from loading the checkpoint into a model.
-        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
-
-        Args:
-            path (Path): The path from which the model will be loaded.
-
-        Returns
-        -------
-            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
-        """
-        model_yaml = path / "context" / "model.yaml"
-        if not model_yaml.exists():
-            raise FileNotFoundError("model.yaml is not found in the context folder of the checkpoint.")
-        with open(model_yaml, 'r') as stream:
-            config = yaml.safe_load(stream)
-
-        dist_ckpt_folder = path / "weights"
-        state_dict = {}
-
-        langauge_layers = config['config']['language_transformer_config']['num_layers']
-        vision_layers = config['config']['vision_transformer_config']['num_layers']
-        distributed_model_weights = load_distributed_model_weights(dist_ckpt_folder, True).items()
-        for k, v in distributed_model_weights:
-            if '_extra_state' in k:
-                continue
-            new_k = k.replace("module.", "")
-            if 'layers' in new_k and (v.size(0) == langauge_layers or v.size(0) == vision_layers):
-                # Only split layers
-                for i in range(v.size(0)):
-                    state_dict[new_k.replace('layers', f'layers.{str(i)}')] = v[i]
-            state_dict[new_k] = v
-        return state_dict, config['config']
-
     def _modify_llama4_source_state(self, state_dict, source_config):
         """
         For MoE layer, we transpose the gate_up_proj and down_proj to match HF implementation.
diff --git a/nemo/collections/vlm/mllama/model/mllama.py b/nemo/collections/vlm/mllama/model/mllama.py
index 1dfecd0e5a07..79ea79002aa1 100644
--- a/nemo/collections/vlm/mllama/model/mllama.py
+++ b/nemo/collections/vlm/mllama/model/mllama.py
@@ -33,7 +33,6 @@
     MLlamaModel,
     MLlamaModelConfig,
 )
-from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import io, teardown
 from nemo.lightning.io.state import _ModelState
 from nemo.lightning.pytorch.utils import dtype_from_hf
@@ -517,40 +516,6 @@ def tokenizer(self) -> "TokenizerSpec":
         """
         return io.load_context(str(self), subpath="model").tokenizer
 
-    def ckpt_load(self, path: Path) -> Tuple[Dict, Dict]:
-        """
-        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
-        so that it is consistent with the key names you would get from loading the checkpoint into a model.
-        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
-
-        Args:
-            path (Path): The path from which the model will be loaded.
-
-        Returns
-        -------
-            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
-        """
-        config = io.load_context(str(self), subpath="model.config")
-        dist_ckpt_folder = path / "weights"
-        state_dict = {}
-
-        langauge_layers = config.language_model_config.num_layers
-        vision_layers = config.vision_model_config.num_layers
-        distributed_model_weights = load_distributed_model_weights(dist_ckpt_folder, True).items()
-        for k, v in distributed_model_weights:
-            if "_extra_state" in k:
-                continue
-            new_k = k.replace("module.", "")
-            if "layers" in new_k and (v.size(0) == langauge_layers or v.size(0) == vision_layers):
-                # Only split layers
-                for i in range(v.size(0)):
-                    state_dict[new_k.replace("layers", f"layers.{str(i)}")] = v[i]
-            elif "global_transformer.layers" in new_k:
-                for i in range(v.size(0)):
-                    state_dict[new_k.replace("layers", f"layers.{str(i)}")] = v[i]
-            state_dict[new_k] = v
-        return state_dict, config
-
     def _modify_mllama_source_state(self, state_dict, source_config):
         """
         - Modify state dict to integrate cross-attention layers into self-attention layer.
diff --git a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
index e9cbe154fcd1..da32902fbbc3 100755
--- a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
+++ b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
@@ -46,7 +46,6 @@
     Qwen25VLVisionConfig,
 )
 from nemo.collections.vlm.vision import MultimodalProjectorConfig
-from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import io, teardown
 from nemo.lightning.io.state import _ModelState
 from nemo.lightning.pytorch.utils import dtype_from_hf
@@ -542,39 +541,6 @@ def tokenizer(self) -> "TokenizerSpec":
         """
         return io.load_context(str(self), subpath="model").tokenizer
 
-    def ckpt_load(self, path: Path) -> Tuple[Dict, Dict]:
-        """
-        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
-        so that it is consistent with the key names you would get from loading the checkpoint into a model.
-        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
-
-        Args:
-            path (Path): The path from which the model will be loaded.
-
-        Returns
-        -------
-            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
-        """
-        config = io.load_context(str(self), subpath="model.config")
-        dist_ckpt_folder = path / "weights"
-        state_dict = {}
-
-        langauge_layers = config.language_transformer_config.num_layers
-        vision_layers = config.vision_transformer_config.num_layers
-        distributed_model_weights = load_distributed_model_weights(dist_ckpt_folder, True).items()
-        for k, v in distributed_model_weights:
-            if "_extra_state" in k:
-                continue
-            new_k = k.replace("module.", "")
-            if "layers" in new_k and (v.size(0) == langauge_layers or v.size(0) == vision_layers):
-                # Only split layers
-                for i in range(v.size(0)):
-                    state_dict[new_k.replace("layers", f"layers.{str(i)}")] = v[i]
-            state_dict[new_k] = v
-
-        source = _ModelState(state_dict)
-        return source, config
-
     @property
     def config(self) -> "HFQwen2VLConfig":
         """
diff --git a/nemo/deploy/__init__.py b/nemo/deploy/__init__.py
deleted file mode 100644
index 0bd6c3d82b44..000000000000
--- a/nemo/deploy/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-warnings.warn(
-    "The 'nemo.deploy' module has been moved to a separate repository. "
-    "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
diff --git a/nemo/deploy/deploy_pytriton.py b/nemo/deploy/deploy_pytriton.py
deleted file mode 100644
index b5f973158168..000000000000
--- a/nemo/deploy/deploy_pytriton.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Stub module for PyTriton deployment.
-
-This module has been moved to the Export-Deploy repository.
-"""
-
-
-class DeployPyTriton:
-    """Stub class that raises an error directing users to the Export-Deploy repository."""
-
-    def __init__(self, *args, **kwargs):
-        raise ImportError(
-            "The 'DeployPyTriton' class has been moved to a separate repository. "
-            "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
-            "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
-        )
-
diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py
deleted file mode 100644
index 308c134c4b33..000000000000
--- a/nemo/deploy/nlp/__init__.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Stub module for NLP deployment.
-
-This module has been moved to the Export-Deploy repository.
-"""
-
-
-class NemoQueryLLM:
-    """Stub class that raises an error directing users to the Export-Deploy repository."""
-
-    def __init__(self, *args, **kwargs):
-        raise ImportError(
-            "The 'NemoQueryLLM' class has been moved to a separate repository. "
-            "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
-            "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
-        )
-
diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
deleted file mode 100644
index f5be0ea6e2bc..000000000000
--- a/nemo/export/__init__.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-warnings.warn(
-    "The 'nemo.export' module has been moved to a separate repository. "
-    "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy",
-    DeprecationWarning,
-    stacklevel=2,
-)
-
diff --git a/nemo/export/tensorrt_lazy_compiler.py b/nemo/export/tensorrt_lazy_compiler.py
deleted file mode 100644
index 651a2ed4aefe..000000000000
--- a/nemo/export/tensorrt_lazy_compiler.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Stub module for TensorRT lazy compiler.
-
-This module has been moved to the Export-Deploy repository.
-"""
-
-
-def trt_compile(*args, **kwargs):
-    """Stub function that raises an error directing users to the Export-Deploy repository."""
-    raise ImportError(
-        "The 'trt_compile' function has been moved to a separate repository. "
-        "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
-        "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
-    )
-
diff --git a/nemo/export/trt_llm/__init__.py b/nemo/export/trt_llm/__init__.py
deleted file mode 100644
index 67052757bd00..000000000000
--- a/nemo/export/trt_llm/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-raise ImportError(
-    "The 'nemo.export.trt_llm' module has been moved to a separate repository. "
-    "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy"
-)
-
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
deleted file mode 100644
index bd9b221bbb79..000000000000
--- a/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-raise ImportError(
-    "The 'nemo.export.trt_llm.nemo_ckpt_loader' module has been moved to a separate repository. "
-    "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy"
-)
-
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
deleted file mode 100644
index b3661ee4f4a0..000000000000
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Stub module for load_distributed_model_weights and related functions.
-
-This module has been moved to the Export-Deploy repository.
-"""
-
-
-def load_distributed_model_weights(*args, **kwargs):
-    """Stub function that raises an error directing users to the Export-Deploy repository."""
-    raise ImportError(
-        "The 'load_distributed_model_weights' function has been moved to a separate repository. "
-        "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
-        "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
-    )
-
diff --git a/nemo/export/vllm_hf_exporter.py b/nemo/export/vllm_hf_exporter.py
deleted file mode 100644
index d70df465e3a9..000000000000
--- a/nemo/export/vllm_hf_exporter.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Stub module for vLLM HF exporter.
-
-This module has been moved to the Export-Deploy repository.
-"""
-
-
-class vLLMHFExporter:
-    """Stub class that raises an error directing users to the Export-Deploy repository."""
-
-    def __init__(self, *args, **kwargs):
-        raise ImportError(
-            "The 'vLLMHFExporter' class has been moved to a separate repository. "
-            "Please use the Export-Deploy repository: https://github.com/NVIDIA-NeMo/Export-Deploy\n"
-            "Install with: pip install git+https://github.com/NVIDIA-NeMo/Export-Deploy.git"
-        )
-
diff --git a/requirements/requirements_vllm.txt b/requirements/requirements_vllm.txt
index f6edafaa9dce..d3b125003786 100644
--- a/requirements/requirements_vllm.txt
+++ b/requirements/requirements_vllm.txt
@@ -26,3 +26,4 @@ vllm==0.8.5.post1
 webdataset>=0.2.86
 wget
 zarr>=2.18.2,<3.0.0
+
diff --git a/tests/functional_tests/L2_NeMo_2_VLLM_VISION.sh b/tests/functional_tests/L2_NeMo_2_VLLM_VISION.sh
deleted file mode 100644
index e08c689feb6c..000000000000
--- a/tests/functional_tests/L2_NeMo_2_VLLM_VISION.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2020-2025, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-TRANSFORMERS_OFFLINE=1 HF_HOME=/home/TestData/vlm/vision/hf/ coverage run -a --data-file=/workspace/.coverage --source=/workspace/nemo \
-  tests/collections/vlm/vision/test_llava_next_InternVIT.py \
-  --devices=1 \
-  --max-steps=5
-
-TRANSFORMERS_OFFLINE=1 HF_HOME=/home/TestData/vlm/vision/hf/ coverage run \
-  -a --data-file=/workspace/.coverage --source=/workspace/nemo \
-  scripts/vlm/import_hf.py --input_name_or_path="OpenGVLab/InternViT-300M-448px-V2_5"
-
-TRANSFORMERS_OFFLINE=1 HF_HOME=/home/TestData/vlm/vision/hf/ coverage run \
-  -a --data-file=/workspace/.coverage --source=/workspace/nemo \
-  scripts/vlm/import_hf.py --input_name_or_path="openai/clip-vit-large-patch14"
-
-TRANSFORMERS_OFFLINE=1 HF_HOME=/home/TestData/vlm/vision/hf/ coverage run \
-  -a --data-file=/workspace/.coverage --source=/workspace/nemo \
-  scripts/vlm/import_hf.py --input_name_or_path="google/siglip-base-patch16-224"
\ No newline at end of file

From 7b2c814e59895ce8c40ef6338eaa3e79bed18a19 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Fri, 7 Nov 2025 15:56:15 -0800
Subject: [PATCH 08/15] lintfix

Signed-off-by: Pablo Garay 
---
 nemo/collections/llm/gpt/model/deepseek.py       | 3 +--
 nemo/collections/llm/gpt/model/llama.py          | 3 +--
 nemo/collections/vlm/llama4/model/llama4_omni.py | 3 +--
 nemo/collections/vlm/mllama/model/mllama.py      | 2 +-
 nemo/collections/vlm/qwen2vl/model/qwen2vl.py    | 3 +--
 5 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/deepseek.py b/nemo/collections/llm/gpt/model/deepseek.py
index bb5124682e04..c6e941bb52f0 100644
--- a/nemo/collections/llm/gpt/model/deepseek.py
+++ b/nemo/collections/llm/gpt/model/deepseek.py
@@ -16,11 +16,10 @@
 from dataclasses import asdict, dataclass, field
 from functools import cached_property, partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import torch
 import torch.nn.functional as F
-import yaml
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.transformer_config import MLATransformerConfig
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 160d69240f4c..f628219e4bfa 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -17,11 +17,10 @@
 from dataclasses import dataclass, field
 from functools import partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Annotated, Callable, List, Optional, Union
 
 import torch
 import torch.nn.functional as F
-import yaml
 from torch import nn
 
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel, torch_dtype_from_mcore_config
diff --git a/nemo/collections/vlm/llama4/model/llama4_omni.py b/nemo/collections/vlm/llama4/model/llama4_omni.py
index f39ba2a8e623..f1ad60b7345f 100644
--- a/nemo/collections/vlm/llama4/model/llama4_omni.py
+++ b/nemo/collections/vlm/llama4/model/llama4_omni.py
@@ -14,10 +14,9 @@
 
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, Tuple
+from typing import TYPE_CHECKING
 
 import torch
-import yaml
 from torch import nn
 
 from nemo.collections.common.tokenizers import TokenizerSpec
diff --git a/nemo/collections/vlm/mllama/model/mllama.py b/nemo/collections/vlm/mllama/model/mllama.py
index 79ea79002aa1..3522d8dcd1cb 100644
--- a/nemo/collections/vlm/mllama/model/mllama.py
+++ b/nemo/collections/vlm/mllama/model/mllama.py
@@ -15,7 +15,7 @@
 import re
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, Optional, Tuple
+from typing import Dict, Optional
 
 import torch
 import torch.distributed
diff --git a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
index da32902fbbc3..b14eedc14687 100755
--- a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
+++ b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
@@ -14,7 +14,7 @@
 
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Dict, Tuple, Union
+from typing import TYPE_CHECKING, Union
 
 import torch
 import transformers
@@ -47,7 +47,6 @@
 )
 from nemo.collections.vlm.vision import MultimodalProjectorConfig
 from nemo.lightning import io, teardown
-from nemo.lightning.io.state import _ModelState
 from nemo.lightning.pytorch.utils import dtype_from_hf
 from nemo.utils import logging
 

From 4d2366bf64f035551c03ea00b3df48586fdc7735 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Fri, 7 Nov 2025 23:29:00 -0800
Subject: [PATCH 09/15] get load_ckpt back

Signed-off-by: Pablo Garay 
---
 nemo/collections/llm/gpt/model/deepseek.py    |  138 +-
 nemo/collections/llm/gpt/model/llama.py       |   64 +-
 .../vlm/llama4/model/llama4_omni.py           |   40 +-
 nemo/collections/vlm/mllama/model/mllama.py   |  590 +-----
 nemo/collections/vlm/qwen2vl/model/qwen2vl.py |  720 +------
 nemo/export/__init__.py                       |   21 +
 nemo/export/multimodal/__init__.py            |   13 +
 nemo/export/multimodal/build.py               |  728 +++++++
 nemo/export/multimodal/converter.py           |  412 ++++
 nemo/export/multimodal/run.py                 | 1168 +++++++++++
 nemo/export/onnx_llm_exporter.py              |  465 +++++
 nemo/export/quantize/__init__.py              |   15 +
 nemo/export/quantize/quantizer.py             |  257 +++
 nemo/export/sentencepiece_tokenizer.py        |  280 +++
 nemo/export/tarutils.py                       |  277 +++
 nemo/export/tensorrt_lazy_compiler.py         |  714 +++++++
 nemo/export/tensorrt_llm.py                   | 1805 +++++++++++++++++
 nemo/export/tensorrt_mm_exporter.py           |  367 ++++
 nemo/export/tiktoken_tokenizer.py             |  123 ++
 nemo/export/trt_llm/__init__.py               |   13 +
 nemo/export/trt_llm/converter/__init__.py     |   13 +
 .../trt_llm/converter/model_converter.py      |  307 +++
 .../converter/model_to_trt_llm_ckpt.py        |  494 +++++
 nemo/export/trt_llm/converter/utils.py        |  598 ++++++
 .../trt_llm/nemo_ckpt_loader/__init__.py      |   13 +
 .../trt_llm/nemo_ckpt_loader/nemo_file.py     |  706 +++++++
 nemo/export/trt_llm/qnemo/__init__.py         |   15 +
 .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py    |  119 ++
 nemo/export/trt_llm/qnemo/tokenizer_utils.py  |   55 +
 nemo/export/trt_llm/qnemo/utils.py            |   32 +
 nemo/export/trt_llm/tensorrt_llm_build.py     |  133 ++
 nemo/export/trt_llm/tensorrt_llm_run.py       |  931 +++++++++
 nemo/export/trt_llm/utils.py                  |   35 +
 nemo/export/utils/__init__.py                 |   45 +
 nemo/export/utils/_mock_import.py             |   79 +
 nemo/export/utils/constants.py                |   16 +
 nemo/export/utils/lora_converter.py           |  223 ++
 nemo/export/utils/model_loader.py             |  209 ++
 nemo/export/utils/utils.py                    |  155 ++
 nemo/export/vllm/__init__.py                  |   13 +
 nemo/export/vllm/engine.py                    |  140 ++
 nemo/export/vllm/model_config.py              |  245 +++
 nemo/export/vllm/model_converters.py          |  421 ++++
 nemo/export/vllm/model_loader.py              |  101 +
 nemo/export/vllm/tokenizer_group.py           |   75 +
 nemo/export/vllm_exporter.py                  |  538 +++++
 nemo/export/vllm_hf_exporter.py               |  132 ++
 47 files changed, 12766 insertions(+), 1287 deletions(-)
 mode change 100755 => 100644 nemo/collections/vlm/qwen2vl/model/qwen2vl.py
 create mode 100644 nemo/export/__init__.py
 create mode 100644 nemo/export/multimodal/__init__.py
 create mode 100644 nemo/export/multimodal/build.py
 create mode 100644 nemo/export/multimodal/converter.py
 create mode 100644 nemo/export/multimodal/run.py
 create mode 100755 nemo/export/onnx_llm_exporter.py
 create mode 100644 nemo/export/quantize/__init__.py
 create mode 100644 nemo/export/quantize/quantizer.py
 create mode 100644 nemo/export/sentencepiece_tokenizer.py
 create mode 100644 nemo/export/tarutils.py
 create mode 100644 nemo/export/tensorrt_lazy_compiler.py
 create mode 100644 nemo/export/tensorrt_llm.py
 create mode 100644 nemo/export/tensorrt_mm_exporter.py
 create mode 100644 nemo/export/tiktoken_tokenizer.py
 create mode 100644 nemo/export/trt_llm/__init__.py
 create mode 100644 nemo/export/trt_llm/converter/__init__.py
 create mode 100755 nemo/export/trt_llm/converter/model_converter.py
 create mode 100644 nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
 create mode 100755 nemo/export/trt_llm/converter/utils.py
 create mode 100644 nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
 create mode 100644 nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
 create mode 100644 nemo/export/trt_llm/qnemo/__init__.py
 create mode 100644 nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
 create mode 100644 nemo/export/trt_llm/qnemo/tokenizer_utils.py
 create mode 100644 nemo/export/trt_llm/qnemo/utils.py
 create mode 100755 nemo/export/trt_llm/tensorrt_llm_build.py
 create mode 100644 nemo/export/trt_llm/tensorrt_llm_run.py
 create mode 100644 nemo/export/trt_llm/utils.py
 create mode 100644 nemo/export/utils/__init__.py
 create mode 100644 nemo/export/utils/_mock_import.py
 create mode 100644 nemo/export/utils/constants.py
 create mode 100644 nemo/export/utils/lora_converter.py
 create mode 100644 nemo/export/utils/model_loader.py
 create mode 100755 nemo/export/utils/utils.py
 create mode 100644 nemo/export/vllm/__init__.py
 create mode 100644 nemo/export/vllm/engine.py
 create mode 100644 nemo/export/vllm/model_config.py
 create mode 100644 nemo/export/vllm/model_converters.py
 create mode 100644 nemo/export/vllm/model_loader.py
 create mode 100644 nemo/export/vllm/tokenizer_group.py
 create mode 100644 nemo/export/vllm_exporter.py
 create mode 100755 nemo/export/vllm_hf_exporter.py

diff --git a/nemo/collections/llm/gpt/model/deepseek.py b/nemo/collections/llm/gpt/model/deepseek.py
index c6e941bb52f0..e46e2a389434 100644
--- a/nemo/collections/llm/gpt/model/deepseek.py
+++ b/nemo/collections/llm/gpt/model/deepseek.py
@@ -13,19 +13,19 @@
 # limitations under the License.
 import json
 import re
-from dataclasses import asdict, dataclass, field
+from dataclasses import dataclass, field
 from functools import cached_property, partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
+import yaml
 from megatron.core.models.gpt.gpt_layer_specs import get_gpt_decoder_block_spec
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.transformer_config import MLATransformerConfig
 from safetensors.torch import load_file
 from torch import nn
-from transformers import AutoConfig
 
 from nemo.collections.llm.gpt.model.base import (
     HAVE_TE,
@@ -34,6 +34,7 @@
     gpt_data_step,
     torch_dtype_from_dict_config,
 )
+from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import io, teardown
 from nemo.lightning.io.state import TransformFns, _ModelState
 from nemo.lightning.pytorch.optim import OptimizerModule
@@ -43,14 +44,10 @@
 if TYPE_CHECKING:
     from megatron.core.transformer import ModuleSpec
     from transformers import AutoModelForCausalLM
-    from transformers import DeepseekV3Config as HFDeepseekV3Config
 
     from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
-if HAVE_TE:
-    from megatron.core.utils import is_te_min_version
-
 
 @dataclass
 class DeepSeekConfig(MLATransformerConfig, GPTConfig):
@@ -89,7 +86,6 @@ class DeepSeekConfig(MLATransformerConfig, GPTConfig):
     moe_token_dispatcher_type: str = "alltoall"
     moe_router_load_balancing_type: str = 'seq_aux_loss'
     moe_shared_expert_overlap: bool = True
-    moe_router_dtype: Optional[str] = 'fp32'
 
     # MLA
     q_lora_rank: int = 1536
@@ -120,9 +116,6 @@ class DeepSeekConfig(MLATransformerConfig, GPTConfig):
     bias_dropout_fusion: bool = True
     masked_softmax_fusion: bool = True
     gradient_accumulation_fusion: bool = True
-    cross_entropy_loss_fusion: bool = True
-    cross_entropy_fusion_impl: str = "te"
-    moe_permute_fusion: bool = is_te_min_version("2.1.0") if HAVE_TE else False
 
     def __post_init__(self):
         super().__post_init__()
@@ -227,7 +220,6 @@ def apply(self, output_path: Path, convert_mtp: bool = False) -> Path:
         from transformers import AutoModelForCausalLM
 
         self.convert_mtp = convert_mtp
-        self._verify_source()
         source = AutoModelForCausalLM.from_pretrained(str(self), trust_remote_code=True, torch_dtype='auto')
         target = self.init()
         trainer = self.nemo_setup(target)
@@ -241,15 +233,6 @@ def apply(self, output_path: Path, convert_mtp: bool = False) -> Path:
 
         return output_path
 
-    def _verify_source(self):
-        source_config = AutoConfig.from_pretrained(str(self), trust_remote_code=True)
-        assert 'quantization_config' not in source_config, (
-            "HuggingFace cannot load DeepSeek V3's FP8 checkpoint directly. You must convert the checkpoint "
-            "to BF16. See NeMo documentation for more details: "
-            "https://nemo-framework-tme.gitlab-master-pages.nvidia.com/documentation/user-guide/latest/llms/"
-            "deepseek_v3.html#nemo-2-0-finetuning-recipes "
-        )
-
     def _modify_source_state(self, source: nn.Module) -> _ModelState:
         """
         In deepseek, HF weight `model.layers.*.post_attention_layernorm.weight` is mapped to mcore weight
@@ -436,7 +419,7 @@ def config(self) -> DeepSeekConfig:
             moe_router_num_groups=source.n_group,
             moe_router_group_topk=source.topk_group,
             moe_router_topk_scaling_factor=source.routed_scaling_factor,
-            moe_aux_loss_coeff=getattr(source, "aux_loss_alpha", 0.001),
+            moe_aux_loss_coeff=source.aux_loss_alpha,
             kv_lora_rank=source.kv_lora_rank,
             qk_head_dim=source.qk_nope_head_dim,
             qk_pos_emb_head_dim=source.qk_rope_head_dim,
@@ -471,27 +454,38 @@ def init(self, dtype=torch.bfloat16, model_name="deepseek-ai/DeepSeek-V3") -> "A
             type(hf_model).register_for_auto_class("AutoModelForCausalLM")
             return hf_model
 
-    def _detect_hf_deepseek_version(self, source_config: Dict[str, Any]) -> str:
+    def ckpt_load(self, path: Path) -> Tuple[Dict, Dict]:
         """
-        Detect the HF DeepSeek version based on the source NeMo config.
+        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
+        so that it is consistent with the key names you would get from loading the checkpoint into a model.
+        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
 
         Args:
-            source_config (Dict[str, Any]): The source NeMo model config.
+            path (Path): The path from which the model will be loaded.
 
-        Returns:
-            str: The DeepSeek version in the Hugging Face Hub convention.
+        Returns
+        -------
+            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
         """
-        if source_config['moe_router_enable_expert_bias']:
-            target_model_name = "deepseek-ai/DeepSeek-V3"
-        elif source_config['q_lora_rank'] is not None:
-            target_model_name = "deepseek-ai/DeepSeek-V2"
-        else:
-            target_model_name = "deepseek-ai/DeepSeek-V2-Lite"
-        logging.info(
-            f"Your model is determined to be {target_model_name} based on the config. If this is not correct, "
-            f"please pass in a local HF checkpoint."
-        )
-        return target_model_name
+        model_yaml = path / "context" / "model.yaml"
+        if not model_yaml.exists():
+            raise FileNotFoundError("model.yaml is not found in the context folder of the checkpoint.")
+        with open(model_yaml, 'r') as stream:
+            config = yaml.safe_load(stream)
+
+        dist_ckpt_folder = path / "weights"
+        state_dict = {}
+        for k, v in load_distributed_model_weights(dist_ckpt_folder, True).items():
+            if '_extra_state' in k:
+                continue
+            new_k = k.replace("module.", "")
+            if '.experts.experts.' in k:
+                # split experts into multiple tensors
+                for i in range(v.size(0)):
+                    state_dict[new_k.replace(".experts.experts.", ".experts.") + str(i)] = v[i]
+            else:
+                state_dict[new_k] = v
+        return state_dict, config['config']
 
     def apply(self, output_path: Path, target_model_name=None) -> Path:
         logging.info("Loading DeepSeek NeMo checkpoint. This may take a while...")
@@ -499,12 +493,21 @@ def apply(self, output_path: Path, target_model_name=None) -> Path:
         logging.info("DeepSeek NeMo checkpoint loaded.")
         if target_model_name is None:
             # Before DeepSeek is fully supported by HF, it is necessary to pass in a local HF checkpoint that
-            # is used to initialize the HF model.
+            # is used to initialize the HF model. The following
             logging.warning(
                 "Before DeepSeek is officially supported in HF, you should pass in a local HF "
                 "checkpoint using llm.export_ckpt(..., target_model_name=)"
             )
-            target_model_name = self._detect_hf_deepseek_version(source_config)
+            if source_config['moe_router_enable_expert_bias']:
+                target_model_name = "deepseek-ai/DeepSeek-V3"
+            elif source_config['q_lora_rank'] is not None:
+                target_model_name = "deepseek-ai/DeepSeek-V2"
+            else:
+                target_model_name = "deepseek-ai/DeepSeek-V2-Lite"
+            logging.info(
+                f"Your model is determined to be {target_model_name} based on the config. If this is not correct, "
+                f"please pass in a local HF checkpoint."
+            )
 
         target = self.init(torch_dtype_from_dict_config(source_config), model_name=target_model_name)
         target = self.convert_state(source, target, source_config)
@@ -597,7 +600,6 @@ def convert_state(self, source, target, source_config):
             target,
             mapping=mapping,
             transforms=transforms,
-            cast_dtype=torch.bfloat16,
         )
 
     def _modify_source_state(self, source: Dict[str, Any], source_config: Dict[str, Any]) -> _ModelState:
@@ -619,60 +621,6 @@ def _modify_source_state(self, source: Dict[str, Any], source_config: Dict[str,
     def tokenizer(self) -> 'AutoTokenizer':
         return io.load_context(self, subpath="model").tokenizer
 
-    @property
-    def config(self) -> "HFDeepseekV3Config":
-        """Create a HF DeepseekV3Config from the NeMo model config.
-
-        Translates the NeMo configuration parameters to the equivalent HF
-        configuration.
-
-        Currently only supports DeepseekV3Config based on availability
-        in the Transformers library.
-
-        Returns:
-            HFDeepseekV3Config: HF configuration for DeepSeekV3 models
-        """
-        # TODO: Get config for all DeepSeek model variants once available in transformers
-
-        from transformers import DeepseekV3Config as HFDeepseekV3Config
-
-        source: DeepSeekV3Config = io.load_context(str(self)).model.config
-
-        target_model_name = self._detect_hf_deepseek_version(asdict(source))
-        if target_model_name != "deepseek-ai/DeepSeek-V3":
-            raise ValueError(f"Getting config for model other than {target_model_name} is not supported.")
-
-        # Figure out the number of zeros in the prefix of moe_layer_freq array
-        # for the HF first_k_dense_replace parameter and validate the reminder:
-        k = 0
-        while k < len(source.moe_layer_freq) and source.moe_layer_freq[k] == 0:
-            k += 1
-        assert all(x == 1 for x in source.moe_layer_freq[k:])
-
-        return HFDeepseekV3Config(
-            architectures=["DeepseekV3ForCausalLM"],
-            num_hidden_layers=source.num_layers,
-            hidden_size=source.hidden_size,
-            intermediate_size=source.ffn_hidden_size,
-            num_attention_heads=source.num_attention_heads,
-            q_lora_rank=source.q_lora_rank,
-            qk_nope_head_dim=source.qk_head_dim,
-            qk_rope_head_dim=source.qk_pos_emb_head_dim,
-            v_head_dim=source.v_head_dim,
-            kv_lora_rank=source.kv_lora_rank,
-            num_key_value_heads=source.kv_channels,
-            n_routed_experts=source.num_moe_experts,
-            moe_intermediate_size=source.moe_ffn_hidden_size,
-            first_k_dense_replace=k,
-            num_experts_per_tok=source.moe_router_topk,
-            n_group=source.moe_router_num_groups,
-            topk_group=source.moe_router_group_topk,
-            routed_scaling_factor=source.moe_router_topk_scaling_factor,
-            aux_loss_alpha=source.moe_aux_loss_coeff,
-            max_position_embeddings=source.max_position_embeddings,
-            vocab_size=self.tokenizer.vocab_size,
-        )
-
 
 __all__ = [
     "DeepSeekConfig",
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index f628219e4bfa..9665f92fb3f3 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,15 +17,17 @@
 from dataclasses import dataclass, field
 from functools import partial
 from pathlib import Path
-from typing import TYPE_CHECKING, Annotated, Callable, List, Optional, Union
+from typing import TYPE_CHECKING, Annotated, Any, Callable, Dict, List, Optional, Tuple, Union
 
 import torch
 import torch.nn.functional as F
+import yaml
 from torch import nn
 
 from nemo.collections.llm.gpt.model.base import GPTConfig, GPTModel, torch_dtype_from_mcore_config
 from nemo.collections.llm.gpt.model.llama4_utils import get_llama4_layer_spec
 from nemo.collections.llm.utils import Config
+from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import OptimizerModule, io, teardown
 from nemo.lightning.ckpt_utils import ADAPTER_META_FILENAME
 from nemo.lightning.io.pl import ckpt_to_weights_subdir
@@ -77,7 +79,6 @@ class LlamaConfig(GPTConfig):
     persist_layer_norm: bool = True
     bias_dropout_fusion: bool = True
     apply_rope_fusion: bool = True
-    use_transformer_engine_op_fuser: Optional[bool] = None
 
 
 @dataclass
@@ -168,7 +169,7 @@ class Llama31Config(Llama3Config):
     old_context_len: int = 8192
     init_method_std: float = 0.02
 
-    def configure_model(self, tokenizer, pre_process=None, post_process=None, vp_stage=None) -> "MCoreGPTModel":
+    def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MCoreGPTModel":
         """Configure and instantiate a Megatron Core Llama 3.1 model.
 
         Extends the base configuration with Llama 3.1 specific RoPE scaling.
@@ -181,7 +182,7 @@ def configure_model(self, tokenizer, pre_process=None, post_process=None, vp_sta
         Returns:
             MCoreGPTModel: Configured Megatron Core GPT model instance
         """
-        model = super().configure_model(tokenizer, pre_process, post_process, vp_stage)
+        model = super().configure_model(tokenizer, pre_process, post_process)
         # Apply rope scaling for Llama3.1 model
         model.rotary_pos_emb.inv_freq = apply_rope_scaling(
             model.rotary_pos_emb.inv_freq,
@@ -291,7 +292,6 @@ class Llama32Config1B(Llama31Config):
     scale_factor: float = 32.0
     share_embeddings_and_output_weights: bool = True
     rotary_base: int = 500_000
-    seq_length: int = 131072
     num_layers: int = 16
     hidden_size: int = 2048
     ffn_hidden_size: int = 8192
@@ -311,7 +311,6 @@ class Llama32Config3B(Llama31Config):
     scale_factor: int = 32
     share_embeddings_and_output_weights: bool = True
     rotary_base: int = 500_000
-    seq_length: int = 131072
     num_layers: int = 28
     hidden_size: int = 3072
     ffn_hidden_size: int = 8192
@@ -766,7 +765,7 @@ def make_vocab_size_divisible_by(vocab_size):
             params_dtype=dtype_from_hf(source),
             generation_config=generation_config,
             vocab_size=source.vocab_size,
-            kv_channels=getattr(source, "head_dim", None),
+            kv_channels=getattr(source, "head_dim"),
             **args,
         )
 
@@ -901,13 +900,6 @@ def convert_state(self, source, target, source_config=None):
                     "decoder.layers.*.mlp.experts.linear_fc1.weight": "model.layers.*.feed_forward.experts.gate_up_proj",
                 }
             )
-
-            # Remove the transform with source_key "decoder.layers.*.mlp.linear_fc1.weight" from transforms
-            # Llama4's HF model has a different mapping for the MLP weights (map to feed_forward instead of mlp)
-            transforms = [
-                t for t in transforms if getattr(t, "source_key", None) != "decoder.layers.*.mlp.linear_fc1.weight"
-            ]
-
             transforms.extend(
                 [
                     io.state_transform(
@@ -1055,6 +1047,46 @@ def create_llama4_config(self, source):
         )
         return config
 
+    def ckpt_load(self, path: Path) -> Tuple[Dict, Any]:
+        """
+        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
+        so that it is consistent with the key names you would get from loading the checkpoint into a model.
+        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
+
+        Args:
+            path (Path): The path from which the model will be loaded.
+
+        Returns
+        -------
+            Tuple[Dict, Any]: The loaded state dict and the yaml config object.
+        """
+        model_yaml = path / "context" / "model.yaml"
+        if not model_yaml.exists():
+            raise FileNotFoundError("model.yaml is not found in the context folder of the checkpoint.")
+        with open(model_yaml, 'r') as stream:
+            config = yaml.safe_load(stream)
+
+        dist_ckpt_folder = path / "weights"
+        state_dict = {}
+
+        dict_to_obj = lambda d: (
+            type('Config', (), {kk: dict_to_obj(vv) for kk, vv in d.items()}) if isinstance(d, dict) else d
+        )
+        config_obj = dict_to_obj(config['config'])
+        langauge_layers = config_obj.num_layers
+        distributed_model_weights = load_distributed_model_weights(dist_ckpt_folder, True).items()
+        for k, v in distributed_model_weights:
+            if '_extra_state' in k:
+                continue
+            new_k = k.replace("module.", "")
+            if 'layers' in new_k and v.size(0) == langauge_layers:
+                # Only split layers
+                for i in range(v.size(0)):
+                    state_dict[new_k.replace('layers', f'layers.{str(i)}')] = v[i]
+            state_dict[new_k] = v
+
+        return state_dict, config_obj
+
     def _modify_llama4_source_state(self, state_dict, source_config):
         """
         For MoE layer, we transpose the gate_up_proj and down_proj to match HF implementation.
@@ -1128,7 +1160,7 @@ def apply(self, output_path: Path) -> Path:
         """
         from nemo.collections.llm.peft import CanonicalLoRA, DoRA, LoRA
 
-        self.peft_obj: Union[LoRA, DoRA, CanonicalLoRA] = io.load_context(str(self), subpath="model.model_transform")
+        self.peft_obj: Union[LoRA, DoRA, CanonicalLoRA] = io.load_context(str(self)).model.model_transform
 
         source, _ = self.nemo_load(str(self))
         target = self.init(torch_dtype_from_mcore_config(source.config))
diff --git a/nemo/collections/vlm/llama4/model/llama4_omni.py b/nemo/collections/vlm/llama4/model/llama4_omni.py
index f1ad60b7345f..5e1edd0a091a 100644
--- a/nemo/collections/vlm/llama4/model/llama4_omni.py
+++ b/nemo/collections/vlm/llama4/model/llama4_omni.py
@@ -14,9 +14,10 @@
 
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict, Tuple
 
 import torch
+import yaml
 from torch import nn
 
 from nemo.collections.common.tokenizers import TokenizerSpec
@@ -26,6 +27,7 @@
 from nemo.collections.vlm.llama4.model.vision import Llama4VisionConfig
 from nemo.collections.vlm.neva.model.llava import export_qkv, export_qkv_bias, import_qkv
 from nemo.collections.vlm.vision.base import MultimodalProjectorConfig
+from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import io, teardown
 from nemo.lightning.io.state import TransformFns, _ModelState
 from nemo.utils import logging
@@ -622,6 +624,42 @@ def tokenizer(self) -> "TokenizerSpec":
         """
         return io.load_context(str(self), subpath="model").tokenizer
 
+    def ckpt_load(self, path: Path) -> Tuple[Dict, Dict]:
+        """
+        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
+        so that it is consistent with the key names you would get from loading the checkpoint into a model.
+        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
+
+        Args:
+            path (Path): The path from which the model will be loaded.
+
+        Returns
+        -------
+            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
+        """
+        model_yaml = path / "context" / "model.yaml"
+        if not model_yaml.exists():
+            raise FileNotFoundError("model.yaml is not found in the context folder of the checkpoint.")
+        with open(model_yaml, 'r') as stream:
+            config = yaml.safe_load(stream)
+
+        dist_ckpt_folder = path / "weights"
+        state_dict = {}
+
+        langauge_layers = config['config']['language_transformer_config']['num_layers']
+        vision_layers = config['config']['vision_transformer_config']['num_layers']
+        distributed_model_weights = load_distributed_model_weights(dist_ckpt_folder, True).items()
+        for k, v in distributed_model_weights:
+            if '_extra_state' in k:
+                continue
+            new_k = k.replace("module.", "")
+            if 'layers' in new_k and (v.size(0) == langauge_layers or v.size(0) == vision_layers):
+                # Only split layers
+                for i in range(v.size(0)):
+                    state_dict[new_k.replace('layers', f'layers.{str(i)}')] = v[i]
+            state_dict[new_k] = v
+        return state_dict, config['config']
+
     def _modify_llama4_source_state(self, state_dict, source_config):
         """
         For MoE layer, we transpose the gate_up_proj and down_proj to match HF implementation.
diff --git a/nemo/collections/vlm/mllama/model/mllama.py b/nemo/collections/vlm/mllama/model/mllama.py
index 3522d8dcd1cb..3a9e29c75d45 100644
--- a/nemo/collections/vlm/mllama/model/mllama.py
+++ b/nemo/collections/vlm/mllama/model/mllama.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,12 +21,8 @@
 import torch.distributed
 from megatron.core.transformer import TransformerConfig
 from torch import Tensor
-from transformers import MllamaConfig as HFMllamaConfig
-from transformers import MllamaForConditionalGeneration
-from transformers.models.mllama.configuration_mllama import MllamaTextConfig, MllamaVisionConfig
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.collections.vlm.mllama.model.base import (
     CrossAttentionTextConfig,
     CrossAttentionVisionConfig,
@@ -36,7 +32,6 @@
 from nemo.lightning import io, teardown
 from nemo.lightning.io.state import _ModelState
 from nemo.lightning.pytorch.utils import dtype_from_hf
-from nemo.utils import logging
 
 # pylint: disable=C0115,C0116,C0301
 
@@ -89,7 +84,9 @@ def local_path(self, base_path: Optional[Path] = None) -> Path:
         return output_path
 
     def apply(self, output_path: Path) -> Path:
-        source = MllamaForConditionalGeneration.from_pretrained(str(self), torch_dtype="auto")
+        from transformers import MllamaForConditionalGeneration
+
+        source = MllamaForConditionalGeneration.from_pretrained(str(self), torch_dtype='auto')
 
         state_dict = _rename_xattn_layer_nums_hf(source.state_dict())
         source = _ModelState(state_dict)
@@ -110,69 +107,69 @@ def convert_state(self, source, target):
         transforms = []
         mapping.update(
             {
-                "model.language_model.layers.*.self_attn.o_proj.weight": "language_model.decoder.layers.*.self_attention.linear_proj.weight",
-                "model.language_model.xattn_layers.*.cross_attn.o_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_proj.weight",
-                "model.language_model.xattn_layers.*.cross_attn.q_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.weight",
-                "model.language_model.norm.weight": "language_model.decoder.final_layernorm.weight",
-                "lm_head.weight": "language_model.output_layer.weight",
-                "model.language_model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
-                "model.language_model.layers.*.mlp.down_proj.weight": "language_model.decoder.layers.*.mlp.linear_fc2.weight",
-                "model.language_model.layers.*.input_layernorm.weight": "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
-                "model.language_model.xattn_layers.*.cross_attn.k_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.k_layernorm.weight",
-                "model.language_model.xattn_layers.*.input_layernorm.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.layer_norm_weight",
-                "model.language_model.xattn_layers.*.cross_attn.q_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.q_layernorm.weight",
-                "model.language_model.xattn_layers.*.post_attention_layernorm.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc1.layer_norm_weight",
-                "model.language_model.xattn_layers.*.mlp.down_proj.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc2.weight",
+                "language_model.model.layers.*.self_attn.o_proj.weight": "language_model.decoder.layers.*.self_attention.linear_proj.weight",
+                "language_model.model.xattn_layers.*.cross_attn.o_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_proj.weight",
+                "language_model.model.xattn_layers.*.cross_attn.q_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.weight",
+                "language_model.model.norm.weight": "language_model.decoder.final_layernorm.weight",
+                "language_model.lm_head.weight": "language_model.output_layer.weight",
+                "language_model.model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+                "language_model.model.layers.*.mlp.down_proj.weight": "language_model.decoder.layers.*.mlp.linear_fc2.weight",
+                "language_model.model.layers.*.input_layernorm.weight": "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+                "language_model.model.xattn_layers.*.cross_attn.k_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.k_layernorm.weight",
+                "language_model.model.xattn_layers.*.input_layernorm.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.layer_norm_weight",
+                "language_model.model.xattn_layers.*.cross_attn.q_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.q_layernorm.weight",
+                "language_model.model.xattn_layers.*.post_attention_layernorm.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc1.layer_norm_weight",
+                "language_model.model.xattn_layers.*.mlp.down_proj.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc2.weight",
             }
         )
 
         transforms.extend(
             [
                 io.state_transform(
-                    source_key="model.language_model.xattn_layers.*.cross_attn_attn_gate",
+                    source_key="language_model.model.xattn_layers.*.cross_attn_attn_gate",
                     target_key="language_model.decoder.xattn_layers.*.gate_attn",
                     fn=_import_gate,
                 ),
                 io.state_transform(
-                    source_key="model.language_model.xattn_layers.*.cross_attn_mlp_gate",
+                    source_key="language_model.model.xattn_layers.*.cross_attn_mlp_gate",
                     target_key="language_model.decoder.xattn_layers.*.gate_ffn",
                     fn=_import_gate,
                 ),
                 io.state_transform(
                     source_key=(
-                        "model.language_model.layers.*.self_attn.q_proj.weight",
-                        "model.language_model.layers.*.self_attn.k_proj.weight",
-                        "model.language_model.layers.*.self_attn.v_proj.weight",
+                        "language_model.model.layers.*.self_attn.q_proj.weight",
+                        "language_model.model.layers.*.self_attn.k_proj.weight",
+                        "language_model.model.layers.*.self_attn.v_proj.weight",
                     ),
                     target_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
                     fn=_import_text_qkv,
                 ),
                 io.state_transform(
                     source_key=(
-                        "model.language_model.layers.*.mlp.gate_proj.weight",
-                        "model.language_model.layers.*.mlp.up_proj.weight",
+                        "language_model.model.layers.*.mlp.gate_proj.weight",
+                        "language_model.model.layers.*.mlp.up_proj.weight",
                     ),
                     target_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
                     fn=_import_simple_concat,
                 ),
                 io.state_transform(
                     source_key=(
-                        "model.language_model.xattn_layers.*.cross_attn.k_proj.weight",
-                        "model.language_model.xattn_layers.*.cross_attn.v_proj.weight",
+                        "language_model.model.xattn_layers.*.cross_attn.k_proj.weight",
+                        "language_model.model.xattn_layers.*.cross_attn.v_proj.weight",
                     ),
                     target_key="language_model.decoder.xattn_layers.*.cross_attention.linear_kv.weight",
                     fn=_import_text_kv,
                 ),
                 io.state_transform(
                     source_key=(
-                        "model.language_model.xattn_layers.*.mlp.gate_proj.weight",
-                        "model.language_model.xattn_layers.*.mlp.up_proj.weight",
+                        "language_model.model.xattn_layers.*.mlp.gate_proj.weight",
+                        "language_model.model.xattn_layers.*.mlp.up_proj.weight",
                     ),
                     target_key="language_model.decoder.xattn_layers.*.mlp.linear_fc1.weight",
                     fn=_import_simple_concat,
                 ),
                 io.state_transform(
-                    source_key="model.language_model.embed_tokens.weight",
+                    source_key="language_model.model.embed_tokens.weight",
                     target_key=(
                         "language_model.embedding.word_embeddings.weight",
                         "language_model.learnable_embedding.weight",
@@ -185,64 +182,64 @@ def convert_state(self, source, target):
         v = "vision_model.vision_encoder"
         mapping.update(
             {
-                "model.vision_model.global_transformer.layers.*.self_attn.o_proj.weight": f"{v}.global_transformer.layers.*.self_attention.linear_proj.weight",
-                "model.vision_model.global_transformer.layers.*.gate_attn": f"{v}.global_transformer.layers.*.gate_attn",
-                "model.vision_model.global_transformer.layers.*.gate_ffn": f"{v}.global_transformer.layers.*.gate_ffn",
-                "model.vision_model.global_transformer.layers.*.input_layernorm.bias": f"{v}.global_transformer.layers.*.input_layernorm.bias",
-                "model.vision_model.global_transformer.layers.*.input_layernorm.weight": f"{v}.global_transformer.layers.*.input_layernorm.weight",
-                "model.vision_model.global_transformer.layers.*.post_attention_layernorm.bias": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.bias",
-                "model.vision_model.global_transformer.layers.*.post_attention_layernorm.weight": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.weight",
-                "model.vision_model.global_transformer.layers.*.mlp.fc1.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc1.bias",
-                "model.vision_model.global_transformer.layers.*.mlp.fc1.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc1.weight",
-                "model.vision_model.global_transformer.layers.*.mlp.fc2.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc2.bias",
-                "model.vision_model.global_transformer.layers.*.mlp.fc2.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc2.weight",
-                "model.vision_model.transformer.layers.*.self_attn.o_proj.weight": f"{v}.transformer.layers.*.self_attention.linear_proj.weight",
-                "model.vision_model.transformer.layers.*.input_layernorm.bias": f"{v}.transformer.layers.*.input_layernorm.bias",
-                "model.vision_model.transformer.layers.*.input_layernorm.weight": f"{v}.transformer.layers.*.input_layernorm.weight",
-                "model.vision_model.transformer.layers.*.post_attention_layernorm.bias": f"{v}.transformer.layers.*.pre_mlp_layernorm.bias",
-                "model.vision_model.transformer.layers.*.post_attention_layernorm.weight": f"{v}.transformer.layers.*.pre_mlp_layernorm.weight",
-                "model.vision_model.transformer.layers.*.mlp.fc1.bias": f"{v}.transformer.layers.*.mlp.linear_fc1.bias",
-                "model.vision_model.transformer.layers.*.mlp.fc1.weight": f"{v}.transformer.layers.*.mlp.linear_fc1.weight",
-                "model.vision_model.transformer.layers.*.mlp.fc2.bias": f"{v}.transformer.layers.*.mlp.linear_fc2.bias",
-                "model.vision_model.transformer.layers.*.mlp.fc2.weight": f"{v}.transformer.layers.*.mlp.linear_fc2.weight",
-                "model.vision_model.class_embedding": f"{v}.class_embedding",
-                "model.vision_model.gated_positional_embedding.embedding": f"{v}.positional_embedding",
-                "model.vision_model.gated_positional_embedding.tile_embedding.weight": f"{v}.gated_tile_positional_embedding.weight",
-                "model.vision_model.gated_positional_embedding.gate": f"{v}.gated_positional_embedding_gate",
-                "model.vision_model.layernorm_post.bias": f"{v}.ln_post.bias",
-                "model.vision_model.layernorm_post.weight": f"{v}.ln_post.weight",
-                "model.vision_model.layernorm_pre.bias": f"{v}.ln_pre.bias",
-                "model.vision_model.layernorm_pre.weight": f"{v}.ln_pre.weight",
-                "model.vision_model.post_tile_positional_embedding.embedding.weight": f"{v}.post_tile_pos_embed.embedding.weight",
-                "model.vision_model.post_tile_positional_embedding.gate": f"{v}.post_tile_pos_embed.gate",
-                "model.vision_model.pre_tile_positional_embedding.embedding.weight": f"{v}.pre_tile_pos_embed.embedding.weight",
-                "model.vision_model.pre_tile_positional_embedding.gate": f"{v}.pre_tile_pos_embed.gate",
-                "model.multi_modal_projector.bias": "vision_model.vision_projection.encoder.bias",
-                "model.multi_modal_projector.weight": "vision_model.vision_projection.encoder.weight",
+                "vision_model.global_transformer.layers.*.self_attn.o_proj.weight": f"{v}.global_transformer.layers.*.self_attention.linear_proj.weight",
+                "vision_model.global_transformer.layers.*.gate_attn": f"{v}.global_transformer.layers.*.gate_attn",
+                "vision_model.global_transformer.layers.*.gate_ffn": f"{v}.global_transformer.layers.*.gate_ffn",
+                "vision_model.global_transformer.layers.*.input_layernorm.bias": f"{v}.global_transformer.layers.*.input_layernorm.bias",
+                "vision_model.global_transformer.layers.*.input_layernorm.weight": f"{v}.global_transformer.layers.*.input_layernorm.weight",
+                "vision_model.global_transformer.layers.*.post_attention_layernorm.bias": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.bias",
+                "vision_model.global_transformer.layers.*.post_attention_layernorm.weight": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.weight",
+                "vision_model.global_transformer.layers.*.mlp.fc1.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc1.bias",
+                "vision_model.global_transformer.layers.*.mlp.fc1.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc1.weight",
+                "vision_model.global_transformer.layers.*.mlp.fc2.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc2.bias",
+                "vision_model.global_transformer.layers.*.mlp.fc2.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc2.weight",
+                "vision_model.transformer.layers.*.self_attn.o_proj.weight": f"{v}.transformer.layers.*.self_attention.linear_proj.weight",
+                "vision_model.transformer.layers.*.input_layernorm.bias": f"{v}.transformer.layers.*.input_layernorm.bias",
+                "vision_model.transformer.layers.*.input_layernorm.weight": f"{v}.transformer.layers.*.input_layernorm.weight",
+                "vision_model.transformer.layers.*.post_attention_layernorm.bias": f"{v}.transformer.layers.*.pre_mlp_layernorm.bias",
+                "vision_model.transformer.layers.*.post_attention_layernorm.weight": f"{v}.transformer.layers.*.pre_mlp_layernorm.weight",
+                "vision_model.transformer.layers.*.mlp.fc1.bias": f"{v}.transformer.layers.*.mlp.linear_fc1.bias",
+                "vision_model.transformer.layers.*.mlp.fc1.weight": f"{v}.transformer.layers.*.mlp.linear_fc1.weight",
+                "vision_model.transformer.layers.*.mlp.fc2.bias": f"{v}.transformer.layers.*.mlp.linear_fc2.bias",
+                "vision_model.transformer.layers.*.mlp.fc2.weight": f"{v}.transformer.layers.*.mlp.linear_fc2.weight",
+                "vision_model.class_embedding": f"{v}.class_embedding",
+                "vision_model.gated_positional_embedding.embedding": f"{v}.positional_embedding",
+                "vision_model.gated_positional_embedding.tile_embedding.weight": f"{v}.gated_tile_positional_embedding.weight",
+                "vision_model.gated_positional_embedding.gate": f"{v}.gated_positional_embedding_gate",
+                "vision_model.layernorm_post.bias": f"{v}.ln_post.bias",
+                "vision_model.layernorm_post.weight": f"{v}.ln_post.weight",
+                "vision_model.layernorm_pre.bias": f"{v}.ln_pre.bias",
+                "vision_model.layernorm_pre.weight": f"{v}.ln_pre.weight",
+                "vision_model.post_tile_positional_embedding.embedding.weight": f"{v}.post_tile_pos_embed.embedding.weight",
+                "vision_model.post_tile_positional_embedding.gate": f"{v}.post_tile_pos_embed.gate",
+                "vision_model.pre_tile_positional_embedding.embedding.weight": f"{v}.pre_tile_pos_embed.embedding.weight",
+                "vision_model.pre_tile_positional_embedding.gate": f"{v}.pre_tile_pos_embed.gate",
+                "multi_modal_projector.bias": "vision_model.vision_projection.encoder.bias",
+                "multi_modal_projector.weight": "vision_model.vision_projection.encoder.weight",
             }
         )
         transforms.extend(
             [
                 io.state_transform(
                     source_key=(
-                        "model.vision_model.global_transformer.layers.*.self_attn.q_proj.weight",
-                        "model.vision_model.global_transformer.layers.*.self_attn.k_proj.weight",
-                        "model.vision_model.global_transformer.layers.*.self_attn.v_proj.weight",
+                        "vision_model.global_transformer.layers.*.self_attn.q_proj.weight",
+                        "vision_model.global_transformer.layers.*.self_attn.k_proj.weight",
+                        "vision_model.global_transformer.layers.*.self_attn.v_proj.weight",
                     ),
                     target_key=(f"{v}.global_transformer.layers.*.self_attention.linear_qkv.weight"),
                     fn=_import_vision_qkv,
                 ),
                 io.state_transform(
                     source_key=(
-                        "model.vision_model.transformer.layers.*.self_attn.q_proj.weight",
-                        "model.vision_model.transformer.layers.*.self_attn.k_proj.weight",
-                        "model.vision_model.transformer.layers.*.self_attn.v_proj.weight",
+                        "vision_model.transformer.layers.*.self_attn.q_proj.weight",
+                        "vision_model.transformer.layers.*.self_attn.k_proj.weight",
+                        "vision_model.transformer.layers.*.self_attn.v_proj.weight",
                     ),
                     target_key=(f"{v}.transformer.layers.*.self_attention.linear_qkv.weight"),
                     fn=_import_vision_qkv,
                 ),
                 io.state_transform(
-                    source_key="model.vision_model.patch_embedding.weight",
+                    source_key="vision_model.patch_embedding.weight",
                     target_key=f"{v}.conv1._linear.weight",
                     fn=_import_patch_embedding_hf,
                 ),
@@ -274,8 +271,7 @@ def _calculate_num_layers(num_hidden_layers, cross_attention_layers):
             rotary_base=source.text_config.rope_theta,
             seq_length=8192,
             num_layers=_calculate_num_layers(
-                source.text_config.num_hidden_layers,
-                source.text_config.cross_attention_layers,
+                source.text_config.num_hidden_layers, source.text_config.cross_attention_layers
             ),
             num_cross_attention_layers=len(source.text_config.cross_attention_layers),
             hidden_size=source.text_config.hidden_size,
@@ -302,356 +298,16 @@ def _vision_model_config(self, source) -> Optional[CrossAttentionVisionConfig]:
         )
 
 
-@io.model_exporter(MLlamaModel, "hf")
-class HFMLlamaExporter(io.ModelConnector[MLlamaModel, "MllamaForConditionalGeneration"]):
-    """
-    Exporter class for converting NeMo MLlama model to HuggingFace format.
-
-    Inherits:
-        io.ModelConnector: Connector interface to handle setup, save, and load using the Lightning framework.
-
-    Methods:
-        init: Initializes a new HuggingFace MLlama model instance.
-        apply: Converts the NeMo model to HuggingFace format and saves it.
-        convert_state: Maps and transforms the state dictionary from NeMo to HuggingFace format.
-        config: Generates and returns the HuggingFace MLlama config for the model.
-    """
-
-    def init(self, dtype=torch.bfloat16) -> "MllamaForConditionalGeneration":
-        """
-        Initializes a HuggingFace MllamaForConditionalGeneration model.
-
-        Args:
-            dtype: The data type to use for the model (default: torch.bfloat16)
-
-        Returns:
-            MllamaForConditionalGeneration: A HuggingFace MLlama model initialized with the configuration.
-        """
-        from transformers.modeling_utils import no_init_weights
-
-        with no_init_weights():
-            return MllamaForConditionalGeneration._from_config(self.config, torch_dtype=dtype)
-
-    def apply(self, output_path: Path) -> Path:
-        """
-        Converts the NeMo MLlama model to HuggingFace format and saves it to the specified path.
-
-        Args:
-            output_path (Path): The path where the converted HuggingFace model will be saved.
-
-        Returns:
-            Path: The output path where the HuggingFace model was saved.
-        """
-        logging.info("Loading MLlama NeMo checkpoint. This may take a while...")
-        source, source_config = self.ckpt_load(self)
-        logging.info("MLlama NeMo checkpoint loaded.")
-        logging.info("Initializing the HF model..")
-        target = self.init()
-        logging.info("Start Converting the model..")
-        target = self.convert_state(source, target, source_config)
-        target = target.cpu()
-        target.save_pretrained(output_path)
-
-        try:
-            self.tokenizer.tokenizer.save_pretrained(output_path)
-        except Exception:
-            logging.warning("Failed to save tokenizer")
-
-        print(f"Converted MLlama model saved to {output_path}")
-
-        return output_path
-
-    def convert_state(self, source, target, source_config):
-        # pylint: disable=C0115,C0116,line-too-long
-        """
-        Maps and transforms the state dictionary from NeMo to HuggingFace format.
-
-        Args:
-            source: The source NeMo model.
-            target: The target HuggingFace model.
-
-        Returns:
-            The target HuggingFace model with the converted state.
-        """
-        source = self._modify_mllama_source_state(source, source_config)
-        mapping = {}
-        transforms = []
-        # Define the state mapping from NeMo to HuggingFace
-        mapping.update(
-            {
-                "language_model.decoder.layers.*.self_attention.linear_proj.weight": "model.language_model.layers.*.self_attn.o_proj.weight",
-                "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.language_model.layers.*.input_layernorm.weight",
-                "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.language_model.layers.*.post_attention_layernorm.weight",
-                "language_model.decoder.layers.*.mlp.linear_fc2.weight": "model.language_model.layers.*.mlp.down_proj.weight",
-                "language_model.decoder.xattn_layers.*.cross_attention.q_layernorm.weight": "model.language_model.layers.*.cross_attn.q_norm.weight",
-                "language_model.decoder.xattn_layers.*.cross_attention.linear_q.weight": "model.language_model.layers.*.cross_attn.q_proj.weight",
-                "language_model.decoder.xattn_layers.*.cross_attention.k_layernorm.weight": "model.language_model.layers.*.cross_attn.k_norm.weight",
-                "language_model.decoder.xattn_layers.*.cross_attention.linear_proj.weight": "model.language_model.layers.*.cross_attn.o_proj.weight",
-                "language_model.decoder.final_layernorm.weight": "model.language_model.norm.weight",
-                "language_model.output_layer.weight": "lm_head.weight",
-            }
-        )
-        transforms.extend(
-            [
-                io.state_transform(
-                    source_key="language_model.decoder.xattn_layers.*.gate_attn",
-                    target_key="model.language_model.layers.*.cross_attn_attn_gate",
-                    fn=_export_gate,
-                ),
-                io.state_transform(
-                    source_key="language_model.decoder.xattn_layers.*.gate_ffn",
-                    target_key="model.language_model.layers.*.cross_attn_mlp_gate",
-                    fn=_export_gate,
-                ),
-                io.state_transform(
-                    source_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
-                    target_key=(
-                        "model.language_model.layers.*.self_attn.q_proj.weight",
-                        "model.language_model.layers.*.self_attn.k_proj.weight",
-                        "model.language_model.layers.*.self_attn.v_proj.weight",
-                    ),
-                    fn=_export_text_qkv,
-                ),
-                io.state_transform(
-                    source_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
-                    target_key=(
-                        "model.language_model.layers.*.mlp.gate_proj.weight",
-                        "model.language_model.layers.*.mlp.up_proj.weight",
-                    ),
-                    fn=_export_simple_split,
-                ),
-                io.state_transform(
-                    source_key="language_model.decoder.xattn_layers.*.cross_attention.linear_kv.weight",
-                    target_key=(
-                        "model.language_model.layers.*.cross_attn.k_proj.weight",
-                        "model.language_model.layers.*.cross_attn.v_proj.weight",
-                    ),
-                    fn=_export_text_kv,
-                ),
-                io.state_transform(
-                    source_key=(
-                        "language_model.embedding.word_embeddings.weight",
-                        "language_model.learnable_embedding.weight",
-                    ),
-                    target_key="model.language_model.embed_tokens.weight",
-                    fn=_export_embedding_hf,
-                ),
-            ]
-        )
-        v = "vision_model.vision_encoder"
-        mapping.update(
-            {
-                f"{v}.global_transformer.layers.*.self_attention.linear_proj.weight": "model.vision_model.global_transformer.layers.*.self_attn.o_proj.weight",
-                f"{v}.global_transformer.layers.*.gate_attn": "model.vision_model.global_transformer.layers.*.gate_attn",
-                f"{v}.global_transformer.layers.*.gate_ffn": "model.vision_model.global_transformer.layers.*.gate_ffn",
-                f"{v}.global_transformer.layers.*.input_layernorm.bias": "model.vision_model.global_transformer.layers.*.input_layernorm.bias",
-                f"{v}.global_transformer.layers.*.input_layernorm.weight": "model.vision_model.global_transformer.layers.*.input_layernorm.weight",
-                f"{v}.global_transformer.layers.*.pre_mlp_layernorm.bias": "model.vision_model.global_transformer.layers.*.post_attention_layernorm.bias",
-                f"{v}.global_transformer.layers.*.pre_mlp_layernorm.weight": "model.vision_model.global_transformer.layers.*.post_attention_layernorm.weight",
-                f"{v}.global_transformer.layers.*.mlp.linear_fc1.bias": "model.vision_model.global_transformer.layers.*.mlp.fc1.bias",
-                f"{v}.global_transformer.layers.*.mlp.linear_fc1.weight": "model.vision_model.global_transformer.layers.*.mlp.fc1.weight",
-                f"{v}.global_transformer.layers.*.mlp.linear_fc2.bias": "model.vision_model.global_transformer.layers.*.mlp.fc2.bias",
-                f"{v}.global_transformer.layers.*.mlp.linear_fc2.weight": "model.vision_model.global_transformer.layers.*.mlp.fc2.weight",
-                f"{v}.transformer.layers.*.self_attention.linear_proj.weight": "model.vision_model.transformer.layers.*.self_attn.o_proj.weight",
-                f"{v}.transformer.layers.*.input_layernorm.bias": "model.vision_model.transformer.layers.*.input_layernorm.bias",
-                f"{v}.transformer.layers.*.input_layernorm.weight": "model.vision_model.transformer.layers.*.input_layernorm.weight",
-                f"{v}.transformer.layers.*.pre_mlp_layernorm.bias": "model.vision_model.transformer.layers.*.post_attention_layernorm.bias",
-                f"{v}.transformer.layers.*.pre_mlp_layernorm.weight": "model.vision_model.transformer.layers.*.post_attention_layernorm.weight",
-                f"{v}.transformer.layers.*.mlp.linear_fc1.bias": "model.vision_model.transformer.layers.*.mlp.fc1.bias",
-                f"{v}.transformer.layers.*.mlp.linear_fc1.weight": "model.vision_model.transformer.layers.*.mlp.fc1.weight",
-                f"{v}.transformer.layers.*.mlp.linear_fc2.bias": "model.vision_model.transformer.layers.*.mlp.fc2.bias",
-                f"{v}.transformer.layers.*.mlp.linear_fc2.weight": "model.vision_model.transformer.layers.*.mlp.fc2.weight",
-                f"{v}.class_embedding": "model.vision_model.class_embedding",
-                f"{v}.positional_embedding": "model.vision_model.gated_positional_embedding.embedding",
-                f"{v}.gated_tile_positional_embedding.weight": "model.vision_model.gated_positional_embedding.tile_embedding.weight",
-                f"{v}.gated_positional_embedding_gate": "model.vision_model.gated_positional_embedding.gate",
-                f"{v}.ln_post.bias": "model.vision_model.layernorm_post.bias",
-                f"{v}.ln_post.weight": "model.vision_model.layernorm_post.weight",
-                f"{v}.ln_pre.bias": "model.vision_model.layernorm_pre.bias",
-                f"{v}.ln_pre.weight": "model.vision_model.layernorm_pre.weight",
-                f"{v}.post_tile_pos_embed.embedding.weight": "model.vision_model.post_tile_positional_embedding.embedding.weight",
-                f"{v}.post_tile_pos_embed.gate": "model.vision_model.post_tile_positional_embedding.gate",
-                f"{v}.pre_tile_pos_embed.embedding.weight": "model.vision_model.pre_tile_positional_embedding.embedding.weight",
-                f"{v}.pre_tile_pos_embed.gate": "model.vision_model.pre_tile_positional_embedding.gate",
-                "vision_model.vision_projection.encoder.bias": "model.multi_modal_projector.bias",
-                "vision_model.vision_projection.encoder.weight": "model.multi_modal_projector.weight",
-            }
-        )
-        transforms.extend(
-            [
-                io.state_transform(
-                    source_key=(f"{v}.global_transformer.layers.*.self_attention.linear_qkv.weight"),
-                    target_key=(
-                        "model.vision_model.global_transformer.layers.*.self_attn.q_proj.weight",
-                        "model.vision_model.global_transformer.layers.*.self_attn.k_proj.weight",
-                        "model.vision_model.global_transformer.layers.*.self_attn.v_proj.weight",
-                    ),
-                    fn=_export_vision_qkv,
-                ),
-                io.state_transform(
-                    source_key=(f"{v}.transformer.layers.*.self_attention.linear_qkv.weight"),
-                    target_key=(
-                        "model.vision_model.transformer.layers.*.self_attn.q_proj.weight",
-                        "model.vision_model.transformer.layers.*.self_attn.k_proj.weight",
-                        "model.vision_model.transformer.layers.*.self_attn.v_proj.weight",
-                    ),
-                    fn=_export_vision_qkv,
-                ),
-                io.state_transform(
-                    source_key=f"{v}.conv1._linear.weight",
-                    target_key="model.vision_model.patch_embedding.weight",
-                    fn=_export_patch_embedding_hf,
-                ),
-            ]
-        )
-        return io.apply_transforms(source, target, mapping=mapping, transforms=transforms)
-
-    @property
-    def tokenizer(self) -> "TokenizerSpec":
-        """
-        Gets the tokenizer from the loaded model context.
-
-        Returns:
-            The tokenizer specification.
-        """
-        return io.load_context(str(self), subpath="model").tokenizer
-
-    def _modify_mllama_source_state(self, state_dict, source_config):
-        """
-        - Modify state dict to integrate cross-attention layers into self-attention layer.
-        e.g. 11B: 32 self-attn + 8 cross-attn -> 40 layers, 90B: 80 self-attn + 20 cross-attn -> 100 layers
-        - Change the layer index to match the cross_attention_layers in the model config.
-        e.g. 11B: [3, 7, 11, 15, 19, 23, 27, 31] -> [3, 8, 13, 18, 23, 28, 33, 38]
-
-        Args:
-            state_dict: Source model state dict
-            source_config: Model config dict
-
-        Returns:
-            _ModelState: Modified state
-        """
-
-        def convert_layer_num(match):
-            layer_num = int(match.group(1))
-            x_num = (layer_num - 3) // (cross_attention_frequency)
-            if (layer_num - 3) % (cross_attention_frequency) == 0:
-                new_layer_num = x_num + layer_num
-                return f".{new_layer_num}."
-            raise ValueError(
-                f"Unexpected layer_num: {layer_num} (does not align with cross_attention_frequency={cross_attention_frequency})"
-            )
-
-        text_config = source_config.language_model_config
-        cross_attention_frequency = text_config.num_layers // text_config.num_cross_attention_layers
-        total_num_layer = text_config.num_layers + text_config.num_cross_attention_layers
-        prefix = "language_model.decoder"
-
-        new_state_dict = {}
-        # Integrating layer indexes of self-attention and cross-attention
-        for i in range(total_num_layer):
-            cross_num = (i - 3) // (cross_attention_frequency + 1)
-            if (i - 3) % (cross_attention_frequency + 1) == 0:
-                xattn_index = cross_num * cross_attention_frequency + 3
-                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc1.layer_norm_weight"] = state_dict.pop(
-                    f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc1.layer_norm_weight"
-                )
-                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc2.weight"] = state_dict.pop(
-                    f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc2.weight"
-                )
-                new_state_dict[f"{prefix}.layers.{i}.self_attention.linear_qkv.layer_norm_weight"] = state_dict.pop(
-                    f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_q.layer_norm_weight"
-                )
-                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc1.weight"] = state_dict.pop(
-                    f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc1.weight"
-                )
-            else:
-                attn_index = i - cross_num - 1
-                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc1.layer_norm_weight"] = state_dict.pop(
-                    f"{prefix}.layers.{attn_index}.mlp.linear_fc1.layer_norm_weight"
-                )
-                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc2.weight"] = state_dict.pop(
-                    f"{prefix}.layers.{attn_index}.mlp.linear_fc2.weight"
-                )
-                new_state_dict[f"{prefix}.layers.{i}.self_attention.linear_qkv.layer_norm_weight"] = state_dict.pop(
-                    f"{prefix}.layers.{attn_index}.self_attention.linear_qkv.layer_norm_weight"
-                )
-                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc1.weight"] = state_dict.pop(
-                    f"{prefix}.layers.{attn_index}.mlp.linear_fc1.weight"
-                )
-
-        for k, v in new_state_dict.items():
-            state_dict[k] = v
-
-        new_state_dict = {}
-        # Align the cross-attention layer index with HF
-        for k, v in state_dict.items():
-            if "xattn_layers" in k:
-                new_state_dict[re.sub(r"\.(\d+)\.", convert_layer_num, k)] = v
-            else:
-                new_state_dict[k] = v
-
-        source = _ModelState(new_state_dict)
-        return source
-
-    @property
-    def config(self) -> "HFMllamaConfig":
-        """
-        Generates the configuration for the HuggingFace MLlama model based on the NeMo model.
-
-        Returns:
-            HFMllamaConfig: A configuration object for the HuggingFace MLlama model.
-        """
-        source = io.load_context(str(self), subpath="model.config")
-        vision_model_config = source.vision_model_config
-        language_config = source.language_model_config
-
-        vision_config = MllamaVisionConfig(
-            num_hidden_layers=vision_model_config.num_layers,
-            hidden_size=vision_model_config.hidden_size,
-            attention_heads=vision_model_config.num_attention_heads,
-            image_size=vision_model_config.vision_chunk_size,
-            max_num_tiles=vision_model_config.vision_max_num_chunks,
-            torch_dtype="bfloat16",
-        )
-        cross_attention_layers = [
-            x + i
-            for i, x in enumerate(language_config._init_fusion_schedule(language_config.num_cross_attention_layers))
-        ]
-        # Create text config for HuggingFace model
-        text_config = MllamaTextConfig(
-            rope_theta=language_config.rotary_base,
-            num_hidden_layers=language_config.num_layers + language_config.num_cross_attention_layers,
-            tie_word_embeddings=language_config.share_embeddings_and_output_weights,
-            cross_attention_layers=cross_attention_layers,
-            hidden_size=language_config.hidden_size,
-            intermediate_size=language_config.ffn_hidden_size,
-            num_attention_heads=language_config.num_attention_heads,
-            num_key_value_heads=language_config.num_query_groups,
-            vocab_size=language_config.vocab_size,
-            rope_scaling={
-                "factor": 8.0,
-                "high_freq_factor": 4.0,
-                "low_freq_factor": 1.0,
-                "original_max_position_embeddings": 8192,
-                "rope_type": "llama3",
-            },
-            eos_token_id=[128001, 128008, 128009],
-            torch_dtype="bfloat16",
-        )
-        # Create the MllamaConfig for HuggingFace
-        return HFMllamaConfig(vision_config=vision_config, text_config=text_config, torch_dtype="bfloat16")
-
-
 def _rename_xattn_layer_nums_hf(source: Dict):
     def convert_layer_num(match):
         layer_num = int(match.group(1))
         cross_num = (layer_num - 3) // (cross_attention_frequency + 1)
         if (layer_num - 3) % (cross_attention_frequency + 1) == 0:
             new_layer_num = cross_num * cross_attention_frequency + 3
-            return f"xattn_layers.{new_layer_num}."
+            return f'xattn_layers.{new_layer_num}.'
 
         new_layer_num = layer_num - cross_num - 1
-        return f"layers.{new_layer_num}."
+        return f'layers.{new_layer_num}.'
 
     cross_attention_frequency = 4
 
@@ -706,19 +362,7 @@ def _import_text_kv(ctx: io.TransformCTX, k, v):
     return _merge_kv(k, v, head_num, num_query_groups, head_size, hidden_size)
 
 
-def _import_simple_concat(a, b):
-    # for both (w1, w3) -> fc1, and (wk, wv) -> wkv
-    return torch.cat((a, b), dim=0)
-
-
-def _merge_kv(
-    k: Tensor,
-    v: Tensor,
-    head_num: int,
-    num_query_groups: int,
-    head_size: int,
-    hidden_size: int,
-):
+def _merge_kv(k: Tensor, v: Tensor, head_num: int, num_query_groups: int, head_size: int, hidden_size: int):
     old_tensor_shape = k.size()
     new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
 
@@ -737,13 +381,7 @@ def _merge_kv(
 
 
 def _merge_qkv(
-    q: Tensor,
-    k: Tensor,
-    v: Tensor,
-    head_num: int,
-    num_query_groups: int,
-    head_size: int,
-    hidden_size: int,
+    q: Tensor, k: Tensor, v: Tensor, head_num: int, num_query_groups: int, head_size: int, hidden_size: int
 ):
     heads_per_group = head_num // num_query_groups
     old_tensor_shape = q.size()
@@ -770,26 +408,6 @@ def _merge_qkv(
     return qkv_weights
 
 
-def _split_kv(
-    kv: Tensor,
-    head_num: int,
-    num_query_groups: int,
-    head_size: int,
-    hidden_size: int,
-):
-    kv_total_dim = 2 * num_query_groups
-
-    linear_kv = kv.reshape([kv_total_dim, head_size, hidden_size])
-
-    k_slice = torch.arange(0, kv_total_dim, 2)
-    v_slice = torch.arange(1, kv_total_dim, 2)
-
-    k_proj = linear_kv[k_slice].reshape(-1, hidden_size).cpu()
-    v_proj = linear_kv[v_slice].reshape(-1, hidden_size).cpu()
-
-    return k_proj, v_proj
-
-
 def _split_qkv(qkv, head_num: int, num_query_groups: int, head_size: int, hidden_size: int):
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
@@ -811,50 +429,20 @@ def _split_qkv(qkv, head_num: int, num_query_groups: int, head_size: int, hidden
     return q_proj, k_proj, v_proj
 
 
-def _export_gate(gate):
-    return gate[0:1]
-
-
-def _export_patch_embedding_hf(a):
-    return a.reshape(a.shape[0], 3, 14, 14)
-
-
-def _export_vision_qkv(ctx: io.TransformCTX, qkv):
-    vision_config = ctx.target.config.vision_config
-
-    head_num = vision_config.attention_heads
-    num_query_groups = vision_config.attention_heads
-    hidden_size = vision_config.hidden_size
-    head_size = hidden_size // head_num
-    return _split_qkv(qkv, head_num, num_query_groups, head_size, hidden_size)
-
-
-def _export_text_kv(ctx: io.TransformCTX, kv):
-    text_config = ctx.target.config.text_config
-
-    head_num = text_config.num_attention_heads
-    num_query_groups = text_config.num_key_value_heads
-    hidden_size = text_config.hidden_size
-    head_size = hidden_size // head_num
-    return _split_kv(kv, head_num, num_query_groups, head_size, hidden_size)
-
-
-def _export_text_qkv(ctx: io.TransformCTX, qkv):
-    text_config = ctx.target.config.text_config
-
-    head_num = text_config.num_attention_heads
-    num_query_groups = text_config.num_key_value_heads
-    hidden_size = text_config.hidden_size
-    head_size = hidden_size // head_num
-    return _split_qkv(qkv, head_num, num_query_groups, head_size, hidden_size)
-
+def _import_simple_concat(a, b):
+    # for both (w1, w3) -> fc1, and (wk, wv) -> wkv
+    return torch.cat((a, b), dim=0)
 
-def _export_simple_split(linear_fc1):
-    """Splits NeMo's fused MLP linear_fc1 weight into gate_proj and up_proj for HuggingFace format."""
-    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
-    return gate_proj, up_proj
 
+def _rename_xattn_layer_nums(source: Dict):
+    def convert_layer_num(match):
+        new_layer_num = int(match.group(1)) * 4 + 3
+        return f'.{new_layer_num}.'
 
-def _export_embedding_hf(word_embeddings, learnable_embedding):
-    """Transforms the word embeddings from NeMo to HuggingFace format."""
-    return torch.cat((word_embeddings, learnable_embedding), dim=0)
+    output_dict = {}
+    for k, v in source.items():
+        if "cross_attention_layers" in k:
+            output_dict[re.sub(r"\.(\d+)\.", convert_layer_num, k)] = v
+        else:
+            output_dict[k] = v
+    return output_dict
diff --git a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
old mode 100755
new mode 100644
index b14eedc14687..77ca41b96f99
--- a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
+++ b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,38 +17,12 @@
 from typing import TYPE_CHECKING, Union
 
 import torch
-import transformers
 from megatron.core.transformer.transformer_config import TransformerConfig
-from transformers import AutoConfig as HFAutoConfig
-from transformers import AutoModelForImageTextToText
-from transformers import Qwen2_5_VLConfig as HFQwen25VLConfig
-from transformers import Qwen2VLConfig as HFQwen2VLConfig
-from transformers import Qwen2VLForConditionalGeneration
-from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig as HFQwen25VLVisionConfig
-from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig as HFQwen2VLVisionConfig
-
-from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
-from nemo.collections.llm import (
-    Qwen2Config,
-    Qwen2Config1P5B,
-    Qwen2Config7B,
-    Qwen2Config72B,
-    Qwen25Config3B,
-    Qwen25Config7B,
-    Qwen25Config32B,
-    Qwen25Config72B,
-)
-from nemo.collections.vlm.neva.model.llava import export_qkv, export_qkv_bias
-from nemo.collections.vlm.qwen2vl.model.base import (
-    Qwen2VLConfig,
-    Qwen2VLModel,
-    Qwen2VLVisionConfig,
-    Qwen25VLVisionConfig,
-)
+
+from nemo.collections.llm import Qwen2Config, Qwen2Config1P5B, Qwen2Config7B, Qwen2Config72B
+from nemo.collections.vlm.qwen2vl.model.base import Qwen2VLConfig, Qwen2VLModel, Qwen2VLVisionConfig
 from nemo.collections.vlm.vision import MultimodalProjectorConfig
 from nemo.lightning import io, teardown
-from nemo.lightning.pytorch.utils import dtype_from_hf
-from nemo.utils import logging
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
@@ -62,9 +36,7 @@ class Qwen2VLConfig2B(Qwen2VLConfig):
 
     from transformers import PretrainedConfig
 
-    language_transformer_config: TransformerConfig = field(
-        default_factory=lambda: Qwen2Config1P5B(share_embeddings_and_output_weights=True)
-    )
+    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen2Config1P5B())
     vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
         default_factory=lambda: Qwen2VLVisionConfig(num_layers=32, num_attention_heads=16)
     )
@@ -103,95 +75,27 @@ class Qwen2VLConfig72B(Qwen2VLConfig):
     )
 
 
-@dataclass
-class Qwen25VLConfig3B(Qwen2VLConfig):
-    """Qwen2.5VL Config 3B"""
-
-    from transformers import PretrainedConfig
-
-    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen25Config3B())
-    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
-        default_factory=lambda: Qwen25VLVisionConfig(num_layers=32, num_attention_heads=16)
-    )
-    vision_projection_config: TransformerConfig = field(
-        default_factory=lambda: MultimodalProjectorConfig(
-            projector_type="mcore_mlp", input_size=5120, hidden_size=2048, ffn_hidden_size=5120
-        )
-    )
-
-
-@dataclass
-class Qwen25VLConfig7B(Qwen2VLConfig):
-    """Qwen2.5VL Config 7B"""
-
-    from transformers import PretrainedConfig
-
-    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen25Config7B())
-    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
-        default_factory=lambda: Qwen25VLVisionConfig(num_layers=32, num_attention_heads=16)
-    )
-    vision_projection_config: TransformerConfig = field(
-        default_factory=lambda: MultimodalProjectorConfig(
-            projector_type="mcore_mlp", input_size=5120, hidden_size=3584, ffn_hidden_size=5120
-        )
-    )
-
-
-@dataclass
-class Qwen25VLConfig32B(Qwen2VLConfig):
-    """Qwen2.5VL Config 32B"""
-
-    from transformers import PretrainedConfig
-
-    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen25Config32B())
-    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
-        default_factory=lambda: Qwen25VLVisionConfig(num_layers=32, num_attention_heads=16, ffn_hidden_size=3456)
-    )
-    vision_projection_config: TransformerConfig = field(
-        default_factory=lambda: MultimodalProjectorConfig(
-            projector_type="mcore_mlp", input_size=5120, hidden_size=5120, ffn_hidden_size=5120
-        )
-    )
-
-
-@dataclass
-class Qwen25VLConfig72B(Qwen2VLConfig):
-    """Qwen2.5VL Config 72B"""
-
-    from transformers import PretrainedConfig
-
-    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen25Config72B())
-    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
-        default_factory=lambda: Qwen25VLVisionConfig(num_layers=32, num_attention_heads=16, ffn_hidden_size=3456)
-    )
-    vision_projection_config: TransformerConfig = field(
-        default_factory=lambda: MultimodalProjectorConfig(
-            projector_type="mcore_mlp", input_size=5120, hidden_size=8192, ffn_hidden_size=5120
-        )
-    )
-
-
 @io.model_importer(Qwen2VLModel, "hf")
 class HFQwen2VLImporter(io.ModelConnector["Qwen2VLForConditionalGeneration", Qwen2VLModel]):
     """Qwen2VL Model HF Importer"""
 
     def init(self) -> Qwen2VLModel:
         # pylint: disable=C0115,C0116
-        return Qwen2VLModel(self.config, model_version="qwen2-vl", tokenizer=self.tokenizer)
+        return Qwen2VLModel(self.config, tokenizer=self.tokenizer)
 
     def apply(self, output_path: Path) -> Path:
         # pylint: disable=C0115,C0116
-        source = AutoModelForImageTextToText.from_pretrained(str(self), trust_remote_code=True)
-        hf_config = HFAutoConfig.from_pretrained(str(self), trust_remote_code=True)
-        self.is_v2_5 = hf_config.model_type == "qwen2_5_vl"
+        from transformers import Qwen2VLForConditionalGeneration
 
+        source = Qwen2VLForConditionalGeneration.from_pretrained(str(self))
         target = self.init()
         trainer = self.nemo_setup(target)
-        source = source.to(dtype_from_hf(hf_config))
-        target = target.to(dtype_from_hf(hf_config))
         self.convert_state(source, target)
         print(f"Converted Qwen2VL model to Nemo, saving to {output_path}")
+        # for name, param in target.named_parameters():
+        #     print(name, param.shape)
         self.nemo_save(output_path, trainer)
+
         print(f"Converted Qwen2VL model saved to {output_path}")
 
         teardown(trainer, target)
@@ -209,36 +113,21 @@ def convert_state(self, source, target):
             "visual.blocks.*.norm2.bias": "vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias",
             "visual.blocks.*.attn.proj.weight": "vision_model.decoder.layers.*.self_attention.linear_proj.weight",
             "visual.blocks.*.attn.proj.bias": "vision_model.decoder.layers.*.self_attention.linear_proj.bias",
+            "visual.blocks.*.mlp.fc1.weight": "vision_model.decoder.layers.*.mlp.linear_fc1.weight",
+            "visual.blocks.*.mlp.fc1.bias": "vision_model.decoder.layers.*.mlp.linear_fc1.bias",
+            "visual.blocks.*.mlp.fc2.weight": "vision_model.decoder.layers.*.mlp.linear_fc2.weight",
+            "visual.blocks.*.mlp.fc2.bias": "vision_model.decoder.layers.*.mlp.linear_fc2.bias",
+            "visual.merger.ln_q.weight": "vision_model.decoder.final_layernorm.weight",
+            "visual.merger.ln_q.bias": "vision_model.decoder.final_layernorm.bias",
             "model.embed_tokens.weight": "language_model.embedding.word_embeddings.weight",
             "model.layers.*.self_attn.o_proj.weight": "language_model.decoder.layers.*.self_attention.linear_proj.weight",
             "model.layers.*.mlp.down_proj.weight": "language_model.decoder.layers.*.mlp.linear_fc2.weight",
             "model.layers.*.input_layernorm.weight": "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
             "model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
             "model.norm.weight": "language_model.decoder.final_layernorm.weight",
-            # "lm_head.weight": "language_model.output_layer.weight",
+            "lm_head.weight": "language_model.output_layer.weight",
         }
-        if not target.config.language_transformer_config.share_embeddings_and_output_weights:
-            mapping.update({"lm_head.weight": "language_model.output_layer.weight"})
 
-        if self.is_v2_5:
-            mapping.update(
-                {
-                    "visual.blocks.*.mlp.down_proj.weight": "vision_model.decoder.layers.*.mlp.linear_fc2.weight",
-                    "visual.blocks.*.mlp.down_proj.bias": "vision_model.decoder.layers.*.mlp.linear_fc2.bias",
-                    "visual.merger.ln_q.weight": "vision_model.decoder.final_layernorm.weight",
-                }
-            )
-        else:
-            mapping.update(
-                {
-                    "visual.blocks.*.mlp.fc1.weight": "vision_model.decoder.layers.*.mlp.linear_fc1.weight",
-                    "visual.blocks.*.mlp.fc1.bias": "vision_model.decoder.layers.*.mlp.linear_fc1.bias",
-                    "visual.blocks.*.mlp.fc2.weight": "vision_model.decoder.layers.*.mlp.linear_fc2.weight",
-                    "visual.blocks.*.mlp.fc2.bias": "vision_model.decoder.layers.*.mlp.linear_fc2.bias",
-                    "visual.merger.ln_q.weight": "vision_model.decoder.final_layernorm.weight",
-                    "visual.merger.ln_q.bias": "vision_model.decoder.final_layernorm.bias",
-                }
-            )
         if "vision_projection.encoder.linear_fc1.weight" in target.module.state_dict().keys():
             mapping.update(
                 {
@@ -260,23 +149,17 @@ def convert_state(self, source, target):
         else:
             raise KeyError("Unable to map vision projection keys.")
 
-        transforms = [
-            _import_language_qkv,
-            _import_language_qkv_bias,
-            _import_vision_qkv,
-            _import_vision_qkv_bias,
-            _import_linear_fc1,
-        ]
-        if self.is_v2_5:
-            transforms += [
-                _import_vision_linear_fc1_weight,
-                _import_vision_linear_fc1_bias,
-            ]
         return io.apply_transforms(
             source,
             target,
             mapping=mapping,
-            transforms=transforms,
+            transforms=[
+                _import_language_qkv,
+                _import_language_qkv_bias,
+                _import_vision_qkv,
+                _import_vision_qkv_bias,
+                _import_linear_fc1,
+            ],
         )
 
     @property
@@ -289,17 +172,9 @@ def tokenizer(self) -> "AutoTokenizer":
     @property
     def config(self) -> Qwen2VLConfig:
         # pylint: disable=C0115,C0116
-        from packaging.version import Version
-
-        if Version(transformers.__version__) > Version('4.51.3'):
-            # Todo: need to fix with newest version of transformers
-            raise ValueError(
-                f"Current version of transformers is {transformers.__version__},"
-                f"Please lower the version to be <= 4.51.3"
-            )
+        from transformers import Qwen2VLConfig as HFQwen2VLConfig
 
-        hf_config = HFAutoConfig.from_pretrained(str(self), trust_remote_code=True)
-        is_v2_5 = hf_config.model_type == "qwen2_5_vl"
+        hf_config = HFQwen2VLConfig.from_pretrained(str(self))
 
         def make_vocab_size_divisible_by(vocab_size):
             # pylint: disable=C0115,C0116
@@ -308,349 +183,41 @@ def make_vocab_size_divisible_by(vocab_size):
                 base //= 2
             return base
 
-        text_config = hf_config
         language_transformer_config = Qwen2Config(
-            num_layers=text_config.num_hidden_layers,
-            hidden_size=text_config.hidden_size,
-            ffn_hidden_size=text_config.intermediate_size,
-            num_attention_heads=text_config.num_attention_heads,
-            init_method_std=text_config.initializer_range,
-            layernorm_epsilon=text_config.rms_norm_eps,
-            num_query_groups=text_config.num_key_value_heads,
-            rotary_base=text_config.rope_theta,
+            num_layers=hf_config.num_hidden_layers,
+            hidden_size=hf_config.hidden_size,
+            ffn_hidden_size=hf_config.intermediate_size,
+            num_attention_heads=hf_config.num_attention_heads,
+            init_method_std=hf_config.initializer_range,
+            layernorm_epsilon=hf_config.rms_norm_eps,
+            num_query_groups=hf_config.num_key_value_heads,
+            rotary_base=hf_config.rope_theta,
             gated_linear_unit=True,
-            make_vocab_size_divisible_by=make_vocab_size_divisible_by(text_config.vocab_size),
-            share_embeddings_and_output_weights=text_config.tie_word_embeddings,
-            vocab_size=text_config.vocab_size,
-            fp16=(dtype_from_hf(text_config) == torch.float16),
-            bf16=(dtype_from_hf(text_config) == torch.bfloat16),
-            params_dtype=dtype_from_hf(text_config),
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(hf_config.vocab_size),
+            share_embeddings_and_output_weights=False,
+            vocab_size=hf_config.vocab_size,
         )
 
         # Use MCore instead of Pytorch
-        vision_config = hf_config.vision_config
-        if is_v2_5:
-            vision_transformer_config = Qwen25VLVisionConfig(
-                ffn_hidden_size=vision_config.intermediate_size,
-                fp16=(dtype_from_hf(hf_config) == torch.float16),
-                bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
-                params_dtype=dtype_from_hf(hf_config),
-            )
-            merge_hidden_size = vision_config.hidden_size * (vision_config.spatial_merge_size**2)
-            vision_projection_config = MultimodalProjectorConfig(
-                input_size=merge_hidden_size,
-                hidden_size=vision_config.out_hidden_size,
-                ffn_hidden_size=merge_hidden_size,
-                projector_type="mcore_mlp",
-                fp16=(dtype_from_hf(hf_config) == torch.float16),
-                bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
-                params_dtype=dtype_from_hf(hf_config),
-            )
-        else:
-            vision_transformer_config = Qwen2VLVisionConfig(
-                fp16=(dtype_from_hf(hf_config) == torch.float16),
-                bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
-                params_dtype=dtype_from_hf(hf_config),
-            )
-            merge_hidden_size = vision_config.embed_dim * (vision_config.spatial_merge_size**2)
-            vision_projection_config = MultimodalProjectorConfig(
-                input_size=merge_hidden_size,
-                hidden_size=vision_config.hidden_size,
-                ffn_hidden_size=merge_hidden_size,
-                projector_type="mcore_mlp",
-                fp16=(dtype_from_hf(hf_config) == torch.float16),
-                bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
-                params_dtype=dtype_from_hf(hf_config),
-            )
+        vision_transformer_config = Qwen2VLVisionConfig()
+        merge_hidden_size = hf_config.vision_config.embed_dim * (hf_config.vision_config.spatial_merge_size**2)
+        vision_projection_config = MultimodalProjectorConfig(
+            input_size=merge_hidden_size,
+            hidden_size=hf_config.vision_config.hidden_size,
+            ffn_hidden_size=merge_hidden_size,
+            projector_type="mcore_mlp",
+        )
 
         output = Qwen2VLConfig(
             language_transformer_config=language_transformer_config,
             vision_transformer_config=vision_transformer_config,
             vision_projection_config=vision_projection_config,
             vision_feature_layer=-1,
-            fp16=(dtype_from_hf(hf_config) == torch.float16),
-            bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
-            params_dtype=dtype_from_hf(hf_config),
         )
 
         return output
 
 
-@io.model_exporter(Qwen2VLModel, "hf")
-class HFQwen2VLExporter(io.ModelConnector[Qwen2VLModel, "Qwen2VLForConditionalGeneration"]):
-    """
-    Exporter class for converting NeMo Qwen2VL model to HuggingFace format.
-
-    Inherits:
-        io.ModelConnector: Connector interface to handle setup, save, and load using the Lightning framework.
-
-    Methods:
-        init: Initializes a new HuggingFace Qwen2VL model instance.
-        apply: Converts the NeMo model to HuggingFace format and saves it.
-        convert_state: Maps and transforms the state dictionary from NeMo to HuggingFace format.
-        config: Generates and returns the HuggingFace Qwen2VL config for the model.
-    """
-
-    def init(self, dtype=torch.bfloat16) -> "Qwen2VLForConditionalGeneration":
-        """
-        Initializes a HuggingFace Qwen2VLForConditionalGeneration model.
-
-        Args:
-            dtype: The data type to use for the model (default: torch.bfloat16)
-
-        Returns:
-            Qwen2VLForConditionalGeneration: A HuggingFace Qwen2VL model initialized with the configuration.
-        """
-        from transformers.modeling_utils import no_init_weights
-
-        with no_init_weights():
-            return AutoModelForImageTextToText.from_config(self.config, torch_dtype=dtype)
-
-    def apply(self, output_path: Path) -> Path:
-        """
-        Converts the NeMo Qwen2VL model to HuggingFace format and saves it to the specified path.
-
-        Args:
-            output_path (Path): The path where the converted HuggingFace model will be saved.
-
-        Returns:
-            Path: The output path where the HuggingFace model was saved.
-        """
-        logging.info("Loading Qwen2VL NeMo checkpoint. This may take a while...")
-        source, source_config = self.ckpt_load(self)
-        logging.info("Qwen2VL NeMo checkpoint loaded.")
-        logging.info("Initializing the HF model..")
-        target = self.init()
-        logging.info("Start Converting the model..")
-        target = self.convert_state(source, target, source_config)
-        target = target.cpu()
-        target.save_pretrained(output_path)
-
-        try:
-            self.tokenizer.tokenizer.save_pretrained(output_path)
-        except Exception:
-            logging.warning("Failed to save tokenizer")
-
-        print(f"Converted Qwen2VL model saved to {output_path}")
-
-        return output_path
-
-    def convert_state(self, source, target, source_config):
-        # pylint: disable=C0115,C0116,line-too-long
-        """
-        Maps and transforms the state dictionary from NeMo to HuggingFace format.
-
-        Args:
-            source: The source NeMo model.
-            target: The target HuggingFace model.
-
-        Returns:
-            The target HuggingFace model with the converted state.
-        """
-
-        mapping = {
-            "vision_model.conv1.weight": "visual.patch_embed.proj.weight",
-            "vision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "visual.blocks.*.norm1.weight",
-            "vision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_bias": "visual.blocks.*.norm1.bias",
-            "vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "visual.blocks.*.norm2.weight",
-            "vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias": "visual.blocks.*.norm2.bias",
-            "vision_model.decoder.layers.*.self_attention.linear_proj.weight": "visual.blocks.*.attn.proj.weight",
-            "vision_model.decoder.layers.*.self_attention.linear_proj.bias": "visual.blocks.*.attn.proj.bias",
-            "language_model.embedding.word_embeddings.weight": "model.embed_tokens.weight",
-            "language_model.decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
-            "language_model.decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
-            "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
-            "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
-            "language_model.decoder.final_layernorm.weight": "model.norm.weight",
-            # "language_model.output_layer.weight": "lm_head.weight",
-        }
-        if source_config.language_transformer_config.share_embeddings_and_output_weights:
-            mapping.update({"language_model.embedding.word_embeddings.weight": "lm_head.weight"})
-        else:
-            mapping.update({"language_model.output_layer.weight": "lm_head.weight"})
-
-        if self.is_v2_5:
-            mapping.update(
-                {
-                    "vision_model.decoder.layers.*.mlp.linear_fc2.weight": "visual.blocks.*.mlp.down_proj.weight",
-                    "vision_model.decoder.layers.*.mlp.linear_fc2.bias": "visual.blocks.*.mlp.down_proj.bias",
-                    "vision_model.decoder.final_layernorm.weight": "visual.merger.ln_q.weight",
-                }
-            )
-
-        else:
-            mapping.update(
-                {
-                    "vision_model.decoder.layers.*.mlp.linear_fc1.weight": "visual.blocks.*.mlp.fc1.weight",
-                    "vision_model.decoder.layers.*.mlp.linear_fc1.bias": "visual.blocks.*.mlp.fc1.bias",
-                    "vision_model.decoder.layers.*.mlp.linear_fc2.weight": "visual.blocks.*.mlp.fc2.weight",
-                    "vision_model.decoder.layers.*.mlp.linear_fc2.bias": "visual.blocks.*.mlp.fc2.bias",
-                    "vision_model.decoder.final_layernorm.weight": "visual.merger.ln_q.weight",
-                    "vision_model.decoder.final_layernorm.bias": "visual.merger.ln_q.bias",
-                }
-            )
-        if "vision_projection.encoder.linear_fc1.weight" in source.state_dict().keys():
-            mapping.update(
-                {
-                    "vision_projection.encoder.linear_fc1.weight": "visual.merger.mlp.0.weight",
-                    "vision_projection.encoder.linear_fc1.bias": "visual.merger.mlp.0.bias",
-                    "vision_projection.encoder.linear_fc2.weight": "visual.merger.mlp.2.weight",
-                    "vision_projection.encoder.linear_fc2.bias": "visual.merger.mlp.2.bias",
-                }
-            )
-        elif "vision_projection.0.weight" in source.state_dict().keys():
-            mapping.update(
-                {
-                    "vision_projection.0.weight": "visual.merger.mlp.0.weight",
-                    "vision_projection.0.bias": "visual.merger.mlp.0.bias",
-                    "vision_projection.2.weight": "visual.merger.mlp.2.weight",
-                    "vision_projection.2.bias": "visual.merger.mlp.2.bias",
-                }
-            )
-        else:
-            raise KeyError("Unable to map vision projection keys.")
-
-        transforms = [
-            _export_language_qkv,
-            _export_language_qkv_bias,
-            _export_vision_qkv,
-            _export_vision_qkv_bias,
-            _export_linear_fc1,
-        ]
-        if self.is_v2_5:
-            transforms += [
-                _export_vision_linear_fc1_weight,
-                _export_vision_linear_fc1_bias,
-            ]
-
-        return io.apply_transforms(
-            source,
-            target,
-            mapping=mapping,
-            transforms=transforms,
-        )
-
-    @property
-    def tokenizer(self) -> "TokenizerSpec":
-        """
-        Gets the tokenizer from the loaded model context.
-
-        Returns:
-            The tokenizer specification.
-        """
-        return io.load_context(str(self), subpath="model").tokenizer
-
-    @property
-    def config(self) -> "HFQwen2VLConfig":
-        """
-        Generates the configuration for the HuggingFace Qwen2VL model based on the NeMo model.
-
-        Returns:
-            HFQwen2VLConfig: A configuration object for the HuggingFace Qwen2VL model.
-        """
-        from packaging.version import Version
-
-        if Version(transformers.__version__) > Version('4.51.3'):
-            # Todo: need to fix with newest version of transformers
-            raise ValueError(
-                f"Current version of transformers is {transformers.__version__},"
-                f"Please lower the version to be <= 4.51.3"
-            )
-        source = io.load_context(str(self), subpath="model.config")
-
-        language_config = source.language_transformer_config
-        vision_model_config = source.vision_transformer_config
-        vision_projection_config = source.vision_projection_config
-
-        self.is_v2_5 = hasattr(vision_model_config, "fullatt_block_indexes") and (
-            vision_model_config.fullatt_block_indexes != None
-        )
-
-        if self.is_v2_5:
-            vision_config = HFQwen25VLVisionConfig(
-                depth=vision_model_config.num_layers,
-                embed_dim=vision_model_config.embed_dim,
-                hidden_size=vision_model_config.hidden_size,
-                out_hidden_size=language_config.hidden_size,
-                hidden_act="silu",
-                mlp_ratio=int(vision_projection_config.ffn_hidden_size // vision_model_config.hidden_size),
-                num_heads=vision_model_config.num_attention_heads,
-                in_channels=3,
-                patch_size=vision_model_config.patch_dim,
-                spatial_merge_size=vision_model_config.spatial_merge_size,
-                spatial_patch_size=vision_model_config.spatial_patch_size,
-                temporal_patch_size=vision_model_config.temporal_patch_size,
-                initializer_range=vision_model_config.init_method_std,
-                fullatt_block_indexes=[7, 15, 23, 31],
-                tokens_per_second=2,
-                model_type="qwen2_5_vl",
-                torch_dtype="bfloat16",
-            ).to_dict()
-
-            # Create the LlavaConfig for HuggingFace
-            hf_config = HFQwen25VLConfig(
-                vision_config=vision_config,
-                num_hidden_layers=language_config.num_layers,
-                hidden_size=language_config.hidden_size,
-                intermediate_size=language_config.ffn_hidden_size,
-                num_attention_heads=language_config.num_attention_heads,
-                max_window_layers=70,
-                max_position_embeddings=language_config.seq_length,
-                initializer_range=language_config.init_method_std,
-                rms_norm_eps=language_config.layernorm_epsilon,
-                num_key_value_heads=language_config.num_query_groups,
-                rope_theta=language_config.rotary_base,
-                vocab_size=language_config.vocab_size,
-                rope_scaling={"type": "mrope", "mrope_section": [16, 24, 24]},
-                tie_word_embeddings=language_config.share_embeddings_and_output_weights,
-                torch_dtype="bfloat16",
-                # vocab_size=self.tokenizer.vocab_size,
-                bos_token_id=151643,
-                eos_token_id=151645,
-                vision_start_token_id=151652,
-                vision_end_token_id=151653,
-                vision_token_id=151654,
-                image_token_id=151655,
-                video_token_id=51656,
-            )
-            return hf_config
-        else:
-            vision_config = HFQwen2VLVisionConfig(
-                depth=vision_model_config.num_layers,
-                embed_dim=vision_model_config.embed_dim,
-                hidden_size=vision_projection_config.hidden_size,
-                hidden_act="quick_gelu",
-                mlp_ratio=int(vision_projection_config.ffn_hidden_size // vision_model_config.hidden_size),
-                num_heads=vision_model_config.num_attention_heads,
-                in_channels=3,
-                patch_size=vision_model_config.patch_dim,
-                spatial_merge_size=vision_model_config.spatial_merge_size,
-                spatial_patch_size=vision_model_config.spatial_patch_size,
-                temporal_patch_size=vision_model_config.temporal_patch_size,
-                initializer_range=vision_model_config.init_method_std,
-                model_type="qwen2_vl",
-                torch_dtype="bfloat16",
-            ).to_dict()
-
-            # Create the Qwen2VLConfig for HuggingFace
-            # if transformers > 4.51.3, use Qwen2VLTextConfig as text_config
-            # https://github.com/huggingface/transformers/pull/37268
-            return HFQwen2VLConfig(
-                num_hidden_layers=language_config.num_layers,
-                hidden_size=language_config.hidden_size,
-                intermediate_size=language_config.ffn_hidden_size,
-                num_attention_heads=language_config.num_attention_heads,
-                initializer_range=language_config.init_method_std,
-                rms_norm_eps=language_config.layernorm_epsilon,
-                num_key_value_heads=language_config.num_query_groups,
-                rope_theta=language_config.rotary_base,
-                tie_word_embeddings=language_config.share_embeddings_and_output_weights,
-                vocab_size=language_config.vocab_size,
-                vision_config=vision_config,
-                torch_dtype="bfloat16",
-            )
-
-
 def import_qkv(q, k, v, head_num, num_query_groups, heads_per_group, hidden_size, head_size):
     # pylint: disable=C0115,C0116
     old_tensor_shape = q.size()
@@ -795,198 +362,3 @@ def _import_cls_token(ctx: io.TransformCTX, cls_token):
 def _import_linear_fc1(down, gate):
     # pylint: disable=C0115,C0116
     return torch.cat((down, gate), axis=0)
-
-
-@io.state_transform(
-    source_key=("visual.blocks.*.mlp.gate_proj.weight", "visual.blocks.*.mlp.up_proj.weight"),
-    target_key="vision_model.decoder.layers.*.mlp.linear_fc1.weight",
-)
-def _import_vision_linear_fc1_weight(down, gate):
-    # pylint: disable=C0115,C0116
-    return torch.cat((down, gate), axis=0)
-
-
-@io.state_transform(
-    source_key=("visual.blocks.*.mlp.gate_proj.bias", "visual.blocks.*.mlp.up_proj.bias"),
-    target_key="vision_model.decoder.layers.*.mlp.linear_fc1.bias",
-)
-def _import_vision_linear_fc1_bias(down, gate):
-    # pylint: disable=C0115,C0116
-    return torch.cat((down, gate), axis=0)
-
-
-def export_qkv(linear_qkv, head_num, num_query_groups, heads_per_group, hidden_size, head_size):
-    # pylint: disable=C0115,C0116
-    qkv_total_dim = head_num + 2 * num_query_groups
-
-    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, -1])
-    hidden_size = linear_qkv.size(-1)
-    q_slice = torch.cat(
-        [
-            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
-            for i in range(num_query_groups)
-        ]
-    )
-    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
-    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
-
-    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
-    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
-    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
-
-    return q_proj, k_proj, v_proj
-
-
-def export_qkv_bias(qkv_bias: torch.Tensor, head_num, num_query_groups, heads_per_group, head_size):
-    """
-    Split interleave-concatenated qkv bias to separate q, k, v bias
-
-    Example: export layer linear_qkv bias to HF {q|k|v}_proj bias
-    """
-    qkv_total_dim = head_num + 2 * num_query_groups
-
-    qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
-    q_slice = torch.cat(
-        [
-            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
-            for i in range(num_query_groups)
-        ]
-    )
-    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
-    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
-
-    q_bias = qkv_bias[q_slice].reshape(-1).cpu()
-    k_bias = qkv_bias[k_slice].reshape(-1).cpu()
-    v_bias = qkv_bias[v_slice].reshape(-1).cpu()
-
-    return q_bias, k_bias, v_bias
-
-
-@io.state_transform(
-    source_key="vision_model.decoder.layers.*.self_attention.linear_qkv.weight",
-    target_key="visual.blocks.*.attn.qkv.weight",
-)
-def _export_vision_qkv(ctx: io.TransformCTX, qkv):
-    # pylint: disable=C0115,C0116
-    hf_config = ctx.target.config.vision_config
-    hidden_size = hf_config.embed_dim if hf_config.model_type == "qwen2_vl" else hf_config.hidden_size
-    return torch.cat(
-        export_qkv(
-            qkv,
-            head_num=hf_config.num_heads,
-            num_query_groups=hf_config.num_heads,
-            heads_per_group=hf_config.num_heads // hf_config.num_heads,
-            hidden_size=hidden_size,
-            head_size=hidden_size // hf_config.num_heads,
-        ),
-        axis=0,
-    )
-
-
-@io.state_transform(
-    source_key="vision_model.decoder.layers.*.self_attention.linear_qkv.bias",
-    target_key="visual.blocks.*.attn.qkv.bias",
-)
-def _export_vision_qkv_bias(ctx: io.TransformCTX, qkv_bias):
-    # pylint: disable=C0115,C0116
-    hf_config = ctx.target.config.vision_config
-    hidden_size = hf_config.embed_dim if hf_config.model_type == "qwen2_vl" else hf_config.hidden_size
-    return torch.cat(
-        export_qkv_bias(
-            qkv_bias,
-            head_num=hf_config.num_heads,
-            num_query_groups=hf_config.num_heads,
-            heads_per_group=hf_config.num_heads // hf_config.num_heads,
-            head_size=hidden_size // hf_config.num_heads,
-        ),
-        axis=0,
-    )
-
-
-@io.state_transform(
-    source_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
-    target_key=(
-        "model.layers.*.self_attn.q_proj.weight",
-        "model.layers.*.self_attn.k_proj.weight",
-        "model.layers.*.self_attn.v_proj.weight",
-    ),
-)
-def _export_language_qkv(ctx: io.TransformCTX, qkv):
-    # pylint: disable=C0115,C0116
-    hf_config = ctx.target.config
-    return export_qkv(
-        qkv,
-        head_num=hf_config.num_attention_heads,
-        num_query_groups=hf_config.num_key_value_heads,
-        heads_per_group=hf_config.num_attention_heads // hf_config.num_key_value_heads,
-        hidden_size=hf_config.hidden_size,
-        head_size=hf_config.hidden_size // hf_config.num_attention_heads,
-    )
-
-
-@io.state_transform(
-    source_key="language_model.decoder.layers.*.self_attention.linear_qkv.bias",
-    target_key=(
-        "model.layers.*.self_attn.q_proj.bias",
-        "model.layers.*.self_attn.k_proj.bias",
-        "model.layers.*.self_attn.v_proj.bias",
-    ),
-)
-def _export_language_qkv_bias(ctx: io.TransformCTX, qkv_bias):
-    # pylint: disable=C0115,C0116
-    hf_config = ctx.target.config
-    return export_qkv_bias(
-        qkv_bias,
-        head_num=hf_config.num_attention_heads,
-        num_query_groups=hf_config.num_key_value_heads,
-        heads_per_group=hf_config.num_attention_heads // hf_config.num_key_value_heads,
-        head_size=hf_config.hidden_size // hf_config.num_attention_heads,
-    )
-
-
-@io.state_transform(
-    source_key="vision_model.class_token",
-    target_key="vision_model.embeddings.class_embedding",
-)
-def _export_cls_token(ctx: io.TransformCTX, cls_token):
-    # pylint: disable=C0115,C0116
-    return cls_token.squeeze()
-
-
-@io.state_transform(
-    source_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
-    target_key=(
-        "model.layers.*.mlp.gate_proj.weight",
-        "model.layers.*.mlp.up_proj.weight",
-    ),
-)
-def _export_linear_fc1(linear_fc1):
-    # pylint: disable=C0115,C0116
-    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
-    return gate_proj, up_proj
-
-
-@io.state_transform(
-    source_key="vision_model.decoder.layers.*.mlp.linear_fc1.weight",
-    target_key=(
-        "visual.blocks.*.mlp.gate_proj.weight",
-        "visual.blocks.*.mlp.up_proj.weight",
-    ),
-)
-def _export_vision_linear_fc1_weight(vision_fc1_weight):
-    # pylint: disable=C0115,C0116
-    gate_proj, up_proj = torch.chunk(vision_fc1_weight, 2, dim=0)
-    return gate_proj, up_proj
-
-
-@io.state_transform(
-    source_key="vision_model.decoder.layers.*.mlp.linear_fc1.bias",
-    target_key=(
-        "visual.blocks.*.mlp.gate_proj.bias",
-        "visual.blocks.*.mlp.up_proj.bias",
-    ),
-)
-def _export_vision_linear_fc1_bias(vision_fc1_bias):
-    # pylint: disable=C0115,C0116
-    gate_proj, up_proj = torch.chunk(vision_fc1_bias, 2, dim=0)
-    return gate_proj, up_proj
diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
new file mode 100644
index 000000000000..3685c9a5cb07
--- /dev/null
+++ b/nemo/export/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# WAR for trtllm and lightning conflict
+try:
+    from nemo.lightning import io
+
+    __all__ = ["io"]
+except (ImportError, ModuleNotFoundError):
+    pass
diff --git a/nemo/export/multimodal/__init__.py b/nemo/export/multimodal/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/export/multimodal/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py
new file mode 100644
index 000000000000..6d5c792d02d9
--- /dev/null
+++ b/nemo/export/multimodal/build.py
@@ -0,0 +1,728 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import os
+import shutil
+import tarfile
+import tempfile
+from pathlib import Path
+from time import time
+from typing import List
+
+import tensorrt as trt
+import torch
+import yaml
+from omegaconf import OmegaConf
+from PIL import Image
+from tensorrt_llm._common import check_max_num_tokens
+from tensorrt_llm.builder import BuildConfig, Builder
+from tensorrt_llm.commands.build import build as build_trtllm
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import MLLaMAForCausalLM
+from tensorrt_llm.plugin import PluginConfig
+from transformers import AutoModel, AutoProcessor, MllamaForConditionalGeneration
+
+from nemo.collections.multimodal.speech_llm.modules.perception_modules import AudioPerceptionModule
+from nemo.core.classes.common import typecheck
+from nemo.export.tensorrt_llm import TensorRTLLM
+from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_nemo_model
+
+from .converter import convert_mllama_nemo_to_hf
+
+logger = trt.Logger(trt.Logger.INFO)
+
+
+def build_trtllm_engine(
+    model_dir: str,
+    visual_checkpoint_path: str,
+    llm_checkpoint_path: str = None,
+    model_type: str = "neva",
+    llm_model_type: str = "llama",
+    tensor_parallelism_size: int = 1,
+    max_input_len: int = 256,
+    max_output_len: int = 256,
+    max_batch_size: int = 1,
+    max_multimodal_len: int = 1024,
+    dtype: str = "bfloat16",
+    use_lora_plugin: str = None,
+    lora_target_modules: List[str] = None,
+    max_lora_rank: int = 64,
+    lora_ckpt_list: List[str] = None,
+):
+    """Build TRTLLM engine by nemo export"""
+    trt_llm_exporter = TensorRTLLM(model_dir=model_dir, lora_ckpt_list=lora_ckpt_list, load_model=False)
+    trt_llm_exporter.export(
+        nemo_checkpoint_path=visual_checkpoint_path if llm_checkpoint_path is None else llm_checkpoint_path,
+        model_type=llm_model_type,
+        tensor_parallelism_size=tensor_parallelism_size,
+        max_input_len=max_input_len,
+        max_output_len=max_output_len,
+        max_seq_len=max_input_len + max_output_len,
+        max_batch_size=max_batch_size,
+        max_prompt_embedding_table_size=max_multimodal_len,
+        dtype=dtype,
+        load_model=False,
+        use_lora_plugin=use_lora_plugin,
+        lora_target_modules=lora_target_modules,
+        max_lora_rank=max_lora_rank,
+        use_mcore_path=False,
+    )
+
+
+def build_mllama_trtllm_engine(
+    model_dir: str,
+    hf_model_path: str,
+    tensor_parallelism_size: int = 1,
+    max_input_len: int = 256,
+    max_output_len: int = 256,
+    max_batch_size: int = 1,
+    max_multimodal_len: int = 1024,
+    dtype: str = "bfloat16",
+    use_lora_plugin: str = None,
+    lora_target_modules: List[str] = None,
+    max_lora_rank: int = 64,
+    lora_ckpt_list: List[str] = None,
+):
+    """Build mllama TRTLLM engine from HF"""
+    if max_batch_size < 4:
+        print(
+            "TensorRT LLM may hit a runtime issue with batch size is smaller than 4 on some models." " Force set to 4"
+        )
+        max_batch_size = 4
+
+    plugin_config = PluginConfig()
+    plugin_config.gpt_attention_plugin = "auto"
+    plugin_config.gemm_plugin = "auto"
+    plugin_config.enable_paged_kv_cache(tokens_per_block=128)
+    plugin_config.remove_input_padding = True
+    plugin_config.use_paged_context_fmha = True
+
+    max_seq_len = max_input_len + max_output_len
+    max_num_tokens, opt_num_tokens = check_max_num_tokens(
+        max_num_tokens=None,
+        opt_num_tokens=None,
+        max_seq_len=max_seq_len,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+        max_beam_width=1,
+        remove_input_padding=True,
+        enable_context_fmha=plugin_config.context_fmha,
+        tokens_per_block=128,
+        multiple_profiles=False,
+    )
+
+    build_dict = {
+        'max_input_len': max_input_len,
+        'max_output_len': max_output_len,
+        'max_encoder_input_len': max_multimodal_len,
+        'max_batch_size': max_batch_size,
+        'max_beam_width': 1,
+        'max_seq_len': max_seq_len,
+        'max_num_tokens': max_num_tokens,
+        'opt_num_tokens': opt_num_tokens,
+        'strongly_typed': True,
+        'builder_opt': None,
+    }
+    build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
+
+    for rank in range(tensor_parallelism_size):
+        mapping = Mapping(world_size=tensor_parallelism_size, rank=rank, tp_size=tensor_parallelism_size)
+        model = MLLaMAForCausalLM.from_hugging_face(
+            hf_model_path,
+            dtype,
+            mapping=mapping,
+        )
+
+        engine = build_trtllm(model, build_config)
+        engine.save(model_dir)
+
+
+def export_visual_wrapper_onnx(
+    visual_wrapper, input, output_dir, input_names=['input'], dynamic_axes={'input': {0: 'batch'}}
+):
+    """Export visual wrapper to ONNX"""
+    logger.log(trt.Logger.INFO, "Exporting onnx")
+    os.makedirs(f'{output_dir}/onnx', exist_ok=True)
+    torch.onnx.export(
+        visual_wrapper,
+        input,
+        f'{output_dir}/onnx/visual_encoder.onnx',
+        opset_version=17,
+        input_names=input_names,
+        output_names=['output'],
+        dynamic_axes=dynamic_axes,
+    )
+
+
+def export_perception_wrapper_onnx(
+    perception_wrapper,
+    input,
+    output_dir,
+    input_names=['processed_signal', 'processed_signal_length'],
+    output_names=['encoded', 'encoded_length'],
+    dynamic_axes={
+        'processed_signal': {0: 'batch', 2: 'time'},
+        'processed_signal_length': {0: 'batch'},
+        'encoded': {0: 'batch', 1: 'time'},
+        'encoded_length': {0: 'batch'},
+    },
+):
+    """Export perception wrapper to ONNX"""
+    logger.log(trt.Logger.INFO, "Exporting onnx")
+    os.makedirs(f'{output_dir}/onnx', exist_ok=True)
+    torch.onnx.export(
+        perception_wrapper,
+        input,
+        f'{output_dir}/onnx/perception_encoder.onnx',
+        opset_version=17,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes=dynamic_axes,
+    )
+
+
+def build_trt_engine(
+    model_type,
+    input_sizes,
+    output_dir,
+    vision_max_batch_size,
+    dtype=torch.bfloat16,
+    image_size=None,
+    num_frames=None,
+    nemo_config=None,
+    part_name='visual_encoder',
+):
+    """Build TRT engine from onnx"""
+    onnx_file = '%s/onnx/%s.onnx' % (output_dir, part_name)
+    engine_file = '%s/%s.engine' % (output_dir, part_name)
+    config_file = '%s/%s' % (output_dir, "config.json")
+    nemo_config_file = '%s/%s' % (output_dir, "nemo_config.yaml")
+
+    with open(nemo_config_file, 'w') as f:
+        yaml.dump(nemo_config, f)
+
+    logger.log(trt.Logger.INFO, "Building TRT engine for %s" % part_name)
+
+    builder = trt.Builder(logger)
+    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+    profile = builder.create_optimization_profile()
+
+    config_args = {"precision": str(dtype).split('.')[-1], "model_type": model_type}
+    if image_size is not None:
+        config_args["image_size"] = image_size
+    if num_frames is not None:
+        config_args["num_frames"] = num_frames
+
+    config_wrapper = Builder().create_builder_config(**config_args)
+    config = config_wrapper.trt_builder_config
+
+    parser = trt.OnnxParser(network, logger)
+
+    with open(onnx_file, 'rb') as model:
+        if not parser.parse(model.read(), os.path.abspath(onnx_file)):
+            logger.log(trt.Logger.ERROR, "Failed parsing %s" % onnx_file)
+            for error in range(parser.num_errors):
+                logger.log(trt.Logger.ERROR, parser.get_error(error))
+        logger.log(trt.Logger.INFO, "Succeeded parsing %s" % onnx_file)
+
+    # Delete onnx files since we don't need them now
+    shutil.rmtree(f'{output_dir}/onnx')
+
+    nBS = -1
+    nMinBS = 1
+    nOptBS = max(nMinBS, int(vision_max_batch_size / 2))
+    nMaxBS = vision_max_batch_size
+
+    inputT = network.get_input(0)
+
+    # input sizes can be a list of ints (e.g., [3, H, W]) when inputs are images,
+    # or a list of three int lists (e.g., [[1, 1, 2700], [1, 500, 2700], [1, 4096, 2700]]).
+    # or a list of three list of lists
+    # (e.g., [{input1: min_shape, input2: min_shape, }, \
+    #     {input1: opt_shape, input2: opt_shape}, \
+    # {input1: max_shape, input2: max_shape}] )
+    assert isinstance(input_sizes, list), "input_sizes must be a list"
+    if isinstance(input_sizes[0], int):
+        logger.log(trt.Logger.INFO, f"Processed input sizes {input_sizes}")
+        inputT.shape = [nBS, *input_sizes]
+        min_size = opt_size = max_size = input_sizes
+    elif len(input_sizes) == 3 and isinstance(input_sizes[0], list):
+        min_size, opt_size, max_size = input_sizes
+        logger.log(trt.Logger.INFO, f"Processed min/opt/max input sizes {min_size}/{opt_size}/{max_size}")
+    elif len(input_sizes) == 3 and isinstance(input_sizes[0], dict):
+        logger.log(trt.Logger.INFO, f"Processed min/opt/max input sizes {input_sizes}")
+    else:
+        raise ValueError(f"invalid input sizes: {input_sizes}")
+
+    if isinstance(input_sizes[0], dict):
+        for i in range(network.num_inputs):
+            inputT = network.get_input(i)
+            input_name = inputT.name
+            min_size = input_sizes[0][input_name]
+            opt_size = input_sizes[1][input_name]
+            max_size = input_sizes[2][input_name]
+            logger.log(trt.Logger.INFO, f"{input_name} min/opt/max input sizes {min_size}/{opt_size}/{max_size}")
+            profile.set_shape(input_name, min_size, opt_size, max_size)
+    else:
+        profile.set_shape(inputT.name, [nMinBS, *min_size], [nOptBS, *opt_size], [nMaxBS, *max_size])
+
+    config.add_optimization_profile(profile)
+
+    t0 = time()
+    engine_string = builder.build_serialized_network(network, config)
+    t1 = time()
+    if engine_string is None:
+        raise RuntimeError("Failed building %s" % (engine_file))
+    else:
+        logger.log(trt.Logger.INFO, "Succeeded building %s in %d s" % (engine_file, t1 - t0))
+        with open(engine_file, 'wb') as f:
+            f.write(engine_string)
+
+    Builder.save_config(config_wrapper, config_file)
+
+
+def build_neva_engine(
+    model_type: str,
+    model_dir: str,
+    visual_checkpoint_path: str,
+    vision_max_batch_size: int = 1,
+):
+    """Build neva visual engine"""
+    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+
+    if os.path.isdir(visual_checkpoint_path):
+        # load untar checkpoint
+        config_path = os.path.join(visual_checkpoint_path, 'model_config.yaml')
+        with open(config_path, 'r') as f:
+            nemo_config = yaml.safe_load(f)
+        try:
+            weights_path = os.path.join(visual_checkpoint_path, 'model_weights.ckpt')
+            mp0_weights = torch.load(weights_path, map_location=device)
+        except FileNotFoundError:
+            weights_path = os.path.join(visual_checkpoint_path, 'mp_rank_00/model_weights.ckpt')
+            mp0_weights = torch.load(weights_path, map_location=device)
+    else:
+        # extract NeMo checkpoint
+        with tempfile.TemporaryDirectory() as temp:
+            temp_path = Path(temp)
+            mp0_weights, nemo_config, _ = load_nemo_model(visual_checkpoint_path, temp_path)
+
+    vision_config = nemo_config["mm_cfg"]["vision_encoder"]
+
+    class DownSampleBlock(torch.nn.Module):
+        # pylint: disable=C0115,C0116
+        def forward(self, x):
+            vit_embeds = x
+            h = w = int(vit_embeds.shape[1] ** 0.5)
+            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1)
+            vit_embeds = self.flat_square(vit_embeds)
+            vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], -1, vit_embeds.shape[-1])
+            return vit_embeds
+
+        def flat_square(self, x):
+            n, w, h, c = x.size()
+            if w % 2 == 1:
+                x = torch.cat([x, torch.zeros((n, 1, h, c), dtype=x.dtype).to(x.device)], dim=1).contiguous()
+                n, w, h, c = x.size()
+            if h % 2 == 1:
+                x = torch.cat([x, torch.zeros((n, w, 1, c), dtype=x.dtype).to(x.device)], dim=2).contiguous()
+                n, w, h, c = x.size()
+            x = x.view(n, w, int(h / 2), int(c * 2))
+            x = x.permute(0, 2, 1, 3).contiguous()
+            x = x.view(n, int(h / 2), int(w / 2), int(c * 4))
+            return x
+
+    class VisionEncoderWrapper(torch.nn.Module):
+        # pylint: disable=C0115,C0116
+        def __init__(self, encoder, connector):
+            super().__init__()
+            self.encoder = encoder
+            self.connector = connector
+
+        def forward(self, images):
+            vision_x = self.encoder(pixel_values=images, output_hidden_states=True)
+            vision_x = vision_x.hidden_states[-2]
+            vision_x = self.connector(vision_x)
+            return vision_x
+
+    encoder = AutoModel.from_pretrained(
+        vision_config["from_pretrained"],
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation='eager',
+    )
+    vision_encoder = encoder.vision_model
+    hf_config = encoder.config
+    dtype = hf_config.torch_dtype
+
+    # connector
+    if nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "mlp2x_gelu":
+        vision_connector = torch.nn.Sequential(
+            torch.nn.Linear(vision_config["hidden_size"], nemo_config["hidden_size"], bias=True),
+            torch.nn.GELU(),
+            torch.nn.Linear(nemo_config["hidden_size"], nemo_config["hidden_size"], bias=True),
+        ).to(dtype=dtype)
+
+        key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector"
+        for layer in range(0, 3, 2):
+            vision_connector[layer].load_state_dict(
+                {
+                    'weight': mp0_weights[f"{key_prefix}.{layer}.weight"].to(dtype),
+                    'bias': mp0_weights[f"{key_prefix}.{layer}.bias"].to(dtype),
+                }
+            )
+    elif nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "linear":
+        vision_connector = torch.nn.Linear(vision_config["hidden_size"], nemo_config["hidden_size"], bias=True)
+        key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector"
+        vision_connector.load_state_dict(
+            {
+                'weight': mp0_weights[f"{key_prefix}.weight"].to(dtype),
+                'bias': mp0_weights[f"{key_prefix}.bias"].to(dtype),
+            }
+        )
+    elif nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "mlp_downsample":
+        vision_connector = torch.nn.Sequential(
+            DownSampleBlock(),
+            torch.nn.LayerNorm(vision_config["hidden_size"] * 4),
+            torch.nn.Linear(vision_config["hidden_size"] * 4, nemo_config["hidden_size"], bias=True),
+            torch.nn.GELU(),
+            torch.nn.Linear(nemo_config["hidden_size"], nemo_config["hidden_size"], bias=True),
+        ).to(dtype=dtype)
+        key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector"
+        for layer in [1, 2, 4]:
+            vision_connector[layer].load_state_dict(
+                {
+                    'weight': mp0_weights[f"{key_prefix}.{layer}.weight"].to(dtype),
+                    'bias': mp0_weights[f"{key_prefix}.{layer}.bias"].to(dtype),
+                }
+            )
+
+    else:
+        raise ValueError(f"Unknown projector type: {nemo_config['mm_cfg']['mm_mlp_adapter_type']}")
+
+    # export the whole wrapper
+    lita_num_frames = None
+    wrapper = VisionEncoderWrapper(vision_encoder, vision_connector).to(device, dtype)
+    if model_type == "lita" or model_type == "vila":
+        image_size = hf_config.image_size
+        if model_type == "lita":
+            lita_num_frames = nemo_config['mm_cfg']['lita']['sample_frames']
+    else:
+        image_size = hf_config.vision_config.image_size
+        if model_type == "vita":
+            lita_num_frames = nemo_config['mm_cfg']['lita']['sample_frames']
+    dummy_image = torch.empty(
+        1, 3, image_size, image_size, dtype=dtype, device=device
+    )  # dummy image shape [B, C, H, W]
+
+    export_visual_wrapper_onnx(wrapper, dummy_image, model_dir)
+    build_trt_engine(
+        model_type,
+        [3, image_size, image_size],
+        model_dir,
+        vision_max_batch_size,
+        dtype,
+        image_size=image_size,
+        num_frames=lita_num_frames if model_type == "lita" or model_type == 'vita' else None,
+        nemo_config=nemo_config,
+    )
+
+
+def build_video_neva_engine(
+    model_dir: str,
+    visual_checkpoint_path: str,
+    vision_max_batch_size: int = 1,
+):
+    """Build video neva visual engine"""
+    device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+    # extract NeMo checkpoint
+    with tarfile.open(visual_checkpoint_path) as tar:
+        nemo_config = yaml.safe_load(tar.extractfile("./model_config.yaml"))
+        try:
+            # trained without TP
+            mp0_weights = torch.load(tar.extractfile("./model_weights.ckpt"), map_location=device)
+        except KeyError:
+            # trained with TP
+            mp0_weights = torch.load(tar.extractfile("./mp_rank_00/model_weights.ckpt"), map_location=device)
+
+    vision_config = nemo_config["mm_cfg"]["vision_encoder"]
+
+    class VisionEncoderWrapper(torch.nn.Module):
+        # pylint: disable=C0115,C0116
+        def __init__(self, encoder, connector):
+            super().__init__()
+            self.encoder = encoder
+            self.connector = connector
+
+        def forward(self, images):
+            b, num_frames, c, h, w = images.shape
+            images = images.view(b * num_frames, c, h, w)
+            vision_x = self.encoder(pixel_values=images, output_hidden_states=True)  # [(B num_frames), C, H, W]
+            vision_x = vision_x.hidden_states[-2]
+            vision_x = vision_x[:, 1:]
+
+            # reshape back to [B, num_frames, img_size, hidden_size]
+            vision_x = vision_x.view(b, num_frames, -1, vision_x.shape[-1])
+
+            vision_x = self.connector(vision_x)
+            return vision_x
+
+    encoder = AutoModel.from_pretrained(
+        vision_config["from_pretrained"],
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True,
+        attn_implementation='eager',
+    )
+    vision_encoder = encoder.vision_model
+    hf_config = encoder.config
+    dtype = hf_config.torch_dtype
+
+    # connector
+    assert nemo_config["mm_cfg"]["mm_mlp_adapter_type"] == "linear"
+    vision_connector = torch.nn.Linear(vision_config["hidden_size"], nemo_config["hidden_size"], bias=True)
+
+    key_prefix = "model.embedding.word_embeddings.adapter_layer.mm_projector_adapter.mm_projector"
+    vision_connector.load_state_dict(
+        {
+            'weight': mp0_weights[f"{key_prefix}.weight"].to(dtype),
+            'bias': mp0_weights[f"{key_prefix}.bias"].to(dtype),
+        }
+    )
+
+    # export the whole wrapper
+    wrapper = VisionEncoderWrapper(vision_encoder, vision_connector).to(device, dtype)
+    image_size = hf_config.vision_config.image_size
+    num_frames = nemo_config['data']['num_frames']
+    dummy_video = torch.empty(1, num_frames, 3, image_size, image_size, dtype=dtype, device=device)  # dummy image
+    export_visual_wrapper_onnx(wrapper, dummy_video, model_dir)
+    build_trt_engine(
+        "video-neva",
+        [num_frames, 3, image_size, image_size],  # [num_frames, 3, H, W]
+        model_dir,
+        vision_max_batch_size,
+        dtype,
+        image_size=image_size,
+        num_frames=num_frames,
+    )
+
+
+def build_perception_engine(
+    model_dir: str,
+    perception_checkpoint_path: str,
+    model_type: str = "salm",
+    max_batch_size: int = 1,
+):
+    """Build perception engine"""
+    assert model_type == "salm", f"Invalid model type {model_type}"
+
+    def load_perception_model(perception_checkpoint_path):
+        weights = "model_weights.ckpt"
+        perception_state_dict = torch.load(os.path.join(perception_checkpoint_path, weights))
+        config = "model_config.yaml"
+        config = OmegaConf.load(os.path.join(perception_checkpoint_path, config))
+        perception = AudioPerceptionModule(cfg=config)
+        perception.load_state_dict(perception_state_dict)
+        perception.eval()
+        return perception
+
+    if not os.path.exists(model_dir):
+        os.makedirs(model_dir)
+    # load perception model
+    perception_model = load_perception_model(perception_checkpoint_path)
+    feature_extractor = perception_model.preprocessor
+    input_signal = torch.randn(1, 1000, dtype=torch.float32)
+    input_signal_length = torch.tensor([1000], dtype=torch.int32)
+
+    processed_signal, processed_signal_length = feature_extractor(
+        input_signal=input_signal, length=input_signal_length
+    )
+    processed_signal_length = processed_signal_length.to(torch.int32)
+    dump_path = model_dir + "/feature_extractor.ts"  # dump the feature extractor as torchscript
+    feature_extractor.export(dump_path, (input_signal, input_signal_length))
+
+    class PerceptionWrapper(torch.nn.Module):
+        # pylint: disable=C0115,C0116
+        def __init__(self, encoder, modality_adapter, proj):
+            super().__init__()
+            self.encoder = encoder
+            self.modality_adapter = modality_adapter
+            self.proj = proj
+
+        @typecheck.disable_checks()
+        def forward(self, processed_signal, processed_signal_length):
+            encoded, encoded_len = self.encoder(audio_signal=processed_signal, length=processed_signal_length)
+            encoded, encoded_len = self.modality_adapter(audio_signal=encoded, length=encoded_len)
+            # b, c, t -> b, t, c
+            encoded = self.proj(encoded.transpose(1, 2))
+            encoded_len = encoded_len.to(torch.int32)
+            return encoded, encoded_len
+
+    perception = PerceptionWrapper(perception_model.encoder, perception_model.modality_adapter, perception_model.proj)
+    export_perception_wrapper_onnx(perception, (processed_signal, processed_signal_length), model_dir)
+    # export the onnx perception model to tensorrt engine
+    # 512 -> 5.12 sec, 3072 -> 30.72 sec
+    opt_batch_size = max(1, max_batch_size // 2)
+    shapes = [
+        {"processed_signal": [1, 80, 64], "processed_signal_length": [1]},
+        {"processed_signal": [opt_batch_size, 80, 512], "processed_signal_length": [opt_batch_size]},
+        {"processed_signal": [max_batch_size, 80, 3072], "processed_signal_length": [max_batch_size]},
+    ]
+    build_trt_engine(
+        model_type,
+        shapes,
+        model_dir,
+        max_batch_size,
+        dtype=torch.float16,
+        nemo_config=None,
+        part_name='perception_encoder',
+    )
+
+
+def build_mllama_visual_engine(
+    model_dir: str,
+    hf_model_path: str,
+    processor_name: str = "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    vision_max_batch_size: int = 1,
+):
+    """Build mllama visual engine"""
+    hf_model = MllamaForConditionalGeneration.from_pretrained(hf_model_path, torch_dtype="auto", device_map="auto")
+    model_dtype = hf_model.dtype
+
+    class MLLaMAVisionWrapper(torch.nn.Module):
+        # pylint: disable=C0115,C0116
+        def __init__(self, vision_model, output_proj):
+            super().__init__()
+            self.vision_model = vision_model
+            self.output_proj = output_proj
+
+        def forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
+            out = self.vision_model(pixel_values, aspect_ratio_ids, aspect_ratio_mask).last_hidden_state
+            out = self.output_proj(out)
+            return out
+
+    wrapper = MLLaMAVisionWrapper(hf_model.vision_model, hf_model.multi_modal_projector)
+
+    processor = AutoProcessor.from_pretrained(processor_name)
+    image = Image.new('RGB', [2048, 2688])
+    inputs = processor(images=image, return_tensors="pt").to(model_dtype)
+
+    export_visual_wrapper_onnx(
+        wrapper,
+        tuple([value for _, value in inputs.items()]),
+        model_dir,
+        input_names=[key for key in inputs],
+        dynamic_axes={key: {0: "batch"} for key in inputs},
+    )
+    shapes = [{k: list(v.shape) for k, v in inputs.items()}] * 3
+    shapes[2] = shapes[0].copy()
+    for k, v in shapes[2].items():
+        shapes[2][k] = [vision_max_batch_size] + v[1:]
+    build_trt_engine("mllama", shapes, model_dir, vision_max_batch_size, model_dtype)
+
+
+def build_visual_engine(
+    model_dir: str,
+    visual_checkpoint_path: str,
+    model_type: str = "neva",
+    vision_max_batch_size: int = 1,
+):
+    """Build visual engine"""
+    model_list = ['neva', 'lita', 'vila', 'vita']
+    if model_type in model_list:
+        build_neva_engine(model_type, model_dir, visual_checkpoint_path, vision_max_batch_size)
+    elif model_type == "video-neva":
+        build_video_neva_engine(model_dir, visual_checkpoint_path, vision_max_batch_size)
+    else:
+        raise RuntimeError(f"Invalid model type {model_type}")
+
+
+def extract_lora_ckpt(
+    lora_ckpt: str,
+    output_dir: str,
+):
+    """Extrace lora from checkpoint"""
+    if os.path.exists(os.path.join(lora_ckpt, "model_weights.ckpt")):
+        model_weight = torch.load(os.path.join(lora_ckpt, "model_weights.ckpt"))
+    elif os.path.exists(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt")):
+        model_weight = torch.load(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt"))
+    else:
+        raise RuntimeError("Imcompatible lora checkpoint format")
+
+    model_config = os.path.join(lora_ckpt, "model_config.yaml")
+
+    if not os.path.exists(model_config):
+        raise RuntimeError("Imcompatible lora checkpoint format")
+
+    llm_lora_weight = {}
+
+    for k, v in model_weight.items():
+        if "mm_projector" not in k:
+            llm_lora_weight[k] = v
+
+    llm_lora_path = os.path.join(output_dir, "llm_lora.nemo")
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        llm_weight_path = os.path.join(tmp_dir, "model_weights.ckpt")
+        torch.save(llm_lora_weight, llm_weight_path)
+
+        with tarfile.open(llm_lora_path, "w") as tar:
+            tar.add(llm_weight_path, arcname="model_weights.ckpt")
+            tar.add(model_config, arcname="model_config.yaml")
+
+    return llm_lora_path
+
+
+def build_mllama_engine(
+    model_dir: str,
+    checkpoint_path: str,
+    processor_name: str = "meta-llama/Llama-3.2-11B-Vision-Instruct",
+    vision_max_batch_size: int = 1,
+    tensor_parallelism_size: int = 1,
+    max_input_len: int = 256,
+    max_output_len: int = 256,
+    max_batch_size: int = 1,
+    max_multimodal_len: int = 1024,
+    dtype: str = "bfloat16",
+    use_lora_plugin: str = None,
+    lora_target_modules: List[str] = None,
+    max_lora_rank: int = 64,
+    lora_ckpt_list: List[str] = None,
+):
+    """Build mllama engine"""
+    new_state_dict, config = convert_mllama_nemo_to_hf(checkpoint_path, processor_name)
+
+    hf_model = MllamaForConditionalGeneration(config)
+    hf_model = hf_model.to(torch.bfloat16)
+    hf_model.load_state_dict(new_state_dict)
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        hf_model_path = os.path.join(tmp_dir, "hf_checkpoint")
+        hf_model.save_pretrained(hf_model_path)
+        del hf_model, new_state_dict
+
+        build_mllama_visual_engine(
+            os.path.join(model_dir, "visual_engine"),
+            hf_model_path,
+            vision_max_batch_size=vision_max_batch_size,
+        )
+        build_mllama_trtllm_engine(
+            os.path.join(model_dir, "llm_engine"),
+            hf_model_path,
+            tensor_parallelism_size,
+            max_input_len,
+            max_output_len,
+            max_batch_size,
+            max_multimodal_len,
+            dtype,
+        )
diff --git a/nemo/export/multimodal/converter.py b/nemo/export/multimodal/converter.py
new file mode 100644
index 000000000000..747ddf80eaea
--- /dev/null
+++ b/nemo/export/multimodal/converter.py
@@ -0,0 +1,412 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import torch
+from transformers import AutoProcessor, MllamaConfig
+from transformers.models.mllama.configuration_mllama import MllamaTextConfig, MllamaVisionConfig
+
+from nemo import lightning as nl
+from nemo.collections import vlm
+
+
+def split_qkv_weight(qkv_weight, model_config):
+    """Split attention qkv from nemo to hf format"""
+    hidden_size = model_config.hidden_size
+    head_num = model_config.num_attention_heads
+    num_query_groups = model_config.num_query_groups or head_num
+    head_size = model_config.kv_channels or (hidden_size // head_num)
+    heads_per_group = head_num // num_query_groups
+    qkv_weight = qkv_weight.reshape(-1, head_size, hidden_size)
+    q_weight = torch.empty((head_num, head_size, hidden_size), device=qkv_weight.device)
+    k_weight = torch.empty((num_query_groups, head_size, hidden_size), device=qkv_weight.device)
+    v_weight = torch.empty((num_query_groups, head_size, hidden_size), device=qkv_weight.device)
+
+    qkv_index = 0
+    for i in range(num_query_groups):
+        q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :] = qkv_weight[
+            qkv_index : qkv_index + heads_per_group, :, :
+        ]
+        qkv_index += heads_per_group
+        k_weight[i, :, :] = qkv_weight[qkv_index, :, :]
+        qkv_index += 1
+        v_weight[i, :, :] = qkv_weight[qkv_index, :, :]
+        qkv_index += 1
+
+    return [('q_proj', q_weight), ('k_proj', k_weight), ('v_proj', v_weight)]
+
+
+def split_kv_weight(kv_weight, model_config):
+    """Split cross attention qkv from nemo to hf format"""
+    hidden_size = model_config.hidden_size
+    head_num = model_config.num_attention_heads
+    num_query_groups = model_config.num_query_groups or head_num
+    head_size = model_config.kv_channels or (hidden_size // head_num)
+    kv_weight = kv_weight.reshape(-1, head_size, hidden_size)
+    k_weight = torch.empty((num_query_groups, head_size, hidden_size), device=kv_weight.device)
+    v_weight = torch.empty((num_query_groups, head_size, hidden_size), device=kv_weight.device)
+
+    kv_index = 0
+    for i in range(num_query_groups):
+        k_weight[i, :, :] = kv_weight[kv_index, :, :]
+        kv_index += 1
+        v_weight[i, :, :] = kv_weight[kv_index, :, :]
+        kv_index += 1
+
+    return [('k_proj', k_weight), ('v_proj', v_weight)]
+
+
+def split_gate_weight(gate_weight):
+    """Split linear fc to gate"""
+    gate_weight = torch.chunk(gate_weight, 2, axis=0)
+
+    return [('gate_proj', gate_weight[0]), ('up_proj', gate_weight[1])]
+
+
+def convert_mllama_config(source_vision, source_text):
+    """Convert nemo mllama config to hf config"""
+    vision_config = MllamaVisionConfig(
+        num_hidden_layers=source_vision.num_layers,
+        hidden_size=source_vision.hidden_size,
+        attention_heads=source_vision.num_attention_heads,
+        image_size=source_vision.vision_chunk_size,
+        max_num_tiles=source_vision.vision_max_num_chunks,
+        torch_dtype="bfloat16",
+    )
+
+    cross_attention_layers = [
+        x + i for i, x in enumerate(source_text._init_fusion_schedule(source_text.num_cross_attention_layers))
+    ]
+    text_config = MllamaTextConfig(
+        rope_theta=source_text.rotary_base,
+        num_hidden_layers=source_text.num_layers + source_text.num_cross_attention_layers,
+        cross_attention_layers=cross_attention_layers,
+        hidden_size=source_text.hidden_size,
+        intermediate_size=source_text.ffn_hidden_size,
+        num_attention_heads=source_text.num_attention_heads,
+        num_key_value_heads=source_text.num_query_groups,
+        vocab_size=source_text.vocab_size,
+        rope_scaling={
+            "factor": 8.0,
+            "high_freq_factor": 4.0,
+            "low_freq_factor": 1.0,
+            "original_max_position_embeddings": 8192,
+            "rope_type": "llama3",
+        },
+        eos_token_id=[128001, 128008, 128009],
+        torch_dtype="bfloat16",
+    )
+
+    return MllamaConfig(vision_config, text_config, torch_dtype="bfloat16")
+
+
+def convert_mllama_nemo_to_hf(checkpoint_path, processor_name):
+    """Convert nemo mllama to hf state dict and config"""
+    processor = AutoProcessor.from_pretrained(processor_name)
+
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=1,
+        ckpt_load_optimizer=False,
+        ckpt_save_optimizer=False,
+    )
+    trainer = nl.Trainer(
+        devices=1,
+        max_steps=1000,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+        val_check_interval=1000,
+        limit_val_batches=50,
+    )
+
+    fabric = trainer.to_fabric()
+
+    tokenizer = processor.tokenizer
+    model = vlm.MLlamaModel(vlm.MLlamaConfig11BInstruct(), tokenizer=tokenizer)
+    config = model.config
+    vision_model_config = config.vision_model_config
+    language_model_config = config.language_model_config
+    model = fabric.load_model(checkpoint_path, model)
+    model = model.module.module.module.module
+
+    state_dict = model.state_dict()
+    del model
+
+    v = "vision_model.vision_encoder"
+    key_map = [
+        ("vision_model.class_embedding", f"{v}.class_embedding"),
+        ("vision_model.gated_positional_embedding.embedding", f"{v}.positional_embedding"),
+        (
+            "vision_model.gated_positional_embedding.tile_embedding.weight",
+            f"{v}.gated_tile_positional_embedding.weight",
+        ),
+        ("vision_model.gated_positional_embedding.gate", f"{v}.gated_positional_embedding_gate"),
+        ("vision_model.layernorm_post.bias", f"{v}.ln_post.bias"),
+        ("vision_model.layernorm_post.weight", f"{v}.ln_post.weight"),
+        ("vision_model.layernorm_pre.bias", f"{v}.ln_pre.bias"),
+        ("vision_model.layernorm_pre.weight", f"{v}.ln_pre.weight"),
+        ("vision_model.post_tile_positional_embedding.embedding.weight", f"{v}.post_tile_pos_embed.embedding.weight"),
+        ("vision_model.post_tile_positional_embedding.gate", f"{v}.post_tile_pos_embed.gate"),
+        ("vision_model.pre_tile_positional_embedding.embedding.weight", f"{v}.pre_tile_pos_embed.embedding.weight"),
+        ("vision_model.pre_tile_positional_embedding.gate", f"{v}.pre_tile_pos_embed.gate"),
+        ("multi_modal_projector.bias", "vision_model.vision_projection.encoder.bias"),
+        ("multi_modal_projector.weight", "vision_model.vision_projection.encoder.weight"),
+        ("language_model.model.norm.weight", "language_model.decoder.final_layernorm.weight"),
+        ("language_model.lm_head.weight", "language_model.output_layer.weight"),
+    ]
+
+    for i in range(vision_model_config.num_layers):
+        key_map.extend(
+            [
+                (
+                    f"vision_model.transformer.layers.{i}.self_attn.o_proj.weight",
+                    f"{v}.transformer.layers.{i}.self_attention.linear_proj.weight",
+                ),
+                (
+                    f"vision_model.transformer.layers.{i}.input_layernorm.bias",
+                    f"{v}.transformer.layers.{i}.input_layernorm.bias",
+                ),
+                (
+                    f"vision_model.transformer.layers.{i}.input_layernorm.weight",
+                    f"{v}.transformer.layers.{i}.input_layernorm.weight",
+                ),
+                (
+                    f"vision_model.transformer.layers.{i}.post_attention_layernorm.bias",
+                    f"{v}.transformer.layers.{i}.pre_mlp_layernorm.bias",
+                ),
+                (
+                    f"vision_model.transformer.layers.{i}.post_attention_layernorm.weight",
+                    f"{v}.transformer.layers.{i}.pre_mlp_layernorm.weight",
+                ),
+                (
+                    f"vision_model.transformer.layers.{i}.mlp.fc1.bias",
+                    f"{v}.transformer.layers.{i}.mlp.linear_fc1.bias",
+                ),
+                (
+                    f"vision_model.transformer.layers.{i}.mlp.fc1.weight",
+                    f"{v}.transformer.layers.{i}.mlp.linear_fc1.weight",
+                ),
+                (
+                    f"vision_model.transformer.layers.{i}.mlp.fc2.bias",
+                    f"{v}.transformer.layers.{i}.mlp.linear_fc2.bias",
+                ),
+                (
+                    f"vision_model.transformer.layers.{i}.mlp.fc2.weight",
+                    f"{v}.transformer.layers.{i}.mlp.linear_fc2.weight",
+                ),
+            ]
+        )
+
+    for i in range(vision_model_config.num_global_layers):
+        key_map.extend(
+            [
+                (
+                    f"vision_model.global_transformer.layers.{i}.self_attn.o_proj.weight",
+                    f"{v}.global_transformer.layers.{i}.self_attention.linear_proj.weight",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.gate_attn",
+                    f"{v}.global_transformer.layers.{i}.gate_attn",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.gate_ffn",
+                    f"{v}.global_transformer.layers.{i}.gate_ffn",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.input_layernorm.bias",
+                    f"{v}.global_transformer.layers.{i}.input_layernorm.bias",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.input_layernorm.weight",
+                    f"{v}.global_transformer.layers.{i}.input_layernorm.weight",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.post_attention_layernorm.bias",
+                    f"{v}.global_transformer.layers.{i}.pre_mlp_layernorm.bias",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.post_attention_layernorm.weight",
+                    f"{v}.global_transformer.layers.{i}.pre_mlp_layernorm.weight",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.mlp.fc1.bias",
+                    f"{v}.global_transformer.layers.{i}.mlp.linear_fc1.bias",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.mlp.fc1.weight",
+                    f"{v}.global_transformer.layers.{i}.mlp.linear_fc1.weight",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.mlp.fc2.bias",
+                    f"{v}.global_transformer.layers.{i}.mlp.linear_fc2.bias",
+                ),
+                (
+                    f"vision_model.global_transformer.layers.{i}.mlp.fc2.weight",
+                    f"{v}.global_transformer.layers.{i}.mlp.linear_fc2.weight",
+                ),
+            ]
+        )
+
+    cross_attention_frequency = language_model_config.num_layers // language_model_config.num_cross_attention_layers
+    toal_num_layer = language_model_config.num_layers + language_model_config.num_cross_attention_layers
+    prefix = "language_model.decoder"
+    for i in range(toal_num_layer):
+        cross_num = (i - 3) // (cross_attention_frequency + 1)
+        if (i - 3) % (cross_attention_frequency + 1) == 0:
+            xattn_index = cross_num * cross_attention_frequency + 3
+            key_map.extend(
+                [
+                    (
+                        f"language_model.model.layers.{i}.cross_attn.o_proj.weight",
+                        f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_proj.weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.cross_attn.q_proj.weight",
+                        f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_q.weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.cross_attn.k_norm.weight",
+                        f"{prefix}.xattn_layers.{xattn_index}.cross_attention.k_layernorm.weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.input_layernorm.weight",
+                        f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_q.layer_norm_weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.cross_attn.q_norm.weight",
+                        f"{prefix}.xattn_layers.{xattn_index}.cross_attention.q_layernorm.weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.post_attention_layernorm.weight",
+                        f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc1.layer_norm_weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.mlp.down_proj.weight",
+                        f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc2.weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.cross_attn_attn_gate",
+                        f"{prefix}.xattn_layers.{xattn_index}.gate_attn",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.cross_attn_mlp_gate",
+                        f"{prefix}.xattn_layers.{xattn_index}.gate_ffn",
+                    ),
+                ]
+            )
+        else:
+            attn_index = i - cross_num - 1
+            key_map.extend(
+                [
+                    (
+                        f"language_model.model.layers.{i}.self_attn.o_proj.weight",
+                        f"{prefix}.layers.{attn_index}.self_attention.linear_proj.weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.post_attention_layernorm.weight",
+                        f"{prefix}.layers.{attn_index}.mlp.linear_fc1.layer_norm_weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.mlp.down_proj.weight",
+                        f"{prefix}.layers.{attn_index}.mlp.linear_fc2.weight",
+                    ),
+                    (
+                        f"language_model.model.layers.{i}.input_layernorm.weight",
+                        f"{prefix}.layers.{attn_index}.self_attention.linear_qkv.layer_norm_weight",
+                    ),
+                ]
+            )
+
+    new_state_dict = {}
+    for new_key, old_key in key_map:
+        new_state_dict[new_key] = state_dict[old_key]
+
+    def convert_vision_qkv_weight(state_dict, vision_model_config):
+        hidden_size = vision_model_config.hidden_size
+
+        new_state_dict = {}
+        for i in range(vision_model_config.num_layers):
+            qkv_weights = state_dict[
+                f"vision_model.vision_encoder.transformer.layers.{i}.self_attention.linear_qkv.weight"
+            ]
+
+            for name, weight in split_qkv_weight(qkv_weights, vision_model_config):
+                new_key = f'vision_model.transformer.layers.{i}.self_attn.{name}.weight'
+                new_state_dict[new_key] = weight.reshape(-1, hidden_size)
+
+        for i in range(vision_model_config.num_global_layers):
+            qkv_weights = state_dict[
+                f"vision_model.vision_encoder.global_transformer.layers.{i}.self_attention.linear_qkv.weight"
+            ]
+
+            for name, weight in split_qkv_weight(qkv_weights, vision_model_config):
+                new_key = f'vision_model.global_transformer.layers.{i}.self_attn.{name}.weight'
+                new_state_dict[new_key] = weight.reshape(-1, hidden_size)
+
+        return new_state_dict
+
+    def convert_patch_embeding(state_dict):
+        conv1_weight = state_dict["vision_model.vision_encoder.conv1._linear.weight"]
+        return {"vision_model.patch_embedding.weight": conv1_weight.reshape(conv1_weight.shape[0], 3, 14, 14)}
+
+    def convert_language_qkv_weight(state_dict, language_model_config):
+        hidden_size = language_model_config.hidden_size
+        new_state_dict = {}
+        for i in range(toal_num_layer):
+            cross_num = (i - 3) // (cross_attention_frequency + 1)
+            if (i - 3) % (cross_attention_frequency + 1) == 0:
+                xattn_index = cross_num * cross_attention_frequency + 3
+                kv_weights = state_dict[f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_kv.weight"]
+                for name, weight in split_kv_weight(kv_weights, language_model_config):
+                    new_key = f"language_model.model.layers.{i}.cross_attn.{name}.weight"
+                    new_state_dict[new_key] = weight.reshape(-1, hidden_size)
+            else:
+                attn_index = i - cross_num - 1
+                qkv_weights = state_dict[f"{prefix}.layers.{attn_index}.self_attention.linear_qkv.weight"]
+                for name, weight in split_qkv_weight(qkv_weights, language_model_config):
+                    new_key = f"language_model.model.layers.{i}.self_attn.{name}.weight"
+                    new_state_dict[new_key] = weight.reshape(-1, hidden_size)
+
+        return new_state_dict
+
+    def convert_gate(state_dict):
+        new_state_dict = {}
+        for i in range(toal_num_layer):
+            cross_num = (i - 3) // (cross_attention_frequency + 1)
+            if (i - 3) % (cross_attention_frequency + 1) == 0:
+                xattn_index = cross_num * cross_attention_frequency + 3
+                gate_weight = state_dict[f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc1.weight"]
+            else:
+                attn_index = i - cross_num - 1
+                gate_weight = state_dict[f"{prefix}.layers.{attn_index}.mlp.linear_fc1.weight"]
+
+            for name, weight in split_gate_weight(gate_weight):
+                new_key = f"language_model.model.layers.{i}.mlp.{name}.weight"
+                new_state_dict[new_key] = weight
+
+        return new_state_dict
+
+    def convert_embedding(state_dict):
+        word_embeddings = state_dict["language_model.embedding.word_embeddings.weight"]
+        learnable_embedding = state_dict["language_model.learnable_embedding.weight"]
+
+        return {"language_model.model.embed_tokens.weight": torch.cat((word_embeddings, learnable_embedding), dim=0)}
+
+    new_state_dict.update(convert_vision_qkv_weight(state_dict, vision_model_config))
+    new_state_dict.update(convert_patch_embeding(state_dict))
+    new_state_dict.update(convert_language_qkv_weight(state_dict, language_model_config))
+    new_state_dict.update(convert_gate(state_dict))
+    new_state_dict.update(convert_embedding(state_dict))
+
+    return new_state_dict, convert_mllama_config(vision_model_config, language_model_config)
diff --git a/nemo/export/multimodal/run.py b/nemo/export/multimodal/run.py
new file mode 100644
index 000000000000..be2e74dc685d
--- /dev/null
+++ b/nemo/export/multimodal/run.py
@@ -0,0 +1,1168 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import json
+import os
+
+try:
+    import decord
+except Exception:
+    import logging
+
+    logging.warning("The package `decord` was not installed in this environment.")
+
+import einops
+import numpy as np
+import soundfile as sf
+import tensorrt as trt
+import tensorrt_llm
+import tensorrt_llm.profiler as profiler
+import torch
+import yaml
+from PIL import Image
+from tensorrt_llm import logger
+from tensorrt_llm._utils import str_dtype_to_trt, torch_dtype_to_trt
+from tensorrt_llm.runtime import ModelRunner, Session, TensorInfo
+from torch.nn import functional as F
+from torchvision import transforms
+from transformers import AutoProcessor, CLIPImageProcessor
+
+from nemo.export.utils.constants import TRTLLM_ENGINE_DIR
+
+
+def trt_dtype_to_torch(dtype):
+    if dtype == trt.float16:
+        return torch.float16
+    elif dtype == trt.float32:
+        return torch.float32
+    elif dtype == trt.int32:
+        return torch.int32
+    elif dtype == trt.bfloat16:
+        return torch.bfloat16
+    else:
+        raise TypeError("%s is not supported" % dtype)
+
+
+class MultimodalModelRunner:
+
+    def __init__(self, visual_engine_dir, llm_engine_dir, modality='vision'):
+        self.modality = modality
+        self.runtime_rank = tensorrt_llm.mpi_rank()
+        device_id = self.runtime_rank % torch.cuda.device_count()
+        torch.cuda.set_device(device_id)
+        self.device = "cuda:%d" % (device_id)
+
+        self.stream = torch.cuda.Stream(torch.cuda.current_device())
+        torch.cuda.set_stream(self.stream)
+
+        # parse model type from visual engine config
+        with open(os.path.join(visual_engine_dir, "config.json"), "r") as f:
+            config = json.load(f)
+        self.model_type = config['builder_config']['model_type']
+        self.vision_precision = config['builder_config']['precision']
+        self.modality_precision = config['builder_config']['precision']
+
+        self.num_frames = config['builder_config'].get('num_frames', None)
+        self.image_size = config['builder_config'].get('image_size', None)
+
+        self.profiling_iterations = 20
+
+        if modality == 'vision':
+            self.init_image_encoder(visual_engine_dir)
+        self.init_tokenizer(llm_engine_dir)
+        self.init_llm(os.path.join(llm_engine_dir, TRTLLM_ENGINE_DIR))  # Engine is stored in subdirectory
+        if self.model_type == 'lita' or self.model_type == 'vila' or self.model_type == 'vita':
+            self.init_vision_preprocessor(visual_engine_dir)
+
+    def init_tokenizer(self, llm_engine_dir):
+        if os.path.exists(os.path.join(llm_engine_dir, "tokenizer_config.json")):
+            from transformers import AutoTokenizer
+
+            self.tokenizer = AutoTokenizer.from_pretrained(llm_engine_dir)
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            if self.model_type == 'vita':
+                self.tokenizer.im_start_id = self.tokenizer.convert_tokens_to_ids("")
+                self.tokenizer.im_end_id = self.tokenizer.convert_tokens_to_ids("")
+                self.tokenizer.vid_start_id = self.tokenizer.convert_tokens_to_ids("")
+                self.tokenizer.vid_end_id = self.tokenizer.convert_tokens_to_ids("")
+        else:
+            from sentencepiece import SentencePieceProcessor
+
+            sp = SentencePieceProcessor(os.path.join(llm_engine_dir, 'tokenizer.model'))
+
+            class return_obj:
+
+                def __init__(self, input_ids):
+                    self.input_ids = input_ids
+
+                def __getitem__(self, name):
+                    if name in "input_ids":
+                        return self.input_ids
+                    else:
+                        raise AttributeError(f"'return_obj' has no item '{name}'")
+
+            # sentencepiece does not follow the same interface as HF
+            class HFTokenizerInterface:
+
+                def encode(self, x, return_tensors=None, **kwargs):
+                    out = sp.encode(x)
+                    if return_tensors == "pt":
+                        out = torch.tensor(out)
+                    return return_obj(out)
+
+                def __call__(self, x, return_tensors=None, **kwargs):
+                    return self.encode(x, return_tensors, **kwargs)
+
+                def decode(self, x, **kwargs):
+                    return sp.decode(x.tolist())
+
+                def batch_decode(self, x, **kwargs):
+                    return self.decode(x, **kwargs)
+
+            self.tokenizer = HFTokenizerInterface()
+            self.tokenizer.eos_token_id = sp.eos_id()
+            self.tokenizer.bos_token_id = sp.bos_id()
+            self.tokenizer.pad_token_id = sp.pad_id()
+
+            self.tokenizer.padding_side = "right"
+
+            if self.model_type == 'lita':
+                self.tokenizer.im_start_id = sp.piece_to_id("")
+                self.tokenizer.im_end_id = sp.piece_to_id("")
+                self.tokenizer.vid_start_id = sp.piece_to_id("")
+                self.tokenizer.vid_end_id = sp.piece_to_id("")
+
+    def init_image_encoder(self, visual_engine_dir):
+        vision_encoder_path = os.path.join(visual_engine_dir, 'visual_encoder.engine')
+        logger.info(f'Loading engine from {vision_encoder_path}')
+        with open(vision_encoder_path, 'rb') as f:
+            engine_buffer = f.read()
+        logger.info(f'Creating session from engine {vision_encoder_path}')
+        self.visual_encoder_session = Session.from_serialized_engine(engine_buffer)
+
+    def init_vision_preprocessor(self, visual_encoder_dir):
+        with open(os.path.join(visual_encoder_dir, 'nemo_config.yaml'), 'r') as f:
+            self.nemo_config = yaml.safe_load(f)
+
+        vision_config = self.nemo_config["mm_cfg"]["vision_encoder"]
+
+        if self.model_type == 'lita':
+            self.image_processor = AutoProcessor.from_pretrained(
+                vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True
+            )
+        elif self.model_type == 'vila' or self.model_type == 'vita':
+            from transformers import SiglipImageProcessor
+
+            self.image_processor = SiglipImageProcessor.from_pretrained(
+                vision_config["from_pretrained"], torch_dtype=torch.bfloat16, trust_remote_code=True
+            )
+        else:
+            raise ValueError(f"Invalid model type: {self.model_type}")
+
+    def init_llm(self, llm_engine_dir):
+        self.model = ModelRunner.from_dir(
+            llm_engine_dir,
+            rank=tensorrt_llm.mpi_rank(),
+            debug_mode=False,
+            stream=self.stream,
+        )
+        self.model_config = self.model.session._model_config
+        self.runtime_mapping = self.model.session.mapping
+
+    def video_preprocess(self, video_path):
+        from decord import VideoReader
+
+        if isinstance(video_path, str):
+            vr = VideoReader(video_path)
+            num_frames = self.num_frames
+            if num_frames == -1:
+                frames = [Image.fromarray(frame.asnumpy()).convert('RGB') for frame in vr]
+            else:
+                # equally sliced frames into self.num_frames frames
+                # if self.num_frames is greater than the number of frames in the video, we will repeat the last frame
+                num_frames = min(num_frames, len(vr))
+                indices = np.linspace(0, len(vr) - 1, num=num_frames, dtype=int)
+                frames = [Image.fromarray(vr[idx].asnumpy()).convert('RGB') for idx in indices]
+                if len(frames) < num_frames:
+                    frames += [frames[-1]] * (num_frames - len(frames))
+        elif isinstance(video_path, np.ndarray):
+            num_frames = self.num_frames
+            if num_frames == -1:
+                frames = [Image.fromarray(frame).convert('RGB') for frame in video_path]
+            else:
+                # equally sliced frames into self.num_frames frames
+                # if self.num_frames is greater than the number of frames in the video, we will repeat the last frame
+                num_frames = min(num_frames, video_path.shape[0])
+                indices = np.linspace(0, video_path.shape[0] - 1, num=num_frames, dtype=int)
+                frames = [Image.fromarray(video_path[idx]).convert('RGB') for idx in indices]
+                if len(frames) < num_frames:
+                    frames += [frames[-1]] * (num_frames - len(frames))
+        else:
+            frames = self.video_path
+
+        processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14", torch_dtype=torch.bfloat16)
+        frames = processor.preprocess(frames, return_tensors="pt")['pixel_values']
+        # make dtype consistent with vision encoder
+        media_tensors = frames.to(
+            tensorrt_llm._utils.str_dtype_to_torch(self.vision_precision)
+        )  # [num_frames, 3, H, W]
+        return media_tensors.unsqueeze(0)  # [1, num_frames, 3, H, W]
+
+    def insert_tokens_by_index(self, input_ids, num_frames):
+        im_start_id = self.tokenizer.im_start_id
+        im_end_id = self.tokenizer.im_end_id
+        vid_start_id = self.tokenizer.vid_start_id
+        vid_end_id = self.tokenizer.vid_end_id
+
+        image_token_indices = (input_ids == 0).nonzero(as_tuple=False).squeeze().tolist()
+        input_ids = input_ids.squeeze().tolist()
+        offset = 0
+
+        # Insert the image tokens and corresponding start/end tokens
+        for i in range(num_frames):
+            idx = image_token_indices[1] + offset
+            input_ids.insert(idx + 1, im_end_id)
+            input_ids.insert(idx + 1, 0)
+            input_ids.insert(idx + 1, im_start_id)
+            offset += 3
+
+        # Insert the video start and end tokens around the video token
+        vid_idx = image_token_indices[1] + offset
+        input_ids.insert(vid_idx + 1, vid_end_id)
+        input_ids.insert(vid_idx + 1, 0)
+        input_ids.insert(vid_idx + 1, vid_start_id)
+
+        input_ids.pop(image_token_indices[1])
+        input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)
+
+        return input_ids
+
+    def preprocess(self, warmup, pre_prompt, post_prompt, image, attention_mask, batch_size):
+        if not warmup:
+            profiler.start(self.modality.capitalize())
+
+        if not warmup:
+            profiler.stop(self.modality.capitalize())
+
+        if self.model_type == 'vila':
+            visual_features, visual_atts = self.get_visual_features(image, attention_mask)
+            input_ids = self.tokenizer_image_token(batch_size, pre_prompt[0] + post_prompt[0], self.tokenizer)
+            batch_split_prompts = self.split_prompt_by_images(input_ids)
+            first_batch_split_prompts = batch_split_prompts[0]
+            # compute prompt length + visual length
+            length = sum([ids.shape[1] for ids in first_batch_split_prompts])
+            if batch_size == 1 and len(image) > 1:
+                # mode 1: multiple image as a whole, flatten visual dims
+                length += visual_atts.shape[0] * visual_atts.shape[1]
+            else:
+                # mode 2: multiple images individually (replicate prompt for each image)
+                length += visual_atts.shape[1]
+
+            input_lengths = torch.IntTensor([length] * batch_size).to(torch.int32)
+            input_ids, ptuning_args = self.setup_fake_prompts_vila(
+                batch_size, visual_features, first_batch_split_prompts, input_lengths
+            )
+            return input_ids, input_lengths, ptuning_args, visual_features
+
+        elif self.model_type == 'lita' or self.model_type == 'vita':
+            visual_input = []
+            for i, img in enumerate(image):
+                visual_features, visual_atts = self.get_visual_features(img, attention_mask)
+            visual_features = visual_features.unsqueeze(0)
+            im_tokens, vid_tokens, num_sample_frames = self.preprocess_lita_visual(visual_features, self.nemo_config)
+            visual_input.extend([im_tokens, vid_tokens])
+
+            input_ids = self.tokenizer_image_token(batch_size, pre_prompt[0] + post_prompt[0], self.tokenizer)
+            input_ids = self.insert_tokens_by_index(input_ids, num_sample_frames)
+            batch_splits = self.split_prompt_by_images(input_ids)
+            first_batch_split_prompts = batch_splits[0]
+            length = sum([ids.shape[1] for ids in first_batch_split_prompts])
+
+            # Update visual atts shape to match im_tokens shape and vid_tokens shape
+            im_tokens = im_tokens.view(1, -1, im_tokens.shape[-1])
+            visual_features = torch.cat([im_tokens, vid_tokens], dim=1)
+            visual_atts = torch.ones(visual_features.size()[:-1], dtype=torch.long).to(image.device)
+
+            if batch_size == 1:
+                length += visual_atts.shape[0] * visual_atts.shape[1]
+            else:
+                raise ValueError("Batch size greater than 1 is not supported for LITA and VITA models")
+
+            input_lengths = torch.IntTensor([length] * batch_size).to(torch.int32)
+            input_ids, ptuning_args = self.setup_fake_prompts_vila(
+                batch_size, visual_input, first_batch_split_prompts, input_lengths
+            )
+            return input_ids, input_lengths, ptuning_args, visual_features
+        else:
+            visual_features, visual_atts = self.get_visual_features(image, attention_mask)
+            pre_input_ids = self.tokenizer(pre_prompt, return_tensors="pt", padding=True).input_ids
+            if post_prompt[0] is not None:
+                post_input_ids = self.tokenizer(post_prompt, return_tensors="pt", padding=True).input_ids
+                if self.model_type == 'video-neva':
+                    length = (
+                        pre_input_ids.shape[1] + post_input_ids.shape[1] + visual_atts.shape[2] * visual_atts.shape[1]
+                    )
+                else:
+                    length = pre_input_ids.shape[1] + post_input_ids.shape[1] + visual_atts.shape[1]
+            else:
+                post_input_ids = None
+                length = pre_input_ids.shape[1] + visual_atts.shape[1]
+
+        input_lengths = torch.IntTensor([length] * batch_size).to(torch.int32)
+
+        input_ids, ptuning_args = self.setup_fake_prompts(
+            visual_features, pre_input_ids, post_input_ids, input_lengths
+        )
+
+        return input_ids, input_lengths, ptuning_args, visual_features
+
+    @staticmethod
+    def tokenizer_image_token(batch_size, prompt, tokenizer, image_token_index=-200):
+        prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split("")]
+
+        def insert_separator(X, sep):
+            return [ele for sublist in zip(X, [sep] * len(X)) for ele in sublist][:-1]
+
+        input_ids = []
+        offset = 0
+        if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
+            offset = 1
+            input_ids.append(prompt_chunks[0][0])
+
+        for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
+            input_ids.extend(x[offset:])
+
+        input_ids = torch.tensor(input_ids, dtype=torch.long)
+        input_ids[input_ids == image_token_index] = 0
+        input_ids = input_ids.unsqueeze(0).expand(batch_size, -1)
+
+        return input_ids
+
+    def split_prompt_by_images(self, tensor):
+        batch_splits = []
+        for batch in tensor:
+            # Find indices where value is zero ()
+            zero_indices = (batch == 0).nonzero(as_tuple=False).squeeze(0)
+            # Add starting point for slicing
+            start_idx = 0
+            splits = []
+            for idx in zero_indices:
+                if start_idx != idx:  # Ensure not slicing zero-length tensors
+                    splits.append(batch[start_idx:idx].unsqueeze(0))
+                start_idx = idx + 1  # Move start index past the zero
+            if start_idx < len(batch):  # Handle last segment if it's not zero-ending
+                splits.append(batch[start_idx:].unsqueeze(0))
+            # Remove empty tensors resulting from consecutive zeros
+            splits = [split for split in splits if split.numel() > 0]
+            batch_splits.append(splits)
+
+        return batch_splits
+
+    def generate(
+        self,
+        pre_prompt,
+        post_prompt,
+        image,
+        decoder_input_ids,
+        max_new_tokens,
+        attention_mask,
+        warmup,
+        batch_size,
+        top_k,
+        top_p,
+        temperature,
+        repetition_penalty,
+        num_beams,
+        lora_uids=None,
+    ):
+        if not warmup:
+            profiler.start("Generate")
+
+        input_ids, input_lengths, ptuning_args, visual_features = self.preprocess(
+            warmup, pre_prompt, post_prompt, image, attention_mask, batch_size
+        )
+
+        if warmup:
+            return None
+
+        profiler.start("LLM")
+        end_id = self.tokenizer.eos_token_id
+
+        ptuning_args[0] = torch.stack([ptuning_args[0]])
+        output_ids = self.model.generate(
+            input_ids,
+            sampling_config=None,
+            prompt_table=ptuning_args[0],
+            max_new_tokens=max_new_tokens,
+            end_id=end_id,
+            pad_id=(
+                self.tokenizer.pad_token_id
+                if self.tokenizer.pad_token_id is not None
+                else self.tokenizer.all_special_ids[0]
+            ),
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            num_beams=num_beams,
+            output_sequence_lengths=False,
+            lora_uids=lora_uids,
+            return_dict=False,
+        )
+
+        profiler.stop("LLM")
+
+        if tensorrt_llm.mpi_rank() == 0:
+            # Extract a list of tensors of shape beam_width x output_ids.
+            output_beams_list = [
+                self.tokenizer.batch_decode(
+                    output_ids[batch_idx, :, input_lengths[batch_idx] :], skip_special_tokens=True
+                )
+                for batch_idx in range(batch_size)
+            ]
+
+            stripped_text = [
+                [output_beams_list[batch_idx][beam_idx].strip() for beam_idx in range(num_beams)]
+                for batch_idx in range(batch_size)
+            ]
+            profiler.stop("Generate")
+            return stripped_text
+        else:
+            profiler.stop("Generate")
+            return None
+
+    def get_visual_features(self, image, attention_mask):
+        visual_features = {'input': image.to(tensorrt_llm._utils.str_dtype_to_torch(self.vision_precision))}
+        if attention_mask is not None:
+            visual_features['attention_mask'] = attention_mask
+        tensor_info = [TensorInfo('input', str_dtype_to_trt(self.vision_precision), image.shape)]
+        if attention_mask is not None:
+            tensor_info.append(TensorInfo('attention_mask', trt.DataType.INT32, attention_mask.shape))
+
+        visual_output_info = self.visual_encoder_session.infer_shapes(tensor_info)
+
+        visual_outputs = {
+            t.name: torch.empty(tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device=image.device)
+            for t in visual_output_info
+        }
+
+        ok = self.visual_encoder_session.run(visual_features, visual_outputs, self.stream.cuda_stream)
+        assert ok, "Runtime execution failed for vision encoder session"
+        self.stream.synchronize()
+
+        image_embeds = visual_outputs['output']
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+
+        return image_embeds, image_atts
+
+    def setup_fake_prompts(self, visual_features, pre_input_ids, post_input_ids, input_lengths):
+        # Assemble fake prompts which points to image embedding actually
+        if hasattr(self, 'num_frames') and (visual_features.shape[1] == self.num_frames):
+            visual_features = visual_features.view(visual_features.shape[0], -1, visual_features.shape[-1])
+
+        fake_prompt_id = torch.arange(
+            self.model_config.vocab_size,
+            self.model_config.vocab_size + visual_features.shape[0] * visual_features.shape[1],
+        )
+        fake_prompt_id = fake_prompt_id.reshape(visual_features.shape[0], visual_features.shape[1])
+
+        if post_input_ids is not None:
+            input_ids = [pre_input_ids, fake_prompt_id, post_input_ids]
+        else:
+            input_ids = [fake_prompt_id, pre_input_ids]
+        input_ids = torch.cat(input_ids, dim=1).contiguous().to(torch.int32)
+
+        ptuning_args = self.ptuning_setup(visual_features, input_ids, input_lengths)
+
+        return input_ids, ptuning_args
+
+    def setup_fake_prompts_vila(self, batch_size, visual_features, split_input_ids, input_lengths):
+
+        if self.model_type == 'lita' or self.model_type == 'vita':
+            squeeze_img_tokens = visual_features[0].squeeze(0)
+            reshape_img_tokens = [t.unsqueeze(0) for t in squeeze_img_tokens]
+            visual_features = reshape_img_tokens + [visual_features[1]]
+
+        fake_prompt_counter = self.model_config.vocab_size
+        if batch_size == 1:
+            # only check for multi-image inference (mode 1)
+            assert len(visual_features) <= len(
+                split_input_ids
+            ), "Unexpected number of visual features. Please check # in prompt and the #image files."
+
+        input_ids = []
+        if batch_size == 1:
+            input_ids = [split_input_ids[0]]
+
+            if self.model_type == 'vila':
+                # mode 1: multiple image as a whole, concat all prompts together, 
...
+                for idx, visual_feature in enumerate(visual_features):
+                    fake_prompt_id = torch.arange(fake_prompt_counter, fake_prompt_counter + visual_feature.shape[0])
+                    fake_prompt_counter += visual_feature.shape[0]
+                    fake_prompt_id = fake_prompt_id.unsqueeze(0)
+                    input_ids.append(fake_prompt_id)
+
+                    # in case no post prompt
+                    if len(split_input_ids) > idx + 1:
+                        input_ids.append(split_input_ids[idx + 1])
+            elif self.model_type == 'lita' or self.model_type == 'vita':
+                for idx, visual_f in enumerate(visual_features):
+                    fake_prompt_id = torch.arange(fake_prompt_counter, fake_prompt_counter + visual_f.shape[1])
+                    fake_prompt_id = fake_prompt_id.reshape(visual_f.shape[1])
+                    fake_prompt_counter += visual_f.shape[1]
+                    fake_prompt_id = fake_prompt_id.unsqueeze(0)
+                    input_ids.append(fake_prompt_id)
+
+                    # in case no post prompt
+                    if len(split_input_ids) > idx + 1:
+                        input_ids.append(split_input_ids[idx + 1])
+
+        elif batch_size > 1 and self.model_type == 'vila':
+            # mode 2: each image have individual prompt, 

+            for idx, visual_feature in enumerate(visual_features):
+                input_ids.append(split_input_ids[0])
+                fake_prompt_id = torch.arange(fake_prompt_counter, fake_prompt_counter + visual_feature.shape[0])
+                fake_prompt_counter += visual_feature.shape[0]
+                fake_prompt_id = fake_prompt_id.unsqueeze(0)
+                input_ids.append(fake_prompt_id)
+                if len(split_input_ids) > 1:
+                    input_ids.append(split_input_ids[1])
+
+        input_ids = torch.cat(input_ids, dim=1).contiguous().to(torch.int32)
+        input_ids = input_ids.reshape(batch_size, -1)
+        ptuning_args = self.ptuning_setup(visual_features, input_ids, input_lengths)
+        return input_ids, ptuning_args
+
+    def preprocess_lita_visual(self, visual_features, config):
+
+        b, t, s, d = visual_features.shape
+
+        num_frames = t
+        if (
+            'visual_token_format' in config['mm_cfg']['lita']
+            and config['mm_cfg']['lita']['visual_token_format'] == 'im_vid_start_end'
+        ):
+            num_image_frames = min(num_frames, config['mm_cfg']['lita']['sample_frames'])
+            idx = np.round(np.linspace(0, num_frames - 1, num_image_frames)).astype(int)
+
+            # Image and video features
+            im_features = visual_features[:, idx, ...]
+
+            vid_features = einops.reduce(visual_features, 'b t s d -> b t d', 'mean')
+            return im_features, vid_features, num_image_frames
+
+        elif (
+            'lita_video_arch' in config['mm_cfg']['lita']
+            and config['mm_cfg']['lita']['lita_video_arch'] == 'temporal_spatial_pool'
+        ):
+            pool_size = 2
+            selected_frames = np.round(np.linspace(0, visual_features.shape[1] - 1, pool_size * pool_size)).astype(int)
+            s_tokens = visual_features[:, selected_frames, ...]
+            s_tokens = einops.rearrange(s_tokens, 'b t (h w) d -> (b t) d h w', h=16, w=16)
+            s_tokens = F.avg_pool2d(s_tokens, kernel_size=pool_size)
+            s_tokens = einops.rearrange(s_tokens, '(b t) d h w -> b (t h w) d', b=b)
+
+            t_tokens = einops.reduce(visual_features, 'b t s d -> b t d', 'mean')
+
+            return t_tokens, s_tokens, pool_size**2
+
+        else:
+            raise ValueError(f'Invalid visual token format: {config["mm_cfg"]["lita"]["visual_token_format"]}')
+
+    def ptuning_setup(self, prompt_table, input_ids, input_lengths):
+        hidden_size = self.model_config.hidden_size * self.runtime_mapping.tp_size
+
+        if self.model_type == 'lita' or self.model_type == 'vita':
+            prompt_table = torch.cat(prompt_table, dim=1)
+        if prompt_table is not None:
+            task_vocab_size = torch.tensor(
+                [prompt_table.shape[1]],
+                dtype=torch.int32,
+            ).cuda()
+            prompt_table = prompt_table.view((prompt_table.shape[0] * prompt_table.shape[1], prompt_table.shape[2]))
+
+            assert prompt_table.shape[1] == hidden_size, "Prompt table dimensions do not match hidden size"
+
+            prompt_table = prompt_table.cuda().to(
+                dtype=tensorrt_llm._utils.str_dtype_to_torch(self.model_config.dtype)
+            )
+        else:
+            prompt_table = torch.empty([1, hidden_size]).cuda()
+            task_vocab_size = torch.zeros([1]).cuda()
+
+        if self.model_config.remove_input_padding:
+            tasks = torch.zeros([torch.sum(input_lengths)], dtype=torch.int32).cuda()
+        else:
+            tasks = torch.zeros(input_ids.shape, dtype=torch.int32).cuda()
+
+        return [prompt_table, tasks, task_vocab_size]
+
+    def expand2square_pt(self, images, background_color):
+        height, width = images.shape[-2:]
+        b = len(images)
+        background_color = torch.Tensor(background_color)
+        if width == height:
+            return images
+        elif width > height:
+            result = einops.repeat(background_color, 'c -> b c h w', b=b, h=width, w=width).clone()
+            paste_start = (width - height) // 2
+            paste_end = paste_start + height
+            result[:, :, paste_start:paste_end, :] = images
+            return result
+        else:
+            result = einops.repeat(background_color, 'c -> b c h w', b=b, h=height, w=height).clone()
+            paste_start = (height - width) // 2
+            paste_end = paste_start + width
+            result[:, :, :, paste_start:paste_end] = images
+            return result
+
+    def load_video(self, config, video_path, processor, num_frames=None):
+        frames = None
+        if isinstance(video_path, str):
+            decord.bridge.set_bridge('torch')
+            video_reader = decord.VideoReader(uri=video_path)
+            if num_frames is not None:
+                idx = np.round(np.linspace(0, len(video_reader) - 1, num_frames)).astype(int)
+                frames = video_reader.get_batch(idx)
+            else:
+                frames = torch.cat([torch.tensor(f.asnumpy()) for f in video_reader])
+        elif isinstance(video_path, np.ndarray):
+            frames = torch.tensor(video_path, dtype=torch.float32)
+
+        return self.preprocess_frames(frames, config, processor)
+
+    def preprocess_frames(self, frames, config, processor):
+        frames = einops.rearrange(frames, 't h w c -> t c h w')
+        if config['data']['image_aspect_ratio'] == 'pad':
+            frames = self.expand2square_pt(frames, tuple(int(x * 255) for x in processor.image_mean))
+        processed_frames = processor.preprocess(frames, return_tensors='pt')['pixel_values']
+        return processed_frames
+
+    def get_num_sample_frames(self, config, vid_len):
+        if (
+            'visual_token_format' in config['mm_cfg']['lita']
+            and config['mm_cfg']['lita']['visual_token_format'] == 'im_vid_start_end'
+        ):
+            max_frames = config['data']['num_frames']
+            if vid_len <= max_frames:
+                return vid_len
+            else:
+                subsample = int(np.ceil(float(vid_len) / max_frames))
+                return int(np.round(float(vid_len) / subsample))
+        else:
+            return config['mm_cfg']['lita']['sample_frames']
+
+    def process_lita_video(self, nemo_config, video_path, image_processor):
+        image = None
+        if isinstance(video_path, str):
+            vid_len = len(decord.VideoReader(video_path))
+            num_sample_frames = self.get_num_sample_frames(nemo_config, vid_len)
+            image = (
+                self.load_video(nemo_config, video_path, image_processor, num_sample_frames)
+                .unsqueeze(0)
+                .to(self.device, dtype=torch.bfloat16)
+            )
+        elif isinstance(video_path, np.ndarray):
+            image = (
+                self.load_video(nemo_config, video_path, image_processor)
+                .unsqueeze(0)
+                .to(self.device, dtype=torch.bfloat16)
+            )
+        return image
+
+    def process_image(self, image_file, image_processor, nemo_config, image_folder):
+        if isinstance(image_file, str):
+            if image_folder is not None:
+                image = Image.open(os.path.join(image_folder, image_file)).convert("RGB")
+            else:
+                image = Image.open(image_file).convert("RGB")
+        else:
+            # image is stored in bytearray
+            image = image_file
+
+        crop_size = nemo_config['mm_cfg']['vision_encoder']['crop_size']
+        crop_size = tuple(crop_size)
+        image = image.resize(crop_size)
+        if nemo_config['data']['image_aspect_ratio'] == 'pad':
+            image = self.expand2square_pt(image, tuple(int(x * 255) for x in image_processor.image_mean))
+            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        else:
+            image = image_processor.preprocess(image, return_tensors="pt")["pixel_values"][0]
+        return image
+
+    def process_vila_img(self, images):
+        new_images = [self.process_image(image, self.image_processor, self.nemo_config, None) for image in images]
+
+        if all(x.shape == new_images[0].shape for x in new_images):
+            new_images = torch.stack(new_images, dim=0)
+        return new_images
+
+    def setup_inputs(self, input_text, raw_image, batch_size):
+        attention_mask = None
+        image = None
+
+        if self.model_type == "neva":
+            image_size = self.image_size
+            dtype = torch.float32
+            transform = transforms.Compose(
+                [
+                    transforms.Resize((image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC),
+                    transforms.ToTensor(),
+                    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+                ]
+            )
+            image = transform(raw_image).to(dtype).unsqueeze(0)
+
+            if input_text is None:
+                input_text = "Hi! What is in this image?"
+
+            pre_prompt = "System\n\nUser\n"
+            post_prompt = f"\n{input_text}\nAssistant\n"
+        elif self.model_type == "video-neva":
+            image = self.video_preprocess(raw_image)  # shape (1, num_frames, 3, H, W)
+
+            if input_text is None:
+                input_text = "Hi! What is in this video?"
+
+            # SteerLM prompt template
+            pre_prompt = (
+                "System\nA chat between a curious user and an artificial intelligence assistant. "
+                "The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n"
+                "User"
+            )
+            post_prompt = (
+                f"\n{input_text}\nAssistant\n"
+                "quality:4,toxicity:0,humor:0,creativity:0,helpfulness:4,"
+                "correctness:4,coherence:4,complexity:4,verbosity:4\n"
+            )
+        elif self.model_type in ['vila', 'lita', 'vita']:
+            if self.model_type == "vila" or self.model_type == "lita":
+                pre_prompt = (
+                    "A chat between a curious user and an artificial intelligence assistant. "
+                    "The assistant gives helpful, detailed, and polite answers to the user's questions. USER: "
+                )
+                if input_text is None:
+                    input_text = "\n Please elaborate what you see in the images?"
+                post_prompt = input_text + " ASSISTANT:"
+
+            elif self.model_type == "vita":
+                # llama3 prompt template
+                pre_prompt = (
+                    "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
+                    "You are a helpful language and vision assistant. "
+                    "You are able to understand the visual content that the user provides, "
+                    "and assist the user with a variety of tasks using natural language. "
+                    "<|start_header_id|>user<|end_header_id|>\n\n"
+                )
+                if input_text is None:
+                    input_text = "\n Please elaborate what you see in the images?"
+                post_prompt = input_text + "<|start_header_id|>assistant<|end_header_id|>\n\n"
+
+        else:
+            raise RuntimeError(f"Invalid model type {self.model_type}")
+
+        if self.model_type == 'lita' or self.model_type == 'vita':
+            image = self.process_lita_video(self.nemo_config, raw_image, self.image_processor)
+
+        if self.model_type == 'vila':
+            raw_image = [raw_image] * batch_size
+            image = self.process_vila_img(raw_image)
+
+        # Repeat inputs to match batch size
+        pre_prompt = [pre_prompt] * batch_size
+        post_prompt = [post_prompt] * batch_size
+        if self.model_type not in ['vila', 'lita', 'vita']:
+            if image.dim() == 5:
+                image = image.expand(batch_size, -1, -1, -1, -1).contiguous()
+            else:
+                image = image.expand(batch_size, -1, -1, -1).contiguous()
+        image = image.to(self.device)
+
+        decoder_input_ids = None
+
+        return input_text, pre_prompt, post_prompt, image, decoder_input_ids, attention_mask
+
+    def run(
+        self,
+        input_text,
+        input_image,
+        max_new_tokens,
+        batch_size,
+        top_k,
+        top_p,
+        temperature,
+        repetition_penalty,
+        num_beams,
+        lora_uids=None,
+        run_profiling=False,
+        check_accuracy=False,
+    ):
+        input_text, pre_prompt, post_prompt, processed_image, decoder_input_ids, attention_mask = self.setup_inputs(
+            input_text, input_image, batch_size
+        )
+
+        self.generate(
+            pre_prompt,
+            post_prompt,
+            processed_image,
+            decoder_input_ids,
+            max_new_tokens,
+            attention_mask=attention_mask,
+            warmup=True,
+            batch_size=batch_size,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            num_beams=num_beams,
+            lora_uids=lora_uids,
+        )
+        num_iters = self.profiling_iterations if run_profiling else 1
+        for _ in range(num_iters):
+            output_text = self.generate(
+                pre_prompt,
+                post_prompt,
+                processed_image,
+                decoder_input_ids,
+                max_new_tokens,
+                attention_mask=attention_mask,
+                warmup=False,
+                batch_size=batch_size,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                num_beams=num_beams,
+                lora_uids=lora_uids,
+            )
+        if self.runtime_rank == 0:
+            self.print_result(input_text, output_text, batch_size, num_beams, run_profiling, check_accuracy)
+        return output_text
+
+    def print_result(self, input_text, output_text, batch_size, num_beams, run_profiling, check_accuracy):
+        if not run_profiling and not check_accuracy:
+            return
+        logger.info("---------------------------------------------------------")
+        if self.model_type != 'nougat':
+            logger.info(f"\n[Q] {input_text}")
+        logger.info(f"\n[A] {output_text[0]}")
+
+        if num_beams == 1:
+            output_ids = self.tokenizer(output_text[0][0], add_special_tokens=False)['input_ids']
+            logger.info(f"Generated {len(output_ids)} tokens")
+
+        if check_accuracy:
+            for i in range(batch_size - 1):
+                if not (output_text[i] == output_text[i + 1]):
+                    logger.info(f"Output {i} and {i + 1} do not match")
+                    assert False
+
+                assert 'robot' in output_text[0][0].lower()
+
+        if run_profiling:
+            msec_per_batch = lambda name: 1000 * profiler.elapsed_time_in_sec(name) / self.profiling_iterations
+            logger.info('Latencies per batch (msec)')
+            logger.info(f'TRT {self.modality} encoder: %.1f' % (msec_per_batch(self.modality.capitalize())))
+            logger.info('TRTLLM LLM generate: %.1f' % (msec_per_batch('LLM')))
+            logger.info('Multimodal generate: %.1f' % (msec_per_batch('Generate')))
+
+        logger.info("---------------------------------------------------------")
+
+    def load_test_media(self, input_media):
+        media_model = ["video-neva", "lita", "vita"]
+        if self.model_type in media_model:
+            media = input_media
+        elif self.model_type == "neva" or self.model_type == "vila":
+            media = Image.open(input_media).convert('RGB')
+        else:
+            raise RuntimeError(f"Invalid model type {self.model_type}")
+
+        return media
+
+
+class SpeechllmModelRunner(MultimodalModelRunner):
+    def __init__(self, perception_engine_dir, llm_engine_dir, modality):
+        """
+        perception_engine_dir: path to the perception engine directory
+                               it should contain:
+                               config.json nemo_config.yaml
+                               perception_encoder.engine : tensorrt engine
+                               feature_extractor.ts  : torchscript model
+        llm_engine_dir: path to the LLM engine directory
+        """
+        super().__init__(perception_engine_dir, llm_engine_dir, modality)
+        assert self.model_type == 'salm'
+        # init preprocessor
+        feature_extractor_path = os.path.join(perception_engine_dir, 'feature_extractor.ts')
+        self.feature_extractor = self.init_speech_preprocessor(feature_extractor_path)
+        self.init_modality_encoder(perception_engine_dir)
+
+    def init_modality_encoder(self, engine_dir):
+        """
+        Initialize the modality encoder session from the prebuilt engine directory
+        Args:
+            engine_dir: str, path to the engine directory
+        """
+        # find file with .engine extension
+        engine_file = None
+        for file in os.listdir(engine_dir):
+            if file.endswith('.engine'):
+                engine_file = file
+                break
+        assert engine_file is not None, f"Engine file not found in {engine_dir}"
+        encoder_path = os.path.join(engine_dir, engine_file)
+        logger.info(f'Loading engine from {encoder_path}')
+        with open(encoder_path, 'rb') as f:
+            engine_buffer = f.read()
+        logger.info(f'Creating session from engine {encoder_path}')
+        self.modality_encoder_session = Session.from_serialized_engine(engine_buffer)
+
+    def init_speech_preprocessor(self, feature_extractor_path):
+        feature_extractor = torch.jit.load(feature_extractor_path)
+        feature_extractor.eval()
+        return feature_extractor
+
+    def process_audio(self, input_signal, input_signal_length):
+        """
+        Args:
+            input_signal: audio signal in numpy array
+            input_signal_length: length of the audio signal in numpy array
+
+        Returns:
+            processed_signal: torch.tensor [B, 80, T]
+            processed_signal_length [B]
+        """
+        input_signal = torch.tensor(input_signal, dtype=torch.float32)
+        input_signal_length = torch.tensor(input_signal_length, dtype=torch.int32)
+        processed_signal, processed_signal_length = self.feature_extractor(input_signal, input_signal_length)
+        return processed_signal, processed_signal_length
+
+    def setup_inputs(self, input_text, input_media, batch_size):
+        """
+        Args:
+            input_text: str or List[str] or None
+            input_media: Tuple[np.array, np.array]
+                input_signal: audio signal in numpy array [b, -1]
+                input_signal_length: length of the audio signal in numpy array [b]
+            batch_size: int
+
+        """
+        input_signal, input_signal_length = input_media
+        processed_signal, processed_signal_length = self.process_audio(input_signal, input_signal_length)
+        processed_signal = processed_signal.to(self.device)
+        processed_signal_length = processed_signal_length.to(self.device)
+        if input_text is None:
+            input_text = "Q: what's the transcription of the audio? A:"
+
+        if isinstance(input_text, str):
+            input_text = [input_text] * batch_size
+
+        assert len(input_text) == batch_size
+        pre_prompt = [''] * batch_size
+        post_prompt = input_text
+        decoder_input_ids = None
+        attention_mask = None
+        return (
+            input_text,
+            pre_prompt,
+            post_prompt,
+            processed_signal,
+            processed_signal_length,
+            decoder_input_ids,
+            attention_mask,
+        )
+
+    def load_test_media(self, input_media_path):
+        """
+        Args:
+            input_media_path: str, path to the audio file
+        Returns:
+            input_signal: np.array [1, -1]
+            input_signal_length: np.array [1]
+        """
+        waveform, sample_rate = sf.read(input_media_path, dtype=np.float32)
+        input_signal = np.array([waveform], dtype=np.float32)
+        input_signal_length = np.array([len(waveform)], dtype=np.int32)
+        return input_signal, input_signal_length
+
+    def get_modality_encoder_features(self, modality_features, attention_mask):
+        """
+        Do inference on the modality encoder engine
+        Args:
+            modality_features: dict {'input1': torch.tensor, 'input2': torch.tensor, ..}
+            attention_mask: None
+        Returns:
+        """
+
+        if attention_mask is not None:
+            modality_features['attention_mask'] = attention_mask
+
+        tensor_info = []
+        for key, tensor in modality_features.items():
+            tensor_info.append(TensorInfo(key, torch_dtype_to_trt(tensor.dtype), tensor.shape))
+
+        output_info = self.modality_encoder_session.infer_shapes(tensor_info)
+
+        outputs = {
+            t.name: torch.empty(tuple(t.shape), dtype=trt_dtype_to_torch(t.dtype), device=self.device)
+            for t in output_info
+        }
+
+        ok = self.modality_encoder_session.run(modality_features, outputs, self.stream.cuda_stream)
+        assert ok, "Runtime execution failed for vision encoder session"
+        self.stream.synchronize()
+
+        return outputs
+
+    def preprocess(self, warmup, pre_prompt, post_prompt, processed_features, attention_mask, batch_size):
+        """
+        Args:
+            warmup: bool
+            pre_prompt: List[str]
+            post_prompt: List[str]
+            processed_features: Tuple[torch.tensor, torch.tensor]
+                processed_signal: torch.tensor [B, 80, T]
+                processed_signal_length: torch.tensor [B]
+            attention_mask: None
+            batch_size: int
+        Returns:
+            input_ids: torch.tensor [B, L]
+            input_lengths: torch.tensor [B]
+            ptuning_args: List[torch.tensor]
+            encoded_features: torch.tensor [B, L, D]
+        """
+        if not warmup:
+            profiler.start(self.modality.capitalize())
+
+        if not warmup:
+            profiler.stop(self.modality.capitalize())
+
+        assert self.model_type == 'salm', f"Invalid model type {self.model_type}"
+
+        processed_features = {
+            "processed_signal": processed_features[0],
+            "processed_signal_length": processed_features[1].to(torch.int32),
+        }
+        encoded_outputs = self.get_modality_encoder_features(processed_features, attention_mask)
+        encoded_features, encoded_length = encoded_outputs['encoded'], encoded_outputs['encoded_length']
+        pre_input_ids = self.tokenizer(pre_prompt).input_ids
+        post_input_ids = self.tokenizer(post_prompt).input_ids
+        input_lengths = []
+        input_ids = []
+        encoded_length = encoded_length.cpu().numpy()
+        fake_id_start = self.model.vocab_size
+        for i in range(batch_size):
+            feat_len = encoded_length[i]
+            feat_fake_ids = np.arange(fake_id_start, fake_id_start + feat_len)
+            cur_input_ids = np.concatenate([pre_input_ids[i], feat_fake_ids, post_input_ids[i]])
+            fake_id_start += feat_len
+            input_lengths.append(len(cur_input_ids))
+            input_ids.append(cur_input_ids)
+
+        max_length = max(input_lengths)
+        # convert input_ids to torch tensor with padding
+        input_ids = [
+            np.pad(ids, (0, max_length - len(ids)), 'constant', constant_values=self.tokenizer.pad_token_id)
+            for ids in input_ids
+        ]
+        input_ids = torch.tensor(input_ids, dtype=torch.int32)
+        input_lengths = torch.tensor(input_lengths, dtype=torch.int32)
+        ptuning_args = self.ptuning_setup(encoded_features, input_ids, input_lengths)
+
+        return input_ids, input_lengths, ptuning_args, encoded_features
+
+    def run(
+        self,
+        input_text,
+        input_media=None,
+        max_new_tokens: int = 30,
+        batch_size: int = 1,
+        top_k: int = 1,
+        top_p: float = 0.0,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.0,
+        num_beams: int = 1,
+        run_profiling=False,
+        check_accuracy=False,
+        input_signal=None,
+        input_signal_length=None,
+        lora_uids=None,
+    ):
+        """
+        Args:
+            input_text: str or List[str] or None
+            input_media: Tuple[np.array, np.array] or None
+                input_signal: audio signal in numpy array [b, -1]
+                input_signal_length: length of the audio signal in numpy array [b]
+            max_new_tokens: int
+            batch_size: int
+            top_k: int
+            top_p: float
+            temperature: float
+            repetition_penalty: float
+            num_beams: int
+            run_profiling: bool
+            check_accuracy: bool
+        """
+        if input_media is None:
+            assert input_signal is not None and input_signal_length is not None
+            input_media = (input_signal, input_signal_length)
+
+        (
+            input_text,
+            pre_prompt,
+            post_prompt,
+            processed_signal,
+            processed_signal_length,
+            decoder_input_ids,
+            attention_mask,
+        ) = self.setup_inputs(input_text, input_media, batch_size)
+        processed_media = (processed_signal, processed_signal_length)
+
+        self.generate(
+            pre_prompt,
+            post_prompt,
+            processed_media,
+            decoder_input_ids,
+            max_new_tokens,
+            attention_mask=attention_mask,
+            warmup=True,
+            batch_size=batch_size,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            repetition_penalty=repetition_penalty,
+            num_beams=num_beams,
+        )
+        num_iters = self.profiling_iterations if run_profiling else 1
+        for _ in range(num_iters):
+            output_text = self.generate(
+                pre_prompt,
+                post_prompt,
+                processed_media,
+                decoder_input_ids,
+                max_new_tokens,
+                attention_mask=attention_mask,
+                warmup=False,
+                batch_size=batch_size,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                num_beams=num_beams,
+            )
+        if self.runtime_rank == 0:
+            self.print_result(input_text, output_text, batch_size, num_beams, run_profiling, check_accuracy)
+        return output_text
diff --git a/nemo/export/onnx_llm_exporter.py b/nemo/export/onnx_llm_exporter.py
new file mode 100755
index 000000000000..e7ce4aeb49bc
--- /dev/null
+++ b/nemo/export/onnx_llm_exporter.py
@@ -0,0 +1,465 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import warnings
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import tensorrt as trt
+import torch
+import wrapt
+from transformers import AutoModel, AutoTokenizer
+
+from nemo.deploy import ITritonDeployable
+from nemo.export.utils import get_example_inputs, get_model_device_type, is_nemo2_checkpoint, validate_fp8_network
+from nemo.utils import logging
+
+
+@wrapt.decorator
+def noop_decorator(func):
+    """No op decorator"""
+
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+use_pytriton = True
+batch = noop_decorator
+try:
+    from pytriton.decorators import batch
+except Exception:
+    logging.warning("PyTriton is not available.")
+    use_pytriton = False
+
+
+use_onnxruntime = True
+try:
+    import onnxruntime
+except Exception:
+    logging.warning("onnxruntime is not available.")
+    use_onnxruntime = False
+
+
+# pylint: disable=line-too-long
+class OnnxLLMExporter(ITritonDeployable):
+    """
+    Exports models to ONNX and run fast inference.
+
+    Example:
+        from nemo.export.onnx_llm_exporter import OnnxLLMExporter
+
+        onnx_llm_exporter = OnnxLLMExporter(
+            onnx_model_dir="/path/for/onnx_model/files",
+            model_name_or_path="/path/for/model/files",
+        )
+
+        onnx_llm_exporter.export(
+            input_names=["input_ids", "attention_mask", "dimensions"],
+            output_names=["embeddings"],
+        )
+
+        output = onnx_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
+        print("output: ", output)
+    """
+
+    def __init__(
+        self,
+        onnx_model_dir: str,
+        model: Optional[torch.nn.Module] = None,
+        tokenizer=None,
+        model_name_or_path: str = None,
+        load_runtime: bool = True,
+    ):
+        """
+        Initializes the ONNX Exporter.
+
+        Args:
+            onnx_model_dir (str): path for storing the ONNX model files.
+            model (Optional[torch.nn.Module]): torch model.
+            tokenizer (HF or NeMo tokenizer): tokenizer class.
+            model_name_or_path (str): a path for ckpt or HF model ID
+            load_runtime (bool): load ONNX runtime if there is any exported model available in
+                                 the onnx_model_dir folder.
+        """
+        self.onnx_model_dir = onnx_model_dir
+        self.model_name_or_path = model_name_or_path
+        self.onnx_model_path = str(Path(onnx_model_dir) / "model.onnx")
+        self.model = model
+        self.tokenizer = tokenizer
+        self.model_input_names = None
+        self.model_output_names = None
+        self.onnx_runtime_session = None
+        self.calibration_data = None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.quant_max_batch_size = None
+
+        if self.model_name_or_path is not None:
+            if model is not None:
+                raise ValueError("A model was also passed but it will be overridden.")
+
+            if Path(self.model_name_or_path).is_dir():
+                if is_nemo2_checkpoint(self.model_name_or_path):
+                    raise NotImplementedError("NeMo 2.0 checkpoint will be supported later.")
+                else:
+                    self._load_hf_model()
+
+        if load_runtime:
+            self._load_runtime()
+
+    def _load_runtime(self):
+        if use_onnxruntime:
+            if Path(self.onnx_model_path).exists():
+                self.onnx_runtime_session = onnxruntime.InferenceSession(self.onnx_model_path)
+                self.model_input_names = [input.name for input in self.onnx_runtime_session.get_inputs()]
+                self.model_output_names = [output.name for output in self.onnx_runtime_session.get_outputs()]
+                self.tokenizer = AutoTokenizer.from_pretrained(
+                    Path(self.onnx_model_dir) / "tokenizer", trust_remote_code=True
+                )
+
+    def _load_hf_model(self):
+        self.model = AutoModel.from_pretrained(
+            self.model_name_or_path,
+            trust_remote_code=True,
+        ).eval()
+
+        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, trust_remote_code=True)
+
+    def export(
+        self,
+        input_names: list,
+        output_names: list,
+        example_inputs: dict = None,
+        opset: int = 20,
+        dynamic_axes_input: Optional[dict] = None,
+        dynamic_axes_output: Optional[dict] = None,
+        export_dtype: str = "fp32",
+        verbose: bool = False,
+    ):
+        """
+        Performs ONNX conversion from a PyTorch model.
+
+        Args:
+            input_names (list): input parameter names of the model that ONNX will export will use.
+            output_names (list): output parameter names of the model that ONNX will export will use.
+            example_inputs (dict): example input for the model to build the engine.
+            opset (int): ONNX opset version. Default is 20.
+            dynamic_axes_input (dict): Variable length axes for the input.
+            dynamic_axes_output (dict): Variable length axes for the output.
+            export_dtype (str): Export dtype, fp16 or fp32.
+            verbose (bool): Enable verbose or not.
+        """
+
+        self._export_to_onnx(
+            input_names=input_names,
+            example_inputs=example_inputs,
+            output_names=output_names,
+            opset=opset,
+            dynamic_axes_input=dynamic_axes_input,
+            dynamic_axes_output=dynamic_axes_output,
+            export_dtype=export_dtype,
+            verbose=verbose,
+        )
+        self._load_runtime()
+
+    def _export_to_onnx(
+        self,
+        input_names: list,
+        output_names: list,
+        example_inputs: dict = None,
+        opset: int = 20,
+        dynamic_axes_input: Optional[dict] = None,
+        dynamic_axes_output: Optional[dict] = None,
+        export_dtype: Union[torch.dtype, str] = "fp32",
+        verbose: bool = False,
+    ):
+
+        if example_inputs is None:
+            example_inputs = get_example_inputs(self.tokenizer)
+
+        if "dimensions" in input_names:
+            example_inputs["dimensions"] = torch.tensor([1] * example_inputs["input_ids"].shape[0])
+
+        if isinstance(export_dtype, str):
+            export_dtype = {"fp16": torch.float16, "fp32": torch.float32}[export_dtype]
+
+        self.model.to(export_dtype)
+
+        Path(self.onnx_model_dir).mkdir(parents=True, exist_ok=True)
+
+        with torch.autocast(device_type=get_model_device_type(self.model), dtype=export_dtype):
+            torch.onnx.export(
+                model=self.model,
+                args=(example_inputs,),
+                f=self.onnx_model_path,
+                input_names=input_names,
+                output_names=output_names,
+                dynamic_axes={**dynamic_axes_input, **dynamic_axes_output},
+                verbose=verbose,
+                opset_version=opset,
+            )
+        logging.info(f"Successfully exported PyTorch model to ONNX model {self.onnx_model_path}")
+
+        existing_directory_path = Path(self.onnx_model_dir) / "tokenizer"
+        existing_directory_path.mkdir(exist_ok=True)
+        self.tokenizer.save_pretrained(existing_directory_path)
+
+    def export_onnx_to_trt(
+        self,
+        trt_model_dir: str,
+        profiles=None,
+        override_layernorm_precision_to_fp32: bool = False,
+        override_layers_to_fp32: List = None,
+        trt_dtype: str = "fp16",
+        profiling_verbosity: str = "layer_names_only",
+        trt_builder_flags: List[trt.BuilderFlag] = None,
+    ) -> None:
+        """Performs TensorRT conversion from an ONNX model.
+
+        Args:
+            trt_model_dir: path to store the TensorRT model.
+            profiles: TensorRT profiles.
+            override_layernorm_precision_to_fp32 (bool): whether to convert layers to fp32 or not.
+            override_layers_to_fp32 (List): Layer names to be converted to fp32.
+            trt_dtype (str): "fp16" or "fp32".
+            profiling_verbosity (str): Profiling verbosity. Default is "layer_names_only".
+            trt_builder_flags (List[trt.BuilderFlag]): TRT specific flags.
+        """
+        logging.info(f"Building TRT engine from ONNX model ({self.onnx_model_path})")
+        trt_logger = trt.Logger(trt.Logger.WARNING)
+        builder = trt.Builder(trt_logger)
+        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
+        config = builder.create_builder_config()
+        parser = trt.OnnxParser(network, trt_logger)
+
+        # we use parse_from_file() instead of parse() because it can be used for both single
+        # file models as well as externally stored models (required when model >2GiB)
+        if not parser.parse_from_file(self.onnx_model_path):
+            logging.warning("ONNX model could not be parsed")
+            for error in range(parser.num_errors):
+                logging.error(parser.get_error(error))
+            return
+
+        if profiles:
+            for profile in profiles:
+                optimization_profile = builder.create_optimization_profile()
+
+                for i in range(network.num_inputs):
+                    in_tensor = network.get_input(i)
+                    optimization_profile.set_shape(
+                        in_tensor.name,
+                        min=profile[in_tensor.name][0],
+                        opt=profile[in_tensor.name][1],
+                        max=profile[in_tensor.name][2],
+                    )
+
+                config.add_optimization_profile(optimization_profile)
+
+        if trt_dtype == "fp16":
+            logging.info("Setting Build Flag FP16")
+            config.set_flag(trt.BuilderFlag.FP16)
+        elif trt_dtype == "fp8":
+            # With FP8 export we want to also enable FP16 layers as a fallback instead of FP32
+            logging.info("Setting Build Flag FP8 and FP16")
+            config.set_flag(trt.BuilderFlag.FP8)
+            config.set_flag(trt.BuilderFlag.FP16)
+            validate_fp8_network(network)
+
+        # patch network
+        if override_layernorm_precision_to_fp32:
+            logging.info("Overriding TensorRT network LayerNorm precision to float32.")
+            self._override_layernorm_precision_to_fp32(network)
+
+        if override_layers_to_fp32:
+            logging.info("Overriding some layers to float32.")
+            self._override_layers_to_fp32(network, override_layers_to_fp32)
+
+        try:
+            config.profiling_verbosity = {
+                "detailed": trt.ProfilingVerbosity.DETAILED,
+                "layer_names_only": trt.ProfilingVerbosity.LAYER_NAMES_ONLY,
+                "none": trt.ProfilingVerbosity.NONE,
+            }[profiling_verbosity]
+        except KeyError:
+            error_msg = "Unknown profiling verbosity value."
+            raise ValueError(error_msg)
+        logging.info(f"Setting Profiling Verbosity to {config.profiling_verbosity}")
+
+        if trt_builder_flags is not None:
+            for flag in trt_builder_flags:
+                config.set_flag(flag)
+
+        engine_string = builder.build_serialized_network(network, config)
+        if engine_string is None:
+            raise Exception("Failed to serialize the TensorRT Engine. Please check the " "TensorRT logs for details")
+
+        trt_model_path = Path(trt_model_dir)
+        trt_model_path.mkdir(parents=True, exist_ok=True)
+        trt_model_path = trt_model_path / "model.plan"
+        trt_model_path.write_bytes(engine_string)
+        logging.info(f"Successfully exported ONNX model ({self.onnx_model_path}) " f"to TRT engine ({trt_model_path})")
+
+    def _override_layer_precision_to_fp32(self, layer: trt.ILayer) -> None:
+        layer.precision = trt.float32
+        layer.set_output_type(0, trt.float32)
+
+    def _override_layers_to_fp32(self, network: trt.INetworkDefinition, fp32_layer_patterns: list[str]) -> None:
+        for i in range(network.num_layers):
+            layer = network.get_layer(i)
+            layer_name = layer.name
+            if any(layer_name.startswith(pattern) for pattern in fp32_layer_patterns) and layer.precision in {
+                trt.float32,
+                trt.float16,
+            }:
+                if layer.type in {trt.LayerType.CAST}:
+                    logging.info(f"Skipping overriding {layer.type} layer {i} {layer_name} dtype")
+                    continue
+                if any(
+                    layer.get_input(input_idx).dtype in {trt.float32, trt.float16}
+                    for input_idx in range(layer.num_inputs)
+                ):
+                    # Note: Assigning to layer.precision (even the same value) sets precision_is_set=True,
+                    # which prevents TensorRT from changing this layer's precision
+                    layer.precision = trt.float32
+                    logging.info(f"Setting layer {i} {layer_name} (type: {layer.type}) precision to FP32")
+                for j in range(layer.num_outputs):
+                    if layer.get_output_type(j) in {trt.float32, trt.float16}:
+                        layer.set_output_type(j, trt.float32)
+                        logging.info(f"Setting layer {i} {layer_name} (type: {layer.type}) output type {j} to FP32")
+
+    def _override_layernorm_precision_to_fp32(self, network: trt.INetworkDefinition) -> None:
+        """Set the precision of LayerNorm subgraphs to FP32 to preserve accuracy.
+
+        - https://nvbugs/4478448 (Mistral)
+        - https://nvbugs/3802112 (T5)
+
+        Args:
+            network: tensorrt.INetworkDefinition
+        """
+        # Logic originally from OSS T5 HF export script:
+        # https://gitlab-master.nvidia.com/TensorRT/Public/oss/-/blob/77495ec/demo/HuggingFace/T5/export.py
+        pow_ops = {}
+        for layer_index, layer in enumerate(network):
+            if layer.type == trt.LayerType.IDENTITY:
+                all_fp32 = all(
+                    [
+                        layer.output_type_is_set(o) and layer.get_output_type(o) == trt.float32
+                        for o in range(layer.num_outputs)
+                    ]
+                )
+                if all_fp32:
+                    if layer.get_input(0).dtype == trt.float32:
+                        layer.precision = trt.float32
+
+            if layer.type == trt.LayerType.ELEMENTWISE:
+                layer.__class__ = getattr(trt, "IElementWiseLayer")
+                if layer.op == trt.ElementWiseOperation.POW:
+                    pow_ops[layer] = layer_index
+                    self._override_layer_precision_to_fp32(layer)
+
+        for _, index in pow_ops.items():
+            # Iterate from few layers before pow to include residual add and cast op.
+            # Iterate till 10 layers after pow op to include all
+            # operations included in layer norm.
+            START_OFFSET = 4
+            END_OFFSET = 12
+            for i in range(index - START_OFFSET, index + END_OFFSET):
+                layer = network.get_layer(i)
+                if layer.type == trt.LayerType.REDUCE:
+                    self._override_layer_precision_to_fp32(layer)
+
+                if layer.type == trt.LayerType.ELEMENTWISE:
+                    layer.__class__ = getattr(trt, "IElementWiseLayer")
+                    if layer.op == trt.ElementWiseOperation.SUM:
+                        self._override_layer_precision_to_fp32(layer)
+
+                if layer.type == trt.LayerType.UNARY:
+                    layer.__class__ = getattr(trt, "IUnaryLayer")
+                    if layer.op == trt.UnaryOperation.SQRT:
+                        self._override_layer_precision_to_fp32(layer)
+
+                if layer.type == trt.LayerType.ELEMENTWISE:
+                    layer.__class__ = getattr(trt, "IElementWiseLayer")
+                    if layer.op == trt.ElementWiseOperation.DIV:
+                        self._override_layer_precision_to_fp32(layer)
+
+                if layer.type == trt.LayerType.ELEMENTWISE:
+                    layer.__class__ = getattr(trt, "IElementWiseLayer")
+                    if layer.op == trt.ElementWiseOperation.PROD:
+                        self._override_layer_precision_to_fp32(layer)
+
+    def forward(self, inputs: Union[List, Dict], dimensions: Optional[List] = None):
+        """Run inference for a given input.
+
+        Args:
+            inputs (Union[List, Dict]): Input for the model. If list, it should be a list of strings.
+                If dict, it should be a dictionary with keys as the model input names.
+            dimensions (Optional[List]): The dimensions parameter of the model. Required if the model
+                was exported to accept dimensions parameter and inputs is given as a list of strings.
+
+        Returns:
+            np.ndarray: Model output.
+        """
+
+        if self.onnx_runtime_session is None:
+            warnings.warn("ONNX Runtime is not available. Please install the onnxruntime-gpu and try again.")
+            return None
+
+        if isinstance(inputs, List):
+            if "dimensions" in self.model_input_names and dimensions is None:
+                raise ValueError("Dimensions should be provided for list input.")
+            inputs = dict(self.tokenizer(inputs))
+            inputs["dimensions"] = dimensions
+
+        output = self.onnx_runtime_session.run(self.model_output_names, inputs)
+        return output[0]
+
+    @property
+    def get_model(self):
+        """Returns the model"""
+
+        return self.model
+
+    @property
+    def get_tokenizer(self):
+        """Returns the tokenizer"""
+
+        return self.tokenizer
+
+    @property
+    def get_model_input_names(self):
+        """Returns the model input names"""
+
+        return self.model_input_names
+
+    @property
+    def get_triton_input(self):
+        """Get triton input"""
+
+        raise NotImplementedError("This function will be implemented later.")
+
+    @property
+    def get_triton_output(self):
+        """Get triton output"""
+
+        raise NotImplementedError("This function will be implemented later.")
+
+    @batch
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        """PyTriton inference function"""
+
+        raise NotImplementedError("This function will be implemented later.")
diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py
new file mode 100644
index 000000000000..87812e621bb6
--- /dev/null
+++ b/nemo/export/quantize/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .quantizer import Quantizer
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
new file mode 100644
index 000000000000..f9f2f2bcbf61
--- /dev/null
+++ b/nemo/export/quantize/quantizer.py
@@ -0,0 +1,257 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tarfile
+from contextlib import nullcontext
+from typing import Callable, Optional
+
+import torch
+import torch.distributed as dist
+from megatron.core import parallel_state
+from megatron.core.transformer.module import Float16Module
+from omegaconf.omegaconf import DictConfig, open_dict
+
+from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+from nemo.utils import logging
+from nemo.utils.distributed import temporary_directory
+from nemo.utils.model_utils import save_artifacts, unwrap_model
+
+try:
+    import modelopt.torch.quantization as mtq
+    from modelopt.torch.export import export_tensorrt_llm_checkpoint
+
+    QUANT_CFG_CHOICES = {
+        "int8": mtq.INT8_DEFAULT_CFG,
+        "int8_sq": mtq.INT8_SMOOTHQUANT_CFG,
+        "fp8": mtq.FP8_DEFAULT_CFG,
+        "int4_awq": mtq.INT4_AWQ_CFG,
+        "w4a8_awq": mtq.W4A8_AWQ_BETA_CFG,
+        "int4": mtq.INT4_BLOCKWISE_WEIGHT_ONLY_CFG,
+        "nvfp4": mtq.NVFP4_DEFAULT_CFG,
+    }
+
+    HAVE_MODELOPT = True
+
+except (ImportError, ModuleNotFoundError) as e:
+    HAVE_MODELOPT = False
+    HAVE_MODELOPT_ERROR = e
+
+
+SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
+
+
+class Quantizer:
+    """Post-training quantization (PTQ) and TRT-LLM export of Nemo checkpoints.
+
+    PTQ converts selected model layers to low-precision format (e.g., INT4, FP8) for efficient serving.
+    The process consist of several steps:
+
+        1. Loading a Nemo model from disk using appropriate parallelism strategy
+        2. Calibrating the model to obtain appropriate algorithm-specific scaling factors
+        3. Producing output directory or .qnemo tarball with model config (json),
+           quantized weights (safetensors) and tokenizer config (yaml).
+
+    The output directory (or .qnemo file) produced is intended to be consumed by TensorRT-LLM toolbox
+    for efficient inference. This can be achieved using Nemo inference containers.
+
+    Currently supported and tested model family is Llama2. Model type needs to be specified in
+    the quantization command with decoder_type parameter on exporting (see below). Quantizing other
+    model families is experimental and might not be fully supported.
+
+    Available quantization methods are listed in `QUANT_CFG_CHOICES` dictionary above.
+    Please consult Model Optimizer documentation https://nvidia.github.io/TensorRT-Model-Optimizer/ for details.
+    You can also inspect different choices in examples/nlp/language_modeling/conf/megatron_gpt_ptq.yaml
+    for quantization algorithms and calibration data as well as recommended settings.
+
+    Quantization algorithm can also be conveniently set to 'null' to perform only weights export step
+    for TensorRT-LLM deployment. This is useful to getting baseline results for a full-precision model.
+    """
+
+    def __init__(self, quantization_config: Optional[DictConfig], export_config: Optional[DictConfig]):
+        """Initialize Quantizer with quantization and export configurations.
+
+        Expected keys in `quantization_config`:
+            - algorithm: str
+            - decoder_type: str
+            - awq_block_size: int (only for awq algorithms)
+            - sq_alpha: float (only for smooth quant algorithms)
+            - enable_kv_cache: bool (default: None i.e. auto-detect based on algorithm and decoder_type)
+
+        Expected keys in `export_config`:
+            - dtype: str/int
+            - decoder_type: str
+            - inference_tensor_parallel: int
+            - inference_pipeline_parallel: int
+            - save_path: str
+        """
+        if not HAVE_MODELOPT:
+            raise RuntimeError("nvidia-modelopt is needed to use Quantizer") from HAVE_MODELOPT_ERROR
+
+        self.quantization_config = quantization_config
+        self.export_config = export_config
+
+        # Quantization sanity checks
+        assert (
+            quantization_config.algorithm is None or quantization_config.algorithm in QUANT_CFG_CHOICES
+        ), f"Unsupported quantization algorithm: {quantization_config.algorithm}"
+        if quantization_config.algorithm is not None:
+            quant_cfg = QUANT_CFG_CHOICES[quantization_config.algorithm]
+
+            if "awq" in quantization_config.algorithm:
+                weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"]
+                if isinstance(weight_quantizer, list):
+                    weight_quantizer = weight_quantizer[0]
+                weight_quantizer["block_sizes"][-1] = quantization_config.awq_block_size
+
+            # Always turn on FP8 kv cache to save memory footprint.
+            # For int8_sq, we use int8 kv cache.
+            # TODO: Investigate why enabling FP8 kv cache will cause accuracy regressions for Nemotron.
+            enable_quant_kv_cache = quantization_config.get("enable_kv_cache", None)
+            if enable_quant_kv_cache is None:
+                enable_quant_kv_cache = (
+                    "int8" not in quantization_config.algorithm and quantization_config.decoder_type != "gpt"
+                )
+            logging.info(f'{"Enabled" if enable_quant_kv_cache else "Disabled"} KV cache quantization')
+            quant_cfg["quant_cfg"]["*output_quantizer"] = {
+                "num_bits": 8 if quantization_config.algorithm == "int8_sq" else (4, 3),
+                "axis": None,
+                "enable": enable_quant_kv_cache,
+            }
+            if quantization_config.algorithm == "int8_sq":
+                logging.info(f"Using int8_sq alpha = {quantization_config.sq_alpha}")
+                quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": quantization_config.sq_alpha}
+
+            self.quant_cfg = quant_cfg
+        else:
+            self.quant_cfg = None
+
+        # Export sanity checks
+        if export_config is not None:
+            assert export_config.dtype in SUPPORTED_DTYPE, f"Unsupported export dtype: {export_config.dtype}"
+
+    @staticmethod
+    def _setup(model: MegatronGPTModel):
+        """Setup model for quantization."""
+        try:
+            model.model.module.language_model.encoder.activations_checkpoint_method = None
+        except AttributeError:
+            pass
+
+        if not parallel_state.is_initialized():
+
+            def dummy():
+                return
+
+            if model.trainer.strategy.launcher is not None:
+                model.trainer.strategy.launcher.launch(dummy, trainer=model.trainer)
+            model.trainer.strategy.setup_environment()
+
+    @staticmethod
+    def modify_model_config(model_cfg: DictConfig) -> DictConfig:
+        """Modify model config for quantization."""
+        with open_dict(model_cfg):
+            if model_cfg.get("sequence_parallel", False):
+                logging.warning("Disabling sequence parallelism for quantization...")
+                model_cfg.sequence_parallel = False
+            model_cfg.name = "modelopt"
+            model_cfg.apply_rope_fusion = False
+
+        return model_cfg
+
+    @staticmethod
+    def _sample_output(model: MegatronGPTModel):
+        """Generate sample output for a model instance."""
+        logging.info("Generating sample output for the model...")
+
+        response = model.generate(
+            inputs=[
+                "Born in north-east France, Soyer trained as a",
+                "Born in California, Soyer trained as a",
+            ],
+            length_params={
+                "max_length": 100,
+                "min_length": 100,
+            },
+        )
+
+        logging.info(f'Example NeMo output before export: {response["sentences"]}"')
+
+    def quantize(self, model: MegatronGPTModel, forward_loop: Callable[[MegatronGPTModel], None]):
+        """Quantize the model and calibrate using given forward loop."""
+        assert self.quant_cfg is not None, "Quantization algorithm is not set"
+
+        logging.info(f"Quantizing model to {self.quantization_config.algorithm}...")
+        self._setup(model)
+
+        model = mtq.quantize(model, self.quant_cfg, forward_loop)
+
+        if self.quantization_config.decoder_type == "gpt":
+            # We found squared_relu may have an under-calibration problem.
+            # Clamp the scaling_factor with a min threshold to avoid under-calibration.
+            maxbound = 0
+            if self.quantization_config.algorithm == "fp8":
+                maxbound = 448
+            elif self.quantization_config.algorithm == "int8_sq":
+                maxbound = 127
+            model = mtq.postprocess_amax(
+                model, "*input_quantizer", lambda amax: torch.clamp(amax, min=0.01 * maxbound)
+            )
+
+        if dist.get_rank() == 0:
+            mtq.print_quant_summary(model)
+
+        return model
+
+    def export(self, model: MegatronGPTModel):
+        """Export model to '.qnemo' format for TensorRT-LLM engine build."""
+        assert self.export_config is not None, "Export config is not set"
+        torch_dtype = torch_dtype_from_precision(self.export_config.dtype)
+
+        if self.export_config.get("sample_output", True):
+            self._sample_output(model)
+
+        if model.cfg.megatron_amp_O2:
+            model.model = unwrap_model(model.model, Float16Module)
+
+        # Setup model export handling: temporary directory for
+        # '.qnemo' tarball or directly write to export_config.save_path
+        compress = self.export_config.get("compress", False)
+        if compress:
+            export_handler = temporary_directory()
+        else:
+            export_handler = nullcontext(enter_result=self.export_config.save_path)
+
+        with export_handler as export_dir:
+            export_tensorrt_llm_checkpoint(
+                model=model,
+                decoder_type=self.export_config.decoder_type,
+                dtype=torch_dtype,
+                export_dir=export_dir,
+                inference_tensor_parallel=self.export_config.inference_tensor_parallel,
+                inference_pipeline_parallel=self.export_config.inference_pipeline_parallel,
+                use_nfs_workspace=model.trainer.num_nodes > 1,
+            )
+            dist.barrier()  # Wait until all ranks complete export_model_config step
+            logging.info(
+                "Exporting quantized weights, model artifacts,"
+                f" and tokenizer config to {self.export_config.save_path}..."
+            )
+            if dist.get_rank() == 0:
+                save_artifacts(model, export_dir)
+                if compress:
+                    os.makedirs(os.path.dirname(self.export_config.save_path), exist_ok=True)
+                    with tarfile.open(self.export_config.save_path, "w") as tar:
+                        tar.add(export_dir, arcname="./")
diff --git a/nemo/export/sentencepiece_tokenizer.py b/nemo/export/sentencepiece_tokenizer.py
new file mode 100644
index 000000000000..190400ed6215
--- /dev/null
+++ b/nemo/export/sentencepiece_tokenizer.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import sentencepiece
+import torch
+
+
+class SentencePieceTokenizer:
+    """
+    SentencePieceTokenizer https://github.com/google/sentencepiece
+
+        Args:
+        model_path: path to sentence piece tokenizer model.
+        special_tokens: either list of special tokens or dictionary of token name to token value
+        legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored,
+            including the possibility to add special tokens inside wrapper.
+        tokenizer: wraps an existing tokenizer
+    """
+
+    def __init__(
+        self,
+        model_path: Optional[str] = None,
+        special_tokens: Optional[Union[Dict[str, str], List[str]]] = None,
+        legacy: bool = False,
+        tokenizer: Optional[sentencepiece.SentencePieceProcessor] = None,
+    ):
+        model_path_provided = model_path is not None
+        tokenizer_provided = tokenizer is not None
+        if not (model_path_provided ^ tokenizer_provided):
+            raise ValueError("Exactly only one of the arguments 'model_path', 'tokenizer' should be provided")
+
+        if tokenizer_provided:
+            self.tokenizer = tokenizer
+        else:
+            if not model_path or not os.path.exists(model_path):
+                raise ValueError(f"model_path: {model_path} is invalid")
+            self.tokenizer = sentencepiece.SentencePieceProcessor()
+            self.tokenizer.Load(model_path)
+
+        self.original_vocab_size = self.tokenizer.get_piece_size()
+        self.vocab_size = self.tokenizer.get_piece_size()
+        self.legacy = legacy
+        self.special_token_to_id = {}
+        self.id_to_special_token = {}
+        if special_tokens:
+            if not self.legacy:
+                raise ValueError(
+                    "Special tokens must be None when legacy is set to False. Provide special tokens at train time."
+                )
+            self.add_special_tokens(special_tokens)
+        self.space_sensitive = self.text_to_tokens('x y') != self.text_to_tokens('x') + self.text_to_tokens('y')
+
+    def text_to_tokens(self, text):
+        if self.legacy:
+            tokens = []
+            idx = 0
+
+            while 1:
+                indices = {}
+
+                for token in self.special_token_to_id:
+                    try:
+                        indices[token] = text[idx:].index(token)
+                    except ValueError:
+                        continue
+
+                if len(indices) == 0:
+                    break
+
+                next_token = min(indices, key=indices.get)
+                next_idx = idx + indices[next_token]
+
+                tokens.extend(self.tokenizer.encode_as_pieces(text[idx:next_idx]))
+                tokens.append(next_token)
+                idx = next_idx + len(next_token)
+
+            tokens.extend(self.tokenizer.encode_as_pieces(text[idx:]))
+            return tokens
+
+        return self.tokenizer.encode_as_pieces(text)
+
+    def encode(self, text):
+        if self.legacy:
+            ids = []
+            idx = 0
+
+            while 1:
+                indices = {}
+
+                for token in self.special_token_to_id:
+                    try:
+                        indices[token] = text[idx:].index(token)
+                    except ValueError:
+                        continue
+
+                if len(indices) == 0:
+                    break
+
+                next_token = min(indices, key=indices.get)
+                next_idx = idx + indices[next_token]
+
+                ids.extend(self.tokenizer.encode_as_ids(text[idx:next_idx]))
+                ids.append(self.special_token_to_id[next_token])
+                idx = next_idx + len(next_token)
+
+            ids.extend(self.tokenizer.encode_as_ids(text[idx:]))
+            return ids
+
+        return self.tokenizer.encode_as_ids(text)
+
+    def tokens_to_text(self, tokens):
+        if isinstance(tokens, np.ndarray):
+            tokens = tokens.tolist()
+
+        return self.tokenizer.decode_pieces(tokens)
+
+    def batch_decode(self, ids):
+        if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
+            ids = ids.tolist()
+
+        if self.legacy:
+            text = ""
+            last_i = 0
+
+            for i, id in enumerate(ids):
+                if id in self.id_to_special_token:
+                    text += self.tokenizer.decode_ids(ids[last_i:i]) + " "
+                    text += self.id_to_special_token[id] + " "
+                    last_i = i + 1
+
+            text += self.tokenizer.decode_ids(ids[last_i:])
+            return text.strip()
+
+        return self.tokenizer.decode(ids)
+
+    def token_to_id(self, token):
+        if self.legacy and token in self.special_token_to_id:
+            return self.special_token_to_id[token]
+
+        return self.tokenizer.piece_to_id(token)
+
+    def ids_to_tokens(self, ids):
+        tokens = []
+        for id in ids:
+            if id >= self.original_vocab_size:
+                tokens.append(self.id_to_special_token[id])
+            else:
+                tokens.append(self.tokenizer.id_to_piece(id))
+        return tokens
+
+    def tokens_to_ids(self, tokens: Union[str, List[str]]) -> Union[int, List[int]]:
+        if isinstance(tokens, str):
+            tokens = [tokens]
+        ids = []
+        for token in tokens:
+            ids.append(self.token_to_id(token))
+        return ids
+
+    def add_special_tokens(self, special_tokens):
+        if not self.legacy:
+            raise AttributeError("Special Token addition does not work when legacy is set to False.")
+
+        if isinstance(special_tokens, list):
+            for token in special_tokens:
+                if (
+                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
+                    and token not in self.special_token_to_id
+                ):
+                    self.special_token_to_id[token] = self.vocab_size
+                    self.id_to_special_token[self.vocab_size] = token
+                    self.vocab_size += 1
+        elif isinstance(special_tokens, dict):
+            for token_name, token in special_tokens.items():
+                setattr(self, token_name, token)
+                if (
+                    self.tokenizer.piece_to_id(token) == self.tokenizer.unk_id()
+                    and token not in self.special_token_to_id
+                ):
+                    self.special_token_to_id[token] = self.vocab_size
+                    self.id_to_special_token[self.vocab_size] = token
+                    self.vocab_size += 1
+
+    @property
+    def pad_id(self):
+        if self.legacy:
+            pad_id = self.tokens_to_ids([self.pad_token])[0]
+        else:
+            pad_id = self.tokenizer.pad_id()
+        return pad_id
+
+    @property
+    def bos_token_id(self):
+        if self.legacy:
+            bos_id = self.tokens_to_ids([self.bos_token])[0]
+        else:
+            bos_id = self.tokenizer.bos_id()
+        return bos_id
+
+    @property
+    def eos_token_id(self):
+        if self.legacy:
+            eos_id = self.tokens_to_ids([self.eos_token])[0]
+        else:
+            eos_id = self.tokenizer.eos_id()
+        return eos_id
+
+    @property
+    def sep_id(self):
+        if self.legacy:
+            return self.tokens_to_ids([self.sep_token])[0]
+        else:
+            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+
+    @property
+    def cls_id(self):
+        if self.legacy:
+            return self.tokens_to_ids([self.cls_token])[0]
+        else:
+            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+
+    @property
+    def mask_id(self):
+        if self.legacy:
+            return self.tokens_to_ids([self.mask_token])[0]
+        else:
+            raise NameError("Use function token_to_id to retrieve special tokens other than unk, pad, bos, and eos.")
+
+    @property
+    def unk_id(self):
+        return self.tokenizer.unk_id()
+
+    @property
+    def additional_special_tokens_ids(self):
+        """Returns a list of the additional special tokens (excluding bos, eos, pad, unk). Used to return sentinel tokens for e.g. T5."""
+        special_tokens = set(
+            [self.bos_token, self.eos_token, self.pad_token, self.mask_token, self.cls_token, self.sep_token]
+        )
+        return [v for k, v in self.special_token_to_id.items() if k not in special_tokens]
+
+    @property
+    def vocab(self):
+        main_vocab = [self.tokenizer.id_to_piece(id) for id in range(self.tokenizer.get_piece_size())]
+        special_tokens = [
+            self.id_to_special_token[self.original_vocab_size + i]
+            for i in range(self.vocab_size - self.original_vocab_size)
+        ]
+        return main_vocab + special_tokens
+
+    # Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens: bool = False):
+        return self.ids_to_tokens(ids)  # TODO: support skip_special_tokens
+
+    def convert_tokens_to_string(self, tokens: List[str]):
+        return self.tokens_to_text(tokens)
+
+    def __len__(self):
+        return self.vocab_size
+
+    @property
+    def is_fast(self):
+        return True
+
+    def get_added_vocab(self):
+        return None
diff --git a/nemo/export/tarutils.py b/nemo/export/tarutils.py
new file mode 100644
index 000000000000..ac608dc935ff
--- /dev/null
+++ b/nemo/export/tarutils.py
@@ -0,0 +1,277 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fnmatch
+import logging
+import os
+import tarfile
+
+from typing import IO, Union
+
+LOGGER = logging.getLogger("NeMo")
+
+try:
+    from zarr.storage import BaseStore
+
+    HAVE_ZARR = True
+except Exception as e:
+    LOGGER.warning(f"Cannot import zarr, support for zarr-based checkpoints is not available. {type(e).__name__}: {e}")
+    BaseStore = object
+    HAVE_ZARR = False
+
+
+class TarPath:
+    """
+    A class that represents a path inside a TAR archive and behaves like pathlib.Path.
+
+    Expected use is to create a TarPath for the root of the archive first, and then derive
+    paths to other files or directories inside the archive like so:
+
+    with TarPath('/path/to/archive.tar') as archive:
+        myfile = archive / 'filename.txt'
+        if myfile.exists():
+            data = myfile.read()
+            ...
+
+    Only read and enumeration operations are supported.
+    """
+
+    def __init__(self, tar: Union[str, tarfile.TarFile, 'TarPath'], *parts):
+        self._needs_to_close = False
+        self._relpath = ''
+        if isinstance(tar, TarPath):
+            self._tar = tar._tar
+            self._relpath = os.path.join(tar._relpath, *parts)
+        elif isinstance(tar, tarfile.TarFile):
+            self._tar = tar
+            if parts:
+                self._relpath = os.path.join(*parts)
+        elif isinstance(tar, str):
+            self._needs_to_close = True
+            self._tar = tarfile.open(tar, 'r')
+            if parts:
+                self._relpath = os.path.join(*parts)
+        else:
+            raise ValueError(f"Unexpected argument type for TarPath: {type(tar).__name__}")
+
+    def __del__(self):
+        if self._needs_to_close:
+            self._tar.close()
+
+    def __truediv__(self, key) -> 'TarPath':
+        return TarPath(self._tar, os.path.join(self._relpath, key))
+
+    def __str__(self) -> str:
+        return os.path.join(self._tar.name, self._relpath)
+
+    @property
+    def tarobject(self):
+        """
+        Returns the wrapped tar object.
+        """
+        return self._tar
+
+    @property
+    def relpath(self):
+        """
+        Returns the relative path of the path.
+        """
+        return self._relpath
+
+    @property
+    def name(self):
+        """
+        Returns the name of the path.
+        """
+        return os.path.split(self._relpath)[1]
+
+    @property
+    def suffix(self):
+        """
+        Returns the suffix of the path.
+        """
+        name = self.name
+        i = name.rfind('.')
+        if 0 < i < len(name) - 1:
+            return name[i:]
+        else:
+            return ''
+
+    def __enter__(self):
+        self._tar.__enter__()
+        return self
+
+    def __exit__(self, *args):
+        return self._tar.__exit__(*args)
+
+    def exists(self):
+        """
+        Checks if the path exists.
+        """
+        try:
+            self._tar.getmember(self._relpath)
+            return True
+        except KeyError:
+            try:
+                self._tar.getmember(os.path.join('.', self._relpath))
+                return True
+            except KeyError:
+                return False
+
+    def is_file(self):
+        """
+        Checks if the path is a file.
+        """
+        try:
+            self._tar.getmember(self._relpath).isreg()
+            return True
+        except KeyError:
+            try:
+                self._tar.getmember(os.path.join('.', self._relpath)).isreg()
+                return True
+            except KeyError:
+                return False
+
+    def is_dir(self):
+        """
+        Checks if the path is a directory.
+        """
+        try:
+            self._tar.getmember(self._relpath).isdir()
+            return True
+        except KeyError:
+            try:
+                self._tar.getmember(os.path.join('.', self._relpath)).isdir()
+                return True
+            except KeyError:
+                return False
+
+    def open(self, mode: str) -> IO[bytes]:
+        """
+        Opens a file in the archive.
+        """
+        if mode != 'r' and mode != 'rb':
+            raise NotImplementedError()
+
+        file = None
+        try:
+            # Try the relative path as-is first
+            file = self._tar.extractfile(self._relpath)
+        except KeyError:
+            try:
+                # Try the relative path with "./" prefix
+                file = self._tar.extractfile(os.path.join('.', self._relpath))
+            except KeyError:
+                raise FileNotFoundError()
+
+        if file is None:
+            raise FileNotFoundError()
+
+        return file
+
+    def glob(self, pattern):
+        """
+        Returns an iterator over the files in the directory, matching the pattern.
+        """
+        for member in self._tar.getmembers():
+            # Remove the "./" prefix, if any
+            name = member.name[2:] if member.name.startswith('./') else member.name
+
+            # If we're in a subdirectory, make sure the file is too, and remove that subdir component
+            if self._relpath:
+                if not name.startswith(self._relpath + '/'):
+                    continue
+                name = name[len(self._relpath) + 1 :]
+
+            # See if the name matches the pattern
+            if fnmatch.fnmatch(name, pattern):
+                yield TarPath(self._tar, os.path.join(self._relpath, name))
+
+    def rglob(self, pattern):
+        """
+        Returns an iterator over the files in the directory, including subdirectories.
+        """
+        for member in self._tar.getmembers():
+            # Remove the "./" prefix, if any
+            name = member.name[2:] if member.name.startswith('./') else member.name
+
+            # If we're in a subdirectory, make sure the file is too, and remove that subdir component
+            if self._relpath:
+                if not name.startswith(self._relpath + '/'):
+                    continue
+                name = name[len(self._relpath) + 1 :]
+
+            # See if any tail of the path matches the pattern, return full path if that's true
+            parts = name.split('/')
+            for i in range(len(parts)):
+                subname = '/'.join(parts[i:])
+                if fnmatch.fnmatch(subname, pattern):
+                    yield TarPath(self._tar, os.path.join(self._relpath, name))
+                    break
+
+    def iterdir(self):
+        """
+        Returns an iterator over the files in the directory.
+        """
+        return self.glob('*')
+
+
+class ZarrPathStore(BaseStore):
+    """
+    An implementation of read-only Store for zarr library
+    that works with pathlib.Path or TarPath objects.
+    """
+
+    def __init__(self, tarpath: TarPath):
+        assert HAVE_ZARR, "Package zarr>=2.18.2,<3.0.0 is required to use ZarrPathStore"
+        self._path = tarpath
+        self._writable = False
+        self._erasable = False
+
+    def __getitem__(self, key):
+        with (self._path / key).open('rb') as file:
+            return file.read()
+
+    def __contains__(self, key):
+        return (self._path / key).is_file()
+
+    def __iter__(self):
+        return self.keys()
+
+    def __len__(self):
+        return sum(1 for _ in self.keys())
+
+    def __setitem__(self, key, value):
+        raise NotImplementedError()
+
+    def __delitem__(self, key):
+        raise NotImplementedError()
+
+    def keys(self):
+        """
+        Returns an iterator over the keys in the store.
+        """
+        return self._path.iterdir()
+
+
+def unpack_tarball(archive: str, dest_dir: str):
+    """
+    Unpacks a tarball into a destination directory.
+
+    Args:
+        archive (str): The path to the tarball.
+        dest_dir (str): The path to the destination directory.
+    """
+    with tarfile.open(archive, mode="r") as tar:
+        tar.extractall(path=dest_dir)
diff --git a/nemo/export/tensorrt_lazy_compiler.py b/nemo/export/tensorrt_lazy_compiler.py
new file mode 100644
index 000000000000..50b609087250
--- /dev/null
+++ b/nemo/export/tensorrt_lazy_compiler.py
@@ -0,0 +1,714 @@
+# Copyright (c) MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import inspect
+import os
+import tempfile
+import threading
+from collections import OrderedDict
+from logging import getLogger
+from pathlib import Path
+from types import MethodType
+from typing import Any, Dict, List, Sequence, Tuple, Union
+
+import torch
+
+from nemo.utils.export_utils import add_casts_around_norms, replace_for_export
+from nemo.utils.import_utils import safe_import
+
+polygraphy, polygraphy_imported = safe_import("polygraphy")
+if polygraphy_imported:
+    from polygraphy.backend.common import bytes_from_path
+    from polygraphy.backend.trt import (
+        CreateConfig,
+        Profile,
+        engine_bytes_from_network,
+        engine_from_bytes,
+        network_from_onnx_path,
+    )
+
+trt, trt_imported = safe_import("tensorrt")
+torch_tensorrt, _ = safe_import("torch_tensorrt")
+cudart, _ = safe_import("cuda.cudart")
+
+lock_sm = threading.Lock()
+
+
+def trt_to_torch_dtype_dict():
+    """
+    Map of TRT dtype -> Torch dtype
+    """
+    return {
+        trt.int32: torch.int32,
+        trt.float32: torch.float32,
+        trt.float16: torch.float16,
+        trt.bfloat16: torch.float16,
+        trt.int64: torch.int64,
+        trt.int8: torch.int8,
+        trt.bool: torch.bool,
+    }
+
+
+def get_profile_shapes(input_shape: Sequence[int], dynamic_batchsize: Sequence[int] | None):
+    """
+    Given a sample input shape, calculate min/opt/max shapes according to dynamic_batchsize.
+    """
+
+    def scale_batch_size(input_shape: Sequence[int], scale_num: int):
+        scale_shape = [*input_shape]
+        scale_shape[0] = scale_num
+        return scale_shape
+
+    # Use the dynamic batchsize range to generate the min, opt and max model input shape
+    if dynamic_batchsize:
+        min_input_shape = scale_batch_size(input_shape, dynamic_batchsize[0])
+        opt_input_shape = scale_batch_size(input_shape, dynamic_batchsize[1])
+        max_input_shape = scale_batch_size(input_shape, dynamic_batchsize[2])
+    else:
+        min_input_shape = opt_input_shape = max_input_shape = input_shape
+    return min_input_shape, opt_input_shape, max_input_shape
+
+
+def get_dynamic_axes(profiles):
+    """
+    This method calculates dynamic_axes to use in onnx.export().
+    Args:
+       profiles: [[min,opt,max],...] list of profile dimensions
+    """
+    dynamic_axes: dict[str, list[int]] = {}
+    if not profiles:
+        return dynamic_axes
+    for profile in profiles:
+        for key in profile:
+            axes = []
+            vals = profile[key]
+            for i in range(len(vals[0])):
+                if vals[0][i] != vals[2][i]:
+                    axes.append(i)
+            if len(axes) > 0:
+                dynamic_axes[key] = axes
+    return dynamic_axes
+
+
+def cuassert(cuda_ret):
+    """
+    Error reporting method for CUDA calls.
+    Args:
+     cuda_ret: CUDA return code.
+    """
+    err = cuda_ret[0]
+    if err != 0:
+        raise RuntimeError(f"CUDA ERROR: {err}")
+    if len(cuda_ret) > 1:
+        return cuda_ret[1]
+    return None
+
+
+class ShapeError(Exception):
+    """
+    Exception class to report errors from setting TRT plan input shapes
+    """
+
+    pass
+
+
+class TRTEngine:
+    """
+    An auxiliary class to implement running of TRT optimized engines
+
+    """
+
+    def __init__(self, plan_path, logger=None):
+        """
+        Loads serialized engine, creates execution context and activates it
+        Args:
+          plan_path: path to serialized TRT engine.
+          logger: optional logger object
+        """
+        self.plan_path = plan_path
+        self.logger = logger or getLogger("trt_compile")
+        self.logger.info(f"Loading TensorRT engine: {self.plan_path}")
+        self.engine = engine_from_bytes(bytes_from_path(self.plan_path))
+        self.tensors = OrderedDict()
+        self.cuda_graph_instance = None  # cuda graph
+        self.context = self.engine.create_execution_context()
+        self.input_names = []
+        self.output_names = []
+        self.dtypes = []
+        self.cur_profile = 0
+        self.input_table = {}
+        dtype_dict = trt_to_torch_dtype_dict()
+        for idx in range(self.engine.num_io_tensors):
+            binding = self.engine[idx]
+            if self.engine.get_tensor_mode(binding) == trt.TensorIOMode.INPUT:
+                self.input_names.append(binding)
+            elif self.engine.get_tensor_mode(binding) == trt.TensorIOMode.OUTPUT:
+                self.output_names.append(binding)
+                dtype = dtype_dict[self.engine.get_tensor_dtype(binding)]
+                self.dtypes.append(dtype)
+        self.logger.info(
+            f"Loaded TensorRT engine: {self.plan_path}.\nInputs: {self.input_names}\nOutputs: {self.output_names}"
+        )
+
+    def allocate_buffers(self, device):
+        """
+        Allocates outputs to run TRT engine
+        Args:
+            device: GPU device to allocate memory on
+        """
+        ctx = self.context
+
+        for i, binding in enumerate(self.output_names):
+            shape = list(ctx.get_tensor_shape(binding))
+            if binding not in self.tensors or list(self.tensors[binding].shape) != shape:
+                t = torch.empty(shape, dtype=self.dtypes[i], device=device).contiguous()
+                self.tensors[binding] = t
+                ctx.set_tensor_address(binding, t.data_ptr())
+
+    def set_inputs(self, feed_dict, stream):
+        """
+        Sets input bindings for TRT engine according to feed_dict
+        Args:
+           feed_dict: a dictionary [str->Tensor]
+           stream: CUDA stream to use
+        """
+        e = self.engine
+        ctx = self.context
+
+        last_profile = self.cur_profile
+
+        def try_set_inputs():
+            for binding in self.input_names:
+                t = feed_dict.get(self.input_table[binding], None)
+                if t is not None:
+                    t = t.contiguous()
+                    shape = t.shape
+                    ctx.set_input_shape(binding, shape)
+                    ctx.set_tensor_address(binding, t.data_ptr())
+
+        while True:
+            try:
+                try_set_inputs()
+                break
+            except ShapeError:
+                next_profile = (self.cur_profile + 1) % e.num_optimization_profiles
+                if next_profile == last_profile:
+                    raise
+                self.cur_profile = next_profile
+                ctx.set_optimization_profile_async(self.cur_profile, stream)
+            except Exception:
+                raise
+        left = ctx.infer_shapes()
+        assert len(left) == 0
+
+    def infer(self, stream, use_cuda_graph=False):
+        """
+        Runs TRT engine.
+        Args:
+            stream: CUDA stream to run on
+            use_cuda_graph: use CUDA graph. Note: requires all inputs to be the same GPU memory between calls.
+        """
+        if use_cuda_graph:
+            if self.cuda_graph_instance is not None:
+                cuassert(cudart.cudaGraphLaunch(self.cuda_graph_instance, stream))
+                cuassert(cudart.cudaStreamSynchronize(stream))
+            else:
+                # do inference before CUDA graph capture
+                noerror = self.context.execute_async_v3(stream)
+                if not noerror:
+                    raise ValueError("ERROR: inference failed.")
+                # capture cuda graph
+                cuassert(
+                    cudart.cudaStreamBeginCapture(
+                        stream, cudart.cudaStreamCaptureMode.cudaStreamCaptureModeThreadLocal
+                    )
+                )
+                self.context.execute_async_v3(stream)
+                graph = cuassert(cudart.cudaStreamEndCapture(stream))
+                self.cuda_graph_instance = cuassert(cudart.cudaGraphInstantiate(graph, 0))
+                self.logger.info("CUDA Graph captured!")
+        else:
+            noerror = self.context.execute_async_v3(stream)
+            cuassert(cudart.cudaStreamSynchronize(stream))
+            if not noerror:
+                raise ValueError("ERROR: inference failed.")
+
+        return self.tensors
+
+
+def make_tensor(d):
+    """
+    Creates a new tensor from d, returns d if d is already a tensor
+    """
+    return d if isinstance(d, torch.Tensor) else torch.tensor(d).cuda()
+
+
+def unroll_input(input_names, input_example):
+    """
+    Simulates list/tuple unrolling during ONNX export
+    """
+    unrolled_input = {}
+    for name in input_names:
+        val = input_example[name]
+        if val is not None:
+            if isinstance(val, list) or isinstance(val, tuple):
+                for i in range(len(val)):
+                    unrolled_input[f"{name}_{i}"] = make_tensor(val[i])
+            else:
+                unrolled_input[name] = make_tensor(val)
+    return unrolled_input
+
+
+def parse_groups(
+    ret: List[torch.Tensor], output_lists: List[List[int]]
+) -> Tuple[Union[torch.Tensor, List[torch.Tensor]], ...]:
+    """
+    Implements parsing of 'output_lists' arg of trt_compile().
+
+    Args:
+      ret: plain list of Tensors
+
+      output_lists: list of output group sizes: to form some Lists/Tuples out of 'ret' List, this will be a list
+                    of group dimensions, like [[], [5], [-1]] for returning Tensor, list of 5 items and dynamic list.
+        Format: [[group_n] | [], ...]
+          [] or group_n == 0 : next output from ret is a scalar
+          group_n > 0  :       next output from ret is a list of group_n length
+          group_n == -1:       next output is a dynamic list. This entry can be at any
+                               position in output_lists, but can appear only once.
+    Returns:
+       Tuple of Union[torch.Tensor, List[torch.Tensor]], according to the grouping in output_lists
+
+    """
+    groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple()
+    cur = 0
+    for i in range(len(output_lists)):
+        gl = output_lists[i]
+        assert len(gl) == 0 or len(gl) == 1
+        if len(gl) == 0 or gl[0] == 0:
+            groups = (*groups, ret[cur])
+            cur = cur + 1
+        elif gl[0] > 0:
+            groups = (*groups, ret[cur : cur + gl[0]])
+            cur = cur + gl[0]
+        elif gl[0] == -1:
+            rev_groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple()
+            rcur = len(ret)
+            for rl in range(len(output_lists) - 1, i, -1):
+                rgl = output_lists[rl]
+                assert len(rgl) == 0 or len(rgl) == 1
+                if len(rgl) == 0 or rgl[0] == 0:
+                    rcur = rcur - 1
+                    rev_groups = (*rev_groups, ret[rcur])
+                elif rgl[0] > 0:
+                    rcur = rcur - rgl[0]
+                    rev_groups = (*rev_groups, ret[rcur : rcur + rgl[0]])
+                else:
+                    raise ValueError("Two -1 lists in output")
+            groups = (*groups, ret[cur:rcur], *rev_groups[::-1])
+            break
+    return groups
+
+
+class TrtCompiler:
+    """
+    This class implements:
+      - TRT lazy persistent export
+      - Running TRT with optional fallback to Torch
+        (for TRT engines with limited profiles)
+    """
+
+    def __init__(
+        self,
+        model,
+        plan_path,
+        precision="fp16",
+        method="onnx",
+        input_names=None,
+        output_names=None,
+        output_lists=None,
+        export_args=None,
+        build_args=None,
+        input_profiles=None,
+        dynamic_batchsize=None,
+        use_cuda_graph=False,
+        timestamp=None,
+        fallback=False,
+        forward_override=None,
+        logger=None,
+    ):
+        """
+        Initialization method:
+         Tries to load persistent serialized TRT engine
+         Saves its arguments for lazy TRT build on first forward() call
+        Args:
+            model: Model to "wrap".
+            plan_path : Path where to save persistent serialized TRT engine.
+            precision: TRT builder precision o engine model. Should be 'fp32'|'tf32'|'fp16'|'bf16'.
+            method: One of 'onnx'|'torch_trt'.
+                    Default is 'onnx' (torch.onnx.export()->TRT). This is the most stable and efficient option.
+                    'torch_trt' may not work for some nets. Also AMP must be turned off for it to work.
+            input_names: Optional list of input names. If None, will be read from the function signature.
+            output_names: Optional list of output names. Note: If not None, patched forward() will return a dictionary.
+            output_lists: Optional list of output group sizes: when forward() returns Lists/Tuples, this will be a list
+                          of their dimensions, like [[], [5], [-1]] for Tensor, list of 5 items and dynamic list.
+            export_args: Optional args to pass to export method. See onnx.export() and Torch-TensorRT docs for details.
+            build_args: Optional args to pass to TRT builder. See polygraphy.Config for details.
+            input_profiles: Optional list of profiles for TRT builder and ONNX export.
+                            Each profile is a map of the form : {"input id" : [min_shape, opt_shape, max_shape], ...}.
+            dynamic_batchsize: A sequence with three elements to define the input batch size range for the model to be
+                               converted. Should be a sequence like [MIN_BATCH, OPT_BATCH, MAX_BATCH].
+            [note]: If neither input_profiles nor dynamic_batchsize specified, static shapes will be used.
+            use_cuda_graph: Use CUDA Graph for inference. Note: inputs have to be the same GPU memory between calls!
+            timestamp: Optional timestamp to rebuild TRT engine (e.g. if config file changes).
+            fallback: Allow to fall back to Pytorch when TRT inference fails (e.g, shapes exceed max profile).
+        """
+
+        method_vals = ["onnx", "torch_trt"]
+        if method not in method_vals:
+            raise ValueError(f"trt_compile(): 'method' should be one of {method_vals}, got: {method}.")
+        precision_vals = ["fp32", "tf32", "fp16", "bf16"]
+        if precision not in precision_vals:
+            raise ValueError(f"trt_compile(): 'precision' should be one of {precision_vals}, got: {precision}.")
+
+        self.plan_path = plan_path
+        self.precision = precision
+        self.method = method
+        self.return_dict = output_names is not None
+        self.output_names = output_names or []
+        self.output_lists = output_lists or []
+        self.profiles = input_profiles or []
+        self.dynamic_batchsize = dynamic_batchsize
+        self.export_args = export_args or {}
+        self.build_args = build_args or {}
+        self.engine: TRTEngine | None = None
+        self.use_cuda_graph = use_cuda_graph
+        self.fallback = fallback
+        self.disabled = False
+
+        self.logger = logger or getLogger("trt_compile")
+        self.argspec = inspect.getfullargspec(model.forward)
+        # Normally we read input_names from forward() but can be overridden
+        if input_names is None:
+            input_names = self.argspec.args[1:]
+        self.defaults = {}
+        if self.argspec.defaults is not None:
+            for i in range(len(self.argspec.defaults)):
+                d = self.argspec.defaults[-i - 1]
+                if d is not None:
+                    d = make_tensor(d)
+                    self.defaults[self.argspec.args[-i - 1]] = d
+
+        self.input_names = input_names
+        self.old_forward = model.forward
+
+        # Force engine rebuild if older than the timestamp
+        if timestamp is not None and os.path.exists(self.plan_path) and os.path.getmtime(self.plan_path) < timestamp:
+            os.remove(self.plan_path)
+
+    def _inputs_to_dict(self, input_example):
+        trt_inputs = {}
+        for i, inp in enumerate(input_example):
+            input_name = self.input_names[i]
+            trt_inputs[input_name] = inp
+        return trt_inputs
+
+    def _load_engine(self):
+        """
+        Loads TRT plan from disk and activates its execution context.
+        """
+        try:
+            self.engine = TRTEngine(self.plan_path, self.logger)
+            # Make sure we have names correct
+            input_table = {}
+            for name in self.engine.input_names:
+                if name.startswith("__") and name not in self.input_names:
+                    orig_name = name[2:]
+                else:
+                    orig_name = name
+                input_table[name] = orig_name
+            self.engine.input_table = input_table
+            self.logger.info(f"Engine loaded, inputs:{self.engine.input_table}")
+        except Exception as e:
+            self.logger.info(f"Exception while loading the engine:\n{e}")
+
+    def forward(self, model, argv, kwargs):
+        """
+        Main forward method:
+         Builds TRT engine if not available yet.
+         Tries to run TRT engine
+         If exception thrown and self.callback==True: falls back to original Pytorch
+
+        Args: Passing through whatever args wrapped module's forward() has
+        Returns: Passing through wrapped module's forward() return value(s)
+
+        """
+        args = self.defaults
+        args.update(kwargs)
+        if len(argv) > 0:
+            args.update(self._inputs_to_dict(argv))
+
+        if self.engine is None and not self.disabled:
+            # Restore original forward for export
+            new_forward = model.forward
+            model.forward = self.old_forward
+            try:
+                self._load_engine()
+                if self.engine is None:
+                    build_args = args.copy()
+                    with torch.no_grad():
+                        self._build_and_save(model, build_args)
+                        # This will reassign input_names from the engine
+                    self._load_engine()
+                    assert self.engine is not None
+            except Exception as e:
+                if self.fallback:
+                    self.logger.info(f"Failed to build engine: {e}")
+                    self.disabled = True
+                else:
+                    raise e
+            if not self.disabled and not self.fallback:
+                # Delete all parameters
+                for param in model.parameters():
+                    del param
+                # Call empty_cache to release GPU memory
+                torch.cuda.empty_cache()
+            # restore TRT hook
+            model.forward = new_forward
+        # Run the engine
+        try:
+            if self.engine is not None:
+                # forward_trt is not thread safe as we do not use per-thread execution contexts
+                with lock_sm:
+                    device = torch.cuda.current_device()
+                    stream = torch.cuda.Stream(device=device)
+                    self.engine.set_inputs(unroll_input(self.input_names, args), stream.cuda_stream)
+                    self.engine.allocate_buffers(device=device)
+                    # Need this to synchronize with Torch stream
+                    stream.wait_stream(torch.cuda.current_stream())
+                    ret = self.engine.infer(stream.cuda_stream, use_cuda_graph=self.use_cuda_graph)
+                    # if output_names is not None, return dictionary
+                    if not self.return_dict:
+                        ret = list(ret.values())
+                        if self.output_lists:
+                            ret = parse_groups(ret, self.output_lists)
+                        elif len(ret) == 1:
+                            ret = ret[0]
+                    return ret
+        except Exception as e:
+            if self.fallback:
+                self.logger.info(f"Exception: {e}\nFalling back to Pytorch ...")
+            else:
+                raise e
+        return self.old_forward(*argv, **kwargs)
+
+    def _onnx_to_trt(self, onnx_path):
+        """
+        Builds TRT engine from ONNX file at onnx_path and saves to self.plan_path
+        """
+
+        profiles = []
+        for profile in self.profiles:
+            p = Profile()
+            for id, val in profile.items():
+                p.add(id, min=val[0], opt=val[1], max=val[2])
+            profiles.append(p)
+
+        build_args = self.build_args.copy()
+        build_args["tf32"] = self.precision != "fp32"
+        if self.precision == "fp16":
+            build_args["fp16"] = True
+        elif self.precision == "bf16":
+            build_args["bf16"] = True
+
+        self.logger.info(f"Building TensorRT engine for {onnx_path}: {self.plan_path}")
+        network = network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM])
+        return engine_bytes_from_network(network, config=CreateConfig(profiles=profiles, **build_args))
+
+    def _build_and_save(self, model, input_example):
+        """
+        If TRT engine is not ready, exports model to ONNX,
+        builds TRT engine and saves serialized TRT engine to the disk.
+        Args:
+             input_example: passed to onnx.export()
+        """
+
+        if self.engine is not None:
+            return
+
+        export_args = self.export_args
+        engine_bytes = None
+
+        add_casts_around_norms(model)
+        replace_for_export(model)
+
+        if self.method == "torch_trt":
+            enabled_precisions = [torch.float32]
+            if self.precision == "fp16":
+                enabled_precisions.append(torch.float16)
+            elif self.precision == "bf16":
+                enabled_precisions.append(torch.bfloat16)
+            inputs = list(input_example.values())
+
+            def get_torch_trt_input(input_shape, dynamic_batchsize):
+                min_input_shape, opt_input_shape, max_input_shape = get_profile_shapes(input_shape, dynamic_batchsize)
+                return torch_tensorrt.Input(
+                    min_shape=min_input_shape, opt_shape=opt_input_shape, max_shape=max_input_shape
+                )
+
+            tt_inputs = [get_torch_trt_input(i.shape, self.dynamic_batchsize) for i in inputs]
+            engine_bytes = torch_tensorrt.convert_method_to_trt_engine(
+                model,
+                "forward",
+                arg_inputs=tt_inputs,
+                enabled_precisions=enabled_precisions,
+                **export_args,
+            )
+        else:
+            dbs = self.dynamic_batchsize
+            if dbs:
+                if len(self.profiles) > 0:
+                    raise ValueError("ERROR: Both dynamic_batchsize and input_profiles set for TrtCompiler!")
+                if len(dbs) != 3:
+                    raise ValueError("dynamic_batchsize has to have len ==3 ")
+                profile = {}
+                for id, val in input_example.items():
+
+                    def add_profile(id, val):
+                        sh = val.shape
+                        if len(sh) > 0:
+                            sh = sh[1:]
+                            profile[id] = [[dbs[0], *sh], [dbs[1], *sh], [dbs[2], *sh]]
+
+                    if isinstance(val, list) or isinstance(val, tuple):
+                        for i in range(len(val)):
+                            add_profile(f"{id}_{i}", val[i])
+                    elif isinstance(val, torch.Tensor):
+                        add_profile(id, val)
+                self.profiles = [profile]
+
+            self.dynamic_axes = get_dynamic_axes(self.profiles)
+
+            if len(self.dynamic_axes) > 0:
+                export_args.update({"dynamic_axes": self.dynamic_axes})
+
+            # Use temporary directory for easy cleanup in case of external weights
+            with tempfile.TemporaryDirectory() as tmpdir:
+                if export_args.get("dynamo", False):
+                    input_names = None
+                else:
+                    input_names = list(unroll_input(self.input_names, input_example).keys())
+                onnx_path = str(Path(tmpdir) / "model.onnx")
+                self.logger.info(
+                    f"Exporting to {onnx_path}:\n"
+                    + f"output_names={self.output_names}\ninput_names={self.input_names}\nexport args: {export_args}"
+                )
+                torch.onnx.export(
+                    model,
+                    (input_example,),
+                    onnx_path,
+                    input_names=input_names,
+                    output_names=self.output_names,
+                    **export_args,
+                )
+                if polygraphy_imported:
+                    from polygraphy.backend.onnx.loader import fold_constants, onnx_from_path, save_onnx
+
+                    onnx_model = fold_constants(onnx_from_path(onnx_path), size_threshold=16 * 1000 * 1000)
+                    save_onnx(onnx_model, onnx_path)
+                self.logger.info("Export to ONNX successful.")
+                engine_bytes = self._onnx_to_trt(onnx_path)
+        if engine_bytes:
+            open(self.plan_path, "wb").write(engine_bytes)
+
+
+def trt_forward(self, *argv, **kwargs):
+    """
+    Patch function to replace original model's forward() with.
+    Redirects to TrtCompiler.forward()
+    """
+    return self._trt_compiler.forward(self, argv, kwargs)
+
+
+def trt_compile(
+    model: torch.nn.Module,
+    base_path: str,
+    args: Dict[str, Any] | None = None,
+    submodule: Union[str, List[str]] | None = None,
+    logger: Any | None = None,
+) -> torch.nn.Module:
+    """
+    Instruments model or submodule(s) with TrtCompiler and replaces its forward() with TRT hook.
+    Note: TRT 10.3 is recommended for best performance. Some nets may even fail to work with TRT 8.x
+    Args:
+      model: module to patch with TrtCompiler object.
+      base_path: TRT plan(s) saved to f"{base_path}[.{submodule}].plan" path.
+                 dirname(base_path) must exist, base_path does not have to.
+                 If base_path does point to existing file (e.g. associated checkpoint),
+                 that file becomes a dependency - its mtime is added to args["timestamp"].
+      args: Optional dict : unpacked and passed to TrtCompiler() - see TrtCompiler above for details.
+      submodule: Optional hierarchical id(s) of submodule to patch, e.g. ['image_decoder.decoder']
+                  If None, TrtCompiler patch is applied to the whole model.
+                  Otherwise, submodule (or list of) is being patched.
+      logger: Optional logger for diagnostics.
+    Returns:
+      Always returns same model passed in as argument. This is for ease of use in configs.
+    """
+
+    default_args: Dict[str, Any] = {
+        "method": "onnx",
+        "precision": "fp16",
+        "build_args": {"builder_optimization_level": 5, "precision_constraints": "obey"},
+    }
+
+    default_args.update(args or {})
+    args = default_args
+
+    if trt_imported and polygraphy_imported and torch.cuda.is_available():
+        # if "path" filename point to existing file (e.g. checkpoint)
+        # it's also treated as dependency
+        if os.path.exists(base_path):
+            timestamp = int(os.path.getmtime(base_path))
+            if "timestamp" in args:
+                timestamp = max(int(args["timestamp"]), timestamp)
+            args["timestamp"] = timestamp
+
+        def wrap(model, path):
+            if not hasattr(model, "_trt_compiler"):
+                model.orig_forward = model.forward
+                wrapper = TrtCompiler(model, path + ".plan", logger=logger, **args)
+                model._trt_compiler = wrapper
+                model.forward = MethodType(trt_forward, model)
+
+        def find_sub(parent, submodule):
+            idx = submodule.find(".")
+            # if there is "." in name, call recursively
+            if idx != -1:
+                parent_name = submodule[:idx]
+                parent = getattr(parent, parent_name)
+                submodule = submodule[idx + 1 :]
+                return find_sub(parent, submodule)
+            return parent, submodule
+
+        if submodule is not None:
+            if isinstance(submodule, str):
+                submodule = [submodule]
+            for s in submodule:
+                parent, sub = find_sub(model, s)
+                wrap(getattr(parent, sub), base_path + "." + s)
+        else:
+            wrap(model, base_path)
+    else:
+        logger = logger or getLogger("trt_compile")
+        logger.warning("TensorRT and/or polygraphy packages are not available! trt_compile() has no effect.")
+
+    return model
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
new file mode 100644
index 000000000000..de71aea86b23
--- /dev/null
+++ b/nemo/export/tensorrt_llm.py
@@ -0,0 +1,1805 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import json
+import logging
+import os
+import pickle
+import shutil
+import tempfile
+import warnings
+from glob import glob
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import numpy as np
+import safetensors
+import tensorrt_llm
+import torch
+import torch.nn.functional as F
+import wrapt
+from tensorrt_llm._common import check_max_num_tokens
+from tensorrt_llm._utils import numpy_to_torch
+from tensorrt_llm.builder import BuildConfig
+from tensorrt_llm.commands.build import build as build_trtllm
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import (
+    BaichuanForCausalLM,
+    BertForQuestionAnswering,
+    BertForSequenceClassification,
+    BertModel,
+    BloomForCausalLM,
+    ChatGLMForCausalLM,
+    CogVLMForCausalLM,
+    CohereForCausalLM,
+    DbrxForCausalLM,
+    DeciLMForCausalLM,
+    DecoderModel,
+    DeepseekForCausalLM,
+    DeepseekV2ForCausalLM,
+    DiT,
+    EagleForCausalLM,
+    EncoderModel,
+    FalconForCausalLM,
+    GemmaForCausalLM,
+    GPTForCausalLM,
+    GPTJForCausalLM,
+    GPTNeoXForCausalLM,
+    GrokForCausalLM,
+    LLaMAForCausalLM,
+    MambaForCausalLM,
+    MedusaForCausalLm,
+    MLLaMAForCausalLM,
+    MPTForCausalLM,
+    OPTForCausalLM,
+    Phi3ForCausalLM,
+    PhiForCausalLM,
+    QWenForCausalLM,
+    RecurrentGemmaForCausalLM,
+    ReDrafterForCausalLM,
+    RobertaForQuestionAnswering,
+    RobertaForSequenceClassification,
+    RobertaModel,
+    WhisperEncoder,
+)
+from tensorrt_llm.plugin import PluginConfig
+from transformers import PreTrainedTokenizerBase
+
+from nemo.deploy import ITritonDeployable
+from nemo.export.tarutils import TarPath, unpack_tarball
+from nemo.export.trt_llm.converter.model_converter import determine_quantization_settings, model_to_trtllm_ckpt
+from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import dist_model_to_trt_llm_ckpt, get_layer_prefix
+from nemo.export.trt_llm.converter.utils import init_model_parallel_from_nemo
+from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import (
+    build_tokenizer,
+    get_model_type,
+    get_tokenizer,
+    get_weights_dtype,
+    load_nemo_model,
+)
+from nemo.export.trt_llm.qnemo import qnemo_to_tensorrt_llm
+from nemo.export.trt_llm.qnemo.tokenizer_utils import TOKENIZER_CONFIG_FILE, get_nmt_tokenizer
+from nemo.export.trt_llm.qnemo.utils import is_qnemo_checkpoint
+from nemo.export.trt_llm.tensorrt_llm_build import build_and_save_engine
+from nemo.export.trt_llm.tensorrt_llm_run import (
+    generate,
+    generate_streaming,
+    load,
+    load_distributed,
+    refit,
+    unload_engine,
+)
+from nemo.export.trt_llm.utils import is_rank
+from nemo.export.utils import is_nemo_tarfile, prepare_directory_for_export, torch_dtype_from_precision
+from nemo.export.utils.constants import TRTLLM_ENGINE_DIR
+
+use_deploy = True
+try:
+    from nemo.deploy.utils import cast_output, str_ndarray2list
+except Exception:
+    use_deploy = False
+
+LOGGER = logging.getLogger("NeMo")
+
+
+@wrapt.decorator
+def noop_decorator(func):
+    """No op decorator"""
+
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+use_pytriton = True
+batch = noop_decorator
+try:
+    from pytriton.decorators import batch, first_value
+    from pytriton.model_config import Tensor
+except Exception:
+    use_pytriton = False
+
+
+# pylint: disable=line-too-long
+class TensorRTLLM(ITritonDeployable):
+    """
+    Exports nemo and huggingface checkpoints to TensorRT-LLM and run fast inference.
+
+    Example:
+        from nemo.export.tensorrt_llm import TensorRTLLM
+
+        trt_llm_exporter = TensorRTLLM(model_dir="/path/for/model/files")
+        trt_llm_exporter.export(
+            nemo_checkpoint_path="/path/for/nemo/checkpoint",
+            model_type="llama",
+            tensor_parallelism_size=1,
+        )
+
+        output = trt_llm_exporter.forward(["Hi, how are you?", "I am good, thanks, how about you?"])
+        print("output: ", output)
+
+    """
+
+    def __init__(
+        self,
+        model_dir: str,
+        lora_ckpt_list: List[str] = None,
+        load_model: bool = True,
+        use_python_runtime: bool = True,
+        enable_chunked_context: bool = None,
+        max_tokens_in_paged_kv_cache: int = None,
+        multi_block_mode: bool = False,
+    ):
+        """
+        Args:
+            model_dir (str): path for storing the TensorRT-LLM model files.
+            lora_ckpt_list (List[str]): lora checkpoint paths.
+            load_model (bool): load TensorRT-LLM model if the engine files exist in the model_dir.
+            use_python_runtime (bool): whether to use python or c++ runtime.
+            multi_block_mode (bool): enable faster decoding in multihead attention. Required for long context. Only available when using c++ runtime
+        """
+
+        if use_python_runtime:
+            if enable_chunked_context is not None or max_tokens_in_paged_kv_cache is not None:
+                raise Exception(
+                    "enable_chunked_context and max_tokens_in_paged_kv_cache options "
+                    "work only with the TensorRT-LLM C++ runtime. Please set "
+                    "use_python_runtime=False to use these options."
+                )
+
+        self.model_dir = model_dir
+        self.engine_dir = os.path.join(model_dir, TRTLLM_ENGINE_DIR)
+        self.lora_ckpt_list = lora_ckpt_list
+        self.use_python_runtime = use_python_runtime
+        self.enable_chunked_context = enable_chunked_context if enable_chunked_context is not None else False
+        self.max_tokens_in_paged_kv_cache = max_tokens_in_paged_kv_cache
+        self.multi_block_mode = multi_block_mode
+        self.model = None
+        self.tokenizer = None
+        self.config = None
+        self.ptuning_tables = []
+        self.p_table = None
+        self.task_vocab_size = 0
+        self.task_vtoken_counts = []
+        self.task_ids = {}
+
+        if load_model:
+            self._load()
+
+    def export(
+        self,
+        nemo_checkpoint_path: str,
+        model_type: Optional[str] = None,
+        delete_existing_files: bool = True,
+        tensor_parallelism_size: int = 1,
+        pipeline_parallelism_size: int = 1,
+        gpus_per_node: Optional[int] = None,
+        max_input_len: int = 256,
+        max_output_len: Optional[int] = None,
+        max_batch_size: int = 8,
+        max_prompt_embedding_table_size: Optional[int] = None,
+        use_parallel_embedding: bool = False,
+        use_embedding_sharing: bool = False,
+        paged_kv_cache: bool = True,
+        remove_input_padding: bool = True,
+        paged_context_fmha: bool = False,
+        dtype: Optional[str] = None,
+        load_model: bool = True,
+        use_lora_plugin: str = None,
+        lora_target_modules: List[str] = None,
+        max_lora_rank: int = 64,
+        max_num_tokens: Optional[int] = None,
+        opt_num_tokens: Optional[int] = None,
+        max_seq_len: Optional[int] = 512,
+        multiple_profiles: bool = False,
+        gpt_attention_plugin: str = "auto",
+        gemm_plugin: str = "auto",
+        use_mcore_path: bool = True,
+        reduce_fusion: bool = True,
+        fp8_quantized: Optional[bool] = None,
+        fp8_kvcache: Optional[bool] = None,
+        gather_context_logits: Optional[bool] = False,
+        gather_generation_logits: Optional[bool] = False,
+        build_rank: Optional[int] = 0,
+    ):
+        """
+        Exports nemo checkpoints to TensorRT-LLM.
+
+        Args:
+            nemo_checkpoint_path (str): path for the nemo checkpoint.
+            model_type (Optional[str]): type of the model (optional for NeMo 2.0 and quantized checkpoints).
+            delete_existing_files (bool): if True, deletes all the files in model_dir.
+            tensor_parallelism_size (int): tensor parallelism.
+            pipeline_parallelism_size (int): pipeline parallelism.
+            gpus_per_node (int): number of gpus per node.
+            max_input_len (int): max input length.
+            max_output_len (int): max output length.
+            max_batch_size (int): max batch size.
+            max_prompt_embedding_table_size (int): max prompt embedding size.
+            use_parallel_embedding (bool): whether to use parallel embedding feature of TRT-LLM or not
+            use_embedding_sharing (bool):
+            paged_kv_cache (bool): if True, uses kv cache feature of the TensorRT-LLM.
+            paged_context_fmha (bool): whether to use paged context fmha feature of TRT-LLM or not
+            remove_input_padding (bool): enables removing input padding or not.
+            dtype (Optional[str]): Floating point type for model weights (supports 'bfloat16', 'float16' or 'float32').
+                If None, try to autodetect the type from model config.
+            load_model (bool): load TensorRT-LLM model after the export.
+            use_lora_plugin (str): use dynamic lora or not.
+            lora_target_modules (List[str]): list of the target lora modules.
+            max_lora_rank (int): maximum lora rank.
+            max_num_tokens (int):
+            opt_num_tokens (int):
+            max_seq_len (int): the maximum sequence length of a single request.
+            multiple_profiles: (bool): enables multiple profiles feature of TRT-LLM. Default = False
+            gpt_attention_plugin (str): enable the gpt attention plugin. Default = "auto"
+            gemm_plugin (str): enable the gpt plugin. Default = "auto"
+            use_mcore_path (bool) : Use the more recent mcore path for export
+            reduce_fusion (bool): enables fusing extra kernels after custom TRT-LLM allReduce
+            fp8_quantized (Optional[bool]): enables exporting to FP8 TRT-LLM checkpoints. If not set, autodetects the type.
+            fp8_kvcache (Optional[bool]): enables FP8 KV-cache quantization. If not set, autodetects the type.
+            gather_context_logits (Optional[bool]): if True, enables gather_context_logits while building trtllm engine. Default: False
+            gather_generation_logits (Optional[bool]): if True, enables gather_generation_logits while building trtllm engine. Default: False
+            build_rank (Optional[int]): rank to export the model on. If None, builds on all ranks.
+        """
+        if not use_mcore_path:
+            warnings.warn(
+                "Exporting models using the local codebase with use_mcore_path=False is deprecated."
+                " Please install megatron-core and set use_mcore_path to True.",
+                stacklevel=2,
+            )
+
+        gpus_per_node = tensor_parallelism_size if gpus_per_node is None else gpus_per_node
+        prepare_directory_for_export(
+            self.model_dir, delete_existing_files=delete_existing_files, subdir=TRTLLM_ENGINE_DIR
+        )
+
+        if max_prompt_embedding_table_size is None:
+            max_prompt_embedding_table_size = 0
+
+        self.model = None
+
+        if max_output_len is not None:
+            warnings.warn(
+                "Parameter max_output_len is deprecated and will be removed.", DeprecationWarning, stacklevel=2
+            )
+            max_output_len = max_output_len if max_output_len is not None else 256
+
+            if max_seq_len is None:
+                max_seq_len = max_input_len + max_output_len
+            else:
+                warnings.warn(
+                    f"Parameter max_output_len will be overwritten by max_seq_len={max_seq_len}.",
+                    DeprecationWarning,
+                    stacklevel=2,
+                )
+
+        max_seq_len = max_seq_len if max_seq_len is not None else 512
+
+        if max_batch_size < 4:
+            warnings.warn(
+                "TensorRT LLM may hit a runtime issue with batch size is smaller than 4 on some models."
+                " Force set to 4",
+                stacklevel=2,
+            )
+            max_batch_size = 4
+
+        is_export_rank = is_rank(build_rank)
+
+        if is_export_rank:
+            tmp_dir = tempfile.TemporaryDirectory()
+            nemo_export_dir = Path(tmp_dir.name)
+
+            if is_qnemo_checkpoint(nemo_checkpoint_path):
+                if os.path.isdir(nemo_checkpoint_path):
+                    nemo_export_dir = nemo_checkpoint_path
+                else:
+                    unpack_tarball(nemo_checkpoint_path, tmp_dir.name)
+                    nemo_checkpoint_path = tmp_dir.name
+
+                if os.path.exists(os.path.join(nemo_checkpoint_path, TOKENIZER_CONFIG_FILE)):
+                    # Instantiate tokenizer for a legacy "Nemo 1" quantized checkpoint from a tokenizer config.
+                    # Note that using the config is deprecated and it will be removed in future releases.
+                    LOGGER.warning("Detected legacy tokenizer_config.yaml, using it to build tokenizer.")
+                    self.tokenizer = get_nmt_tokenizer(nemo_checkpoint_path)
+                else:
+                    self.tokenizer = get_tokenizer(nemo_checkpoint_path)
+
+                model_config = None
+
+                qnemo_to_tensorrt_llm(
+                    nemo_checkpoint_path=nemo_checkpoint_path,
+                    engine_dir=self.engine_dir,
+                    max_input_len=max_input_len,
+                    max_seq_len=max_seq_len,
+                    max_batch_size=max_batch_size,
+                    max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                    tensor_parallel_size=tensor_parallelism_size,
+                    pipeline_parallel_size=pipeline_parallelism_size,
+                    use_parallel_embedding=use_parallel_embedding,
+                    paged_kv_cache=paged_kv_cache,
+                    paged_context_fmha=paged_context_fmha,
+                    remove_input_padding=remove_input_padding,
+                    use_lora_plugin=use_lora_plugin,
+                    lora_target_modules=lora_target_modules,
+                    max_lora_rank=max_lora_rank,
+                    max_num_tokens=max_num_tokens,
+                    opt_num_tokens=opt_num_tokens,
+                    multiple_profiles=multiple_profiles,
+                    reduce_fusion=reduce_fusion,
+                )
+            else:
+                if model_type is None:
+                    # For NeMo 2.0 models we can get model_type from the model class name
+                    model_type = get_model_type(nemo_checkpoint_path)
+
+                if model_type is None:
+                    raise ValueError(
+                        "Parameter model_type needs to be provided and cannot be inferred from the checkpoint. "
+                        "Please specify it explicitely."
+                    )
+
+                if model_type not in self.get_supported_models_list:
+                    raise ValueError(
+                        f"Model {model_type} is not currently a supported model type. "
+                        f"Supported model types are: {self.get_supported_models_list}."
+                    )
+
+                if dtype is None:
+                    dtype = get_weights_dtype(nemo_checkpoint_path)
+
+                if dtype is None:
+                    raise ValueError(
+                        "Parameter dtype needs to be provided and cannot be inferred from the checkpoint. "
+                        "Please specify it explicitely."
+                    )
+
+                model, model_config, self.tokenizer = load_nemo_model(
+                    nemo_checkpoint_path, nemo_export_dir, use_mcore_path
+                )
+                if use_mcore_path:
+                    from megatron.core.export.data_type import DataType
+                    from megatron.core.export.export_config import ExportConfig
+                    from megatron.core.export.model_type import ModelType
+                    from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import (
+                        DEFAULT_CONVERSION_DICT,
+                    )
+                    from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+                    from tensorrt_llm.layers import MoeConfig
+
+                    share_embeddings_and_output_weights = model_config.get(
+                        "share_embeddings_and_output_weights", False
+                    )
+                    fp8_quantized, fp8_kvcache = determine_quantization_settings(
+                        model_config, fp8_quantized, fp8_kvcache
+                    )
+
+                    # We build the transformer config using the nemo model config.
+                    transformer_config = self.get_transformer_config(model_config)
+                    input_model_type = getattr(ModelType, model_type)
+
+                    # MCore export supports some default conversion dictionaries
+                    mcore_model_conversion_dict = DEFAULT_CONVERSION_DICT
+
+                    # All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models start with "model.decoder.layers.4.blahblah". so we append model. to the keys
+                    nemo_model_conversion_dict = {
+                        f'model.{key}': value for key, value in mcore_model_conversion_dict.items()
+                    } | {  # Mapping for NeMo 2.0
+                        f'module.{key}': value for key, value in mcore_model_conversion_dict.items()
+                    }
+
+                    # TODO: Workaround: Gemma uses gated activation, while mcore does not handle openai-gelu
+                    # as a gated function. Remove once !11614 is merged.
+                    activation = model_config.get('activation', "gelu")
+                    if activation == "openai-gelu" and input_model_type.name == 'gemma':
+                        activation = "geglu"
+
+                    trtllm_helper = TRTLLMHelper(
+                        transformer_config=transformer_config,
+                        model_type=input_model_type,
+                        trtllm_conversion_dict=nemo_model_conversion_dict,
+                        position_embedding_type=model_config.get('position_embedding_type'),
+                        max_position_embeddings=model_config.get('max_position_embeddings'),
+                        rotary_percentage=model_config.get('rotary_percentage', 1.0),
+                        rotary_base=model_config.get('rotary_base', 10000),
+                        moe_tp_mode=model_config.get('moe_tp_mode', 2),
+                        multi_query_mode=model_config.get("multi_query_mode", False),
+                        activation=activation,
+                        seq_len_interpolation_factor=model_config.get("seq_len_interpolation_factor"),
+                        moe_renorm_mode=model_config.get(
+                            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+                        ),
+                        share_embeddings_and_output_weights=share_embeddings_and_output_weights,
+                    )
+
+                    input_dtype = getattr(DataType, dtype)
+                    export_config = ExportConfig(
+                        tensor_parallelism_size,
+                        pipeline_parallelism_size,
+                        use_parallel_embedding,
+                        share_embeddings_and_output_weights,
+                    )
+
+                    trtllm_model_weights_list, trtllm_model_config_list = (
+                        trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                            model_state_dict=model,
+                            export_config=export_config,
+                            dtype=input_dtype,
+                            state_dict_split_by_layer_numbers=False,
+                            fp8_quantized=fp8_quantized,
+                            fp8_kvcache=fp8_kvcache,
+                        )
+                    )
+
+                    for trtllm_model_weights, trtllm_model_config in zip(
+                        trtllm_model_weights_list, trtllm_model_config_list
+                    ):
+                        trtllm_helper.build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            engine_dir=self.engine_dir,
+                            trtllm_model_weights=trtllm_model_weights,
+                            trtllm_model_config=trtllm_model_config,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            use_refit=False,
+                            max_num_tokens=max_num_tokens,
+                            max_seq_len=max_seq_len,
+                            opt_num_tokens=opt_num_tokens,
+                            max_beam_width=1,
+                            tokens_per_block=128,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                        )
+                else:
+                    if model_type == "gpt" or model_type == "starcoder":
+                        model_type = "gptnext"
+
+                    if model_type == "mixtral":
+                        model_type = "llama"
+
+                    trtllm_model_weights_list, trtllm_model_config_list = model_to_trtllm_ckpt(
+                        model=model,
+                        nemo_model_config=model_config,
+                        nemo_export_dir=nemo_export_dir,
+                        decoder_type=model_type,
+                        dtype=dtype,
+                        tensor_parallel_size=tensor_parallelism_size,
+                        pipeline_parallel_size=pipeline_parallelism_size,
+                        gpus_per_node=gpus_per_node,
+                        use_parallel_embedding=use_parallel_embedding,
+                        use_embedding_sharing=use_embedding_sharing,
+                        fp8_quantized=fp8_quantized,
+                        fp8_kvcache=fp8_kvcache,
+                    )
+
+                    for trtllm_model_weights, trtllm_model_config in zip(
+                        trtllm_model_weights_list, trtllm_model_config_list
+                    ):
+                        build_and_save_engine(
+                            max_input_len=max_input_len,
+                            max_output_len=max_output_len,
+                            max_batch_size=max_batch_size,
+                            model_config=trtllm_model_config,
+                            model_weights=trtllm_model_weights,
+                            model_dir=self.engine_dir,
+                            model_type=model_type,
+                            lora_ckpt_list=self.lora_ckpt_list,
+                            use_lora_plugin=use_lora_plugin,
+                            max_lora_rank=max_lora_rank,
+                            lora_target_modules=lora_target_modules,
+                            max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+                            paged_kv_cache=paged_kv_cache,
+                            remove_input_padding=remove_input_padding,
+                            paged_context_fmha=paged_context_fmha,
+                            max_num_tokens=max_num_tokens,
+                            opt_num_tokens=opt_num_tokens,
+                            max_seq_len=max_seq_len,
+                            multiple_profiles=multiple_profiles,
+                            gpt_attention_plugin=gpt_attention_plugin,
+                            gemm_plugin=gemm_plugin,
+                            gather_context_logits=gather_context_logits,
+                            gather_generation_logits=gather_generation_logits,
+                        )
+
+            tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
+            tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context")
+            vocab_path = os.path.join(nemo_export_dir, "vocab.json")
+            if isinstance(self.tokenizer, PreTrainedTokenizerBase):
+                self.tokenizer.save_pretrained(self.model_dir)
+            elif os.path.exists(tokenizer_path):
+                shutil.copy(tokenizer_path, self.model_dir)
+            elif os.path.exists(tokenizer_path_nemo2):
+                # Copy HF tokenizer files to root model directory
+                for path in glob(os.path.join(tokenizer_path_nemo2, "nemo_tokenizer", "*.json")):
+                    shutil.copy(path, self.model_dir)
+                # Copy SentencePiece tokenizer.model
+                for path in glob(os.path.join(tokenizer_path_nemo2, "*.model")):
+                    shutil.copy(path, os.path.join(self.model_dir, "tokenizer.model"))
+            elif os.path.exists(vocab_path):
+                shutil.copy(vocab_path, os.path.join(self.model_dir, "vocab.json"))
+
+            nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
+            if os.path.exists(nemo_model_config):
+                shutil.copy(nemo_model_config, self.model_dir)
+
+            tmp_dir.cleanup()
+
+        if is_export_rank and model_config is not None:
+            self._export_to_nim_format(model_config, model_type)
+
+        if tensorrt_llm.mpi_world_size() > 1:
+            tensorrt_llm.mpi_barrier()
+
+        if is_export_rank and load_model:
+            self._load()
+
+    def export_hf_model(
+        self,
+        hf_model_path: str,
+        max_batch_size: int = 8,
+        tensor_parallelism_size: int = 1,
+        max_input_len: int = 256,
+        max_output_len: int = 256,
+        max_num_tokens: Optional[int] = None,
+        opt_num_tokens: Optional[int] = None,
+        dtype: Optional[str] = None,
+        max_seq_len: Optional[int] = 512,
+        gemm_plugin: str = "auto",
+        remove_input_padding: bool = True,
+        paged_context_fmha: bool = False,
+        paged_kv_cache: bool = True,
+        tokens_per_block: int = 128,
+        multiple_profiles: bool = False,
+        reduce_fusion: bool = False,
+        max_beam_width: int = 1,
+        use_refit: bool = False,
+        model_type: Optional[str] = None,
+        delete_existing_files: bool = True,
+    ):
+        """
+        Export a Hugging Face model checkpoint to TensorRT-LLM format.
+
+        Args:
+            hf_model_path (str): Path to the Hugging Face model directory
+            max_batch_size (int, optional): Maximum batch size for inference. Defaults to 8.
+            tensor_parallelism_size (int, optional): Size of tensor parallelism. Defaults to 1.
+            max_input_len (int, optional): Maximum input sequence length. Defaults to 256.
+            max_output_len (int, optional): Maximum output sequence length. Defaults to 256.
+            max_num_tokens (int, optional): Maximum number of tokens. Defaults to None.
+            opt_num_tokens (int, optional): Optimal number of tokens. Defaults to None.
+            dtype (str, optional): Data type for model weights. If None, inferred from model config.
+            max_seq_len (int, optional): Maximum total sequence length. Defaults to 512.
+            gemm_plugin (str, optional): GEMM plugin type. Defaults to "auto".
+            remove_input_padding (bool, optional): Whether to remove input padding. Defaults to True.
+            paged_context_fmha (bool, optional): Whether to use paged context FMHA. Defaults to False.
+            paged_kv_cache (bool, optional): Whether to use paged KV cache. Defaults to True.
+            tokens_per_block (int, optional): Number of tokens per block for paged KV cache. Defaults to 128.
+            multiple_profiles (bool, optional): Whether to use multiple TensorRT profiles. Defaults to False.
+            reduce_fusion (bool, optional): Whether to reduce operator fusion. Defaults to False.
+            max_beam_width (int, optional): Maximum beam width for beam search. Defaults to 1.
+            use_refit (bool, optional): Whether to use TensorRT refitting. Defaults to False.
+            model_type (str, optional): Type of the model architecture. Defaults to None.
+            delete_existing_files (bool, optional): Whether to delete existing files in export dir. Defaults to True.
+
+        Raises:
+            ValueError: If model_type is not supported or dtype cannot be determined
+        """
+        LOGGER.info("Starting HF export to TRT-LLM")
+        if model_type not in self.get_supported_hf_model_mapping:
+            raise ValueError(
+                f"Model {model_type} is not currently a supported model type. "
+                f"Supported model types are: {self.get_supported_hf_model_mapping.keys()}."
+            )
+
+        if dtype is None:
+            dtype = self.get_hf_model_dtype(hf_model_path)
+            if dtype is None:
+                raise ValueError("No dtype found in hf model config. Please specify a dtype.")
+
+        prepare_directory_for_export(
+            self.model_dir, delete_existing_files=delete_existing_files, subdir=TRTLLM_ENGINE_DIR
+        )
+
+        if max_batch_size < 4:
+            print("TensorRT-LLM may hit runtime issue with batch size is smaller than 4. Force set to 4")
+            max_batch_size = 4
+
+        plugin_config = PluginConfig()
+        plugin_config.gemm_plugin = gemm_plugin
+        if paged_kv_cache:
+            plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
+        else:
+            plugin_config.paged_kv_cache = False
+        plugin_config.remove_input_padding = remove_input_padding
+        plugin_config.use_paged_context_fmha = paged_context_fmha
+        plugin_config.multiple_profiles = multiple_profiles
+        plugin_config.reduce_fusion = reduce_fusion
+        max_seq_len = max_input_len + max_output_len
+        max_num_tokens, opt_num_tokens = check_max_num_tokens(
+            max_num_tokens=max_num_tokens,
+            opt_num_tokens=opt_num_tokens,
+            max_seq_len=max_seq_len,
+            max_batch_size=max_batch_size,
+            max_input_len=max_input_len,
+            max_beam_width=max_beam_width,
+            remove_input_padding=remove_input_padding,
+            enable_context_fmha=plugin_config.context_fmha,
+            tokens_per_block=tokens_per_block,
+            multiple_profiles=multiple_profiles,
+        )
+        build_dict = {
+            'max_input_len': max_input_len,
+            'max_output_len': max_output_len,
+            'max_batch_size': max_batch_size,
+            'max_beam_width': max_beam_width,
+            'max_seq_len': max_seq_len,
+            'max_num_tokens': max_num_tokens,
+            'opt_num_tokens': opt_num_tokens,
+            'strongly_typed': False,
+            'builder_opt': None,
+            'multiple_profiles': multiple_profiles,
+            'use_refit': use_refit,
+        }
+        build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
+        for rank in range(tensor_parallelism_size):
+            LOGGER.info(f"Iterating over rank:{rank}")
+            mapping = Mapping(world_size=tensor_parallelism_size, rank=rank, tp_size=tensor_parallelism_size)
+            trtllm_model_class = self.get_supported_hf_model_mapping[model_type]
+            model = trtllm_model_class.from_hugging_face(
+                hf_model_path,
+                dtype,
+                mapping=mapping,
+            )
+            engine = build_trtllm(model, build_config)
+            engine.save(self.engine_dir)
+        # Copy HF tokenizer files to root model directory
+        for path in glob(os.path.join(hf_model_path, "*.json")):
+            shutil.copy(path, self.model_dir)
+        # Copy sentencepiece model to model directory
+        for path in glob(os.path.join(hf_model_path, "*.model")):
+            shutil.copy(path, self.model_dir)
+        LOGGER.info(f"Generarated TRT-LLM checkpoint at dir:{self.model_dir}")
+        LOGGER.info(f"Loading the TRT-LLM checkpoint:{self.model_dir}")
+        self._load()
+
+    def get_hf_model_dtype(self, model_dir: str) -> Optional[str]:
+        """
+        Read the config file from a Hugging Face model directory and identify the model's data type.
+
+        Args:
+            model_dir (str): Path to the Hugging Face model directory
+
+        Returns:
+            Optional[str]: The model's data type if found in config, None otherwise
+        """
+        config_path = Path(model_dir) / 'config.json'
+
+        if not config_path.exists():
+            raise FileNotFoundError(f"Config file not found at {config_path}")
+
+        try:
+            with open(config_path, 'r') as f:
+                config = json.load(f)
+                # Check for dtype in different possible locations in the config
+                if 'torch_dtype' in config:
+                    return config['torch_dtype']
+                elif 'dtype' in config:
+                    return config['dtype']
+                elif 'pretrained_config' in config and 'dtype' in config['pretrained_config']:
+                    return config['pretrained_config']['dtype']
+
+                # If no explicit dtype found, check for other indicators
+                if 'fp16' in config and config['fp16']:
+                    return 'float16'
+                elif 'bf16' in config and config['bf16']:
+                    return 'bfloat16'
+
+            return None
+        except json.JSONDecodeError:
+            raise ValueError(f"Invalid JSON in config file at {config_path}")
+        except Exception as e:
+            raise RuntimeError(f"Error reading config file: {str(e)}")
+
+    def _export_to_nim_format(self, model_config: Dict[str, Any], model_type: str):
+        """
+        Exports the model configuration to a specific format required by NIM.
+        This method performs the following steps:
+
+        1. Copies the generation_config.json (if present) from the nemo_context directory to the root model directory.
+        2. Creates a dummy Hugging Face configuration file based on the provided model configuration and type.
+
+        Args:
+            model_config (dict): A dictionary containing the model configuration parameters.
+            model_type (str): The type of the model (e.g., "llama").
+        """
+
+        generation_config_path = os.path.join(self.model_dir, "nemo_context", "artifacts", "generation_config.json")
+        if os.path.isfile(generation_config_path):
+            shutil.copy(generation_config_path, self.model_dir)
+
+        # Fields "architectures" and "model_type" are required by HF but not relevant for NIM
+        seq_len_interpolation_factor = model_config.get("seq_len_interpolation_factor")
+        hf_config = {
+            "max_position_embeddings": model_config.get("encoder_seq_length"),
+            "architectures": ["LLaMAForCausalLM"],
+            "rope_scaling": (
+                None
+                if seq_len_interpolation_factor is None
+                else {
+                    "factor": seq_len_interpolation_factor,
+                    "rope_type": "default",
+                }
+            ),
+            "model_type": model_type,
+        }
+        with open(os.path.join(self.model_dir, "config.json"), "w") as f:
+            json.dump(hf_config, f, indent=2)
+            f.write("\n")
+
+    def get_transformer_config(self, nemo_model_config):
+        """Given nemo model config get transformer config"""
+        from megatron.core.transformer.transformer_config import TransformerConfig
+
+        normalization = nemo_model_config.get('normalization', 'layernorm')
+        transformer_config_normalization = 'LayerNorm'
+        layernorm_zero_centered_gamma = nemo_model_config.get('layernorm_zero_centered_gamma', False)
+        if normalization == 'layernorm1p':
+            layernorm_zero_centered_gamma = True
+        elif normalization == 'rmsnorm':
+            transformer_config_normalization = 'RMSNorm'
+
+        num_moe_experts = nemo_model_config.get('num_moe_experts', 0)
+        conf = TransformerConfig(
+            num_layers=nemo_model_config.get('num_layers'),
+            moe_router_topk=nemo_model_config.get('moe_router_topk', 0),
+            num_attention_heads=nemo_model_config.get('num_attention_heads'),
+            num_query_groups=nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
+            kv_channels=nemo_model_config.get("kv_channels", None),
+            hidden_size=nemo_model_config.get('hidden_size'),
+            ffn_hidden_size=nemo_model_config.get('ffn_hidden_size'),
+            layernorm_epsilon=nemo_model_config.get('layernorm_epsilon'),
+            add_bias_linear=nemo_model_config.get('bias'),
+            num_moe_experts=num_moe_experts if num_moe_experts > 0 else None,
+            normalization=transformer_config_normalization,
+            layernorm_zero_centered_gamma=layernorm_zero_centered_gamma,
+            gated_linear_unit=nemo_model_config.get('gated_linear_unit', False),
+        )
+        return conf
+
+    def convert_to_safe_tensors(
+        self,
+        nemo_checkpoint_path: str,
+        model_type: Optional[str] = None,
+        delete_existing_files: bool = True,
+        tensor_parallelism_size: int = 1,
+        pipeline_parallelism_size: int = 1,
+        gpus_per_node: int = None,
+        use_parallel_embedding: bool = False,
+        use_embedding_sharing: bool = False,
+        dtype: str = "bfloat16",
+    ):
+        """Convert to safe tensor"""
+        gpus_per_node = tensor_parallelism_size if gpus_per_node is None else gpus_per_node
+
+        if Path(self.model_dir).exists():
+            if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
+                for files in os.listdir(self.model_dir):
+                    path = os.path.join(self.model_dir, files)
+                    try:
+                        shutil.rmtree(path)
+                    except OSError:
+                        os.remove(path)
+
+                if len(os.listdir(self.model_dir)) > 0:
+                    raise Exception("Couldn't delete all files.")
+            elif len(os.listdir(self.model_dir)) > 0:
+                raise Exception("There are files in this folder. Try setting delete_existing_files=True.")
+        else:
+            Path(self.model_dir).mkdir(parents=True, exist_ok=True)
+
+        if model_type == "gpt" or model_type == "starcoder":
+            model_type = "gptnext"
+
+        if model_type == "mixtral":
+            model_type = "llama"
+
+        if tensorrt_llm.mpi_rank() == 0:
+            tmp_dir = tempfile.TemporaryDirectory()
+            nemo_export_dir = Path(tmp_dir.name)
+
+            model, model_config, self.tokenizer = load_nemo_model(nemo_checkpoint_path, nemo_export_dir)
+            weights_dicts, model_configs = model_to_trtllm_ckpt(
+                model=model,
+                nemo_model_config=model_config,
+                nemo_export_dir=nemo_export_dir,
+                decoder_type=model_type,
+                dtype=dtype,
+                tensor_parallel_size=tensor_parallelism_size,
+                pipeline_parallel_size=pipeline_parallelism_size,
+                gpus_per_node=gpus_per_node,
+                use_parallel_embedding=use_parallel_embedding,
+                use_embedding_sharing=use_embedding_sharing,
+            )
+
+            for weight_dict, model_config in zip(weights_dicts, model_configs):
+                rank = model_config.mapping.tp_rank
+                for k, v in weight_dict.items():
+                    if isinstance(v, np.ndarray):
+                        weight_dict[k] = numpy_to_torch(v)
+                    else:
+                        weight_dict[k] = v
+
+                safetensors.torch.save_file(weight_dict, os.path.join(self.model_dir, f'rank{rank}.safetensors'))
+            model_configs[0].to_json_file(os.path.join(self.model_dir, 'config.json'))
+
+            tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
+            if os.path.exists(tokenizer_path):
+                shutil.copy(tokenizer_path, self.model_dir)
+            else:
+                if self.tokenizer is not None:
+                    self.tokenizer.save_pretrained(self.model_dir)
+
+            nemo_model_config = os.path.join(nemo_export_dir, "model_config.yaml")
+            if os.path.exists(nemo_model_config):
+                shutil.copy(nemo_model_config, self.model_dir)
+
+            tmp_dir.cleanup()
+
+        if tensorrt_llm.mpi_world_size() > 1:
+            tensorrt_llm.mpi_barrier()
+
+    def gather_and_reshard_model(self, model_config, model, storage_dtype):
+        """
+        Accumulate all vp model chunks together, and reshard model (i.e) gather all pp ranks
+        if required and return the final model state dict
+        """
+
+        def _get_layer_index(split_key):
+            for index, key in enumerate(split_key):
+                if key == "layers":
+                    return index + 1
+            raise ValueError(f"Unknown layer name format: {split_key}")
+
+        def rename_layer_num(param_name, layer_num):
+            split_key = param_name.split(".")
+            layer_index = int(_get_layer_index(split_key))
+            split_key[layer_index] = str(layer_num)
+            return ".".join(split_key)
+
+        def get_layer_num(param_name):
+            split_key = param_name.split(".")
+            layer_index = int(_get_layer_index(split_key))
+            return int(split_key[layer_index])
+
+        from megatron.core import parallel_state
+
+        tp_size = parallel_state.get_tensor_model_parallel_world_size()
+        pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+        pp_first_rank = parallel_state.get_pipeline_model_parallel_first_rank()
+        pp_last_rank = parallel_state.get_pipeline_model_parallel_last_rank()
+        pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+        pp_group = parallel_state.get_pipeline_model_parallel_group()
+        vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+        if not vp_size:
+            vp_size = 1
+
+        inference_tp_size = self.tp_size
+        inference_pp_size = self.pp_size
+        reshard_model = False
+        if inference_tp_size != tp_size or inference_pp_size != pp_size:
+            LOGGER.info("Training/Generation model parallelism resharding enabled")
+            if inference_pp_size == 1 and pp_size > 1 and inference_tp_size == tp_size:
+                reshard_model = True
+            else:
+                raise NotImplementedError(
+                    "NeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases."
+                )
+
+        num_layers = model_config["num_layers"]
+        layers_per_pp = num_layers // pp_size
+        layers_per_chunk = layers_per_pp // vp_size
+
+        tl_params = {}
+        model_level_params = {}
+        if vp_size > 1:  # consolidate params across model chunks
+            for idx, model_chunk in enumerate(model):
+                for key, val in model_chunk.state_dict().items():
+                    # TODO: currently fp8 is not supported
+                    if torch.is_tensor(val) and '_extra_state' not in key:
+                        if 'layers' in key:
+                            key2 = rename_layer_num(key, get_layer_num(key) + idx * pp_size * layers_per_chunk)
+                            tl_params[key2] = val
+                        else:
+                            model_level_params[key] = val
+        else:
+            for key, val in model.state_dict().items():
+                # TODO: currently fp8 is not supported
+                if torch.is_tensor(val) and '_extra_state' not in key:
+                    if 'decoder.layers' in key:
+                        tl_params[key] = val
+                    else:
+                        model_level_params[key] = val
+
+        if vp_size > 1 or reshard_model:
+            # gather layers across pp ranks
+            gathered_params = {}
+            for key, val in tl_params.items():
+                weight_list = [torch.zeros_like(val) for _ in range(pp_size)]
+                torch.distributed.all_gather(weight_list, val, group=pp_group)
+                for idx in range(pp_size):
+                    layer_num = get_layer_num(key) + idx * layers_per_chunk
+                    key2 = rename_layer_num(key, layer_num)
+                    if not reshard_model:  # Save only layers of 1 single PP stage
+                        layers_start = layers_per_pp * pp_rank
+                        layers_end = layers_per_pp * (pp_rank + 1) - 1
+                        if layer_num >= layers_start and layer_num <= layers_end:
+                            key2 = rename_layer_num(key, layer_num % layers_per_pp)
+                            gathered_params[key2] = weight_list[idx]
+                    else:
+                        gathered_params[key2] = weight_list[idx]
+            tl_params = gathered_params
+
+        model_state_dict = model_level_params
+        model_state_dict.update(tl_params)
+
+        def get_tensor_if_available(key, pp_src_idx, group):
+            tensor = model_state_dict.get(key)
+            if tensor is not None:
+                tensor_shape = [tensor.shape]
+            else:
+                tensor_shape = [None]
+
+            torch.distributed.broadcast_object_list(tensor_shape, pp_src_idx, group=group)
+
+            if tensor_shape[0] is None:
+                return None
+            if torch.distributed.get_rank() != pp_src_idx:
+                tensor = torch.empty(tensor_shape[0], dtype=storage_dtype).cuda()
+
+            torch.distributed.broadcast(tensor.contiguous(), pp_src_idx, group=pp_group)
+            return tensor
+
+        if reshard_model:
+            key = 'decoder.final_layernorm.weight'
+            tensor = get_tensor_if_available(key, pp_last_rank, pp_group)
+            if tensor is not None:
+                model_state_dict[key] = tensor
+
+            key = 'decoder.final_layernorm.bias'
+            tensor = get_tensor_if_available(key, pp_last_rank, pp_group)
+            if tensor is not None:
+                model_state_dict[key] = tensor
+
+            key = 'embedding.word_embeddings.weight'
+            tensor = get_tensor_if_available(key, pp_first_rank, pp_group)
+            if tensor is not None:
+                model_state_dict[key] = tensor
+
+            key = 'output_layer.weight'
+            tensor = get_tensor_if_available(key, pp_last_rank, pp_group)
+            if tensor is not None:
+                model_state_dict[key] = tensor
+
+        return model_state_dict
+
+    def get_input_dtype(self, storage_dtype):
+        """
+        Return mcore export dtype given torch dtype
+        """
+        from megatron.core.export.data_type import DataType
+
+        if storage_dtype == torch.bfloat16:
+            return DataType.bfloat16
+        elif storage_dtype == torch.float32:
+            return DataType.float32
+        elif storage_dtype == torch.float16:
+            return DataType.float16
+
+    @staticmethod
+    def get_nemo_to_trtllm_conversion_dict(model_state_dict):
+        """MCore export supports some default conversion dictionaries
+        All Mcore conversion dicts start with "decoder.layers.4.blah.blah" , while nemo models sometimes start with "model.decoder.layers.4.blahblah". so we append model prefix. to the keys
+        """
+        from megatron.core.export.trtllm.model_to_trllm_mapping.default_conversion_dict import DEFAULT_CONVERSION_DICT
+
+        model_prefix, _ = get_layer_prefix(layer_names=model_state_dict.keys(), is_mcore=True)
+
+        nemo_model_conversion_dict = {}
+        for key, value in DEFAULT_CONVERSION_DICT.items():
+            if model_prefix:
+                nemo_model_conversion_dict[f'{model_prefix}{key}'] = value
+            else:
+                nemo_model_conversion_dict[key] = value
+        return nemo_model_conversion_dict
+
+    def build(
+        self,
+        model,
+        model_config,
+        model_type,
+        gpus_per_node,
+        tokenizer,
+        max_input_len: int = 1024,
+        max_output_len: int = 1024,
+        max_batch_size: int = 4,
+        use_refit: bool = True,
+        reshard_model: bool = False,
+        use_mcore_path: bool = True,
+    ):
+        """
+        Convert a model parallel nemo model to TensorRT-LLM.
+        """
+        assert tensorrt_llm.mpi_rank() == torch.distributed.get_rank()
+        self.use_refit, self.model_type, self.gpus_per_node = use_refit, model_type, gpus_per_node
+        self.mp_rank, self.dp_rank, self.tp_size, self.pp_size, self.dp_size = init_model_parallel_from_nemo(
+            reshard_model
+        )
+        self.tokenizer = build_tokenizer(tokenizer)
+
+        if self.dp_size > 1:
+            self.model_dir = os.path.join(self.model_dir, f"dp_rank{self.dp_rank}")
+
+        if use_mcore_path:
+            from megatron.core.export.model_type import ModelType
+            from megatron.core.export.trtllm.trtllm_helper import TRTLLMHelper
+            from tensorrt_llm.layers import MoeConfig
+
+            storage_dtype = torch_dtype_from_precision(model_config.precision)
+            model_state_dict = self.gather_and_reshard_model(model_config, model, storage_dtype)
+            # We build the transformer config using the nemo model config.
+            transformer_config = self.get_transformer_config(model_config)
+            input_model_type = getattr(ModelType, model_type)
+
+            nemo_model_conversion_dict = self.get_nemo_to_trtllm_conversion_dict(model_state_dict)
+            self.trtllm_helper = TRTLLMHelper(
+                transformer_config=transformer_config,
+                model_type=input_model_type,
+                trtllm_conversion_dict=nemo_model_conversion_dict,
+                position_embedding_type=model_config.get('position_embedding_type'),
+                max_position_embeddings=model_config.get('max_position_embeddings'),
+                rotary_percentage=model_config.get('rotary_percentage', 1.0),
+                rotary_base=model_config.get('rotary_base', 10000),
+                moe_tp_mode=model_config.get('moe_tp_mode', 2),
+                multi_query_mode=model_config.get("multi_query_mode", False),
+                activation=model_config.get('activation', "gelu"),
+                seq_len_interpolation_factor=model_config.get("seq_len_interpolation_factor"),
+                moe_renorm_mode=model_config.get(
+                    'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+                ),
+                share_embeddings_and_output_weights=model_config.get("share_embeddings_and_output_weights", False),
+            )
+
+            input_dtype = self.get_input_dtype(storage_dtype)
+
+            trtllm_model_weights_list, trtllm_model_config_list = (
+                self.trtllm_helper.get_trtllm_pretrained_config_and_model_weights(
+                    model_state_dict=model_state_dict,
+                    dtype=input_dtype,
+                    state_dict_split_by_layer_numbers=True,
+                    on_device_distributed_conversion=True,
+                    vocab_size=self.tokenizer.vocab_size,
+                    gpus_per_node=gpus_per_node,
+                )
+            )
+            trtllm_model_config = trtllm_model_config_list[0]
+            trtllm_model_weights = trtllm_model_weights_list[0]
+
+            if reshard_model:
+                assert self.pp_size == 1, 'Reshard is true, but pp size is not one'
+                # MCORE Export will use parallel_state to determine pp .
+                # Since we reshard to pp = 1, we need to modify the config and mapping
+                world_size = self.tp_size * self.pp_size
+                trtllm_model_config.pp_size = self.pp_size
+                trtllm_model_config.world_size = world_size
+                trtllm_model_config.mapping = tensorrt_llm.Mapping(
+                    world_size=world_size,
+                    rank=self.mp_rank,
+                    tp_size=self.tp_size,
+                    pp_size=self.pp_size,
+                )
+
+            engine = self.trtllm_helper.build_and_save_engine(
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_seq_len=max_input_len + max_output_len,
+                max_batch_size=max_batch_size,
+                trtllm_model_config=trtllm_model_config,
+                trtllm_model_weights=trtllm_model_weights,
+                engine_dir=self.model_dir,
+                use_refit=use_refit,
+            )
+        else:
+            weights, model_config = model_to_trtllm_ckpt(
+                model=model,
+                nemo_model_config=model_config,
+                nemo_export_dir=self.model_dir,
+                decoder_type=model_type,
+                tensor_parallel_size=self.tp_size,
+                pipeline_parallel_size=self.pp_size,
+                gpus_per_node=gpus_per_node,
+                use_parallel_embedding=True,
+                use_distributed_convert=True,
+                model_parallel_rank=self.mp_rank,
+                vocab_size=self.tokenizer.vocab_size,
+            )
+
+            engine = build_and_save_engine(
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_seq_len=max_input_len + max_output_len,
+                max_batch_size=max_batch_size,
+                model_config=model_config[0],
+                model_weights=weights[0],
+                model_dir=self.model_dir,
+                model_type=model_type,
+                use_refit=use_refit,
+            )
+
+        torch.distributed.barrier()
+
+        cfg_path = Path(os.path.join(self.model_dir, f'config_{torch.distributed.get_rank()}.json'))
+        with open(cfg_path, "w", encoding="utf-8") as f:
+            json.dump(engine.config.to_dict(), f, indent=4)
+
+        load_distributed(self.model_dir, self.mp_rank, gpus_per_node)
+
+    def refit(self, model, model_config, use_mcore_path=True):
+        """
+        Refits an TensorRT engine using an instantiated nemo model.
+        This function should only be used after calling build()
+        """
+        weights_dict = None
+        if use_mcore_path:
+            storage_dtype = torch_dtype_from_precision(model_config.precision)
+
+            model_state_dict = self.gather_and_reshard_model(model_config, model, storage_dtype)
+
+            nemo_model_conversion_dict = self.get_nemo_to_trtllm_conversion_dict(model_state_dict)
+            self.trtllm_helper.weights_converter.convert(
+                model_state_dict=model_state_dict,
+                tokenizer_vocab_size=self.tokenizer.vocab_size,
+                trtllm_conversion_dict=nemo_model_conversion_dict,
+            )
+            weights_dict = self.trtllm_helper.weights_converter.trtllm_model_weights
+
+        else:
+            weights_dict = dist_model_to_trt_llm_ckpt(
+                model=model,
+                nemo_model_config=model_config,
+                inference_tp_size=self.tp_size,
+                inference_pp_size=self.pp_size,
+                tokenizer_vocab_size=self.tokenizer.vocab_size,
+            )
+        load_distributed(self.model_dir, self.mp_rank, self.gpus_per_node)
+        gc.collect()
+        torch.cuda.empty_cache()
+        refit(weights_dict)
+
+    def forward(
+        self,
+        input_texts: List[str],
+        max_output_len: int = 64,
+        top_k: int = 1,
+        top_p: float = 0.0,
+        temperature: float = 1.0,
+        stop_words_list: List[str] = None,
+        bad_words_list: List[str] = None,
+        no_repeat_ngram_size: int = None,
+        task_ids: List[str] = None,
+        lora_uids: List[str] = None,
+        prompt_embeddings_table=None,
+        prompt_embeddings_checkpoint_path: str = None,
+        streaming: bool = False,
+        output_log_probs: bool = False,
+        output_context_logits: bool = False,
+        output_generation_logits: bool = False,
+        **sampling_kwargs,
+    ):
+        """
+        Exports nemo checkpoints to TensorRT-LLM.
+
+        Args:
+            input_texts (List(str)): list of sentences.
+            max_output_len (int): max generated tokens.
+            top_k (int): limits us to a certain number (K) of the top tokens to consider.
+            top_p (float): limits us to the top tokens within a certain probability mass (p).
+            temperature (float): A parameter of the softmax function, which is the last layer in the network.
+            stop_words_list (List(str)): list of stop words.
+            bad_words_list (List(str)): list of bad words.
+            no_repeat_ngram_size (int): no repeat ngram size.
+            task_ids (List(str)): list of the task ids for the prompt tables.
+            prompt_embeddings_table (List(float)): prompt embeddings table.
+            prompt_embeddings_checkpoint_path (str): path for the nemo checkpoint for the prompt embedding table.
+            output_generation_logits (bool): if True returns generation_logits in the outout of generate method.
+            sampling_kwargs: Additional kwargs to set in the SamplingConfig.
+        """
+
+        if self.model is None:
+            raise Exception(
+                "A nemo checkpoint should be exported to TensorRT-LLM and "
+                "then it should be loaded first to run inference."
+            )
+        else:
+            if prompt_embeddings_table is not None or prompt_embeddings_checkpoint_path is not None:
+                prompt_table = self._get_prompt_embedding_table(
+                    prompt_embeddings_table, prompt_embeddings_checkpoint_path
+                )
+                tv_size = prompt_table.size(dim=0)
+                task_vtoken_counts = [tv_size]
+            elif len(self.ptuning_tables) > 0:
+                prompt_table = self.p_table
+                tv_size = self.task_vocab_size
+                task_vtoken_counts = self.task_vtoken_counts
+            else:
+                prompt_table = None
+                tv_size = None
+                task_vtoken_counts = None
+
+            if task_ids is None:
+                assert prompt_table is None, "There is a prompt embedding table and task_ids cannot be None"
+                input_task_ids = None
+            else:
+                if prompt_table is None:
+                    input_task_ids = None
+                else:
+                    if len(task_ids) > 1:
+                        assert len(task_ids) == len(input_texts), (
+                            "Either len of the task_ids has to be 1 or" "it needs to match with len of input_texts."
+                        )
+
+                    if len(task_ids) == 1:
+                        assert task_ids[0] in self.task_ids.keys(), "Task: {0} doesn't exist in the task list.".format(
+                            task_ids[0]
+                        )
+                        input_task_ids = [self.task_ids[task_ids[0]] for i in range(len(input_texts))]
+                    else:
+                        input_task_ids = []
+                        for i in range(len(input_texts)):
+                            assert (
+                                task_ids[i] in self.task_ids.keys()
+                            ), "Task: {0} doesn't exist in the task list.".format(task_ids[i])
+                            input_task_ids.append(self.task_ids[task_ids[i]])
+            if not streaming:
+                if torch.distributed.is_initialized() or tensorrt_llm.mpi_world_size() > 1:
+                    multiprocessed_env = True
+                else:
+                    multiprocessed_env = False
+
+                return generate(
+                    input_texts=input_texts,
+                    max_output_len=max_output_len,
+                    host_context=self.model,
+                    top_k=top_k,
+                    top_p=top_p,
+                    temperature=temperature,
+                    prompt_table=prompt_table,
+                    task_vocab_size=tv_size,
+                    task_vtoken_counts=task_vtoken_counts,
+                    task_ids=input_task_ids,
+                    lora_uids=lora_uids,
+                    stop_words_list=stop_words_list,
+                    bad_words_list=bad_words_list,
+                    no_repeat_ngram_size=no_repeat_ngram_size,
+                    output_log_probs=output_log_probs,
+                    multiprocessed_env=multiprocessed_env,
+                    output_context_logits=output_context_logits,
+                    output_generation_logits=output_generation_logits,
+                    **sampling_kwargs,
+                )
+            else:
+                return generate_streaming(
+                    input_texts=input_texts,
+                    max_output_len=max_output_len,
+                    host_context=self.model,
+                    top_k=top_k,
+                    top_p=top_p,
+                    temperature=temperature,
+                    prompt_table=prompt_table,
+                    task_vocab_size=tv_size,
+                    task_vtoken_counts=task_vtoken_counts,
+                    task_ids=input_task_ids,
+                    lora_uids=lora_uids,
+                    stop_words_list=stop_words_list,
+                    bad_words_list=bad_words_list,
+                    no_repeat_ngram_size=no_repeat_ngram_size,
+                    **sampling_kwargs,
+                )
+
+    def add_prompt_table(self, task_name: str, prompt_embeddings_checkpoint_path: str):
+        """Add prompt table"""
+        if self.model is None:
+            raise Exception(
+                "A nemo checkpoint should be exported to TensorRT-LLM and "
+                "then it should be loaded first to run inference."
+            )
+
+        for pt in self.ptuning_tables:
+            if pt["task_name"] == task_name:
+                raise Exception("Task name: {0} has already added. Please pass a unique task name.".format(task_name))
+
+        prompt_table = self._get_prompt_embedding_table(
+            prompt_embeddings_checkpoint_path=prompt_embeddings_checkpoint_path
+        )
+
+        self.ptuning_tables.append({"table": prompt_table, "task_name": task_name})
+        with open(os.path.join(self.model_dir, 'prompt_tables.pkl'), 'wb') as f:
+            pickle.dump(self.ptuning_tables, f)
+
+        self._prep_ptuning_table()
+
+    def remove_prompt_table(self, task_name: str):
+        """Remove prompt table"""
+        if self.ptuning_tables is not None:
+            for i in range(len(self.ptuning_tables)):
+                if self.ptuning_tables[i]["task_name"] == task_name:
+                    self.ptuning_tables.pop(i)
+                    with open(os.path.join(self.model_dir, 'prompt_tables.pkl'), 'wb') as f:
+                        pickle.dump(self.ptuning_tables, f)
+                    return
+            self._prep_ptuning_table()
+
+    def _pad_logits(self, logits_tensor):
+        """
+        Pads the logits tensor with 0's on the right
+        """
+        padding_len = max([logit_tensor.shape[0] for logit_tensor in logits_tensor])
+        for i, tensor in enumerate(logits_tensor):
+            tensor_len = tensor.shape[0]
+            if tensor_len < padding_len:
+                padding_diff = padding_len - tensor_len
+                # padding_diff num of rows of zeros are added at the bottom
+                logits_tensor[i] = F.pad(tensor, (0, 0, 0, padding_diff), mode='constant', value=0)
+        return logits_tensor
+
+    @property
+    def get_supported_models_list(self):
+        """Supported model list"""
+        # gpt and gptnext are the same. Keeping the gptnext due to backward compatibility.
+        return ["gpt", "gptnext", "llama", "falcon", "starcoder", "mixtral", "gemma"]
+
+    @property
+    def get_supported_hf_model_mapping(self):
+        """Supported HF Model Mapping"""
+        HF_MODEL_CLASS_MAP = {
+            'GPT2LMHeadModel': GPTForCausalLM,
+            'GPT2LMHeadCustomModel': GPTForCausalLM,
+            'GPTBigCodeForCausalLM': GPTForCausalLM,
+            'Starcoder2ForCausalLM': GPTForCausalLM,
+            'JAISLMHeadModel': GPTForCausalLM,
+            'GPTForCausalLM': GPTForCausalLM,
+            'NemotronForCausalLM': GPTForCausalLM,
+            'OPTForCausalLM': OPTForCausalLM,
+            'BloomForCausalLM': BloomForCausalLM,
+            'RWForCausalLM': FalconForCausalLM,
+            'FalconForCausalLM': FalconForCausalLM,
+            'PhiForCausalLM': PhiForCausalLM,
+            'Phi3ForCausalLM': Phi3ForCausalLM,
+            'Phi3VForCausalLM': Phi3ForCausalLM,
+            'Phi3SmallForCausalLM': Phi3ForCausalLM,
+            'PhiMoEForCausalLM': Phi3ForCausalLM,
+            'MambaForCausalLM': MambaForCausalLM,
+            'GPTNeoXForCausalLM': GPTNeoXForCausalLM,
+            'GPTJForCausalLM': GPTJForCausalLM,
+            'MptForCausalLM': MPTForCausalLM,
+            'MPTForCausalLM': MPTForCausalLM,
+            'GLMModel': ChatGLMForCausalLM,
+            'ChatGLMModel': ChatGLMForCausalLM,
+            'ChatGLMForCausalLM': ChatGLMForCausalLM,
+            'ChatGLMForConditionalGeneration': ChatGLMForCausalLM,
+            'LlamaForCausalLM': LLaMAForCausalLM,
+            'LlavaLlamaModel': LLaMAForCausalLM,
+            'ExaoneForCausalLM': LLaMAForCausalLM,
+            'MistralForCausalLM': LLaMAForCausalLM,
+            'MixtralForCausalLM': LLaMAForCausalLM,
+            'ArcticForCausalLM': LLaMAForCausalLM,
+            'Grok1ModelForCausalLM': GrokForCausalLM,
+            'InternLMForCausalLM': LLaMAForCausalLM,
+            'InternLM2ForCausalLM': LLaMAForCausalLM,
+            'InternLMXComposer2ForCausalLM': LLaMAForCausalLM,
+            'GraniteForCausalLM': LLaMAForCausalLM,
+            'GraniteMoeForCausalLM': LLaMAForCausalLM,
+            'MedusaForCausalLM': MedusaForCausalLm,
+            'MedusaLlamaForCausalLM': MedusaForCausalLm,
+            'ReDrafterForCausalLM': ReDrafterForCausalLM,
+            'BaichuanForCausalLM': BaichuanForCausalLM,
+            'BaiChuanForCausalLM': BaichuanForCausalLM,
+            'SkyworkForCausalLM': LLaMAForCausalLM,
+            'GEMMA': GemmaForCausalLM,
+            'GEMMA2': GemmaForCausalLM,
+            'QWenLMHeadModel': QWenForCausalLM,
+            'QWenForCausalLM': QWenForCausalLM,
+            'Qwen2ForCausalLM': QWenForCausalLM,
+            'Qwen2MoeForCausalLM': QWenForCausalLM,
+            'Qwen2ForSequenceClassification': QWenForCausalLM,
+            'Qwen2VLForConditionalGeneration': QWenForCausalLM,
+            'Qwen2VLModel': QWenForCausalLM,
+            'WhisperEncoder': WhisperEncoder,
+            'EncoderModel': EncoderModel,
+            'DecoderModel': DecoderModel,
+            'DbrxForCausalLM': DbrxForCausalLM,
+            'RecurrentGemmaForCausalLM': RecurrentGemmaForCausalLM,
+            'CogVLMForCausalLM': CogVLMForCausalLM,
+            'DiT': DiT,
+            'DeepseekForCausalLM': DeepseekForCausalLM,
+            'DeciLMForCausalLM': DeciLMForCausalLM,
+            'DeepseekV2ForCausalLM': DeepseekV2ForCausalLM,
+            'EagleForCausalLM': EagleForCausalLM,
+            'CohereForCausalLM': CohereForCausalLM,
+            'MLLaMAModel': MLLaMAForCausalLM,
+            'MllamaForConditionalGeneration': MLLaMAForCausalLM,
+            'BertForQuestionAnswering': BertForQuestionAnswering,
+            'BertForSequenceClassification': BertForSequenceClassification,
+            'BertModel': BertModel,
+            'RobertaModel': RobertaModel,
+            'RobertaForQuestionAnswering': RobertaForQuestionAnswering,
+            'RobertaForSequenceClassification': RobertaForSequenceClassification,
+        }
+        return HF_MODEL_CLASS_MAP
+
+    @property
+    def get_hidden_size(self):
+        """Get hidden size"""
+        if self.config is None:
+            return None
+        else:
+            return self.config["pretrained_config"]["hidden_size"]
+
+    @property
+    def get_triton_input(self):
+        """Get triton input"""
+        inputs = (
+            Tensor(name="prompts", shape=(-1,), dtype=bytes),
+            Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="random_seed", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="stop_words_list", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="bad_words_list", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="output_context_logits", shape=(-1,), dtype=np.bool_, optional=False),
+            Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False),
+        )
+        return inputs
+
+    @property
+    def get_triton_output(self):
+        outputs = (
+            Tensor(name="outputs", shape=(-1,), dtype=bytes),
+            Tensor(name="generation_logits", shape=(-1,), dtype=np.single),
+            Tensor(name="context_logits", shape=(-1,), dtype=np.single),
+        )
+        return outputs
+
+    @batch
+    @first_value(
+        "max_output_len",
+        "top_k",
+        "top_p",
+        "temperature",
+        "random_seed",
+        "no_repeat_ngram_size",
+        "output_generation_logits",
+        "output_context_logits",
+    )
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        """Triton infer function for streaming"""
+        output_dict = {}
+        context_logits_available = False
+        generation_logits_available = False
+        prompts = str_ndarray2list(inputs.pop("prompts"))
+        infer_input = {"input_texts": prompts}
+        try:
+            if "max_output_len" in inputs:
+                infer_input["max_output_len"] = inputs.pop("max_output_len")
+            if "top_k" in inputs:
+                infer_input["top_k"] = inputs.pop("top_k")
+            if "top_p" in inputs:
+                infer_input["top_p"] = inputs.pop("top_p")
+            if "temperature" in inputs:
+                infer_input["temperature"] = inputs.pop("temperature")
+            if "random_seed" in inputs:
+                infer_input["random_seed"] = inputs.pop("random_seed")
+            if "stop_words_list" in inputs:
+                stop_words_list = str_ndarray2list(inputs.pop("stop_words_list"))
+                infer_input["stop_words_list"] = [[stop_word] for stop_word in stop_words_list]
+            if "bad_words_list" in inputs:
+                bad_words_list = str_ndarray2list(inputs.pop("bad_words_list"))
+                infer_input["bad_words_list"] = [[bad_word] for bad_word in bad_words_list]
+            if "no_repeat_ngram_size" in inputs:
+                infer_input["no_repeat_ngram_size"] = inputs.pop("no_repeat_ngram_size")
+            if "task_id" in inputs:
+                task_id = np.char.decode(inputs.pop("task_id").astype("bytes"), encoding="utf-8")
+                infer_input["task_ids"] = task_id[0]
+            if "lora_uids" in inputs:
+                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
+                infer_input["lora_uids"] = lora_uids[0].tolist()
+            if "output_generation_logits" in inputs:
+                generation_logits_available = inputs["output_generation_logits"]
+                infer_input["output_generation_logits"] = inputs.pop("output_generation_logits")
+            if "output_context_logits" in inputs:
+                context_logits_available = inputs["output_context_logits"]
+                infer_input["output_context_logits"] = inputs.pop("output_context_logits")
+
+            if generation_logits_available:
+                # generation_logits is a 4d torch tensor of dim [BS,1,#generated_tokens,vocab_size]
+                output_texts, generation_logits = self.forward(**infer_input)
+                # convert generation_logits to numpy array. Note: from my understanding since generation_logits is
+                # returned as a torch tensor it won't have varying number of tokens across multiple sequences,
+                # likely due to TRTLLM taking care of padding hence no addtnl padding is needed.
+                output_dict["generation_logits"] = np.array(
+                    [generation_logit.cpu().numpy() for generation_logit in generation_logits]
+                )
+
+            elif context_logits_available:
+                output_texts, context_logits = self.forward(**infer_input)
+                # context_logits is a list of tensors shaped [#tokens, vocab_size] and the len of the list  is BS
+                # In case of batched inputs (i.e multiple prompts sent as a list) context_logits returned can have
+                # different seq_len. Following code pads them as it can otherwise error while converting to numpy array
+                context_logits = self._pad_logits(context_logits)
+                # Convert context_Logits to numpy array of shape [bS, 1, padding_len, vocab_size],.
+                context_logits = np.array([logit_tensor.unsqueeze(0).cpu().numpy() for logit_tensor in context_logits])
+                output_dict["context_logits"] = context_logits
+            else:
+                output_texts = self.forward(**infer_input)
+            output_dict["outputs"] = cast_output(output_texts, np.bytes_)
+        except Exception as error:
+            err_msg = "An error occurred: {0}".format(str(error))
+            output_dict["outputs"] = cast_output([err_msg] * len(prompts), np.bytes_)
+
+        return output_dict
+
+    @batch
+    @first_value("max_output_len", "top_k", "top_p", "temperature", "random_seed", "no_repeat_ngram_size")
+    def triton_infer_fn_streaming(self, **inputs: np.ndarray):
+        """Triton infer function for streaming"""
+        try:
+            infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
+            if "max_output_len" in inputs:
+                infer_input["max_output_len"] = inputs.pop("max_output_len")
+            if "top_k" in inputs:
+                infer_input["top_k"] = inputs.pop("top_k")
+            if "top_p" in inputs:
+                infer_input["top_p"] = inputs.pop("top_p")
+            if "temperature" in inputs:
+                infer_input["temperature"] = inputs.pop("temperature")
+            if "random_seed" in inputs:
+                infer_input["random_seed"] = inputs.pop("random_seed")
+            if "stop_words_list" in inputs:
+                stop_words_list = str_ndarray2list(inputs.pop("stop_words_list"))
+                infer_input["stop_words_list"] = [[stop_word] for stop_word in stop_words_list]
+            if "bad_words_list" in inputs:
+                bad_words_list = str_ndarray2list(inputs.pop("bad_words_list"))
+                infer_input["bad_words_list"] = [[bad_word] for bad_word in bad_words_list]
+            if "no_repeat_ngram_size" in inputs:
+                infer_input["no_repeat_ngram_size"] = inputs.pop("no_repeat_ngram_size")
+            if "task_id" in inputs:
+                task_id = np.char.decode(inputs.pop("task_id").astype("bytes"), encoding="utf-8")
+                infer_input["task_ids"] = task_id[0]
+            if "lora_uids" in inputs:
+                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
+                infer_input["lora_uids"] = lora_uids[0].tolist()
+
+            partial_outputs = self.forward(**infer_input, streaming=True)
+            # On each request to this generator, run the model for one step and return a dict
+            # with full outputs generated until this step.
+            for output_texts in partial_outputs:
+                yield {"outputs": cast_output(output_texts, np.bytes_)}
+        except Exception as error:
+            err_msg = "An error occurred: {0}".format(str(error))
+            output = cast_output([err_msg], np.bytes_)
+            return {"outputs": output}
+
+    def _prep_ptuning_table(self):
+        self.task_vocab_size = 0
+        for pt in self.ptuning_tables:
+            if self.task_vocab_size < pt["table"].size(dim=0):
+                self.task_vocab_size = pt["table"].size(dim=0)
+
+        # pad tasks to longest task embedding table, remember the original task vtoken counts
+        vtokens_embeddings = []
+        self.task_vtoken_counts = []
+        self.task_ids = {}
+        tid = 0
+        for i, ptuning_table in enumerate(self.ptuning_tables):
+            original_table = ptuning_table["table"]
+            vtoken_count = original_table.size(dim=0)
+            padded_table = torch.zeros((self.task_vocab_size, self.get_hidden_size), dtype=original_table.dtype)
+            padded_table[:vtoken_count, :] = original_table
+            vtokens_embeddings.append(padded_table)
+            self.task_ids[ptuning_table["task_name"]] = tid
+            self.task_vtoken_counts.append(vtoken_count)
+            tid = tid + 1
+
+        if len(vtokens_embeddings) > 0:
+            self.p_table = torch.stack(vtokens_embeddings, dim=0).view(-1, self.get_hidden_size)
+
+            max_prompt_embedding_table_size = self.config['build_config']['max_prompt_embedding_table_size']
+            actual_prompt_table_size = self.p_table.shape[0]
+
+            if actual_prompt_table_size > max_prompt_embedding_table_size:
+                raise Exception(
+                    f"The size of the combined prompt embedding table ({actual_prompt_table_size}) is greater than max_prompt_embedding_table_size ({max_prompt_embedding_table_size})."
+                )
+        else:
+            self.p_table = None
+
+    def _load_prompt_tables(self):
+        if self.model_dir is not None:
+            pt_path = Path(os.path.join(self.model_dir, 'prompt_tables.pkl'))
+            if pt_path.exists():
+                with open(pt_path, 'rb') as f:
+                    self.ptuning_tables = pickle.load(f)
+                self._prep_ptuning_table()
+            else:
+                self.ptuning_tables = []
+
+    def _get_prompt_embedding_table_ckpt(self, prompt_embeddings_checkpoint_path):
+        with TarPath(prompt_embeddings_checkpoint_path) as checkpoint_archive:
+            mw_path = checkpoint_archive / "model_weights.ckpt"
+            if not mw_path.exists():
+                mw_path = checkpoint_archive / "mp_rank_00/model_weights.ckpt"
+                if not mw_path.exists():
+                    raise FileNotFoundError(
+                        "File: {0} could not be found in the nemo checkpoint. "
+                        "Please check the nemo checkpoint format for the prompt "
+                        "embedding table.".format(mw_path)
+                    )
+
+            with mw_path.open('rb') as mw_file:
+                weights = torch.load(mw_file)
+
+            weights_found = True
+            if "model.embedding.adapter_layer.ptuning_adapter.inference_table" in weights:
+                weights = weights["model.embedding.adapter_layer.ptuning_adapter.inference_table"]
+            elif (
+                "model.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight"
+                in weights
+            ):
+                weights = weights[
+                    "model.language_model.adapter_layer.ptuning_adapter.inference_table.prompt_table.taskname.prompt_embeddings.weight"
+                ]
+            elif 'prompt_table' in weights:
+                if "prompt_table.taskname.prompt_embeddings.weight" in weights['prompt_table']:
+                    weights = weights['prompt_table']["prompt_table.taskname.prompt_embeddings.weight"]
+                else:
+                    weights_found = False
+            else:
+                weights_found = False
+
+            if not weights_found:
+                raise Exception(
+                    "Could not find the embedding table in the {0}. Please check the nemo file format".format(
+                        prompt_embeddings_checkpoint_path
+                    )
+                )
+
+            return weights.cpu().detach()
+
+    def _get_prompt_embedding_table(
+        self,
+        prompt_embeddings_table=None,
+        prompt_embeddings_checkpoint_path=None,
+    ):
+        if prompt_embeddings_table is not None and prompt_embeddings_checkpoint_path is not None:
+            LOGGER.warning(
+                "prompt_embeddings_table will be used and "
+                "prompt_embeddings_checkpoint_path will be "
+                "ignored for ptuning."
+            )
+            p_tuning = "use_table"
+        elif prompt_embeddings_table is not None:
+            p_tuning = "use_table"
+        elif prompt_embeddings_checkpoint_path is not None:
+            p_tuning = "use_checkpoint"
+        else:
+            return None, None
+
+        if p_tuning == "use_table":
+            if not isinstance(prompt_embeddings_table, np.ndarray):
+                raise TypeError("Only numpy array is allowed for the prompt embeddings table.")
+
+            if len(prompt_embeddings_table.shape) != 2:
+                raise Exception("A two dimensional prompt embeddings table for a single task is only supported.")
+
+            prompt_embeddings_table = torch.from_numpy(prompt_embeddings_table)
+        elif p_tuning == "use_checkpoint":
+            if not is_nemo_tarfile(prompt_embeddings_checkpoint_path):
+                raise TypeError(prompt_embeddings_checkpoint_path + " is not a nemo file.")
+            prompt_embeddings_table = self._get_prompt_embedding_table_ckpt(prompt_embeddings_checkpoint_path)
+
+        dtype = self.config['pretrained_config']['dtype']
+        prompt_embeddings_table = prompt_embeddings_table.to(
+            dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype)
+        ).cuda()
+
+        if prompt_embeddings_table.size(dim=1) != self.config["pretrained_config"]["hidden_size"]:
+            raise Exception(
+                "Hidden dimension of the model is {0} and does not match with the dimension of the prompt table.".format(
+                    self.config["pretrained_config"]["hidden_size"]
+                )
+            )
+
+        return prompt_embeddings_table
+
+    def _load_config_file(self):
+        config_path = Path(self.engine_dir) / 'config.json'
+        if config_path.exists():
+            with open(config_path, 'r') as f:
+                self.config = json.load(f)
+        else:
+            raise FileNotFoundError(f"File: {config_path} could not be found.")
+
+    def _load(self):
+        self.model = None
+        self.tokenizer = None
+        self.config = None
+        self.ptuning_tables = []
+
+        if Path(self.model_dir).exists():
+            folders = os.listdir(self.model_dir)
+            if len(folders) > 0:
+                try:
+                    self._load_config_file()
+                    self.tokenizer = get_tokenizer(self.model_dir)
+                    self.model = load(
+                        tokenizer=self.tokenizer,
+                        engine_dir=self.engine_dir,
+                        lora_ckpt_list=self.lora_ckpt_list,
+                        use_python_runtime=self.use_python_runtime,
+                        enable_chunked_context=self.enable_chunked_context,
+                        max_tokens_in_paged_kv_cache=self.max_tokens_in_paged_kv_cache,
+                        multi_block_mode=self.multi_block_mode,
+                    )
+                    self._load_prompt_tables()
+                except Exception as error:
+                    raise RuntimeError(
+                        "Files in the TensorRT-LLM folder are corrupted and the model needs to be exported again."
+                    ) from error
+
+    def unload_engine(self):
+        """Unload engine"""
+        unload_engine()
diff --git a/nemo/export/tensorrt_mm_exporter.py b/nemo/export/tensorrt_mm_exporter.py
new file mode 100644
index 000000000000..7eeb0ca2721f
--- /dev/null
+++ b/nemo/export/tensorrt_mm_exporter.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import shutil
+import tempfile
+from pathlib import Path
+from typing import List
+
+import numpy as np
+import wrapt
+from tensorrt_llm.runtime import MultimodalModelRunner as TRTLLMRunner
+
+from nemo.deploy import ITritonDeployable
+from nemo.export.multimodal.build import (
+    build_mllama_engine,
+    build_perception_engine,
+    build_trtllm_engine,
+    build_visual_engine,
+    extract_lora_ckpt,
+)
+from nemo.export.multimodal.run import MultimodalModelRunner, SpeechllmModelRunner
+from nemo.export.tarutils import unpack_tarball
+
+use_deploy = True
+try:
+    from nemo.deploy.utils import cast_output, ndarray2img, str_ndarray2list
+except Exception:
+    use_deploy = False
+
+
+@wrapt.decorator
+def noop_decorator(func):
+    """No op decorator"""
+
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+use_pytriton = True
+batch = noop_decorator
+try:
+    from pytriton.decorators import batch, first_value
+    from pytriton.model_config import Tensor
+except Exception:
+    use_pytriton = False
+
+
+LOGGER = logging.getLogger("NeMo")
+
+
+class TensorRTMMExporter(ITritonDeployable):
+    """
+    Exports nemo checkpoints to TensorRT and run fast inference.
+
+    Example:
+        from nemo.export import TensorRTMMExporter
+
+        exporter = TensorRTMMExporter(model_dir="/path/for/model/files")
+        exporter.export(
+            visual_checkpoint_path="/path/for/nemo/checkpoint",
+            model_type="neva",
+            tensor_parallel_size=1,
+        )
+
+        output = exporter.forward("Hi! What is in this image?", "/path/for/input_media")
+        print("output: ", output)
+
+    """
+
+    def __init__(
+        self,
+        model_dir: str,
+        load_model: bool = True,
+        modality: str = "vision",
+    ):
+        self.model_dir = model_dir
+        self.runner = None
+        # vision modality is for image and video
+        assert modality in ["vision", "audio"]
+        self.modality = modality
+
+        if load_model:
+            self._load()
+
+    def export(
+        self,
+        visual_checkpoint_path: str,
+        llm_checkpoint_path: str = None,
+        model_type: str = "neva",
+        llm_model_type: str = "llama",
+        tensor_parallel_size: int = 1,
+        max_input_len: int = 4096,
+        max_output_len: int = 256,
+        max_batch_size: int = 1,
+        vision_max_batch_size: int = 1,
+        max_multimodal_len: int = 3072,
+        dtype: str = "bfloat16",
+        delete_existing_files: bool = True,
+        load_model: bool = True,
+        use_lora_plugin: str = None,
+        lora_target_modules: List[str] = None,
+        lora_checkpoint_path: str = None,
+        max_lora_rank: int = 64,
+    ):
+        """Export multimodal models to TRTLLM"""
+        if Path(self.model_dir).exists():
+            if delete_existing_files and len(os.listdir(self.model_dir)) > 0:
+                for files in os.listdir(self.model_dir):
+                    path = os.path.join(self.model_dir, files)
+                    try:
+                        shutil.rmtree(path)
+                    except OSError:
+                        os.remove(path)
+
+                if len(os.listdir(self.model_dir)) > 0:
+                    raise Exception("Couldn't delete all files.")
+            elif len(os.listdir(self.model_dir)) > 0:
+                raise Exception("There are files in this folder. Try setting delete_existing_files=True.")
+        else:
+            Path(self.model_dir).mkdir(parents=True, exist_ok=True)
+
+        if model_type == "mllama":
+            build_mllama_engine(
+                model_dir=self.model_dir,
+                checkpoint_path=visual_checkpoint_path,
+                tensor_parallelism_size=tensor_parallel_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_batch_size=max_batch_size,
+                vision_max_batch_size=vision_max_batch_size,
+                max_multimodal_len=max_multimodal_len,
+                dtype=dtype,
+            )
+        else:
+            if lora_checkpoint_path is not None:
+                tmp_dir = tempfile.TemporaryDirectory()
+                if os.path.isdir(lora_checkpoint_path):
+                    lora_dir = lora_checkpoint_path
+                else:
+                    lora_dir = os.path.join(tmp_dir.name, "unpacked_lora")
+                    unpack_tarball(lora_checkpoint_path, lora_dir)
+
+                llm_lora_path = [extract_lora_ckpt(lora_dir, tmp_dir.name)]
+            else:
+                tmp_dir = None
+                llm_lora_path = None
+                lora_dir = None
+
+            llm_dir = os.path.join(self.model_dir, "llm_engine")
+            build_trtllm_engine(
+                model_dir=llm_dir,
+                visual_checkpoint_path=visual_checkpoint_path,
+                llm_checkpoint_path=llm_checkpoint_path,
+                model_type=model_type,
+                llm_model_type=llm_model_type,
+                tensor_parallelism_size=tensor_parallel_size,
+                max_input_len=max_input_len,
+                max_output_len=max_output_len,
+                max_batch_size=max_batch_size,
+                max_multimodal_len=max_multimodal_len,
+                dtype=dtype,
+                use_lora_plugin=use_lora_plugin,
+                lora_target_modules=lora_target_modules,
+                max_lora_rank=max_lora_rank,
+                lora_ckpt_list=llm_lora_path,
+            )
+
+            if model_type == "salm":
+                perception_dir = os.path.join(self.model_dir, "perception_engine")
+                build_perception_engine(perception_dir, visual_checkpoint_path, model_type, vision_max_batch_size)
+            else:
+                visual_dir = os.path.join(self.model_dir, "visual_engine")
+                build_visual_engine(
+                    visual_dir,
+                    visual_checkpoint_path if lora_dir is None else lora_dir,
+                    model_type,
+                    vision_max_batch_size,
+                )
+
+            if tmp_dir is not None:
+                tmp_dir.cleanup()
+
+        if load_model:
+            self._load()
+
+    def forward(
+        self,
+        input_text: str,
+        input_media: str,
+        batch_size: int = 1,
+        max_output_len: int = 30,
+        top_k: int = 1,
+        top_p: float = 0.0,
+        temperature: float = 1.0,
+        repetition_penalty: float = 1.0,
+        num_beams: int = 1,
+        lora_uids: List[str] = None,
+    ):
+        """Run forward with loaded TRTLLM engine"""
+        if self.runner is None:
+            raise Exception(
+                "A nemo checkpoint should be exported and " "then it should be loaded first to run inference."
+            )
+
+        if isinstance(self.runner, TRTLLMRunner):
+            self.runner.args.image_path = input_media
+            self.runner.args.batch_size = batch_size
+            self.runner.args.top_k = top_k
+            self.runner.args.top_p = top_p
+            self.runner.args.temperature = temperature
+            self.runner.args.repetition_penalty = repetition_penalty
+            self.runner.args.num_beams = num_beams
+            raw_image = self.runner.load_test_data(input_media)
+            return self.runner.run(
+                input_text,
+                raw_image,
+                max_output_len,
+            )[1]
+        else:
+            input_media = self.runner.load_test_media(input_media)
+            return self.runner.run(
+                input_text,
+                input_media,
+                max_output_len,
+                batch_size,
+                top_k,
+                top_p,
+                temperature,
+                repetition_penalty,
+                num_beams,
+                lora_uids,
+            )
+
+    def get_input_media_tensors(self):
+        """Get input media tensors"""
+        if self.modality == "vision":
+            return [Tensor(name="input_media", shape=(-1, -1, -1, 3), dtype=np.uint8)]
+        elif self.modality == "audio":
+            return [
+                Tensor(name="input_signal", shape=(-1,), dtype=np.single),
+                Tensor(name="input_signal_length", shape=(1,), dtype=np.intc),
+            ]
+        return []
+
+    @property
+    def get_triton_input(self):
+        inputs = (
+            [Tensor(name="input_text", shape=(-1,), dtype=bytes)]
+            + self.get_input_media_tensors()
+            + [
+                Tensor(name="batch_size", shape=(-1,), dtype=np.int_, optional=True),
+                Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
+                Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
+                Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
+                Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
+                Tensor(name="repetition_penalty", shape=(-1,), dtype=np.single, optional=True),
+                Tensor(name="num_beams", shape=(-1,), dtype=np.int_, optional=True),
+                Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
+            ]
+        )
+        inputs = tuple(inputs)
+        return inputs
+
+    @property
+    def get_triton_output(self):
+        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
+        return outputs
+
+    @batch
+    @first_value("batch_size", "max_output_len", "top_k", "top_p", "temperature", "repetition_penalty", "num_beams")
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        try:
+            if self.runner is None:
+                raise Exception(
+                    "A nemo checkpoint should be exported and then it should be loaded first to run inference."
+                )
+
+            infer_input = {"input_text": str_ndarray2list(inputs.pop("input_text")[0])}
+            video_model_list = ["video-neva", "lita", "vita"]
+            if self.runner.model_type in ["neva", "vila", "mllama"]:
+                infer_input["input_image"] = ndarray2img(inputs.pop("input_media")[0])[0]
+            elif self.runner.model_type in video_model_list:
+                infer_input["input_image"] = inputs.pop("input_media")[0]
+            elif self.runner.model_type == "salm":
+                infer_input["input_signal"] = inputs.pop("input_signal")
+                infer_input["input_signal_length"] = inputs.pop("input_signal_length")[:, 0]
+            if "batch_size" in inputs:
+                infer_input["batch_size"] = inputs.pop("batch_size")
+            if "max_output_len" in inputs:
+                infer_input["max_new_tokens"] = inputs.pop("max_output_len")
+            if "top_k" in inputs:
+                infer_input["top_k"] = inputs.pop("top_k")
+            if "top_p" in inputs:
+                infer_input["top_p"] = inputs.pop("top_p")
+            if "temperature" in inputs:
+                infer_input["temperature"] = inputs.pop("temperature")
+            if "repetition_penalty" in inputs:
+                infer_input["repetition_penalty"] = inputs.pop("repetition_penalty")
+            if "num_beams" in inputs:
+                infer_input["num_beams"] = inputs.pop("num_beams")
+            if "lora_uids" in inputs:
+                lora_uids = np.char.decode(inputs.pop("lora_uids").astype("bytes"), encoding="utf-8")
+                infer_input["lora_uids"] = lora_uids[0].tolist()
+
+            if isinstance(self.runner, TRTLLMRunner):
+                self.runner.args.batch_size = infer_input.pop("batch_size")
+                self.runner.args.top_k = infer_input.pop("top_k")
+                self.runner.args.top_p = infer_input.pop("top_p")
+                self.runner.args.temperature = infer_input.pop("temperature")
+                self.runner.args.repetition_penalty = infer_input.pop("repetition_penalty")
+                self.runner.args.num_beams = infer_input.pop("num_beams")
+                output_texts = self.runner.run(**infer_input)[1]
+            else:
+                output_texts = self.runner.run(**infer_input)
+            output = cast_output(output_texts, np.bytes_)
+        except Exception as error:
+            err_msg = "An error occurred: {0}".format(str(error))
+            output = cast_output([err_msg], np.bytes_)
+
+        return {"outputs": output}
+
+    def _load(self):
+        llm_dir = os.path.join(self.model_dir, "llm_engine")
+        if not os.path.exists(llm_dir):
+            return
+        if self.modality == "vision":
+            import json
+
+            visual_dir = os.path.join(self.model_dir, "visual_engine")
+            with open(os.path.join(visual_dir, "config.json"), "r") as f:
+                config = json.load(f)
+            if config["builder_config"]["model_type"] == "mllama":
+                from types import SimpleNamespace
+
+                args = SimpleNamespace(
+                    visual_engine_dir=visual_dir,
+                    visual_engine_name="visual_encoder.engine",
+                    llm_engine_dir=llm_dir,
+                    hf_model_dir='meta-llama/Llama-3.2-11B-Vision-Instruct',
+                    use_py_session=True,
+                    cross_kv_cache_fraction=0.5,
+                    enable_context_fmha_fp32_acc=None,
+                    enable_chunked_context=False,
+                    kv_cache_free_gpu_memory_fraction=0.9,
+                    multi_block_mode=True,
+                )
+                self.runner = TRTLLMRunner(args)
+            else:
+                self.runner = MultimodalModelRunner(visual_dir, llm_dir, self.modality)
+        elif self.modality == "audio":
+            perception_dir = os.path.join(self.model_dir, "perception_engine")
+            self.runner = SpeechllmModelRunner(perception_dir, llm_dir, self.modality)
diff --git a/nemo/export/tiktoken_tokenizer.py b/nemo/export/tiktoken_tokenizer.py
new file mode 100644
index 000000000000..d599620256fa
--- /dev/null
+++ b/nemo/export/tiktoken_tokenizer.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import json
+from pathlib import Path
+from typing import Dict, Optional
+
+import numpy as np
+import tiktoken
+import torch
+
+PATTERN_TIKTOKEN = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+DEFAULT_TIKTOKEN_MAX_VOCAB = 2**17  # 131072
+SPECIAL_TOKENS = ["", "", ""]
+SPECIAL_TOKEN_TEMPLATE = ""
+
+
+def reload_mergeable_ranks(
+    path: str,
+    max_vocab: Optional[int] = None,
+) -> Dict[bytes, int]:
+    """
+    Reload the tokenizer JSON file and convert it to Tiktoken format.
+    """
+    assert path.endswith(".json")
+
+    # reload vocab
+    with open(path, "r", encoding='utf-8') as f:
+        vocab = json.load(f)
+    assert isinstance(vocab, list)
+    print(f"Vocab size: {len(vocab)}")
+    if max_vocab is not None:
+        vocab = vocab[:max_vocab]
+        print(f"Cutting vocab to first {len(vocab)} tokens.")
+
+    # build ranks
+    ranks: Dict[bytes, int] = {}
+    for i, x in enumerate(vocab):
+        assert x.keys() == {"rank", "token_bytes", "token_str"}
+        assert x["rank"] == i
+        merge = base64.b64decode(x["token_bytes"])
+        assert i >= 256 or merge == bytes([i])
+        ranks[merge] = x["rank"]
+
+    # sanity check
+    assert len(ranks) == len(vocab)
+    assert set(ranks.values()) == set(range(len(ranks)))
+
+    return ranks
+
+
+class TiktokenTokenizer:
+    def __init__(self, vocab_file: str):
+
+        self.num_special_tokens = 1000
+        vocab_size = DEFAULT_TIKTOKEN_MAX_VOCAB
+        pattern = PATTERN_TIKTOKEN
+        special_tokens = SPECIAL_TOKENS.copy()
+        inner_vocab_size = vocab_size - self.num_special_tokens
+
+        token2id = reload_mergeable_ranks(vocab_file, max_vocab=inner_vocab_size)
+        self.tokenizer = tiktoken.Encoding(
+            name=Path(vocab_file).parent.name,
+            pat_str=pattern,
+            mergeable_ranks=token2id,
+            special_tokens={},  # special tokens are handled manually
+        )
+
+        # BOS / EOS / Pad token IDs
+        self._bos_id = special_tokens.index("")
+        self._eos_id = special_tokens.index("")
+
+    def encode(self, text):
+        tokens = self.tokenizer.encode(text)
+        tokens = [t + self.num_special_tokens for t in tokens]
+        return tokens
+
+    def decode(self, tokens):
+        # Filter out special tokens and adjust the remaining tokens
+        adjusted_tokens = [
+            t - self.num_special_tokens
+            for t in tokens
+            if t not in {self._bos_id, self._eos_id} and t >= self.num_special_tokens
+        ]
+
+        # Decode only if there are tokens left after filtering
+        if adjusted_tokens:
+            return self.tokenizer.decode(adjusted_tokens)
+        else:
+            return ""  # Return an empty string if all tokens were filtered out
+
+    def batch_decode(self, ids):
+        if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
+            ids = ids.tolist()
+
+        if isinstance(ids[0], list):
+            ids = ids[0]
+
+        return self.decode(ids)
+
+    @property
+    def pad_id(self):
+        return self._eos_id
+
+    @property
+    def bos_token_id(self):
+        return self._bos_id
+
+    @property
+    def eos_token_id(self):
+        return self._eos_id
diff --git a/nemo/export/trt_llm/__init__.py b/nemo/export/trt_llm/__init__.py
new file mode 100644
index 000000000000..4fc50543f1d2
--- /dev/null
+++ b/nemo/export/trt_llm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/trt_llm/converter/__init__.py b/nemo/export/trt_llm/converter/__init__.py
new file mode 100644
index 000000000000..4fc50543f1d2
--- /dev/null
+++ b/nemo/export/trt_llm/converter/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
new file mode 100755
index 000000000000..aef3c44e6cac
--- /dev/null
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -0,0 +1,307 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import tensorrt_llm
+import torch
+from tensorrt_llm._utils import pad_vocab_size
+from tensorrt_llm.functional import non_gated_version
+from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.models.modeling_utils import PretrainedConfig
+
+from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import (
+    convert_model_to_trt_llm_ckpt,
+    dist_model_to_trt_llm_ckpt,
+)
+from nemo.export.trt_llm.converter.utils import DECODER_MODEL_TYPE, split
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def get_config(decoder_type, config):
+    DECODER_CONFIG = {
+        "llama": tensorrt_llm.models.llama.config.LLaMAConfig,
+        "gpt": tensorrt_llm.models.gpt.config.GPTConfig,
+        "gptnext": tensorrt_llm.models.gpt.config.GPTConfig,
+        "falcon": tensorrt_llm.models.falcon.config.FalconConfig,
+        "gemma": tensorrt_llm.models.GemmaConfig,
+    }
+    config_cls = DECODER_CONFIG[decoder_type] if decoder_type in DECODER_CONFIG else PretrainedConfig
+
+    return config_cls(**config)
+
+
+def prompt_convert(prompt_config, prompt_weights):
+    if "task_templates" in prompt_config:
+        prompt_templates = prompt_config["task_templates"]
+        actual_task_id = 0
+        vtokens_embeddings = []
+        vtokens_len = []
+        for task_name_id, prompt_task in enumerate(prompt_templates):
+            prompt_task_name = prompt_task["taskname"]
+            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
+            prompt_task_weights = prompt_weights["prompt_table"].get(
+                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight"
+            )
+            if prompt_task_weights is None:
+                continue
+            vtokens_embeddings.append(prompt_task_weights)
+            vtokens_len.append(prompt_task_weights.shape[0])
+            actual_task_id += 1
+
+        max_vtoken_len = max(vtokens_len)
+        embedding_dim = vtokens_embeddings[0].shape[1]
+
+        # pad tasks to longest task embedding table
+        for i, vtoken_emb_table in enumerate(vtokens_embeddings):
+            padded_table = torch.zeros((max_vtoken_len, embedding_dim))
+            padded_table[: vtoken_emb_table.shape[0], :] = vtoken_emb_table
+            vtokens_embeddings[i] = padded_table
+
+        vtokens_embeddings = torch.stack(vtokens_embeddings)
+    else:
+        vtokens_embeddings = prompt_weights["prompt_embeddings_weights"]
+
+    return vtokens_embeddings
+
+
+def determine_quantization_settings(
+    nemo_model_config: Dict[str, Any], fp8_quantized: Optional[bool] = None, fp8_kvcache: Optional[bool] = None
+) -> Tuple[bool, bool]:
+    """
+    Determines the exported models quantization settings.
+    Reads from NeMo config, with optional override.
+
+    Args:
+        nemo_model_config (dict): NeMo model configuration
+        fp8_quantized (optional, bool): User-specified quantization flag
+        fp8_kvcache (optional, bool): User-specified cache quantization flag
+    Returns:
+        Tuple[bool, bool]:
+            - Model quantization flag
+            - Model kv-cache quantization flag
+    """
+
+    is_nemo_quantized: bool = nemo_model_config.get('fp8', False)
+    if fp8_quantized is None:
+        fp8_quantized = is_nemo_quantized
+    if fp8_kvcache is None:
+        fp8_kvcache = is_nemo_quantized
+
+    return fp8_quantized, fp8_kvcache
+
+
+def model_to_trtllm_ckpt(
+    model,
+    nemo_model_config,
+    nemo_export_dir,
+    decoder_type: str,
+    dtype: str = "bfloat16",
+    tensor_parallel_size: int = 1,
+    pipeline_parallel_size: int = 1,
+    gpus_per_node: int = None,
+    use_parallel_embedding: bool = False,
+    use_embedding_sharing: bool = False,
+    use_distributed_convert: bool = False,
+    model_parallel_rank: int = None,
+    vocab_size: Optional[int] = None,
+    fp8_quantized: Optional[bool] = None,
+    fp8_kvcache: Optional[bool] = None,
+) -> Tuple[List[Dict], List[PretrainedConfig]]:
+    if nemo_model_config.get("share_embeddings_and_output_weights", False) and not use_embedding_sharing:
+        LOGGER.info(
+            "Found share_embeddings_and_output_weights is True in NeMo config, set use_embedding_sharing = True"
+        )
+        use_embedding_sharing = True
+
+    fp8_quantized, fp8_kvcache = determine_quantization_settings(nemo_model_config, fp8_quantized, fp8_kvcache)
+    # If the model has been sharded with model parallelism, convert the model in a gpu-distributed manner
+    if use_distributed_convert:
+        weights_dict = dist_model_to_trt_llm_ckpt(
+            model=model,
+            nemo_model_config=nemo_model_config,
+            inference_tp_size=tensor_parallel_size,
+            inference_pp_size=pipeline_parallel_size,
+            tokenizer_vocab_size=vocab_size,
+            fp8_quantized=fp8_quantized,
+            fp8_kvcache=fp8_kvcache,
+        )
+        vocab_size_padded = vocab_size
+    else:
+        weights_dict = convert_model_to_trt_llm_ckpt(
+            model=model,
+            nemo_model_config=nemo_model_config,
+            nemo_export_dir=nemo_export_dir,
+            inference_tp_size=tensor_parallel_size,
+            processes=1,
+            storage_type=dtype,
+            use_parallel_embedding=use_parallel_embedding,
+            decoder_type=decoder_type,
+            fp8_quantized=fp8_quantized,
+            fp8_kvcache=fp8_kvcache,
+        )
+
+        has_lm_head = "lm_head.weight" in weights_dict
+        if has_lm_head:
+            lm_head_weight = weights_dict["lm_head.weight"]
+        if vocab_size is None:
+            vocab_size = weights_dict["transformer.vocab_embedding.weight"].shape[0]
+        vocab_size_padded = pad_vocab_size(vocab_size, tensor_parallel_size) if has_lm_head else vocab_size
+
+        if has_lm_head and vocab_size_padded != vocab_size:
+            pad_width = vocab_size_padded - vocab_size
+            lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)), "constant", constant_values=0)
+
+    world_size = tensor_parallel_size * pipeline_parallel_size
+    hidden_act = nemo_model_config.get('activation')
+    hidden_act = (
+        hidden_act.split("-")[-1] if nemo_model_config.get('num_moe_experts', 0) else non_gated_version(hidden_act)
+    )
+
+    config = {
+        'architecture': DECODER_MODEL_TYPE[decoder_type],
+        'dtype': dtype,
+        'num_hidden_layers': nemo_model_config.get('num_layers'),
+        'num_attention_heads': nemo_model_config.get('num_attention_heads'),
+        'num_key_value_heads': nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
+        'head_size': nemo_model_config.get('kv_channels'),
+        'hidden_size': nemo_model_config.get('hidden_size'),
+        'intermediate_size': nemo_model_config.get('ffn_hidden_size'),
+        'norm_epsilon': nemo_model_config.get('layernorm_epsilon'),
+        'vocab_size': vocab_size_padded,
+        'position_embedding_type': (
+            "rope_gpt_neox" if nemo_model_config.get('position_embedding_type') == "rope" else "learned_absolute"
+        ),
+        'max_position_embeddings': nemo_model_config.get('max_position_embeddings'),
+        'hidden_act': hidden_act,
+        'use_parallel_embedding': use_parallel_embedding,
+        'embedding_sharding_dim': 0,
+        'share_embedding_table': use_embedding_sharing,
+        'quantization': {
+            'quant_algo': "FP8" if fp8_quantized else None,
+            'kv_cache_quant_algo': "FP8" if fp8_kvcache else None,
+        },
+        'bias': nemo_model_config.get('bias'),
+        'apply_query_key_layer_scaling': False,
+        'rotary_pct': nemo_model_config.get('rotary_percentage', 1.0),
+        'rotary_base': nemo_model_config.get('rotary_base', 10000),
+        'moe_num_experts': nemo_model_config.get('num_moe_experts', 0),
+        'moe_top_k': nemo_model_config.get('moe_router_topk', 0),
+        'moe_normalization_mode': nemo_model_config.get(
+            'moe_renorm_mode', MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE
+        ),
+        'moe_tp_mode': nemo_model_config.get(
+            'moe_tp_mode', 2
+        ),  # change MoeConfig.ParallelismMode.TENSOR_PARALLEL to 2
+        'logits_dtype': 'float32',
+        'world_size': world_size,
+        'tp_size': tensor_parallel_size,
+        'pp_size': pipeline_parallel_size,
+    }
+    model_configs = []
+    weights_dicts = []
+    num_layers = nemo_model_config.get('num_layers')
+    rotary_scaling = nemo_model_config.get("seq_len_interpolation_factor")
+
+    if decoder_type == "falcon":
+        config["new_decoder_architecture"] = False if num_layers == 32 else True
+        config["parallel_attention"] = True
+    if rotary_scaling is not None:
+        config["rotary_scaling"] = {"type": "linear", "factor": float(rotary_scaling)}
+
+    if use_distributed_convert:
+        config["gpus_per_node"] = gpus_per_node
+        model_configs.append(get_config(decoder_type, config))
+        model_configs[0].mapping = tensorrt_llm.Mapping(
+            world_size=world_size,
+            rank=model_parallel_rank,
+            tp_size=tensor_parallel_size,
+            pp_size=pipeline_parallel_size,
+        )
+        weights_dicts.append(weights_dict)
+        return weights_dicts, model_configs
+
+    pp_key = {
+        "transformer.vocab_embedding.weight",
+        "transformer.position_embedding.weight",
+        "lm_head.weight",
+        "transformer.ln_f.weight",
+        "transformer.ln_f.bias",
+    }
+
+    for i in range(world_size):
+        mapping = tensorrt_llm.Mapping(
+            world_size=world_size,
+            rank=i,
+            tp_size=tensor_parallel_size,
+            pp_size=pipeline_parallel_size,
+        )
+        layers_range = mapping.pp_layers(num_layers)
+
+        weights_dict_local = {}
+        for k, v in weights_dict.items():
+            if k in pp_key:
+                continue
+            new_key = k
+            if new_key.endswith(".bin"):  # TP split
+                if new_key.endswith(f"{mapping.tp_rank}.bin"):
+                    new_key = new_key.replace(f".{mapping.tp_rank}.bin", "")
+                else:
+                    continue
+            if "layers" in new_key:  # PP
+                layer_num = int(new_key.split(".")[2])
+                if layer_num in layers_range:
+                    new_key = new_key.replace(f"layers.{layer_num}", f"layers.{layer_num-layers_range[0]}")
+                else:
+                    continue
+            if config.get("new_decoder_architecture", False) and "post_layernorm" in new_key:
+                new_key = new_key.replace("post_layernorm", "mlp_layernorm")
+            weights_dict_local[new_key] = v
+
+        if mapping.is_first_pp_rank():
+            embedding_weight = (
+                split(weights_dict["transformer.vocab_embedding.weight"], mapping.tp_size, mapping.tp_rank)
+                if use_parallel_embedding
+                else weights_dict["transformer.vocab_embedding.weight"]
+            )
+
+            weights_dict_local["transformer.vocab_embedding.weight"] = embedding_weight
+
+            pos_embedding_weight = weights_dict.get("transformer.position_embedding.weight")
+            if pos_embedding_weight is not None:
+                if use_parallel_embedding:
+                    pos_embedding_weight = split(pos_embedding_weight, mapping.tp_size, mapping.tp_rank)
+                weights_dict_local["transformer.position_embedding.weight"] = pos_embedding_weight
+
+        if mapping.is_last_pp_rank():
+            if has_lm_head:
+                weights_dict_local["lm_head.weight"] = split(
+                    lm_head_weight, mapping.tp_size, mapping.tp_rank
+                ).contiguous()
+            weights_dict_local["transformer.ln_f.weight"] = weights_dict["transformer.ln_f.weight"]
+
+            ln_f_bias = weights_dict.get("transformer.ln_f.bias")
+            if ln_f_bias is not None:
+                weights_dict_local["transformer.ln_f.bias"] = ln_f_bias
+
+        config["gpus_per_node"] = gpus_per_node
+        model_config = get_config(decoder_type, config)
+        model_config.mapping = mapping
+        model_configs.append(model_config)
+        weights_dicts.append(weights_dict_local)
+
+    return weights_dicts, model_configs
diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
new file mode 100644
index 000000000000..cb505f634490
--- /dev/null
+++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -0,0 +1,494 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import multiprocessing
+from collections import defaultdict
+from pathlib import Path
+
+import torch
+from tensorrt_llm._utils import pad_vocab_size, str_dtype_to_torch
+from tqdm import tqdm
+
+from nemo.export.trt_llm.converter.utils import save_scaling_factor, save_val, split_and_save_weight, weights_dict
+from nemo.export.utils import torch_dtype_from_precision
+
+LOGGER = logging.getLogger("NeMo")
+
+layer_names = {
+    "position_embedding": "embedding.position_embeddings.weight",
+    "word_embedding": "embedding.word_embeddings.weight",
+    "output_layer": "output_layer.weight",
+    "final_layernorm.weight": "final_layernorm.weight",
+    "final_layernorm.bias": "final_layernorm.bias",
+}
+
+
+def extract_layers_with_prefix(model_, prefix):
+    length_to_trim = len(prefix)
+    model_state = model_.get("state_dict", model_)
+    return {key[length_to_trim:]: model_state[key] for key in model_state.keys() if key.startswith(prefix)}
+
+
+def get_layer_name(layer_type: str, prefix: str):
+    layer_dict = layer_names
+    if layer_type in layer_dict:
+        return prefix + layer_dict[layer_type]
+    else:
+        raise ValueError(f"Unknown layer type {layer_type}")
+
+
+def get_layer_prefix(layer_names, is_mcore):
+    transformer_layer_prefix = None
+
+    for layer_name in layer_names:
+        if not layer_name.startswith('optimizer') and 'self_attention' in layer_name:
+            transformer_layer_prefix = layer_name.split('layers')[0]
+            break
+    assert transformer_layer_prefix is not None, f"Cannot extract transformer layer prefix from {layer_name}"
+    if is_mcore:
+        model_prefix = transformer_layer_prefix.split('decoder')[0]
+    else:
+        model_prefix = transformer_layer_prefix.split('encoder')[0]
+    assert model_prefix is not None, "Cannot extract model prefix from {layer_name}"
+
+    return model_prefix, transformer_layer_prefix
+
+
+def rename_key(new_key: str):
+    if "self_attention" in new_key:
+        new_key = new_key.replace("self_attention", "attention")
+    if "attention.linear_qkv.layer_norm_weight" in new_key:
+        new_key = new_key.replace("attention.linear_qkv.layer_norm_weight", "input_layernorm.weight")
+    if "attention.linear_qkv.layer_norm_bias" in new_key:
+        new_key = new_key.replace("attention.linear_qkv.layer_norm_bias", "input_layernorm.bias")
+    if "mlp.linear_fc1.layer_norm_weight" in new_key:
+        new_key = new_key.replace("mlp.linear_fc1.layer_norm_weight", "post_attention_layernorm.weight")
+    if "mlp.linear_fc1.layer_norm_bias" in new_key:
+        new_key = new_key.replace("mlp.linear_fc1.layer_norm_bias", "post_attention_layernorm.bias")
+
+    return new_key
+
+
+def rename_key_dist_ckpt(old_key: str, layer: int):
+    new_key = old_key
+    if "layers." in old_key:
+        split_key = old_key.split(".")
+        split_key.insert(1, str(layer))
+        new_key = ".".join(split_key)
+
+    return rename_key(new_key)
+
+
+def is_scaling_factor(key: str) -> bool:
+    return "extra_state" in key
+
+
+def load_scaling_factors(model: dict, num_layers: int, export_config: dict) -> dict:
+    if not export_config.get('fp8_quantized', False):
+        return {}
+
+    scaling_factors = {}
+    for key, val in model.items():
+        if is_scaling_factor(key):
+            for layer in range(num_layers):
+                renamed_key = rename_key_dist_ckpt(key, layer)
+                scaling_factors = save_scaling_factor(scaling_factors, renamed_key, val[layer], export_config)
+
+    return scaling_factors
+
+
+@torch.no_grad()
+def convert_model_to_trt_llm_ckpt(
+    nemo_model_config,
+    model,
+    nemo_export_dir,
+    storage_type,
+    inference_tp_size,
+    decoder_type,
+    use_parallel_embedding,
+    processes,
+    fp8_quantized=False,
+    fp8_kvcache=False,
+):
+
+    # if checkpoints files could be found - start preparing output dir
+    out_dir = create_export_dir(nemo_export_dir)
+    storage_type = str_dtype_to_torch(storage_type)
+    is_mcore = nemo_model_config.get("mcore_gpt", False)
+
+    # load position_embedding from rank 0
+    model_state_dict = model.get("state_dict", model)
+
+    prefix, transformer_layer_prefix = get_layer_prefix(model_state_dict.keys(), is_mcore)
+
+    has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict
+    has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict
+
+    num_layers = nemo_model_config["num_layers"]
+    training_tp_size = 1
+    training_pp_size = 1
+    num_kv_heads = nemo_model_config.get("num_query_groups", 0)
+    multi_query_mode = nemo_model_config.get("multi_query_mode", False)
+    num_attention_heads = nemo_model_config["num_attention_heads"]
+    kv_channels = nemo_model_config.get("kv_channels", None)
+
+    if num_kv_heads == 0:
+        if multi_query_mode:
+            num_kv_heads = 1
+        else:
+            num_kv_heads = num_attention_heads
+
+    export_config = {
+        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p"
+        or nemo_model_config.get("layernorm_zero_centered_gamma", False),
+        "tp_size": training_tp_size,
+        "split_gated_activation": nemo_model_config.get("activation", "gelu")
+        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu", "openai-gelu"]
+        and (decoder_type == "gptnext" or is_mcore),
+        "num_attention_heads": num_attention_heads,
+        "num_kv_heads": num_kv_heads,
+        "kv_channels": kv_channels,
+        "use_attention_nemo_shape": True,
+        "transpose_weights": True,
+        "use_parallel_embedding": use_parallel_embedding,
+        "fp8_quantized": fp8_quantized,
+        "fp8_kvcache": fp8_kvcache,
+    }
+
+    # split_factor: in how many parts a TP training node is split
+    split_factor = inference_tp_size
+    model_level_weights = defaultdict(list)
+
+    def handle_model_level_weights(model, tp_idx: int, pp_idx: int):
+        if tp_idx == 0 and pp_idx == 0:
+            if has_position_embedding:
+                val = model[get_layer_name("position_embedding", prefix)]
+                val = val.to(storage_type).cpu()
+                model_level_weights["transformer.position_embedding.weight"].append(val)
+        if pp_idx == 0:
+            val = model.get("state_dict", model)[get_layer_name("word_embedding", prefix)]
+
+            vocab_size = val.shape[0]
+            if use_parallel_embedding:
+                # Pad vocab_size first
+                if vocab_size % inference_tp_size != 0:
+                    vocab_size_padded = pad_vocab_size(vocab_size, inference_tp_size)
+                    pad_width = vocab_size_padded - vocab_size
+                    val = torch.nn.functional.pad(val, (0, 0, 0, pad_width), value=0)
+
+            val = val.to(storage_type).cpu()
+            model_level_weights["transformer.vocab_embedding.weight"].append(val)
+        if has_lm_head and pp_idx == training_pp_size - 1 and decoder_type != "gemma":
+            val = model.get("state_dict", model)[get_layer_name("output_layer", prefix)]
+            val = val.to(storage_type).cpu()
+            model_level_weights["lm_head.weight"].append(val)
+
+    weights_dict = {}
+    tp_rank = 0
+
+    handle_model_level_weights(model, 0, 0)
+    model = extract_layers_with_prefix(model, transformer_layer_prefix)
+    scaling_factors = load_scaling_factors(model, num_layers, export_config)
+
+    starmap_args = []
+    for key, val in model.items():
+        if "_extra_state" not in key:
+            if len(val.size()) == 1:
+                starmap_args.append(
+                    (
+                        tp_rank,
+                        out_dir,
+                        split_factor,
+                        # Let's rename/map the key to the old layer name previously. You can try printing out
+                        # the rename_key output of the old llama checkpoint and compare.
+                        rename_key_dist_ckpt(key, 0),
+                        # Since the state dict value has the full layers,
+                        # let's select the ith layer weights/biases here.
+                        [val],
+                        storage_type,
+                        None,
+                        export_config,
+                        scaling_factors,
+                    )
+                )
+            else:
+                for i in range(num_layers):
+                    starmap_args.append(
+                        (
+                            tp_rank,
+                            out_dir,
+                            split_factor,
+                            # Let's rename/map the key to the old layer name previously. You can try printing out
+                            # the rename_key output of the old llama checkpoint and compare.
+                            rename_key_dist_ckpt(key, i),
+                            # Since the state dict value has the full layers,
+                            # let's select the ith layer weights/biases here.
+                            [val[i]],
+                            storage_type,
+                            None,
+                            export_config,
+                            scaling_factors,
+                        )
+                    )
+
+    starmap_args = tqdm(starmap_args, desc="saving weights")
+
+    if processes > 1:
+        with multiprocessing.Pool(processes) as pool:
+            weights_dicts = pool.starmap(split_and_save_weight, starmap_args)
+            weights_dict_local = {k: v for d in weights_dicts for k, v in d.items()}
+    else:
+        # simpler for debug situations
+        for starmap_arg in starmap_args:
+            weights_dict_local = split_and_save_weight(*starmap_arg)
+
+    weights_dict.update(weights_dict_local)
+
+    for key, values in model_level_weights.items():
+        model_level_weights[key] = torch.concatenate(values, axis=0)
+        weights_dict[key] = model_level_weights[key]
+
+    weights_dict.update(scaling_factors)
+    return weights_dict
+
+
+def _get_layer_index(split_key):
+    for index, key in enumerate(split_key):
+        if key == "layers":
+            return index + 1
+    raise ValueError(f"Unknown layer name format: {split_key}")
+
+
+def rename_layer_num(param_name, layer_num):
+    split_key = param_name.split(".")
+    layer_index = int(_get_layer_index(split_key))
+    split_key[layer_index] = str(layer_num)
+    return ".".join(split_key)
+
+
+def get_layer_num(param_name):
+    split_key = param_name.split(".")
+    layer_index = int(_get_layer_index(split_key))
+    return int(split_key[layer_index])
+
+
+@torch.no_grad()
+def dist_model_to_trt_llm_ckpt(
+    model,
+    nemo_model_config,
+    inference_tp_size,
+    inference_pp_size,
+    tokenizer_vocab_size,
+    fp8_quantized=False,
+    fp8_kvcache=False,
+):
+    from megatron.core import parallel_state
+    from megatron.core.tensor_parallel.utils import VocabUtility
+
+    tp_rank = parallel_state.get_tensor_model_parallel_rank()
+    tp_size = parallel_state.get_tensor_model_parallel_world_size()
+    tp_group = parallel_state.get_tensor_model_parallel_group()
+    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+    pp_first_rank = parallel_state.get_pipeline_model_parallel_first_rank()
+    pp_last_rank = parallel_state.get_pipeline_model_parallel_last_rank()
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    pp_group = parallel_state.get_pipeline_model_parallel_group()
+    pp_is_last = parallel_state.is_pipeline_last_stage(ignore_virtual=True)
+    pp_is_first = parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+    vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
+    if not vp_size:
+        vp_size = 1
+
+    reshard_model = False
+    if inference_tp_size != tp_size or inference_pp_size != pp_size:
+        LOGGER.info("Training/Generation model parallelism resharding enabled")
+        if inference_pp_size == 1 and pp_size > 1 and inference_tp_size == tp_size:
+            reshard_model = True
+        else:
+            raise NotImplementedError(
+                "NeMo currently only supports PP>1 -> PP=1 resharding,"
+                " other types of resharding will come in future releases."
+            )
+
+    num_layers = nemo_model_config["num_layers"]
+    is_mcore = nemo_model_config.get("mcore_gpt", False)
+    storage_type = torch_dtype_from_precision(nemo_model_config.precision)
+    sample_state_dict = model[0].state_dict() if vp_size > 1 else model.state_dict()
+    prefix, transformer_layer_prefix = get_layer_prefix(sample_state_dict, is_mcore)
+    assert is_mcore, "Only megatron-core inflight model conversion is supported"
+
+    export_config = {
+        "apply_layernorm_1p": nemo_model_config.get("normalization", "") == "layernorm1p",
+        "tp_size": tp_size,
+        "split_gated_activation": nemo_model_config.get("activation", "gelu")
+        in ["swiglu", "geglu", "fast-swiglu", "fast-geglu", "openai-gelu"],
+        "num_attention_heads": nemo_model_config["num_attention_heads"],
+        "num_kv_heads": nemo_model_config.get('num_query_groups', nemo_model_config['num_attention_heads']),
+        "convert_on_device": True,
+        "use_attention_nemo_shape": True,
+        "transpose_weights": True,
+        "fp8_quantized": fp8_quantized,
+        "fp8_kvcache": fp8_kvcache,
+    }
+
+    starmap_config = {
+        "tp_rank": None,
+        "saved_dir": None,  # unused
+        "split_factor": 0,
+        "storage_type": storage_type,
+        "act_range": None,
+        "config": export_config,
+    }
+
+    tl_params = {}
+    model_level_params = {}
+    starmap_args = []
+    layers_per_pp = num_layers // pp_size
+    layers_per_chunk = layers_per_pp // vp_size
+
+    if vp_size > 1:  # consolidate params across model chunks
+        for idx, model_chunk in enumerate(model):
+            for key, val in model_chunk.state_dict().items():
+                if torch.is_tensor(val):
+                    if 'layers' in key:
+                        key2 = rename_layer_num(key, get_layer_num(key) + idx * pp_size * layers_per_chunk)
+                        tl_params[key2] = val
+                    else:
+                        model_level_params[key] = val
+    else:
+        for key, val in model.state_dict().items():
+            if torch.is_tensor(val):
+                if 'decoder.layers' in key:
+                    tl_params[key] = val
+                else:
+                    model_level_params[key] = val
+
+    if vp_size > 1 or reshard_model:
+        # gather layers across pp ranks
+        gathered_params = {}
+        for key, val in tl_params.items():
+            weight_list = [torch.zeros_like(val) for _ in range(pp_size)]
+            torch.distributed.all_gather(weight_list, val, group=pp_group)
+            for idx in range(pp_size):
+                layer_num = get_layer_num(key) + idx * layers_per_chunk
+                key2 = rename_layer_num(key, layer_num)
+                if not reshard_model:  # Save only layers of 1 single PP stage
+                    layers_start = layers_per_pp * pp_rank
+                    layers_end = layers_per_pp * (pp_rank + 1) - 1
+                    if layer_num >= layers_start and layer_num <= layers_end:
+                        key2 = rename_layer_num(key, layer_num % layers_per_pp)
+                        gathered_params[key2] = weight_list[idx]
+                else:
+                    gathered_params[key2] = weight_list[idx]
+        tl_params = gathered_params
+
+    # ----------------Convert layer level weights----------------
+    layer_params = extract_layers_with_prefix(tl_params, transformer_layer_prefix)
+    layer_params = {k: v for k, v in layer_params.items() if k.startswith("layers.")}
+    for key, val in layer_params.items():
+        starmap_args.append(starmap_config | {'key': rename_key(key), 'vals': val})
+
+    def broadcast_item(item, group, src_rank):
+        item = [item]
+        torch.distributed.broadcast_object_list(item, src_rank, group=group)
+        return item[0]
+
+    def try_get_model_level_weight(src_key_or_tensor, pp_src_idx):
+        have_tensor = False
+        if torch.distributed.get_rank() == pp_src_idx:
+            if isinstance(src_key_or_tensor, str):
+                tensor = model_level_params.get(src_key_or_tensor, None)
+                have_tensor = torch.is_tensor(tensor)
+            else:
+                assert torch.is_tensor(src_key_or_tensor)
+                tensor = src_key_or_tensor
+                have_tensor = True
+        if reshard_model:
+            have_tensor = broadcast_item(have_tensor, pp_group, pp_src_idx)
+        if not have_tensor:
+            return None
+
+        if reshard_model:  # Broadcast tensor to all PP groups
+            if torch.distributed.get_rank() == pp_src_idx:
+                shape = tensor.shape
+            else:
+                shape = [None]
+            shape = broadcast_item(shape, pp_group, pp_src_idx)
+            if torch.distributed.get_rank() != pp_src_idx:
+                tensor = torch.zeros(shape, dtype=storage_type).cuda()
+            torch.distributed.broadcast(tensor.contiguous(), pp_src_idx, group=pp_group)
+        return tensor
+
+    # ----------------Convert Final Layernorm----------------
+    if pp_is_last or reshard_model:
+        ln_f = try_get_model_level_weight(
+            get_layer_name("final_layernorm.weight", transformer_layer_prefix), pp_last_rank
+        )
+        if ln_f is not None:
+            starmap_args.append(starmap_config | {'key': "final_layernorm.weight", 'vals': ln_f})
+
+        ln_f_bias = try_get_model_level_weight(
+            get_layer_name("final_layernorm.bias", transformer_layer_prefix), pp_last_rank
+        )
+        if ln_f_bias is not None:
+            starmap_args.append(starmap_config | {'key': "final_layernorm.bias", 'vals': ln_f_bias})
+
+    # ----------------Convert Embeddings----------------
+    def get_remove_vocab_padding(tensor_name):
+        tensor = model_level_params.get(tensor_name, None)
+        if tensor is None:
+            return None
+
+        if tp_size > 1:  # Gather padded tensor chunks
+            vocab_size_padded = tensor.shape[0] * tp_size
+            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
+                vocab_size_padded, tp_rank, tp_size
+            )
+            dim_size = list(tensor.size())
+            dim_size[0] = vocab_size_padded
+            gathered_tensor = torch.zeros(dim_size, dtype=tensor.dtype, device=torch.cuda.current_device())
+            gathered_tensor[vocab_start_index:vocab_end_index] = tensor
+            torch.distributed.all_reduce(gathered_tensor, group=tp_group)
+            tensor = gathered_tensor
+        unpadded = tensor[:tokenizer_vocab_size]
+        if tp_size > 1:  # Split gathered tensor for tensor parallel embedding
+            vocab_start_index, vocab_end_index = VocabUtility.vocab_range_from_global_vocab_size(
+                tokenizer_vocab_size, tp_rank, tp_size
+            )
+            unpadded = unpadded[vocab_start_index:vocab_end_index]
+        return unpadded.T  # TRTLLM expects (vocab_size, hidden_size) so need extra transpose
+
+    if pp_is_first or reshard_model:
+        vocab_embed = get_remove_vocab_padding(get_layer_name("word_embedding", prefix))
+        vocab_embed = try_get_model_level_weight(vocab_embed, pp_first_rank)
+        save_val(vocab_embed, dir=None, key='transformer.vocab_embedding.weight', tp_num=None)
+
+    if pp_is_last or reshard_model:
+        lm_head = get_remove_vocab_padding(get_layer_name("output_layer", prefix))
+        lm_head = try_get_model_level_weight(lm_head, pp_last_rank)
+        save_val(lm_head, dir=None, key='lm_head.weight', tp_num=None)
+
+    for starmap_arg in tqdm(starmap_args, desc="saving weights"):
+        split_and_save_weight(**starmap_arg)
+
+    return weights_dict
+
+
+def create_export_dir(nemo_export_dir):
+    out_dir = Path(nemo_export_dir)
+    if not out_dir.exists():
+        out_dir.mkdir(parents=True)
+    return out_dir
diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py
new file mode 100755
index 000000000000..aaa1b2b5cbfe
--- /dev/null
+++ b/nemo/export/trt_llm/converter/utils.py
@@ -0,0 +1,598 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+import numpy as np
+import tensorrt_llm
+import torch
+from tensorrt_llm._utils import mpi_comm, torch_to_numpy
+
+# A global dicts to store exported weights.
+# This is set to be a global variable to avoid extra code modification from tensorrt_llm.
+weights_dict = {}
+
+
+DECODER_MODEL_TYPE = {
+    "gptj": 'GPTForCausalLM',
+    "gptnext": 'GPTForCausalLM',
+    "llama": 'LlamaForCausalLM',
+    "gemma": 'GemmaForCausalLM',
+    "falcon": 'FalconForCausalLM',
+}
+
+post_layernorm_keys = [
+    "post_attention_layernorm.weight",
+    "post_attention_layernorm.bias",
+    "post_self_attn_layernorm.weight",
+]
+mlp_proj_bias_keys = ["mlp.linear_fc2.bias", "mlp.dense_4h_to_h.bias"]
+attention_dense_bias_keys = ["attention.linear_proj.bias", "attention.dense.bias"]
+input_layernorm_keys = ["input_layernorm.weight", "input_layernorm.bias"]
+pre_layernorm_keys = ["pre_mlp_layernorm.weight", "pre_mlp_layernorm.bias"]
+attention_dense_weight_keys = ["attention.linear_proj.weight", "attention.dense.weight"]
+mlp_proj_weight_keys = ["mlp.linear_fc2.weight", "mlp.dense_4h_to_h.weight"]
+mlp_fc_keys = ["mlp.dense_h_to_4h.weight", "mlp.dense_h_to_4h.bias", "mlp.linear_fc1.weight", "mlp.linear_fc1.bias"]
+attention_qkv_bias_keys = ["attention.query_key_value.bias", "attention.linear_qkv.bias"]
+attention_qkv_weight_keys = ["attention.query_key_value.weight", "attention.linear_qkv.weight"]
+mlp_router_keys = ["mlp.router.weight"]
+mlp_fc_expert_keys = ["experts.linear_fc1.weight"]
+mlp_proj_experts_keys = ["experts.linear_fc2.weight"]
+final_layernorm_keys = ["final_layernorm.weight", "final_layernorm.bias"]
+mlp_dense_2_keys = ["mlp.dense_h_to_4h_2.weight", "mlp.dense_h_to_4h_2.bias"]
+attention_not_mapped_keys = [
+    "attention.query.weight",
+    "attention.query.bias",
+    "attention.key_value.weight",
+    "attention.key_value.bias",
+]
+
+weight_scaling_suffix = '.weights_scaling_factor'
+activation_scaling_suffix = '.activation_scaling_factor'
+
+
+def save_val(val, dir, key, tp_num=None):
+    suffix = "" if tp_num is None else f".{tp_num}.bin"
+    global weights_dict
+
+    # Transpose linear layer weights to the correct shape.
+    if torch.is_tensor(val):
+        val = val.detach().contiguous()
+        if len(val.shape) >= 2:
+            val = val.reshape(val.shape[0], -1)
+            val = torch.transpose(val, 0, 1)
+        if key not in weights_dict:
+            weights_dict[f"{key}{suffix}"] = torch.empty(
+                val.size(), dtype=val.dtype, layout=val.layout, device="cpu", pin_memory=True
+            )
+        weights_dict[f"{key}{suffix}"].copy_(val, non_blocking=True)
+    else:
+        if len(val.shape) >= 2:
+            val = np.ascontiguousarray(np.transpose(val.reshape(val.shape[0], -1), [1, 0]))
+        weights_dict[f"{key}{suffix}"] = val
+
+
+def save_split(split_vals, dir, key, i, split_factor):
+    for j, val in enumerate(split_vals):
+        save_val(val, dir, key, i * split_factor + j)
+
+
+def save_expert_split(split_vals, dir, key, i, split_factor):
+    for j, val in enumerate(split_vals):
+        tp_num = i * split_factor + j
+        suffix = "" if tp_num is None else f".{tp_num}.bin"
+
+        global weights_dict
+        weights_dict[f"{key}{suffix}"] = val
+
+
+def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
+    """This function has two purposes:
+    - compute quantized weights, scaled either per-tensor or per-column
+    - compute scaling factors.
+
+    Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
+    CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
+    CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.
+
+    Here is the list of what we need (T means per-tensor, C per-column):
+    - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8).
+    Used before the GEMM. (T)
+    - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
+    - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
+    - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
+    to quant range (int8) (used for CUBLAS) (T, C)
+
+    Note that we don't do anything special about row-parallel GEMM.
+    Theoretically, we could have per-GPU scaling factors too,
+    but then the model would change depending on the number of GPUs used.
+
+    For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection,
+    we consider it
+    as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
+    """
+    # compute weight scaling factors for fp->int8 and int8->fp
+    if is_qkv and not multi_query_mode:
+        scale_w_orig_quant_t = 127.0 / act_range["w"].reshape(3, -1).max(dim=-1, keepdims=True)[0].cpu().numpy()
+        scale_w_orig_quant_c = 127.0 / act_range["w"].reshape(3, -1).cpu().numpy()
+    elif is_qkv and multi_query_mode:
+        raise ValueError("Multi-query w/ int8 quant has not been supported yet")
+    else:
+        scale_w_orig_quant_t = 127.0 / act_range["w"].max().cpu().numpy()
+        scale_w_orig_quant_c = 127.0 / act_range["w"].cpu().numpy()
+    scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t
+    scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c
+
+    # compute the rest of needed scaling factors
+    scale_x_orig_quant_t = np.array(127.0 / act_range["x"].max().item())
+    scale_y_orig_quant_t = np.array(127.0 / act_range["y"].max().item())
+    scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.0)
+    scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t * scale_w_orig_quant_t)
+    scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t * scale_w_orig_quant_c)
+    if is_qkv:
+        scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t, scale_w_orig_quant_c.shape)
+        scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t, scale_w_orig_quant_c.shape)
+
+    def to_i8(x):
+        return x.round().clip(-127, 127).astype(np.int8)
+
+    return {
+        "weight.int8": to_i8(weights * scale_w_orig_quant_t),
+        "weight.int8.col": to_i8(weights * scale_w_orig_quant_c),
+        "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32),
+        "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32),
+        "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32),
+        "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32),
+        "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32),
+        "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32),
+    }
+
+
+def write_int8(vals, dir, base_key, split_dim, tp_rank, split_factor, kv_cache_only=False):
+    if not kv_cache_only:
+        save_split(
+            np.split(vals["weight.int8"], split_factor, axis=split_dim),
+            dir,
+            f"{base_key}.weight.int8",
+            tp_rank,
+            split_factor,
+        )
+        save_split(
+            np.split(vals["weight.int8.col"], split_factor, axis=split_dim),
+            dir,
+            f"{base_key}.weight.int8.col",
+            tp_rank,
+            split_factor,
+        )
+
+    saved_keys_once = ["scale_y_quant_orig"]
+    if not kv_cache_only:
+        saved_keys_once += ["scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant"]
+    # per-column scaling factors are loaded per-gpu for ColumnParallel GEMMs (QKV, FC1)
+    if not kv_cache_only:
+        if split_dim == -1:
+            save_split(
+                np.split(vals["scale_w_quant_orig.col"], split_factor, axis=split_dim),
+                dir,
+                f"{base_key}.scale_w_quant_orig.col",
+                tp_rank,
+                split_factor,
+            )
+            save_split(
+                np.split(vals["scale_y_accum_quant.col"], split_factor, axis=split_dim),
+                dir,
+                f"{base_key}.scale_y_accum_quant.col",
+                tp_rank,
+                split_factor,
+            )
+        else:
+            saved_keys_once += ["scale_w_quant_orig.col", "scale_y_accum_quant.col"]
+
+    if tp_rank == 0:
+        for save_key in saved_keys_once:
+            save_val(vals[save_key], dir, f"{base_key}.{save_key}")
+
+
+def get_suffix(key: str) -> str:
+    return '.' + key.split('.')[-1]
+
+
+def get_trt_llm_prefix(key: str) -> str:
+    layer_num = key.split(".")[1]
+    return f'transformer.layers.{layer_num}'
+
+
+def any_word_in_key(key: str, words: List[str]) -> bool:
+    return any([word in key for word in words])
+
+
+def sequential_key_map(key: str, mapping: List[Tuple[List[str], str]]) -> Optional[str]:
+    for keywords, mapped in mapping:
+        if any_word_in_key(key, keywords):
+            return mapped
+
+    return None
+
+
+def get_trt_llm_infix(key: str) -> Optional[str]:
+    mapping = [
+        (post_layernorm_keys, '.post_layernorm'),
+        (mlp_proj_bias_keys, '.mlp.proj'),
+        (attention_dense_bias_keys, '.attention.dense'),
+        (input_layernorm_keys, '.input_layernorm'),
+        (pre_layernorm_keys, '.post_layernorm'),
+        (attention_dense_weight_keys, '.attention.dense'),
+        (mlp_proj_weight_keys, '.mlp.proj'),
+        (mlp_fc_keys, '.mlp.fc'),
+        (attention_qkv_bias_keys + attention_qkv_weight_keys, '.attention.qkv'),
+        (mlp_router_keys, '.mlp.router'),
+        (mlp_fc_expert_keys, '.mlp.fc'),
+        (mlp_proj_experts_keys, '.mlp.proj'),
+    ]
+    return sequential_key_map(key, mapping)
+
+
+def get_trt_llm_keyname(key: str) -> str:
+    if any_word_in_key(key, final_layernorm_keys):
+        return key.replace("final_layernorm", "transformer.ln_f")
+
+    if infix := get_trt_llm_infix(key):
+        return get_trt_llm_prefix(key) + infix + get_suffix(key)
+
+    return key
+
+
+def is_scaling_factor(key: str) -> bool:
+    return "scale_fwd" in key
+
+
+def get_scaling_factor_keys(key: str) -> Tuple[Tuple[str, str], Tuple[str, str]]:
+    # Reuses existing mapping of NeMo -> TRT LLM weights key via swapping suffixes
+    corresponding_weight_key = '.'.join(key.split('.')[:-2]) + '.weight'
+    corresponding_trt_llm_weight_key = get_trt_llm_keyname(corresponding_weight_key)
+    base_key = '.'.join(corresponding_trt_llm_weight_key.split('.')[:-1])
+
+    weight_scale = base_key + weight_scaling_suffix
+    activation_scale = base_key + activation_scaling_suffix
+    keys = (weight_scale, activation_scale)
+
+    layer_prefix = get_trt_llm_prefix(key)
+    mapped_key = layer_prefix + '.mlp.gate'
+    gate_activation = mapped_key + activation_scaling_suffix
+    gate_weight = mapped_key + weight_scaling_suffix
+    gate_keys = (gate_activation, gate_weight)
+
+    return keys, gate_keys
+
+
+def save_scaling_factor(scaling_factors: dict, key: str, val: torch.Tensor, config: dict):
+    if not is_scaling_factor(key):
+        return scaling_factors
+
+    activation_factor = 1 / val[0].view(1)
+    weights_factor = 1 / val[1].view(1)
+
+    (weights_key, activation_key), gate_keys = get_scaling_factor_keys(key)
+    scaling_factors[activation_key] = activation_factor
+    scaling_factors[weights_key] = weights_factor
+
+    split_gated_activation = config.get("split_gated_activation", False)
+    if split_gated_activation and any_word_in_key(key, ["mlp.dense_h_to_4h", "mlp.linear_fc1"]):
+        (gate_activation_key, gate_weight_key) = gate_keys
+        scaling_factors[gate_activation_key] = activation_factor
+        scaling_factors[gate_weight_key] = weights_factor
+
+    return scaling_factors
+
+
+def cast_val_datatype(vals, trt_llm_key, storage_type, is_fp8_model, scaling_factors):
+    if not is_fp8_model:
+        return [val.to(storage_type) for val in vals]
+
+    fp8_storage_type = torch.float8_e4m3fn
+    quantized_keys = [
+        k.split(weight_scaling_suffix)[0] for k in scaling_factors.keys() if k.endswith(weight_scaling_suffix)
+    ]
+    for k in quantized_keys:
+        if k in trt_llm_key:
+            storage_type = fp8_storage_type
+            scale = scaling_factors[k + weight_scaling_suffix]
+            vals = [val.to(torch.float32) / scale for val in vals]
+            break
+
+    return [val.to(storage_type) for val in vals]
+
+
+def split_val_gate(vals: List[np.ndarray], convert_on_device: bool):
+    if convert_on_device:
+        return [[n] for n in torch.chunk(vals[0], 2, axis=-1)]
+
+    splits = [np.split(val, 2, axis=-1) for val in vals]
+    return list(zip(*splits))
+
+
+# Note: in multi_query_mode, only query heads are split between multiple GPUs, while key/value head
+# are not split as there is only one head per key/value.
+@torch.no_grad()
+def split_and_save_weight(
+    tp_rank, saved_dir, split_factor, key, vals, storage_type, act_range, config, scaling_factors={}
+):
+    use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
+    split_gated_activation = config.get("split_gated_activation", False)
+    num_attention_heads = config.get("num_attention_heads", 0)
+    tp_size = config.get("tp_size", 1)
+    int8_outputs = config.get("int8_outputs", None)
+    multi_query_mode = config.get("multi_query_mode", False)
+    num_kv_heads = config.get("num_kv_heads", num_attention_heads)
+    size_per_head = config.get("kv_channels", None)
+    convert_on_device = config.get("convert_on_device", False)
+    is_fp8_model = config.get("fp8_quantized", False)
+    use_fp8_kv_cache = config.get("fp8_kvcache", False)
+    save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
+
+    trt_llm_key = get_trt_llm_keyname(key)
+    if not isinstance(vals, list):
+        vals = [vals]
+
+    if config.get("transpose_weights", False) and vals[0].ndim == 2:
+        vals = [val.T for val in vals]
+    if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
+        vals = [val.float() + 1.0 for val in vals]
+
+    vals = cast_val_datatype(vals, trt_llm_key, storage_type, is_fp8_model, scaling_factors)
+    if convert_on_device:
+        assert len(vals) == 1  # Should only convert a single device param per call
+        assert torch.is_tensor(vals[0])
+    elif torch.is_tensor(vals[0]):
+        vals = [torch_to_numpy(val.cpu()) for val in vals]
+
+    if any_word_in_key(
+        key,
+        input_layernorm_keys
+        + pre_layernorm_keys
+        + attention_dense_bias_keys
+        + post_layernorm_keys
+        + mlp_proj_bias_keys
+        + final_layernorm_keys,
+    ) and (tp_rank == 0 or convert_on_device):
+        # shared weights, only need to convert the weights of rank 0
+        save_val(vals[0], saved_dir, trt_llm_key)
+
+    elif any_word_in_key(key, attention_dense_weight_keys + mlp_proj_weight_keys):
+        if convert_on_device:
+            save_val(vals[0], saved_dir, trt_llm_key)
+        else:
+            cat_dim = 0
+            val = np.concatenate(vals, axis=cat_dim)
+            split_vals = np.split(val, split_factor, axis=cat_dim)
+            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
+
+        if act_range is not None and int8_outputs == "all":
+            base_key = trt_llm_key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode)
+            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
+
+    elif any_word_in_key(key, mlp_fc_keys):
+        if split_gated_activation:
+            vals, gates = split_val_gate(vals, convert_on_device)
+
+        if convert_on_device:
+            save_val(vals[0], saved_dir, trt_llm_key)
+        else:
+            cat_dim = -1
+            val = np.concatenate(vals, axis=cat_dim)
+            split_vals = np.split(val, split_factor, axis=cat_dim)
+            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
+
+        if act_range is not None and int8_outputs == "all":
+            base_key = trt_llm_key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode)
+            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
+
+        if split_gated_activation:
+            assert not save_int8
+            layer_prefix = get_trt_llm_prefix(key)
+            gate_key = layer_prefix + '.mlp.gate' + get_suffix(trt_llm_key)
+            if convert_on_device:
+                save_val(gates[0], saved_dir, gate_key)
+            else:
+                gate = np.concatenate(gates, axis=cat_dim)
+                split_vals = np.split(gate, split_factor, axis=cat_dim)
+                save_split(split_vals, saved_dir, gate_key, tp_rank, split_factor)
+
+    elif any_word_in_key(key, mlp_dense_2_keys):
+        if convert_on_device:
+            save_val(vals[0], saved_dir, trt_llm_key)
+        else:
+            cat_dim = -1
+            val = np.concatenate(vals, axis=cat_dim)
+            split_vals = np.split(val, split_factor, axis=cat_dim)
+            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
+
+        if act_range is not None and int8_outputs == "all":
+            base_key = trt_llm_key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range, multi_query_mode=multi_query_mode)
+            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank, split_factor)
+
+    elif any_word_in_key(key, attention_qkv_bias_keys):
+        qkv_hidden_dim = vals[0].shape[0]
+        size_per_head = qkv_hidden_dim // (num_attention_heads + 2 * num_kv_heads)
+        q_num = num_attention_heads // num_kv_heads
+
+        # We first concat all sub weights per tp rank together.
+        len_vals = len(vals)
+        if convert_on_device:
+            val = vals[0]
+        else:
+            val = np.concatenate(vals, axis=0)
+        val = val.reshape(num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
+
+        # Split the QKV to separate variables.
+        if convert_on_device:
+            qkv = torch.split(val, [q_num, 1, 1], dim=1)
+            split_vals = torch.concatenate([qkv[0].reshape(-1), qkv[1].reshape(-1), qkv[2].reshape(-1)], dim=1)
+            save_val(split_vals, saved_dir, trt_llm_key)
+        else:
+            qkv = np.split(val, [q_num, q_num + 1], axis=1)
+            q_split = np.split(qkv[0], split_factor, axis=0)
+            k_split = np.split(qkv[1], split_factor, axis=0)
+            v_split = np.split(qkv[2], split_factor, axis=0)
+
+            # Concatenate Q, K, and V together
+            split_vals = [
+                np.concatenate([q_split[i].reshape(-1), k_split[i].reshape(-1), v_split[i].reshape(-1)], axis=0)
+                for i in range(split_factor)
+            ]
+            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
+
+    elif any_word_in_key(key, attention_qkv_weight_keys):
+        assert use_attention_nemo_shape, "Only support NEMO shape for QKV weights"
+        hidden_dim = vals[0].shape[0]
+        if size_per_head is None:
+            size_per_head = hidden_dim // num_attention_heads
+        q_num = num_attention_heads // num_kv_heads
+
+        # When the merge factor exceeds 1, the 'vals' list will have multiple entries.
+        # Depending on the format, 'vals' can look like either [QQQQ..KV, QQQQ..KV, ...](for GQA) or [QKV, QKV, ...](for MHA).
+        # We first concat all sub weights per tp rank together.
+        if convert_on_device:
+            val = vals[0].reshape(hidden_dim, num_kv_heads // tp_size, q_num + 2, size_per_head)
+            qkv = torch.split(val, [q_num, 1, 1], dim=2)
+            split_vals = torch.concatenate(
+                [qkv[0].reshape(hidden_dim, -1), qkv[1].reshape(hidden_dim, -1), qkv[2].reshape(hidden_dim, -1)], dim=1
+            )
+            save_val(split_vals, saved_dir, trt_llm_key)
+        else:
+            len_vals = len(vals)
+            val = np.concatenate(vals, axis=1)
+            val = val.reshape(hidden_dim, num_kv_heads * len_vals // tp_size, q_num + 2, size_per_head)
+
+            # Split the QKV to separate variables.
+            qkv = np.split(val, [q_num, q_num + 1], axis=2)
+
+            query_groups_shape = qkv[0].shape
+            if len(query_groups_shape) > 1:
+                if (query_groups_shape[1] % split_factor) != 0:
+                    raise Exception(
+                        "Number of query groups of the models is {0}. Please select tensor parallelism size "
+                        "that can split the number of query groups to equal number of query matrices in the "
+                        "each GPU.".format(query_groups_shape[1])
+                    )
+
+            q_split = np.split(qkv[0], split_factor, axis=1)
+            k_split = np.split(qkv[1], split_factor, axis=1)
+            v_split = np.split(qkv[2], split_factor, axis=1)
+
+            # Concatenate Q, K, and V together
+            split_vals = [
+                np.concatenate(
+                    [
+                        q_split[i].reshape(hidden_dim, -1),
+                        k_split[i].reshape(hidden_dim, -1),
+                        v_split[i].reshape(hidden_dim, -1),
+                    ],
+                    axis=1,
+                )
+                for i in range(split_factor)
+            ]
+            save_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
+
+        if save_int8:
+            base_key = trt_llm_key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range, is_qkv=True, multi_query_mode=multi_query_mode)
+            write_int8(
+                vals_i8,
+                saved_dir,
+                base_key,
+                cat_dim,
+                tp_rank,
+                split_factor,
+                kv_cache_only=int8_outputs == "kv_cache_only",
+            )
+
+        if use_fp8_kv_cache:
+            base_key = trt_llm_key.replace('.qkv.weight', '')
+            scaling_factor = torch.FloatTensor([1.0])
+            save_val(scaling_factor, dir, base_key + '.kv_cache_scaling_factor')
+
+    elif any_word_in_key(key, attention_not_mapped_keys):
+        pass
+
+    elif any_word_in_key(key, mlp_router_keys):
+        val = np.concatenate(vals, axis=1)
+        save_val(val, saved_dir, trt_llm_key)
+
+    elif any_word_in_key(key, mlp_fc_expert_keys):
+        cat_dim = -1
+        val = np.concatenate(vals, axis=cat_dim)
+        w1, w3 = np.split(val, 2, axis=1)
+        # w1 splits
+        split_w1s = np.split(w1, split_factor, axis=1)
+        # w3 splits
+        split_w3s = np.split(w3, split_factor, axis=1)
+
+        split_vals = [np.concatenate(item, axis=1) for item in zip(split_w3s, split_w1s)]
+        save_expert_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
+
+    elif any_word_in_key(key, mlp_proj_experts_keys):
+        cat_dim = -1
+        val = np.concatenate(vals, axis=cat_dim)
+        split_vals = np.split(val, split_factor, axis=cat_dim)
+        save_expert_split(split_vals, saved_dir, trt_llm_key, tp_rank, split_factor)
+    else:
+        print(f"[WARNING] {key} not handled by converter")
+
+    global weights_dict
+    return weights_dict
+
+
+def split(v: Union[np.ndarray, torch.Tensor], tp_size: int, idx: int, dim: int = 0):
+    """Splits the np tensor v on dim and return the idx's slice."""
+    if tp_size == 1:
+        return v
+
+    dim = dim if len(v.shape) != 1 else 0
+    if torch.is_tensor(v):
+        return torch.split(v, v.size(dim) // tp_size, dim=dim)[idx].contiguous()
+
+    return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
+
+
+def init_model_parallel_from_nemo(reshard_model):
+    from megatron.core import parallel_state
+
+    pp_size = parallel_state.get_pipeline_model_parallel_world_size()
+    tp_size = parallel_state.get_tensor_model_parallel_world_size()
+    dp_size = parallel_state.get_data_parallel_world_size()
+    tp_rank = parallel_state.get_tensor_model_parallel_rank()
+    pp_rank = parallel_state.get_pipeline_model_parallel_rank()
+    dp_rank = parallel_state.get_data_parallel_rank()
+
+    if reshard_model and pp_size > 1:
+        dp_size = dp_size * pp_size
+        dp_rank = torch.distributed.get_rank() // tp_size
+        pp_rank = 0
+        pp_size = 1
+
+    mp_rank = tp_size * pp_rank + tp_rank
+    # Need to split cpp MPI World Comm because TensorRT-LLM NCCL plugins refer to the locally split comm.
+    # High level call structure is: MpiComm::split -> MpiComm::setSession -> LOCAL_COMM_SESSION (used in allReducePlugin.cpp)
+    tensorrt_llm.bindings.MpiComm.split(dp_rank, mp_rank)
+    # Also split the python mpi communicator and set the global world one to the local split one
+    new_comm = mpi_comm().Split(color=dp_rank, key=mp_rank)
+    from mpi4py import MPI
+
+    MPI.COMM_WORLD = new_comm
+
+    return mp_rank, dp_rank, tp_size, pp_size, dp_size
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
new file mode 100644
index 000000000000..34cb8f1eca19
--- /dev/null
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -0,0 +1,706 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import functools
+import json
+import logging
+import os
+import pickle
+import shutil
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import torch
+import yaml
+from transformers import AutoTokenizer, GPT2Tokenizer, PreTrainedTokenizer
+
+from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.export.tarutils import TarPath
+from nemo.export.tiktoken_tokenizer import TiktokenTokenizer
+from nemo.export.utils import load_model_weights, nemo_to_path, torch_dtype_from_precision
+
+try:
+    from nemo.lightning import io
+
+    HAVE_NEMO2 = True
+except (ImportError, ModuleNotFoundError):
+    HAVE_NEMO2 = False
+
+LOGGER = logging.getLogger("NeMo")
+EXTRA_STATE = "extra_state"
+
+
+def load_extra_state_from_bytes(val: Optional[Union[torch.Tensor, BytesIO]]) -> Optional[dict]:
+    """Loads single extra_state from bytes storage.
+
+    Args:
+        val (torch.Tensor | BytesIO): Bytes storage of extra_state
+    Returns:
+        Optional[dict]: Deserialized extra_state, or None if the bytes storage is empty.
+    """
+    if val is None:
+        return None
+
+    # TransformerEngine shifted from storing extra_states bytes storage from _io.BytesIO to torch.Tensor
+    if isinstance(val, torch.Tensor):
+        if val.numel() == 0:
+            return None
+
+        val = val.detach().numpy(force=True).tobytes()
+        return pickle.loads(val)
+
+    val.seek(0)
+    return torch.load(val, weights_only=True)
+
+
+def preprocess_scaling_factors_for_local_export(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Scaling factors are kept in BufferIO objects.
+    This function reads the exact scales, preparing them for export.
+    Used only for local (non-mcore) path.
+
+    Args:
+        state_dict (dict): Model state dictionary
+    Returns:
+        dict: The same dictionary, with explicitly loaded extra states from bytes.
+    """
+    scales_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE in k and 'core_attention' not in k}
+    state_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE not in k}
+    scales = {}
+
+    for key, value in scales_dict.items():
+        extra_state = load_extra_state_from_bytes(value)
+
+        if extra_state is not None and 'scale_fwd' in extra_state:
+            scales[key + '.scale_fwd'] = extra_state['scale_fwd'].cpu()
+
+    combined_scales = {}
+    for key in scales:
+        if '.decoder.layers.0' not in key:
+            continue
+
+        # Key has a structure "model.decoder.layers.."
+        decomposed = key.split('.')
+        layer_num_idx = 3
+
+        # Merges scales from "model.decoder.layers.." to
+        # larger dimensional tensor with "model.decoder.layers." key
+        combined = []
+        layer_num = 0
+        decomposed[layer_num_idx] = str(layer_num)
+        while (scale := scales.get('.'.join(decomposed))) is not None:
+            combined.append(scale)
+            layer_num += 1
+            decomposed[layer_num_idx] = str(layer_num)
+
+        del decomposed[layer_num_idx]
+        combined_scales['.'.join(decomposed)] = torch.stack(combined)
+
+    return state_dict | combined_scales
+
+
+def rename_extra_states(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    This function preprocesses extra states for Megatron export.
+
+    Args:
+        state_dict (dict): Model state dictionary
+    Returns:
+        dict: Model state dictionary, with extra states consumable by mcore export
+    """
+    mcore_extra_states = {}
+
+    for key, value in state_dict.items():
+        if EXTRA_STATE not in key:
+            continue
+
+        # Keys with the extra states have the following format:
+        # .layers.._extra_state/shard__
+        key_base, shard_key = key.split('/')
+        if '_' not in shard_key:
+            continue
+
+        shard_layer = shard_key.split('_')[1]
+        if not shard_layer.isnumeric():
+            continue
+
+        # Renames keys to:
+        # .layers..._extra_state
+        mcore_key = key_base.replace("layers", f"layers.{shard_layer}")
+        if isinstance(value, list):
+            value = value[0]
+        mcore_extra_states[mcore_key] = value
+
+    state_dict = {k: v for k, v in state_dict.items() if EXTRA_STATE not in k}
+    return state_dict | mcore_extra_states
+
+
+def torch_to_numpy_state_dict(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Transforms model state dictionary with torch tensors to numpy arrays.
+
+    Args:
+        state_dict (dict): Model state dictionary.
+    Returns:
+        dict: State dictionary using numpy arrays.
+    """
+    for k, v in state_dict.items():
+        if v.dtype == torch.bfloat16:
+            from tensorrt_llm._utils import np_bfloat16
+
+            state_dict[k] = v.view(torch.int16).numpy().view(np_bfloat16)
+        else:
+            state_dict[k] = v.numpy()
+
+    return state_dict
+
+
+def update_tokenizer_paths(tokenizer_config: Dict, unpacked_checkpoints_dir):
+    """Updates tokenizer paths in the tokenizer config."""
+
+    def _update_config_entry(key, file_pattern):
+        old_path = tokenizer_config.get(key, None)
+        if old_path is None:
+            return
+        old_path = Path(old_path)
+        new_path = unpacked_checkpoints_dir.get_tokenizer_file_path("tokenizer", key, file_pattern)
+        if new_path:
+            LOGGER.debug(f"Update tokenizer {key} {old_path} -> {new_path}")
+            tokenizer_config[key] = new_path
+        elif not old_path.exists():
+            LOGGER.warning(f"Tokenizer {key}'s path {old_path} does not exists: set it to None")
+            tokenizer_config[key] = None
+
+    _update_config_entry("model", "*.model")
+    _update_config_entry("vocab_file", "*vocab*")
+    _update_config_entry("merge_file", "*merge*.txt")
+
+    return tokenizer_config
+
+
+def copy_tokenizer_files(config, out_dir):
+    """Copies tokenizer files to the output directory."""
+    basenames = {
+        "model": "tokenizer",
+        "vocab_file": "vocab",
+        "merge_file": "merges",
+    }
+
+    for key in basenames.keys():
+        if config.get(key, None) is None:
+            continue
+
+        path = config[key]
+
+        if isinstance(path, str):
+            path = Path(path)
+
+        if not path.exists():
+            LOGGER.debug(f"Tokenizer {key}: {path} file not found")
+            continue
+
+        dst_path = out_dir / f"{basenames[key]}{path.suffix}"
+        config[key] = str(dst_path)
+        LOGGER.debug(f"Copy tokenizer {key}: {path}->{dst_path}")
+
+        # Copy 'path' to 'dst_path' without shutil.copy(...) because 'path' may be a TarPath
+        with path.open('rb') as infile:
+            with open(dst_path, 'wb') as outfile:
+                outfile.write(infile.read())
+
+    return config
+
+
+def get_tokenizer_from_nemo2_context(model_context_dir: Path):
+    """
+    Retrieve tokenizer configuration from NeMo 2.0 context and instantiate the tokenizer.
+
+    Args:
+        model_context_dir (Path): Path to the model context directory.
+
+    Returns:
+        The instantiated tokenizer (various classes possible).
+    """
+
+    if HAVE_NEMO2:
+        # Use NeMo tokenizer loaded from the NeMo 2.0 model context
+        tokenizer_spec = io.load_context(model_context_dir, subpath="model.tokenizer")
+        return build_tokenizer(tokenizer_spec)
+    else:
+        # Use local nemo.export SentencePieceTokenizer implementation
+        # or directly a HuggingFace tokenizer based on the model config
+        with (model_context_dir / "model.yaml").open("r") as stream:
+            model_config = yaml.safe_load(stream)
+
+        tokenizer_config = model_config["tokenizer"]
+        target_class = tokenizer_config["_target_"]
+        tokenizer_module = "nemo.collections.common.tokenizers."
+        assert target_class.startswith(tokenizer_module)
+        target_class = target_class.removeprefix(tokenizer_module)
+
+        if target_class == "sentencepiece_tokenizer.SentencePieceTokenizer":
+            tokenizer = SentencePieceTokenizer(
+                model_path=str(model_context_dir / tokenizer_config["model_path"]),
+                special_tokens=tokenizer_config.get("special_tokens", None),
+                legacy=tokenizer_config.get("legacy", False),
+            )
+        elif target_class == "huggingface.auto_tokenizer.AutoTokenizer":
+            tokenizer = AutoTokenizer.from_pretrained(
+                str(model_context_dir / tokenizer_config["pretrained_model_name"])
+            )
+        else:
+            raise ValueError(f"Unsupported tokenizer type: {tokenizer_module}{target_class}.")
+
+    return tokenizer
+
+
+def get_tokenizer(tokenizer_dir_or_path: Union[str, Path]) -> PreTrainedTokenizer:
+    """Loads the tokenizer from the decoded NeMo weights dir."""
+    tokenizer_dir_or_path = Path(tokenizer_dir_or_path)
+    if (tokenizer_dir_or_path / "nemo_context").exists():
+        return get_tokenizer_from_nemo2_context(tokenizer_dir_or_path / "nemo_context")
+    elif (tokenizer_dir_or_path / "tokenizer_config.json").exists():
+        return AutoTokenizer.from_pretrained(tokenizer_dir_or_path)
+    elif os.path.exists(os.path.join(tokenizer_dir_or_path, "vocab.json")):
+        vocab_path = tokenizer_dir_or_path / "vocab.json" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
+        tokenizer_config = {"library": "tiktoken", "vocab_file": str(vocab_path)}
+        return build_tokenizer(tokenizer_config)
+    else:
+        model_path = (
+            tokenizer_dir_or_path / "tokenizer.model" if tokenizer_dir_or_path.is_dir() else tokenizer_dir_or_path
+        )
+        tokenizer_config = {"library": "sentencepiece", "model": str(model_path)}
+        return build_tokenizer(tokenizer_config)
+
+
+def build_tokenizer(tokenizer):
+    """Builds tokenizer for trt-llm export."""
+    if isinstance(tokenizer, dict):
+        tokenizer_config = tokenizer
+        if tokenizer_config["library"] == "sentencepiece":
+            return SentencePieceTokenizer(model_path=tokenizer_config["model"])
+        elif tokenizer_config["library"] == "tiktoken":
+            return TiktokenTokenizer(vocab_file=tokenizer_config["vocab_file"])
+        elif "GPT2" in tokenizer_config["type"]:
+            tokenizer = GPT2Tokenizer(tokenizer_config["vocab_file"], tokenizer_config["merge_file"])
+        else:
+            raise ValueError(f'Tokenizer type {tokenizer_config["library"]} not handled')
+
+        if tokenizer.bos_token_id is None:
+            tokenizer.add_special_tokens({"bos_token": ""})
+        if tokenizer.eos_token_id is None:
+            tokenizer.add_special_tokens({"eos_token": ""})
+    else:
+        # For NeMo tokenizers, monkey patch encode & batch_decode methods for unified interface
+        import nemo.collections.common.tokenizers as nemo_tokenizers
+
+        if isinstance(tokenizer, nemo_tokenizers.TokenizerSpec):
+            if isinstance(tokenizer, nemo_tokenizers.AutoTokenizer):
+                # Unwrap the original methods of HF tokenizer
+                batch_decode = tokenizer.tokenizer.batch_decode
+                encode = tokenizer.tokenizer.encode
+            elif isinstance(tokenizer, nemo_tokenizers.SentencePieceTokenizer):
+                # Define HF equivalents based on available SP methods
+                def batch_decode(self, ids):
+                    if torch.is_tensor(ids):
+                        ids = ids.cpu().numpy()
+                    if isinstance(ids, np.ndarray):
+                        ids = ids.tolist()
+                    return self.tokenizer.decode(ids)
+
+                encode = tokenizer.tokenizer.encode_as_ids
+            else:
+                raise NotImplementedError(f"Patching tokenizer methods for {type(tokenizer)} is not available")
+
+            tokenizer.bos_token_id = tokenizer.bos_id
+            tokenizer.eos_token_id = tokenizer.eos_id
+            nemo_tokenizers.TokenizerSpec.encode = encode
+            nemo_tokenizers.TokenizerSpec.batch_decode = batch_decode
+
+    return tokenizer
+
+
+def load_nemo_config(nemo_ckpt: Union[str, Path]) -> Dict[Any, Any]:
+    """
+    Load the model configuration from a NeMo checkpoint.
+
+    This function handles both NeMo 1.0 and NeMo 2.0 checkpoint structures.
+    For NeMo 2.0, it reads the configuration from the 'context/model.yaml' file.
+    For NeMo 1.0, it uses the UnpackedNemoCheckpointDir to load the model configuration.
+
+    Args:
+        nemo_ckpt (Union[str, Path]): Path to the NeMo checkpoint file or directory.
+    Returns:
+        Dict[Any, Any]: The configuration dictionary.
+    """
+    if Path(nemo_ckpt).is_dir():
+        nemo_ckpt = Path(nemo_ckpt)
+    else:
+        nemo_ckpt = TarPath(nemo_ckpt)
+
+    if (nemo_ckpt / "weights").exists() and (nemo_ckpt / "context").exists():  # Stucture of NeMo 2.0 checkpoints
+        with (nemo_ckpt / "context" / "model.yaml").open("r") as stream:
+            config = yaml.safe_load(stream)
+    else:  # Assume NeMo 1.0 case
+        unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(nemo_ckpt, load_checkpoints_to_cpu=True)
+        config = unpacked_checkpoint_dir.model_config
+
+    return config
+
+
+def get_model_type(nemo_ckpt: Union[str, Path]) -> Optional[str]:
+    """
+    Determine the model type from a NeMo checkpoint for TensorRT-LLM engine build.
+
+    Args:
+        nemo_ckpt (Union[str, Path]): Path to the NeMo checkpoint file.
+    Returns:
+        Optional[str]: The model type if it can be determined, otherwise None.
+    """
+    model_config = load_nemo_config(nemo_ckpt)
+    model_type = None
+
+    if model_class := model_config.get("_target_"):
+        # NeMo 2.0 case
+        NEMO2_TO_MODEL_TYPE = {
+            "nemo.collections.llm.gpt.model.base.GPTModel": "gpt",
+            "nemo.collections.llm.gpt.model.llama.LlamaModel": "llama",
+            "nemo.collections.llm.gpt.model.mistral.MistralModel": "llama",
+            "nemo.collections.llm.gpt.model.mixtral.MixtralModel": "llama",
+            "nemo.collections.llm.gpt.model.starcoder.StarcoderModel": "gpt",
+            "nemo.collections.llm.gpt.model.starcoder2.Starcoder2Model": "gpt",
+            "nemo.collections.llm.gpt.model.nemotron.NemotronModel": "gpt",
+            "nemo.collections.llm.gpt.model.gemma.GemmaModel": "gemma",
+            "nemo.collections.llm.gpt.model.phi3mini.Phi3Model": "phi3",
+            "nemo.collections.llm.gpt.model.baichuan.Baichuan2Model": "baichuan",
+            "nemo.collections.llm.gpt.model.chatglm.ChatGLMModel": "chatglm",
+            "nemo.collections.llm.gpt.model.qwen2.Qwen2Model": "qwen",
+        }
+        try:
+            model_type = NEMO2_TO_MODEL_TYPE[model_class]
+            LOGGER.info(f"Determined model_type='{model_type}' for {nemo_ckpt} checkpoint.")
+
+        except KeyError:
+            LOGGER.error(
+                f"Model {model_class} not found in the NEMO2_TO_MODEL_TYPE mapping, "
+                "try providing the model_type explicitely for exporting:\n"
+                f"{json.dumps(NEMO2_TO_MODEL_TYPE, indent=2)}"
+            )
+            raise
+    else:
+        LOGGER.warning(f"Parameter model_type cannot be determined for {nemo_ckpt} checkpoint.")
+    return model_type
+
+
+def get_weights_dtype(nemo_ckpt: Union[str, Path]) -> Optional[str]:
+    """Determine the weights data type from a NeMo checkpoint for TensorRT-LLM engine build.
+
+    Args:
+        nemo_ckpt (Union[str, Path]): Path to the NeMo checkpoint file.
+    Returns:
+        Optional[str]: The dtype if it can be determined, otherwise None.
+    """
+    model_config = load_nemo_config(nemo_ckpt)
+    torch_dtype = None
+    dtype = None
+
+    is_nemo2 = "_target_" in model_config
+    if is_nemo2:
+        torch_dtype = model_config["config"]["params_dtype"]["_target_"]
+    elif precision := model_config.get("precision", None):
+        torch_dtype = str(torch_dtype_from_precision(precision))
+
+    if torch_dtype is not None:
+        dtype = torch_dtype.removeprefix("torch.")
+        LOGGER.info(f"Determined weights dtype='{dtype}' for {nemo_ckpt} checkpoint.")
+    else:
+        LOGGER.warning(
+            f"Parameter dtype for model weights cannot be determined for {nemo_ckpt} checkpoint. "
+            "There is no 'precision' field specified in the model_config.yaml file."
+        )
+
+    return dtype
+
+
+def load_distributed_model_weights(
+    nemo_checkpoint: Union[str, Path], mcore_scales_format: bool, torch_tensor: bool = True
+) -> Dict[str, Any]:
+    """
+    Loads model weights in `torch_dist` format from the model path.
+    Preprocesses the scaling factors for local export if mcore_scales_format is set to False.
+
+    Args:
+        nemo_checkpoint (str | Path): Path to the nemo checkpoint.
+        mcore_scales_format (bool): Flag for local vs megatron.core export.
+        torch_tensor (bool): If set to False, converts returns weights in numpy format.
+    Returns:
+        dict: Model state dictionary.
+    """
+    state_dict = load_model_weights(nemo_checkpoint, load_extra_states=True)
+    if not torch_tensor:
+        state_dict = torch_to_numpy_state_dict(state_dict)
+
+    state_dict = rename_extra_states(state_dict)
+    if not mcore_scales_format:
+        state_dict.update({k: v[0] for k, v in state_dict.items() if EXTRA_STATE in k and isinstance(v, list)})
+        state_dict = preprocess_scaling_factors_for_local_export(state_dict)
+
+    return state_dict
+
+
+def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Path], mcore_scales_format: bool = True):
+    """Unified model loading for trt-llm export."""
+    if not os.path.exists(nemo_ckpt):
+        raise TypeError("%s does not exist", nemo_ckpt)
+
+    nemo_dir = nemo_to_path(nemo_ckpt)
+
+    tokenizer = None
+    try:
+        unpacked_checkpoint_dir = UnpackedNemoCheckpointDir(nemo_dir, load_checkpoints_to_cpu=True)
+
+        if (nemo_dir / "model_weights").exists():
+            model = load_distributed_model_weights(nemo_ckpt, mcore_scales_format)
+
+            nemo_model_config = unpacked_checkpoint_dir.model_config
+
+            if nemo_model_config["tokenizer"].get("library", None) == "huggingface":
+                tokenizer = AutoTokenizer.from_pretrained(
+                    nemo_model_config["tokenizer"]["type"],
+                    use_fast=nemo_model_config["tokenizer"].get("use_fast", False),
+                )
+            else:
+                tokenizer_config = update_tokenizer_paths(nemo_model_config["tokenizer"], unpacked_checkpoint_dir)
+                tokenizer_config = copy_tokenizer_files(tokenizer_config, nemo_export_dir)
+
+                tokenizer = build_tokenizer(tokenizer_config)
+        elif (nemo_dir / "weights").exists():
+            model = load_distributed_model_weights(nemo_ckpt, mcore_scales_format)
+            io_folder = nemo_dir / "context"
+
+            if (io_folder / "model.yaml").exists():
+                with open(io_folder / "model.yaml", 'r') as stream:
+                    config = yaml.safe_load(stream)
+
+                nemo_model_config = {}
+                for k, v in config["config"].items():
+                    if isinstance(v, (float, int, str, bool)):
+                        nemo_model_config[k] = v
+                    elif k == "activation_func":
+                        nemo_model_config["activation"] = v["_target_"].rsplit('.', 1)[-1]
+            else:
+                assert HAVE_NEMO2, "nemo_toolkit>=2.0.0 is required to load the model context."
+
+                config = io.load_context(io_folder, subpath="model.config")
+
+                nemo_model_config = {}
+                for k, v in config.__dict__.items():
+                    if isinstance(v, (float, int, str, bool)):
+                        nemo_model_config[k] = v
+                    elif k == "activation_func":
+                        if isinstance(v, torch.jit.ScriptFunction):
+                            nemo_model_config["activation"] = v.name
+                        else:
+                            nemo_model_config["activation"] = v.__name__
+
+            if nemo_model_config.get("num_moe_experts") is None:
+                nemo_model_config["num_moe_experts"] = 0
+                nemo_model_config["moe_router_topk"] = 0
+            if nemo_model_config["activation"] == "silu":
+                nemo_model_config["activation"] = "fast-swiglu"
+            elif nemo_model_config["activation"] == "openai_gelu":
+                nemo_model_config["activation"] = "openai-gelu"
+            elif nemo_model_config["activation"] == "squared_relu":
+                nemo_model_config["activation"] = "squared-relu"
+
+            if nemo_model_config.get("add_bias_linear"):
+                nemo_model_config["bias"] = True
+
+            nemo_model_config["mcore_gpt"] = True
+            nemo_model_config["max_position_embeddings"] = nemo_model_config.get("seq_length", 4096)
+            nemo_model_config["rotary_percentage"] = nemo_model_config.get("rotary_percent", 1.0)
+
+            shutil.copytree(io_folder, nemo_export_dir / "nemo_context")
+        else:
+            raise Exception("Not a supported NeMo file format: only distributed MCore NeMo checkpoints are supported.")
+    finally:
+        if isinstance(nemo_dir, TarPath):
+            nemo_dir.tarobject.close()
+
+    return model, nemo_model_config, tokenizer
+
+
+def cpu_map_location(storage, loc):
+    """Maps storage to CPU."""
+    return storage.cpu()
+
+
+def gpu_map_location(storage, loc):
+    """Maps storage to GPU."""
+    if loc.startswith("cuda"):
+        training_gpu_idx = int(loc.split(":")[1])
+        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
+        return storage.cuda(inference_gpu_idx)
+    elif loc.startswith("cpu"):
+        return storage.cpu()
+    else:
+        raise ValueError(f"Not handled {loc}")
+
+
+class UnpackedNemoCheckpointDir:
+    """
+    Caches model config and tokenizer file path when loading from a packed NeMo checkpoint directory.
+    """
+
+    def __init__(
+        self,
+        checkpoints_dir: Union[Path, TarPath],
+        load_checkpoints_to_cpu: bool = False,
+    ):
+        assert isinstance(checkpoints_dir, (Path, TarPath))
+        self._checkpoints_dir = checkpoints_dir
+        self._load_checkpoints_to_cpu = load_checkpoints_to_cpu
+
+    @property
+    @functools.lru_cache
+    def model_config(self):
+        """Returns model config dictionary."""
+        model_config = None
+
+        model_config_filename = "model_config.yaml"
+        model_configs_paths = list(self._checkpoints_dir.rglob(model_config_filename))
+        if model_configs_paths:
+            if len(model_configs_paths) > 1:
+                LOGGER.debug(f"There are more than single {model_config_filename} in" f" {self._checkpoints_dir}")
+            model_config_path = model_configs_paths[0]
+            LOGGER.debug("Loading model config from %s", model_config_path)
+            with model_config_path.open("r") as model_config_file:
+                model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
+        else:
+            LOGGER.debug("Searching model config in checkpoints")
+            # try to obtain from checkpoint
+            checkpoint_name = self.checkpoint_name
+            checkpoints_paths = sorted(self._checkpoints_dir.rglob(checkpoint_name))
+            if checkpoints_paths:
+                # assume that parallel ranks 0 checkpoint should have model config embedded
+                checkpoint_path = checkpoints_paths[0]
+
+                map_location_fn = cpu_map_location if self._load_checkpoints_to_cpu else gpu_map_location
+
+                model_00 = torch.load(checkpoint_path, map_location=map_location_fn)
+                if "hyper_parameters" in model_00 and "cfg" in model_00["hyper_parameters"]:
+                    model_config = model_00["hyper_parameters"]["cfg"]
+                    LOGGER.debug("Loaded model config from checkpoint %s", checkpoint_path)
+                else:
+                    LOGGER.debug("Could not find model config in checkpoint %s", checkpoint_path)
+
+                del model_00
+
+        if model_config is None:
+            LOGGER.warning("Could not find checkpoint with NeMo model config in %s", self._checkpoints_dir)
+
+        LOGGER.debug("Loaded model config %s", model_config)
+
+        return model_config
+
+    @property
+    def checkpoints_dir(self):
+        """Returns path to checkpoints directory."""
+        return self._checkpoints_dir
+
+    def get_checkpoints_paths(self, tensor_model_parallel_size=1, pipeline_model_parallel_size=1):
+        """Injects tensor/pipeline model parallel ranks into the filepath.
+        Does nothing if not using model parallelism.
+        """
+        checkpoint_path_without_rank = self.checkpoints_dir / self.checkpoint_name
+
+        def _inject_parallel_ranks(tp_rank, pp_rank):
+            if tensor_model_parallel_size > 1 or pipeline_model_parallel_size > 1:
+                if pipeline_model_parallel_size is None or pipeline_model_parallel_size == 1:
+                    checkpoint_path = (
+                        checkpoint_path_without_rank.parent
+                        / f"mp_rank_{tp_rank:02d}"
+                        / checkpoint_path_without_rank.name
+                    )
+                else:
+                    checkpoint_path = (
+                        checkpoint_path_without_rank.parent
+                        / f"tp_rank_{tp_rank:02d}_pp_rank_{pp_rank:03d}"
+                        / checkpoint_path_without_rank.name
+                    )
+                return checkpoint_path
+            else:
+                return checkpoint_path_without_rank
+
+        return [
+            [
+                _inject_parallel_ranks(tp_rank=tp_rank, pp_rank=pp_rank)
+                for pp_rank in range(pipeline_model_parallel_size)
+            ]
+            for tp_rank in range(tensor_model_parallel_size)
+        ]
+
+    @property
+    @functools.lru_cache
+    def checkpoint_name(self):
+        """Returns the name of the checkpoint file."""
+        patterns = [
+            "model_weights.ckpt",  # older megatron checkpoints
+            "*last.ckpt",  # newer format of checkpoints
+        ]
+        for pattern in patterns:
+            model_files = sorted(list(self._checkpoints_dir.rglob(pattern)))
+            if model_files:
+                return model_files[0].name
+
+        raise ValueError(f"Could not find checkpoint files in {self._checkpoints_dir}")
+
+    @functools.lru_cache
+    def get_tokenizer_file_path(self, tokenizer_key, file_key, default_filename_pattern):
+        """Returns path to tokenizer file."""
+        model_config = self.model_config
+        file_property = None
+        if tokenizer_key in model_config and file_key in model_config[tokenizer_key]:
+            file_property = model_config[tokenizer_key][file_key]
+        elif file_key in model_config:
+            file_property = model_config[file_key]
+
+        LOGGER.debug("model_config[%s][%s]=%s", tokenizer_key, file_key, file_property)
+
+        if file_property and file_property.startswith("nemo:"):
+            filename = file_property.split("nemo:")[1]
+            filename_pattern = f"*{filename}"
+        elif file_property and file_property.startswith("/artifacts/"):
+            filename = Path(file_property).name
+            filename_pattern = f"*{filename}"
+        elif file_property is None or file_property == "None":
+            filename_pattern = None
+        else:
+            filename_pattern = default_filename_pattern
+            LOGGER.warning(
+                f"Tokenizer file from config: {tokenizer_key}.{file_key}={file_property} "
+                f"looks like unsupported path. Pattern {filename_pattern} will be used."
+            )
+
+        file_path = None
+        if filename_pattern is not None:
+            files_paths = list(self._checkpoints_dir.glob(filename_pattern))
+            if files_paths:
+                assert len(files_paths) == 1
+                file_path = files_paths[0]
+
+        return file_path
diff --git a/nemo/export/trt_llm/qnemo/__init__.py b/nemo/export/trt_llm/qnemo/__init__.py
new file mode 100644
index 000000000000..59b9eb8ae6a6
--- /dev/null
+++ b/nemo/export/trt_llm/qnemo/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
new file mode 100644
index 000000000000..7fd554a66d14
--- /dev/null
+++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+import subprocess
+import warnings
+from typing import List, Optional
+
+from tensorrt_llm.models import PretrainedConfig
+
+from nemo.export.trt_llm.qnemo.utils import CONFIG_NAME, WEIGHTS_NAME
+
+
+def qnemo_to_tensorrt_llm(
+    nemo_checkpoint_path: str,
+    engine_dir: str,
+    max_input_len: int,
+    max_seq_len: Optional[int],
+    max_batch_size: int,
+    max_prompt_embedding_table_size: int,
+    tensor_parallel_size: Optional[int] = None,
+    pipeline_parallel_size: Optional[int] = None,
+    use_parallel_embedding: bool = False,
+    paged_kv_cache: bool = True,
+    paged_context_fmha: bool = False,
+    remove_input_padding: bool = True,
+    use_lora_plugin: Optional[str] = None,
+    lora_target_modules: Optional[List[str]] = None,
+    max_lora_rank: int = 64,
+    max_num_tokens: Optional[int] = None,
+    opt_num_tokens: Optional[int] = None,
+    max_beam_width: int = 1,
+    multiple_profiles: bool = False,
+    reduce_fusion: bool = True,
+):
+    """Build TensorRT-LLM engine with trtllm-build command in a subprocess."""
+    assert not lora_target_modules, f"LoRA is not supported for quantized checkpoints, got {lora_target_modules}"
+
+    warnings.warn(
+        "Note that setting tensor_parallel_size, pipeline_parallel_size and use_parallel_embedding "
+        " parameters for quantized models is done on the calibration step (in PTQ workflow)."
+        " These parameters are ignored when building and running TensorRT-LLM engine below.",
+        UserWarning,
+        stacklevel=3,
+    )
+
+    num_build_workers = len(glob.glob(os.path.join(nemo_checkpoint_path, WEIGHTS_NAME.format("*"))))
+    assert num_build_workers, f"No TensorRT-LLM weight files found in {nemo_checkpoint_path}"
+
+    config = PretrainedConfig.from_json_file(os.path.join(nemo_checkpoint_path, CONFIG_NAME))
+
+    log_level = "warning"
+
+    quant_algo = config.quantization.quant_algo
+
+    use_fused_mlp = True
+    if config.quantization.exclude_modules:
+        for module_name in config.quantization.exclude_modules:
+            # For AutoQuant, fc and gate might not be quantized at the same time
+            # TODO: relax this limitation on the TRT-LLM side
+            if "gate" in module_name or "fc" in module_name:
+                use_fused_mlp = False
+    use_fused_mlp = use_fused_mlp and 'RecurrentGemma' not in config.architecture
+
+    use_qdq = quant_algo in ["FP8", "W8A8_SQ_PER_CHANNEL"]
+
+    speculative_decoding_mode = "medusa" if "Medusa" in config.architecture else None
+
+    build_cmd = "trtllm-build "
+    build_cmd += f"--checkpoint_dir {nemo_checkpoint_path} "
+    build_cmd += f"--log_level {log_level} "
+    build_cmd += f"--output_dir {engine_dir} "
+    build_cmd += f"--workers {num_build_workers} "
+    build_cmd += f"--max_batch_size {max_batch_size} "
+    build_cmd += f"--max_input_len {max_input_len} "
+    build_cmd += f"--max_beam_width {max_beam_width} "
+    build_cmd += f"--max_prompt_embedding_table_size {max_prompt_embedding_table_size} "
+    build_cmd += f"--paged_kv_cache {'enable' if paged_kv_cache else 'disable'} "
+    build_cmd += f"--use_paged_context_fmha {'enable' if paged_context_fmha else 'disable'} "
+    build_cmd += f"--remove_input_padding {'enable' if remove_input_padding else 'disable'} "
+    build_cmd += f"--multiple_profiles {'enable' if multiple_profiles else 'disable'} "
+    build_cmd += f"--reduce_fusion {'enable' if reduce_fusion else 'disable'} "
+    build_cmd += f"--use_fused_mlp {'enable' if use_fused_mlp else 'disable'} "
+
+    if not use_qdq:
+        build_cmd += "--gemm_plugin auto "
+
+    if max_seq_len is not None:
+        build_cmd += f"--max_seq_len {max_seq_len} "
+
+    if max_num_tokens is not None:
+        build_cmd += f"--max_num_tokens {max_num_tokens} "
+    else:
+        build_cmd += f"--max_num_tokens {max_batch_size * max_input_len} "
+
+    if opt_num_tokens is not None:
+        build_cmd += f"--opt_num_tokens {opt_num_tokens} "
+
+    if speculative_decoding_mode:
+        build_cmd += f"--speculative_decoding_mode {speculative_decoding_mode} "
+
+    build_cmd = build_cmd.replace("--", "\\\n  --")  # Separate parameters line by line
+
+    print("trtllm-build command:")
+    print(build_cmd)
+
+    subprocess.run(build_cmd, shell=True, check=True)
diff --git a/nemo/export/trt_llm/qnemo/tokenizer_utils.py b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
new file mode 100644
index 000000000000..37b45521dcca
--- /dev/null
+++ b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+
+from omegaconf import OmegaConf
+from transformers import AutoTokenizer
+
+from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.export.tiktoken_tokenizer import TiktokenTokenizer
+
+# TODO: use get_nmt_tokenizer helper below to instantiate tokenizer once environment / dependencies get stable
+# from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
+
+TOKENIZER_CONFIG_FILE = "tokenizer_config.yaml"
+TOKENIZER_DIR = "tokenizer"
+LOGGER = logging.getLogger("NeMo")
+
+
+def get_nmt_tokenizer(nemo_checkpoint_path: str):
+    """Build tokenizer from Nemo tokenizer config."""
+
+    LOGGER.info(f"Initializing tokenizer from {TOKENIZER_CONFIG_FILE}")
+    tokenizer_cfg = OmegaConf.load(os.path.join(nemo_checkpoint_path, TOKENIZER_CONFIG_FILE))
+
+    library = tokenizer_cfg.library
+    legacy = tokenizer_cfg.get("sentencepiece_legacy", library == "sentencepiece")
+
+    if library == "huggingface":
+        LOGGER.info(f"Getting HuggingFace AutoTokenizer with pretrained_model_name: {tokenizer_cfg.type}")
+        tokenizer = AutoTokenizer.from_pretrained(tokenizer_cfg["type"], use_fast=tokenizer_cfg.get("use_fast", False))
+    elif library == "sentencepiece":
+        LOGGER.info(f"Getting SentencePieceTokenizer with model: {tokenizer_cfg.model}")
+        tokenizer = SentencePieceTokenizer(
+            model_path=os.path.join(nemo_checkpoint_path, tokenizer_cfg.model), legacy=legacy
+        )
+    elif library == "tiktoken":
+        print(f"Getting TiktokenTokenizer with file: {tokenizer_cfg.vocab_file}")
+        tokenizer = TiktokenTokenizer(vocab_file=os.path.join(nemo_checkpoint_path, tokenizer_cfg.vocab_file))
+    else:
+        raise NotImplementedError("Currently we only support 'huggingface' and 'sentencepiece' tokenizer libraries.")
+
+    return tokenizer
diff --git a/nemo/export/trt_llm/qnemo/utils.py b/nemo/export/trt_llm/qnemo/utils.py
new file mode 100644
index 000000000000..a2bd74d3ff4c
--- /dev/null
+++ b/nemo/export/trt_llm/qnemo/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+
+from nemo.export.tarutils import TarPath
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "rank{}.safetensors"
+
+
+def is_qnemo_checkpoint(path: str) -> bool:
+    """Detect if a given path is a TensorRT-LLM a.k.a. "qnemo" checkpoint based on config & tensor data presence."""
+    if os.path.isdir(path):
+        path = Path(path)
+    else:
+        path = TarPath(path)
+    config_path = path / CONFIG_NAME
+    tensor_path = path / WEIGHTS_NAME.format(0)
+    return config_path.exists() and tensor_path.exists()
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
new file mode 100755
index 000000000000..a0c8d52b9895
--- /dev/null
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import tensorrt_llm
+from tensorrt_llm._common import check_max_num_tokens
+from tensorrt_llm.builder import BuildConfig
+from tensorrt_llm.commands.build import build as build_trtllm
+from tensorrt_llm.logger import logger
+from tensorrt_llm.lora_manager import LoraConfig
+from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights
+from tensorrt_llm.plugin import PluginConfig
+
+MODEL_NAME = "NeMo"
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def build_and_save_engine(
+    max_input_len=1024,
+    max_output_len=1024,
+    max_batch_size=4,
+    model_dir=None,
+    model_weights=None,
+    model_config=None,
+    model_type='gpt',
+    lora_ckpt_list=None,
+    use_lora_plugin=None,
+    max_lora_rank=64,
+    lora_target_modules=None,
+    max_prompt_embedding_table_size=0,
+    paged_kv_cache: bool = True,
+    remove_input_padding: bool = True,
+    paged_context_fmha: bool = False,
+    use_refit: bool = False,
+    max_num_tokens: int = None,
+    max_seq_len: int = None,
+    opt_num_tokens: int = None,
+    max_beam_width: int = 1,
+    tokens_per_block: int = 128,
+    multiple_profiles: bool = False,
+    gpt_attention_plugin: str = "auto",
+    gemm_plugin: str = "auto",
+    reduce_fusion: bool = False,
+    gather_context_logits: bool = False,
+    gather_generation_logits: bool = False,
+):
+    architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture
+    try:
+        model_cls = getattr(tensorrt_llm.models, architecture)
+    except Exception:
+        raise AttributeError(f"Could not find TRTLLM model type: {model_type}!")
+
+    logger.set_level("info")
+    plugin_config = PluginConfig()
+    plugin_config.gpt_attention_plugin = gpt_attention_plugin
+    plugin_config.gemm_plugin = gemm_plugin
+    if paged_kv_cache:
+        plugin_config.enable_paged_kv_cache(tokens_per_block=tokens_per_block)
+    else:
+        plugin_config.paged_kv_cache = False
+    plugin_config.remove_input_padding = remove_input_padding
+    plugin_config.use_paged_context_fmha = paged_context_fmha
+    plugin_config.multiple_profiles = multiple_profiles
+    plugin_config.reduce_fusion = reduce_fusion
+
+    max_num_tokens, opt_num_tokens = check_max_num_tokens(
+        max_num_tokens=max_num_tokens,
+        opt_num_tokens=opt_num_tokens,
+        max_seq_len=max_seq_len,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+        max_beam_width=max_beam_width,
+        remove_input_padding=remove_input_padding,
+        enable_context_fmha=plugin_config.context_fmha,
+        tokens_per_block=tokens_per_block,
+        multiple_profiles=multiple_profiles,
+    )
+
+    build_dict = {
+        'max_input_len': max_input_len,
+        'max_output_len': max_output_len,
+        'max_batch_size': max_batch_size,
+        'max_beam_width': max_beam_width,
+        'max_seq_len': max_seq_len,
+        'max_num_tokens': max_num_tokens,
+        'opt_num_tokens': opt_num_tokens,
+        'max_prompt_embedding_table_size': max_prompt_embedding_table_size,
+        'gather_context_logits': gather_context_logits,
+        'gather_generation_logits': gather_generation_logits,
+        'strongly_typed': False,
+        'builder_opt': None,
+        'use_refit': use_refit,
+        'multiple_profiles': multiple_profiles,
+    }
+    build_config = BuildConfig.from_dict(build_dict, plugin_config=plugin_config)
+
+    if use_lora_plugin is not None:
+        # build_config.plugin_config.set_lora_plugin(use_lora_plugin)
+        build_config.plugin_config._lora_plugin = use_lora_plugin
+        lora_config = LoraConfig(
+            lora_dir=lora_ckpt_list,
+            lora_ckpt_source='nemo',
+            max_lora_rank=max_lora_rank,
+        )
+        if lora_target_modules is not None:
+            lora_config.lora_target_modules = lora_target_modules
+        build_config.lora_config = lora_config
+
+    model = model_cls.from_config(model_config)
+    model = optimize_model(
+        model,
+        use_parallel_embedding=model_config.use_parallel_embedding,
+        share_embedding_table=model_config.share_embedding_table,
+    )
+    preprocess_weights(model_weights, model_config)
+    model.load(model_weights)
+    engine = build_trtllm(model, build_config)
+    engine.save(model_dir)
+
+    return engine
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
new file mode 100644
index 000000000000..b3d504cd86ea
--- /dev/null
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -0,0 +1,931 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import csv
+import json
+import logging
+import os
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import tensorrt as trt
+import tensorrt_llm
+import torch
+from mpi4py.futures import MPIPoolExecutor
+from tensorrt_llm.builder import Engine
+from tensorrt_llm.lora_manager import LoraManager
+from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
+from transformers import PreTrainedTokenizer
+
+LOGGER = logging.getLogger("NeMo")
+
+use_trtllm_bindings = True
+try:
+    from tensorrt_llm.bindings import GptJsonConfig
+except Exception:
+    use_trtllm_bindings = False
+
+TRTLLM_SUPPORTS_DEVICE_DISABLE = True
+try:
+    from tensorrt_llm.runtime.generation import DISABLE_TORCH_DEVICE_SET
+except (ImportError, ModuleNotFoundError):
+    TRTLLM_SUPPORTS_DEVICE_DISABLE = False
+
+
+@dataclass
+class TensorrtLLMHostContext:
+    """The host side context for TRT LLM inference."""
+
+    executor: MPIPoolExecutor = None
+    world_size: int = 1
+    tokenizer: PreTrainedTokenizer = None
+    max_batch_size: int = 0
+    max_input_len: int = 0
+    add_bos: bool = False
+
+
+@dataclass
+class TensorrtLLMWorkerContext:
+    """The MPI worker side context for TRT LLM inference."""
+
+    decoder: ModelRunner | ModelRunnerCpp = None
+    sampling_config: SamplingConfig = None
+    lora_manager: LoraManager = None
+    max_batch_size: int = 0
+    max_input_len: int = 0
+
+
+# This is a global context that will be initialized during the model loading process as MPI worker.
+tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
+
+
+def _read_config(config_path: Path):
+    with open(config_path, "r") as f:
+        config = json.load(f)
+
+    tensor_parallel_size = config["builder_config"]["tensor_parallel"]
+    pipeline_parallel_size = config["builder_config"]["pipeline_parallel"]
+    world_size = tensor_parallel_size * pipeline_parallel_size
+
+    assert world_size <= torch.cuda.device_count(), f"Not enough GPUs, requesting {world_size}"
+
+    num_heads = config["builder_config"]["num_heads"]
+    num_kv_heads = config["builder_config"].get("num_kv_heads", num_heads)
+    head_size = config["builder_config"]["head_size"]
+    hidden_size = config["builder_config"]["hidden_size"] // tensor_parallel_size
+
+    num_heads = num_heads // tensor_parallel_size
+    num_kv_heads = (num_kv_heads + tensor_parallel_size - 1) // tensor_parallel_size
+
+    if "tokens_per_block" in config["plugin_config"]:
+        tokens_per_block = config["plugin_config"]["tokens_per_block"]
+    else:
+        tokens_per_block = config["builder_config"]["tokens_per_block"]
+
+    if quantization := config["builder_config"].get("quantization"):
+        # Field "quantization" (dict) is introduced for quantized Nemo checkpoints support.
+        # For regular Nemo checkpoints "quant_mode" field should be used (default: 0).
+        quant_mode = QuantMode.from_quant_algo(quantization['quant_algo'], quantization['kv_cache_quant_algo'])
+    else:
+        quant_mode = QuantMode(config["builder_config"]["quant_mode"])
+
+    model_config = ModelConfig(
+        model_name=config["builder_config"]["name"],
+        max_batch_size=config["builder_config"]["max_batch_size"],
+        max_beam_width=config["builder_config"]["max_beam_width"],
+        vocab_size=config["builder_config"]["vocab_size"],
+        num_layers=config["builder_config"]["num_layers"],
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        hidden_size=hidden_size,
+        head_size=head_size,
+        gpt_attention_plugin=config["plugin_config"]["gpt_attention_plugin"],
+        remove_input_padding=config["plugin_config"]["remove_input_padding"],
+        paged_kv_cache=config["plugin_config"]["paged_kv_cache"],
+        tokens_per_block=tokens_per_block,
+        max_prompt_embedding_table_size=config["builder_config"]["max_prompt_embedding_table_size"],
+        dtype=config["builder_config"]["precision"],
+        lora_plugin=config["plugin_config"]["lora_plugin"],
+        lora_target_modules=config["builder_config"]["lora_target_modules"],
+        quant_mode=quant_mode,
+        use_context_fmha_for_generation=config["plugin_config"]["use_context_fmha_for_generation"],
+        gather_context_logits=config["builder_config"]["gather_context_logits"],
+        gather_generation_logits=config["builder_config"]["gather_generation_logits"],
+    )
+
+    dtype = config["builder_config"]["precision"]
+    max_input_len = config["builder_config"]["max_input_len"]
+    max_batch_size = config["builder_config"]["max_batch_size"]
+
+    return model_config, world_size, tensor_parallel_size, pipeline_parallel_size, dtype, max_input_len, max_batch_size
+
+
+def _load(
+    tokenizer: PreTrainedTokenizer,
+    engine_dir,
+    lora_ckpt_list=None,
+    num_beams=1,
+    use_python_runtime: bool = True,
+    enable_chunked_context: bool = False,
+    max_tokens_in_paged_kv_cache: int = None,
+    multi_block_mode: bool = False,
+):
+    """The impl of `load` API for on a single GPU worker."""
+    try:
+        tensorrt_llm.logger.set_level("info")
+
+        engine_dir = Path(engine_dir)
+        config_path = engine_dir / "config.json"
+        # model_config, world_size, tp_size, pp_size, dtype, max_input_len, max_batch_size = _read_config(config_path)
+
+        with open(config_path, "r") as f:
+            config = json.load(f)
+
+        max_batch_size = config["build_config"]["max_batch_size"]
+        max_input_len = config["build_config"]["max_input_len"]
+        # max_output_len = config["build_config"]["max_output_len"]
+        max_beam_width = config["build_config"]["max_beam_width"]
+
+        runtime_rank = tensorrt_llm.mpi_rank()
+
+        if use_python_runtime:
+            if enable_chunked_context:
+                logging.warning("enable_chunked_context is disabled when using python runtime")
+            if multi_block_mode:
+                logging.warning("multi_block_mode is disabled when using python runtime")
+
+            decoder = ModelRunner.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                debug_mode=False,
+            )
+        else:
+            decoder = ModelRunnerCpp.from_dir(
+                engine_dir=engine_dir,
+                lora_dir=lora_ckpt_list,
+                lora_ckpt_source="nemo",
+                rank=runtime_rank,
+                max_batch_size=max_batch_size,
+                max_input_len=max_input_len,
+                # max_output_len=max_output_len,
+                max_beam_width=max_beam_width,
+                enable_chunked_context=enable_chunked_context,
+                max_tokens_in_paged_kv_cache=max_tokens_in_paged_kv_cache,
+                multi_block_mode=multi_block_mode,
+                debug_mode=False,
+            )
+
+        sampling_config = SamplingConfig(
+            end_id=tokenizer.eos_token_id, pad_id=tokenizer.eos_token_id, num_beams=num_beams
+        )
+
+        # Initialize the global context so it can be used during `run` API.
+        global tensorrt_llm_worker_context
+        tensorrt_llm_worker_context.decoder = decoder
+        tensorrt_llm_worker_context.sampling_config = sampling_config
+        tensorrt_llm_worker_context.max_batch_size = max_batch_size
+        tensorrt_llm_worker_context.max_input_len = max_input_len
+
+    except Exception as e:
+        print(e)
+        raise e
+
+
+def _forward(
+    input_tensors: List[torch.IntTensor],
+    max_output_len: int,
+    top_k: int = 1,
+    top_p: float = 0.0,
+    temperature: float = 1.0,
+    prompt_table=None,
+    task_vocab_size=None,
+    task_ids: List[int] = None,
+    lora_uids: List[str] = None,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    streaming: bool = False,
+    multiprocessed_env=False,
+    **sampling_kwargs,
+) -> Optional[torch.IntTensor]:
+    """The impl of `forward` API for on a single GPU worker with tensor as IO.
+
+    Returns:
+        the output tokens tensor with shape [batch_size, num_beams, output_len].
+    """
+    try:
+        # Loading the global context initialized from the `load` API.
+        global tensorrt_llm_worker_context
+        decoder = tensorrt_llm_worker_context.decoder
+        assert decoder is not None, "Invalid worker context, decoder is not loaded."
+        sampling_config = tensorrt_llm_worker_context.sampling_config
+        max_batch_size = tensorrt_llm_worker_context.max_batch_size
+        max_input_len = tensorrt_llm_worker_context.max_input_len
+
+        batch_size = len(input_tensors)
+        assert batch_size <= max_batch_size, f"batch size {batch_size} exceedng max batch size {max_batch_size}"
+        input_lengths = [t.shape[0] for t in input_tensors]
+        max_length = max(input_lengths)
+        assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
+        pad_id = sampling_config.pad_id
+        end_id = sampling_config.end_id
+        num_beams = sampling_config.num_beams
+
+        for k in sampling_kwargs.keys():
+            if not hasattr(sampling_config, k):
+                raise TypeError(f"Unknown sampling args '{k}'")
+
+        with torch.no_grad():
+            prompt_tasks = None if task_ids is None else ",".join(str(task) for task in task_ids)
+
+            if prompt_table is not None:
+                prompt_table = prompt_table.reshape(1, *prompt_table.shape)
+                tmp_dir = tempfile.TemporaryDirectory()
+                prompt_table_path = os.path.join(tmp_dir.name, 'prompt_table.npy')
+                np.save(prompt_table_path, prompt_table.cpu().float().numpy())
+                prompt_table = prompt_table_path
+
+            outputs = decoder.generate(
+                input_tensors,
+                max_new_tokens=max_output_len,
+                end_id=end_id,
+                pad_id=pad_id,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                num_beams=num_beams,
+                stop_words_list=stop_words_list,
+                bad_words_list=bad_words_list,
+                lora_uids=lora_uids,
+                prompt_table_path=prompt_table,
+                prompt_table=prompt_table,
+                prompt_tasks=prompt_tasks,
+                streaming=streaming,
+                output_sequence_lengths=True,
+                return_dict=True,
+                **sampling_kwargs,
+            )
+
+            torch.cuda.synchronize()
+
+            if prompt_table is not None:
+                tmp_dir.cleanup()
+
+        runtime_rank = tensorrt_llm.mpi_rank()
+        if runtime_rank == 0 or multiprocessed_env:
+            return outputs
+        else:
+            return None
+
+    except Exception as e:
+        print(e)
+        raise e
+
+
+def load(
+    tokenizer: PreTrainedTokenizer,
+    engine_dir: str,
+    lora_ckpt_list: List[str] = None,
+    num_beams: int = 1,
+    use_python_runtime: bool = True,
+    enable_chunked_context: bool = False,
+    max_tokens_in_paged_kv_cache: int = None,
+    multi_block_mode: bool = False,
+) -> TensorrtLLMHostContext:
+    """Loaded the compiled LLM model and run it.
+
+    It also supports running the TRT LLM model on multi-GPU.
+    """
+    # the parent dir of the engine_dir
+    config_path = os.path.join(engine_dir, "config.json")
+    with open(config_path, "r") as f:
+        config = json.load(f)
+    world_size = config["pretrained_config"]["mapping"]["world_size"]
+    if world_size == 1:
+        _load(
+            tokenizer,
+            engine_dir,
+            lora_ckpt_list,
+            num_beams,
+            use_python_runtime,
+            enable_chunked_context,
+            max_tokens_in_paged_kv_cache,
+            multi_block_mode,
+        )
+        executor = None
+    elif tensorrt_llm.mpi_world_size() > 1:
+        _load(
+            tokenizer,
+            engine_dir,
+            lora_ckpt_list,
+            num_beams,
+            use_python_runtime,
+            enable_chunked_context,
+            max_tokens_in_paged_kv_cache,
+        )
+        executor = None
+        tensorrt_llm.mpi_barrier()
+    else:
+        executor = MPIPoolExecutor(max_workers=world_size)
+        futures = []
+        for _ in range(world_size):
+            future = executor.submit(
+                _load,
+                tokenizer,
+                engine_dir,
+                lora_ckpt_list,
+                num_beams,
+                use_python_runtime,
+                enable_chunked_context,
+                max_tokens_in_paged_kv_cache,
+            )
+            futures.append(future)
+        for future in futures:
+            future.result()
+
+    max_batch_size = config["build_config"]["max_batch_size"]
+    max_input_len = config["build_config"]["max_input_len"]
+    architectures_that_need_bos_token = [
+        "GemmaForCausalLM",
+        "LLaMAForCausalLM",
+        "MistralForCausalLM",
+        "MixtralForCausalLM",
+    ]
+    add_bos = config["pretrained_config"]["architecture"] in architectures_that_need_bos_token
+
+    return TensorrtLLMHostContext(
+        executor=executor,
+        world_size=world_size,
+        tokenizer=tokenizer,
+        max_batch_size=max_batch_size,
+        max_input_len=max_input_len,
+        add_bos=add_bos,
+    )
+
+
+def forward(
+    input_tensors: List[torch.IntTensor],
+    max_output_len: int,
+    host_context: TensorrtLLMHostContext,
+    top_k: int = 1,
+    top_p: float = 0.0,
+    temperature: float = 1.0,
+    prompt_table=None,
+    task_vocab_size=None,
+    task_ids: List[int] = None,
+    lora_uids: List[str] = None,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    streaming: bool = False,
+    multiprocessed_env=False,
+    **sampling_kwargs,
+) -> Optional[torch.IntTensor]:
+    """Run the loaded model with the host_context provided from the `load` API."""
+    batch_size = len(input_tensors)
+    max_batch_size = host_context.max_batch_size
+    assert batch_size <= max_batch_size, f"batch size {batch_size} exceedng max batch size {max_batch_size}"
+    max_length = max([t.shape[0] for t in input_tensors])
+    max_input_len = host_context.max_input_len
+    assert max_length <= max_input_len, f"input length {max_length} exceedng max input length {max_input_len}"
+
+    world_size = host_context.world_size
+    if world_size == 1 or multiprocessed_env:
+        return _forward(
+            input_tensors=input_tensors,
+            max_output_len=max_output_len,
+            top_k=top_k,
+            top_p=top_p,
+            temperature=temperature,
+            prompt_table=prompt_table,
+            task_vocab_size=task_vocab_size,
+            task_ids=task_ids,
+            lora_uids=lora_uids,
+            stop_words_list=stop_words_list,
+            bad_words_list=bad_words_list,
+            no_repeat_ngram_size=no_repeat_ngram_size,
+            streaming=streaming,
+            multiprocessed_env=multiprocessed_env,
+            **sampling_kwargs,
+        )
+    else:
+        executor = host_context.executor
+        futures = []
+        for _ in range(world_size):
+            future = executor.submit(
+                _forward,
+                input_tensors=input_tensors,
+                max_output_len=max_output_len,
+                top_k=top_k,
+                top_p=top_p,
+                temperature=temperature,
+                prompt_table=prompt_table,
+                task_vocab_size=task_vocab_size,
+                task_ids=task_ids,
+                lora_uids=lora_uids,
+                stop_words_list=stop_words_list,
+                bad_words_list=bad_words_list,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                streaming=streaming,
+                **sampling_kwargs,
+            )
+            futures.append(future)
+        for future in futures:
+            result = future.result()
+            if result is not None:
+                return result
+
+        raise RuntimeError("Internal error")
+
+
+def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):
+    """Loads TRTLLM engines in a distributed gpu environment, in particular
+    this function creates a custom mapping of device_id to WorldConfig
+    """
+    global tensorrt_llm_worker_context
+    if isinstance(tensorrt_llm_worker_context.decoder, ModelRunner):
+        return
+
+    config_path = Path(engine_dir) / f"config_{torch.distributed.get_rank()}.json"
+    json_config = GptJsonConfig.parse_file(config_path)
+    model_config = json_config.model_config
+
+    max_batch_size = model_config.max_batch_size
+    max_input_len = model_config.max_input_len
+
+    tp_size = json_config.tensor_parallelism
+    assert tp_size <= gpus_per_node, "Multinode TP is not unsupported"
+
+    # TRTLLM asserts that rank equals the device num however this
+    # is not true for the megatron mapping of TP->DP->PP.
+    # So we manipulate TRTLLM to emulate a TP->PP single node setup
+    # TRTLLM is expected to fix this in future releases
+    offset = (torch.cuda.current_device() - model_parallel_rank % gpus_per_node + gpus_per_node) % gpus_per_node
+    device_ids = [i for i in range(gpus_per_node)]
+    for _ in range(offset):
+        device_ids.append(device_ids.pop(0))
+    engine_index = model_parallel_rank
+    # mpi_rank = mpi_comm().Get_rank()
+    # Copied from worldConfig.h (getDevice())
+    # mpi_device = mpi_rank % gpus_per_node
+    # TODO: Consider re-enabling
+    # assert torch.cuda.current_device() == mpi_device
+
+    # TODO: check if API exists (copied from gptJsonConfig.cpp)
+    # https://github.com/terrykong/TensorRT-LLM/blob/05316d3313360012536ace46c781518f5afae75e/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp#L478
+    engine_filename = f"rank{engine_index}.engine"
+    serialize_path = Path(engine_dir) / engine_filename
+    with open(serialize_path, "rb") as f:
+        engine_data = bytearray(f.read())
+
+    with open(config_path) as f:
+        json_config_str = f.read()
+
+    engine = Engine.from_buffer(engine_buffer=engine_data, json_config_str=json_config_str, rank=model_parallel_rank)
+
+    if not TRTLLM_SUPPORTS_DEVICE_DISABLE:
+        raise RuntimeError(
+            "TensorRT-LLM does not support torch device disabling. "
+            "Please upgrade TensorRT-LLM to make use of this feature."
+        )
+    elif not DISABLE_TORCH_DEVICE_SET:
+        raise RuntimeError(
+            "To use TensorRT-LLM's python ModelRunner API in load_distributed(...) "
+            "you must set the env var DISABLE_TORCH_DEVICE_SET=1"
+        )
+
+    default_kwargs = {
+        "max_output_len": None,
+        "lora_dir": None,
+        "debug_mode": False,
+        "lora_ckpt_source": "hf",
+        "medusa_choices": None,
+        "stream": None,
+        "gpu_weights_percent": 1.0,
+        "enable_context_fmha_fp32_acc": False,
+        "multi_block_mode": True,
+    }
+
+    decoder = ModelRunner.from_engine(
+        engine=engine,
+        # We want the engine to have the mp_rank,
+        # but the python runtime to not resassign the device of the current process
+        # So we will set it to the current device
+        rank=torch.cuda.current_device(),
+        **default_kwargs,
+    )
+
+    tensorrt_llm_worker_context.decoder = decoder
+    tensorrt_llm_worker_context.max_batch_size = max_batch_size
+    tensorrt_llm_worker_context.max_input_len = max_input_len
+
+
+def maybe_cast_to_trt_dtype(dtype):
+    """
+    Cast input dtype to TensorRT dtype if applicable.
+
+    Args:
+        dtype: Input dtype (torch.dtype or trt.DataType)
+
+    Returns:
+        trt.DataType: Corresponding TensorRT dtype
+    """
+    if isinstance(dtype, trt.DataType):
+        return dtype
+    elif isinstance(dtype, torch.dtype):
+        return tensorrt_llm._utils.torch_dtype_to_trt(dtype)
+    else:
+        raise NotImplementedError(f"Expects the type to be a tensorrt.DataType or torch.dtype, but got {type(dtype)=}")
+
+
+def refit(weights_dict: dict):
+    """
+    Refit TensorRT-LLM by hot-swapping its engine weights.
+
+    Args:
+        weights_dict: Dictionary containing new weights
+    """
+    global tensorrt_llm_worker_context
+    decoder = tensorrt_llm_worker_context.decoder
+    if not isinstance(decoder, ModelRunner):
+        raise ValueError(
+            f"Refit is only supported with ModelRunner, but export has been configured with {type(decoder)=}"
+        )
+
+    engine = decoder.session.runtime.engine
+    # The session dtype plumbs the model_config's dtype
+    model_dtype = maybe_cast_to_trt_dtype(decoder.session.dtype)
+    assert engine.refittable, "Tried refitting engine without refit enabled"
+
+    refitter = trt.Refitter(engine=engine, logger=trt.Logger(trt.Logger.ERROR))
+    remaining_refit_weights = set(refitter.get_all_weights())
+    skipped_weights = []
+    for trt_name, weight in weights_dict.items():
+        if trt_name not in remaining_refit_weights:
+            skipped_weights.append(trt_name)
+            continue
+        trt_weight = trt.Weights(model_dtype, weight.data_ptr(), torch.numel(weight))
+        trt_wt_location = trt.TensorLocation.DEVICE if weight.is_cuda else trt.TensorLocation.HOST
+        assert (
+            model_dtype == refitter.get_weights_prototype(trt_name).dtype == maybe_cast_to_trt_dtype(weight.dtype)
+        ), (
+            f"Expected all three of these dtypes to be the same:\n"
+            f"  {model_dtype=}\n"
+            f"  {refitter.get_weights_prototype(trt_name).dtype=}\n"
+            f"  weight.dtype={maybe_cast_to_trt_dtype(weight.dtype)}"
+        )
+
+        refitter.set_named_weights(
+            trt_name, trt_weight, trt_wt_location
+        ), f"Unable to set {trt_name=} {trt_weight=} {trt_wt_location=}"
+        remaining_refit_weights.remove(trt_name)
+    if skipped_weights:
+        logging.warning(
+            f"These weights were ignored during refit since they are not present in engine: {skipped_weights}"
+        )
+    if remaining_refit_weights:
+        logging.warning(f"Weights dict did not contain weights for these named TRT weights: {remaining_refit_weights}")
+
+    if not refitter.refit_cuda_engine():
+        raise ValueError("Refit failed!")
+
+
+def unload_engine():
+    """
+    Deletes the ModelRunner which should free up device memory
+    """
+    global tensorrt_llm_worker_context
+    decoder = tensorrt_llm_worker_context.decoder
+    if not isinstance(decoder, ModelRunner):
+        raise ValueError(
+            f"unload_engine is only supported with ModelRunner, but export has been configured with {type(decoder)=}"
+        )
+
+    logging.info("Unloading engine...")
+    del tensorrt_llm_worker_context.decoder
+    tensorrt_llm_worker_context.decoder = None
+    logging.info("Engine unloaded!")
+
+
+def prepare_input_tensors(
+    input_texts: List[str],
+    host_context: TensorrtLLMHostContext,
+    prompt_table=None,
+    task_vtoken_counts: List[int] = None,
+    task_ids: List[int] = None,
+):
+    """
+    Prepare input tensors from text input.
+
+    Args:
+        input_texts: List of input text strings
+        host_context: Context containing tokenizer and configuration
+        prompt_table: a lookup table containing trained embeddings for vtoken used in p-tuning
+        task_vtoken_counts: Optional list of vtoken counts per task
+        task_ids: Optional list of task IDs
+
+    Returns:
+        dict: Prepared input tensors for model
+    """
+
+    tokenizer = host_context.tokenizer
+
+    if host_context.add_bos:
+        bos_tokens = [tokenizer.bos_token_id]
+    else:
+        bos_tokens = []
+
+    input_tokens = [bos_tokens + tokenizer.encode(t) for t in input_texts]
+
+    # If p-tuning is used, we need to prepend vtokens to each input.
+    if prompt_table is not None:
+
+        # Go over the tokenized prompts and prepend vtokens.
+        # The number of vtokens could be different for each task.
+        for prompt_index in range(len(input_texts)):
+            # Find out the number of vtokens to generate
+            task_id = task_ids[prompt_index]
+            num_vtokens = task_vtoken_counts[task_id]
+
+            # Create a tensor with vtokens, e.g. 32000, 32001, 32002... when vocab_size=32000
+            # TRT-LLM will convert each vtoken into its corresponding embedding row from the prompt table.
+            vocab_size = tokenizer.vocab_size
+            vtokens = list(range(vocab_size, vocab_size + num_vtokens))
+
+            # Concatenate the vtokens with the real tokens
+            real_tokens = input_tokens[prompt_index]
+            input_tokens[prompt_index] = vtokens + real_tokens
+
+    # Convert input token lists to tensors
+    input_tensors = [torch.IntTensor(token_list) for token_list in input_tokens]
+
+    return input_tensors
+
+
+def generate(
+    input_texts: List[str],
+    max_output_len: int,
+    host_context: TensorrtLLMHostContext,
+    top_k: int = 1,
+    top_p: float = 0.0,
+    temperature: float = 1.0,
+    prompt_table=None,
+    task_vocab_size=None,
+    task_vtoken_counts: List[int] = None,
+    task_ids: List[int] = None,
+    lora_uids: List[str] = None,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    streaming: bool = False,
+    output_log_probs=False,
+    multiprocessed_env=False,
+    output_context_logits=False,
+    output_generation_logits=False,
+    **sampling_kwargs,
+) -> Optional[List[List[str]]]:
+    """Generate the output sequence from the input sequence.
+
+    Returns a 2D string list with shape [batch_size, num_beams].
+    """
+    tokenizer = host_context.tokenizer
+    input_tensors = prepare_input_tensors(input_texts, host_context, prompt_table, task_vtoken_counts, task_ids)
+
+    stop_words_list_tensors = None
+    if stop_words_list is not None:
+        stop_words_arrays = to_word_list_format(stop_words_list, tokenizer)
+        stop_words_list_tensors = (
+            torch.Tensor(stop_words_arrays).to(torch.int32).to(torch.cuda.current_device()).contiguous()
+        )
+
+    bad_words_list_tensors = None
+    if bad_words_list is not None:
+        bad_words_arrays = to_word_list_format(bad_words_list, tokenizer)
+        bad_words_list_tensors = (
+            torch.Tensor(bad_words_arrays).to(torch.int32).to(torch.cuda.current_device()).contiguous()
+        )
+
+    if no_repeat_ngram_size is not None:
+        no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
+
+    outputs = forward(
+        input_tensors=input_tensors,
+        max_output_len=max_output_len,
+        host_context=host_context,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+        prompt_table=prompt_table,
+        task_vocab_size=task_vocab_size,
+        task_ids=task_ids,
+        lora_uids=lora_uids,
+        stop_words_list=stop_words_list_tensors,
+        bad_words_list=bad_words_list_tensors,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        streaming=False,
+        output_log_probs=output_log_probs,
+        multiprocessed_env=multiprocessed_env,
+        **sampling_kwargs,
+    )
+
+    assert outputs is not None
+    if tensorrt_llm.mpi_rank() != 0:
+        return None
+
+    output_ids = outputs['output_ids']
+    sequence_lengths = outputs['sequence_lengths']
+    input_lengths = [t.shape[0] for t in input_tensors]
+
+    output_lines_list = [
+        tokenizer.batch_decode(output_ids[b, :, input_lengths[b] : sequence_lengths[b][0]])
+        for b in range(output_ids.shape[0])
+    ]
+
+    if output_generation_logits:
+        return output_lines_list, outputs['generation_logits']
+    elif output_context_logits:
+        return output_lines_list, outputs['context_logits']
+    return output_lines_list
+
+
+def generate_streaming(
+    input_texts: List[str],
+    max_output_len: int,
+    host_context: TensorrtLLMHostContext,
+    top_k: int = 1,
+    top_p: float = 0.0,
+    temperature: float = 1.0,
+    prompt_table=None,
+    task_vocab_size=None,
+    task_vtoken_counts: List[int] = None,
+    task_ids: List[int] = None,
+    lora_uids: List[str] = None,
+    stop_words_list=None,
+    bad_words_list=None,
+    no_repeat_ngram_size=None,
+    **sampling_kwargs,
+) -> Optional[List[List[str]]]:
+    """Generate the output sequence from the input sequence.
+
+    Returns a 2D string list with shape [batch_size, num_beams].
+    """
+    tokenizer = host_context.tokenizer
+    input_tensors = prepare_input_tensors(input_texts, host_context, prompt_table, task_vtoken_counts, task_ids)
+
+    batch_size = len(input_texts)
+
+    stop_words_list_tensors = None
+    if stop_words_list is not None:
+        stop_words_list_tensors = [tokenizer.encode(t) for t in stop_words_list]
+        stop_words_list_tensors = torch.IntTensor(stop_words_list_tensors)
+        stop_words_list_tensors = (
+            stop_words_list_tensors.unsqueeze(0).repeat(batch_size, 1, 1).to(torch.cuda.current_device())
+        )
+
+    bad_words_list_tensors = None
+    if bad_words_list is not None:
+        bad_words_list_tensors = [tokenizer.encode(t) for t in bad_words_list]
+        bad_words_list_tensors = torch.IntTensor(bad_words_list_tensors)
+        bad_words_list_tensors = (
+            bad_words_list_tensors.unsqueeze(0).repeat(batch_size, 1, 1).to(torch.cuda.current_device())
+        )
+
+    if no_repeat_ngram_size is not None:
+        no_repeat_ngram_size = torch.IntTensor(no_repeat_ngram_size).to(torch.cuda.current_device())
+
+    outputs = forward(
+        input_tensors=input_tensors,
+        max_output_len=max_output_len,
+        host_context=host_context,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+        prompt_table=prompt_table,
+        task_vocab_size=task_vocab_size,
+        task_ids=task_ids,
+        lora_uids=lora_uids,
+        stop_words_list=stop_words_list_tensors,
+        bad_words_list=bad_words_list_tensors,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        streaming=True,
+        **sampling_kwargs,
+    )
+    assert outputs is not None
+
+    input_lengths = [t.shape[0] for t in input_tensors]
+
+    # 'outputs' is a generator that yields one generator, not sure why... Unwrap that.
+    for output in outputs:
+        output_ids = output['output_ids']
+        # Now iterate over the partial outputs, decode and yield each intermediate result.
+        generated_tokens = 0
+        for partial_outputs in output_ids:
+            if partial_outputs is None:
+                break
+            # partial_outputs is a tensor with shape=(len(input_texts), 1, output_length),
+            # where the last dimension contains a progressively increasing number of valid, generated tokens.
+            assert partial_outputs.shape[0] == len(input_texts)
+            outputs = []
+            generated_tokens += 1
+
+            # For each input in the batch...
+            for input_index in range(len(input_texts)):
+                # Extract the generated part of the output tensor and decode it.
+                input_length = input_lengths[input_index]
+                decoded_output = tokenizer.batch_decode(
+                    partial_outputs[input_index, :, input_length : input_length + generated_tokens]
+                )[0]
+                outputs.append(decoded_output)
+
+            # Yield the list of decoded partial responses.
+            yield outputs
+        # See above - 'outputs' yields just one item.
+        break
+
+
+def unload(host_context: TensorrtLLMHostContext):
+    """Frees the GPU resource from the TensorrtLLMHostContext and reset the host_context."""
+    if host_context.executor is not None:
+        host_context.executor.shutdown(wait=True)
+        host_context.executor = None
+        return
+
+    global tensorrt_llm_worker_context
+    tensorrt_llm_worker_context.decoder = None
+    tensorrt_llm_worker_context = TensorrtLLMWorkerContext()
+
+
+def to_word_list_format(
+    word_dict: List[List[str]],
+    tokenizer=None,
+    ref_str="",
+):
+    '''
+    format of word_dict
+        len(word_dict) should be same to batch_size
+        word_dict[i] means the words for batch i
+        len(word_dict[i]) must be 1, which means it only contains 1 string
+        This string can contains several sentences and split by ",".
+        For example, if word_dict[2] = " I am happy, I am sad", then this function will return
+        the ids for two short sentences " I am happy" and " I am sad".
+    '''
+    assert tokenizer is not None, "need to set tokenizer"
+
+    flat_ids = []
+    offsets = []
+    # The encoding of a single word can't always be trusted. See
+    #   https://github.com/NVIDIA/NeMo/blob/bb575b72fd0be51ae10cc77d9f89ddb9e9d3b96d/nemo/collections/nlp/modules/common/text_generation_strategy.py#L229  # pylint: disable=C0301
+    ids_ref = tokenizer.encode(ref_str)
+    for word_dict_item in word_dict:
+        item_flat_ids = []
+        item_offsets = []
+
+        if isinstance(word_dict_item[0], bytes):
+            word_dict_item = [word_dict_item[0].decode()]
+
+        words = list(csv.reader(word_dict_item))[0]
+        for word in words:
+            ids = tokenizer.encode(f"{ref_str}{word}")
+            if ids[0 : len(ids_ref)] == ids_ref:
+                # It worked! We can obtain the token(s) associated to `word` by stripping the prefix tokens.
+                ids = ids[len(ids_ref) :]
+            else:
+                # Unfortunately the prefix was merged with `word`. We could try with a different prefix, but
+                # for now we just use the basic encoding since this should be a very rare edge case.
+                ids = tokenizer.encode(word)
+                logging.warning(f"The encoding of word '{word}' into tokens {ids} might be incorrect")
+
+            if len(ids) == 0:
+                continue
+
+            item_flat_ids += ids
+            item_offsets.append(len(ids))
+
+        flat_ids.append(np.array(item_flat_ids))
+        offsets.append(np.cumsum(np.array(item_offsets)))
+
+    pad_to = max(1, max(len(ids) for ids in flat_ids))
+
+    for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
+        flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
+        offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
+
+    return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))
diff --git a/nemo/export/trt_llm/utils.py b/nemo/export/trt_llm/utils.py
new file mode 100644
index 000000000000..bb30048b96c7
--- /dev/null
+++ b/nemo/export/trt_llm/utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import tensorrt_llm
+
+
+def is_rank(rank: Optional[int]) -> bool:
+    """
+    Check if the current MPI rank matches the specified rank.
+
+    Args:
+        rank (Optional[int]): The rank to check against.
+
+    Returns:
+        bool: True if the current rank matches the specified rank or if rank is None.
+    """
+    current_rank = tensorrt_llm.mpi_rank()
+    if rank is None:
+        return True
+    if isinstance(rank, int):
+        return current_rank == rank
+    raise ValueError(f"Invalid rank argument {rank} of type {type(rank)}.")
diff --git a/nemo/export/utils/__init__.py b/nemo/export/utils/__init__.py
new file mode 100644
index 000000000000..12442fca30a4
--- /dev/null
+++ b/nemo/export/utils/__init__.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo.export.utils.lora_converter import convert_lora_nemo_to_canonical
+from nemo.export.utils.model_loader import (
+    load_model_weights,
+    load_sharded_metadata_torch_dist,
+    load_sharded_metadata_zarr,
+    nemo_to_path,
+)
+from nemo.export.utils.utils import (
+    get_example_inputs,
+    get_model_device_type,
+    is_nemo2_checkpoint,
+    is_nemo_tarfile,
+    prepare_directory_for_export,
+    torch_dtype_from_precision,
+    validate_fp8_network,
+)
+
+__all__ = [
+    "convert_lora_nemo_to_canonical",
+    "load_model_weights",
+    "load_sharded_metadata_torch_dist",
+    "load_sharded_metadata_zarr",
+    "nemo_to_path",
+    "is_nemo2_checkpoint",
+    "is_nemo_tarfile",
+    "prepare_directory_for_export",
+    "torch_dtype_from_precision",
+    "get_model_device_type",
+    "get_example_inputs",
+    "validate_fp8_network",
+]
diff --git a/nemo/export/utils/_mock_import.py b/nemo/export/utils/_mock_import.py
new file mode 100644
index 000000000000..0eabda79a926
--- /dev/null
+++ b/nemo/export/utils/_mock_import.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib
+import logging
+import sys
+import types
+from contextlib import contextmanager
+
+LOGGER = logging.getLogger("NeMo")
+
+"""
+Utility to mock imports of unavailable modules.
+
+Created for the purpose of using NeMo checkpoints produced with nvcr.io/nvidia/nemo:25.02.rc2
+containers (or later) and used in the environments where Megatron-Core is not available. This
+currently includes NIM containers.
+"""
+
+
+@contextmanager
+def _mock_import(module: str):
+    """
+    Context manager to mock the import of a specified module if it is not available.
+
+    Args:
+        module (str): The name of the module to mock.
+
+    Yields:
+        Yields control back to the caller.
+    """
+
+    class DummyModule(types.ModuleType):
+        """DummyModule."""
+
+        def __getattr__(self, name):
+            class Dummy:
+                """Dummy."""
+
+                pass
+
+            return Dummy
+
+    try:
+        importlib.import_module(module)
+    except ModuleNotFoundError:
+        LOGGER.warning(f"Module '{module}' is not available, mocking with a dummy module.")
+        sys_modules_backup = sys.modules.copy()
+
+        dummy_module = DummyModule("dummy")
+        module_name, *submodules = module.split(".")
+        sys.modules[module_name] = dummy_module
+        modules_mocked = [module_name]
+        for submodule in submodules:
+            module_name += f".{submodule}"
+            sys.modules[module_name] = dummy_module
+            modules_mocked.append(module_name)
+
+        yield
+
+        # Restore the original sys.modules
+        for module_name in modules_mocked:
+            if module_name in sys_modules_backup:
+                sys.modules[module_name] = sys_modules_backup[module_name]
+            else:
+                del sys.modules[module_name]
+    else:
+        yield
diff --git a/nemo/export/utils/constants.py b/nemo/export/utils/constants.py
new file mode 100644
index 000000000000..b7360e5f1f22
--- /dev/null
+++ b/nemo/export/utils/constants.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Export
+TRTLLM_ENGINE_DIR = "trtllm_engine"
diff --git a/nemo/export/utils/lora_converter.py b/nemo/export/utils/lora_converter.py
new file mode 100644
index 000000000000..cd229317bf23
--- /dev/null
+++ b/nemo/export/utils/lora_converter.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import re
+import tarfile
+import tempfile
+from pathlib import Path
+from typing import Any, Dict, List, Tuple
+
+import torch
+import yaml
+
+from nemo.export.tarutils import TarPath
+
+
+def replace_number_add_offset(key, offset_value):
+    # This function finds the layer number in the state dict key and adds a numeric offset to that number
+
+    if offset_value == 0:
+        return key
+
+    pattern = r'layers.(\d+)'
+
+    def add_offset(match):
+        return "layers." + str(int(match.group(1)) + offset_value)
+
+    return re.sub(pattern, add_offset, key)
+
+
+def rename_qkv_keys(key):
+    new_keys = []
+    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.q_adapter."))
+    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.k_adapter."))
+    new_keys.append(key.replace(".lora_kqv_adapter.", ".lora_unfused_kqv_adapter.v_adapter."))
+    return new_keys
+
+
+def reformat_module_names_to_hf(tensors: Dict[str, torch.Tensor]) -> Tuple[Dict[str, torch.Tensor], List[str]]:
+    new_tensors = dict()
+    module_names = set()
+    known_module_names = ["q_proj", "k_proj", "v_proj", "o_proj", "down_proj", "gate_proj", "up_proj"]
+    for module_name, module_weight in tensors.items():
+        # map linear_in and linear_out to lora_a/lora_b counterparts
+        new_module_name = "base_model." + module_name.replace("linear_in", "lora_A").replace("linear_out", "lora_B")
+
+        # map target modules to their vLLM/HF counterparts
+        new_module_name = new_module_name.replace("q_adapter", "q_proj")
+        new_module_name = new_module_name.replace("k_adapter", "k_proj")
+        new_module_name = new_module_name.replace("v_adapter", "v_proj")
+        new_module_name = new_module_name.replace("lora_dense_attention_adapter", "o_proj")
+        new_module_name = new_module_name.replace("lora_4htoh_adapter", "down_proj")
+        new_module_name = new_module_name.replace("gate_adapter", "gate_proj")
+        new_module_name = new_module_name.replace("up_adapter", "up_proj")
+
+        # map other parts of the module names to fit vLLM/huggingface
+        new_module_name = new_module_name.replace(".adapter_layer", "")
+        new_module_name = new_module_name.replace(".lora_unfused_kqv_proj", "")
+        new_module_name = new_module_name.replace(".lora_unfused_hto4h_adapter", "")
+        new_module_name = new_module_name.replace("self_attention", "self_attn")
+        new_module_name = new_module_name.replace("decoder", "model")
+
+        new_tensors[new_module_name] = module_weight
+
+        # keep track of the modules that we've added to store them in the config file
+        for kmn in known_module_names:
+            if f'.{kmn}' in new_module_name:
+                module_names.add(kmn)
+
+    return (new_tensors, list(module_names))
+
+
+def convert_lora_weights_to_canonical(
+    config: Dict[str, Any], lora_weights: Dict[str, torch.Tensor]
+) -> Dict[str, torch.Tensor]:
+    """This function converts nemo style (fused) lora weights to canonical (unfused)
+    LoRA weights. Namely, it unfuses the QKV adapter layers and the H-to-4H adapter layers.
+
+    Returns:
+        Dict[str, torch.Tensor]: The new LoRA weights with unfused layers.
+    """
+
+    hidden_size = int(config["hidden_size"])
+    num_heads = int(config["num_attention_heads"])
+    head_size = hidden_size // num_heads
+    num_query_groups = int(config.get("num_query_groups", num_heads))  # num_kv_heads
+
+    heads_per_group = num_heads // num_query_groups
+    qkv_total_dim = num_heads + 2 * num_query_groups
+
+    adapter_size = config['peft']['lora_tuning']['adapter_dim']
+
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * group_idx, (heads_per_group + 2) * group_idx + heads_per_group)
+            for group_idx in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, heads_per_group + 2)
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, heads_per_group + 2)
+
+    qkv_keys_to_update = []
+    hto4h_keys_to_update = []
+    for key in lora_weights.keys():
+        if "lora_kqv_adapter" in key:
+            qkv_keys_to_update.append(key)
+        if "lora_hto4h_adapter" in key:
+            hto4h_keys_to_update.append(key)
+
+    # unfuse QKV layer
+    for key in qkv_keys_to_update:
+        if "linear_in" in key:
+            assert lora_weights[key].size(0) == adapter_size
+            for new_key in rename_qkv_keys(key):
+                lora_weights[new_key] = lora_weights[key]
+                assert len(lora_weights[new_key].size()) == 2
+        elif "linear_out" in key:
+            assert lora_weights[key].size(1) == adapter_size
+            for new_key, size in zip(rename_qkv_keys(key), [q_slice, k_slice, v_slice]):
+                lora_weights[new_key] = (
+                    lora_weights[key]
+                    .reshape((qkv_total_dim, head_size, adapter_size))[size]
+                    .reshape((-1, adapter_size))
+                )
+                assert len(lora_weights[new_key].size()) == 2
+        lora_weights.pop(key)
+
+    # This maps to gate_up_proj in HF, but we need to split it up into gate_proj and up_proj
+    for key in hto4h_keys_to_update:
+        gate_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.gate_adapter.")
+        up_proj_key = key.replace(".lora_hto4h_adapter.", ".lora_unfused_hto4h_adapter.up_adapter.")
+
+        module_weight = lora_weights[key]
+        if "linear_in" in key:
+            # lora_a gets duplicated
+            lora_weights[gate_proj_key] = module_weight
+            lora_weights[up_proj_key] = module_weight
+        elif "linear_out" in key:
+            # lora_b gets split
+            split_size = module_weight.shape[0]
+            gate_up_split = module_weight.split(split_size // 2)
+            lora_weights[gate_proj_key] = gate_up_split[0]
+            lora_weights[up_proj_key] = gate_up_split[1]
+        lora_weights.pop(key)
+    return lora_weights
+
+
+def convert_lora_nemo_to_canonical(lora_nemo, save_path, hf_format=False, donor_hf_config=None):
+    with TarPath(lora_nemo) as archive:
+        with (archive / "model_config.yaml").open("r") as config_file:
+            lora_config = yaml.load(config_file, Loader=yaml.SafeLoader)
+
+        tp_size = lora_config.get('tensor_model_parallel_size', 1)
+        pp_size = lora_config.get('pipeline_model_parallel_size', 1)
+
+        lora_state_dict = [{}] * tp_size
+
+        for pp in range(pp_size):
+            for tp in range(tp_size):
+                if tp_size == 1:
+                    ckpt_file = archive / "model_weights.ckpt"
+                elif pp_size == 1:
+                    ckpt_file = archive / f"mp_rank_{tp:02d}/model_weights.ckpt"
+                else:
+                    ckpt_file = archive / f"tp_rank_{tp:02d}_pp_rank_{pp:03d}/model_weights.ckpt"
+
+                with ckpt_file.open("rb") as f:
+                    weights = torch.load(f, map_location=torch.device('cpu'))
+
+                if pp == 0:
+                    lora_state_dict[tp] = weights
+                else:
+                    # calculate layer offset
+                    layer_offset = lora_config['num_layers'] // pp_size * pp
+                    for key, value in weights.items():
+                        new_key = replace_number_add_offset(key, layer_offset)
+                        lora_state_dict[tp][new_key] = value
+
+        # TODO: currently suport tp=1
+        lora_state_dict = lora_state_dict[0]
+        if lora_config['peft']['lora_tuning'].get('variant', 'nemo') == "nemo":
+            lora_config['peft']['lora_tuning']['variant'] = "canonical"
+            lora_state_dict = convert_lora_weights_to_canonical(lora_config, lora_state_dict)
+
+        if hf_format:
+            lora_state_dict, target_modules = reformat_module_names_to_hf(lora_state_dict)
+            Path(save_path).mkdir(parents=True, exist_ok=True)
+            torch.save(lora_state_dict, f"{save_path}/adapter_model.bin")
+            if donor_hf_config is not None:
+                with open(donor_hf_config) as hf_config_file:
+                    adapter_config = json.load(hf_config_file)
+            else:
+                adapter_config = {}
+            adapter_config['peft_type'] = "LORA"
+            adapter_config['r'] = lora_config['peft']['lora_tuning']['adapter_dim']
+            adapter_config['lora_alpha'] = lora_config['peft']['lora_tuning']['alpha']
+            adapter_config['target_modules'] = target_modules
+            with open(f"{save_path}/adapter_config.json", "w") as f:
+                json.dump(adapter_config, f, indent=4)
+        else:
+            with tempfile.TemporaryDirectory() as tmpdir:
+                with open(f"{tmpdir}/model_config.yaml", "w") as f:
+                    yaml.dump(lora_config, f)
+                torch.save(lora_state_dict, f"{tmpdir}/model_weights.ckpt")
+
+                dirname = os.path.dirname(save_path)
+                os.makedirs(dirname, exist_ok=True)
+                with tarfile.open(save_path, "w:") as tar:
+                    tar.add(tmpdir, arcname=".")
+
+    return lora_state_dict, lora_config
diff --git a/nemo/export/utils/model_loader.py b/nemo/export/utils/model_loader.py
new file mode 100644
index 000000000000..39fbc26505d7
--- /dev/null
+++ b/nemo/export/utils/model_loader.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os.path
+from io import BytesIO
+from pathlib import Path
+from typing import Any, Dict, Union
+
+import numpy
+
+# tenosrstore is needed to register 'bfloat16' dtype with numpy for zarr compatibility
+import tensorstore  # noqa: F401 pylint: disable=unused-import
+import torch
+from torch.distributed.checkpoint import FileSystemReader, load
+from torch.distributed.checkpoint.metadata import BytesStorageMetadata, TensorStorageMetadata
+
+from nemo.export.tarutils import TarPath, ZarrPathStore
+from nemo.export.utils._mock_import import _mock_import
+
+LOGGER = logging.getLogger("NeMo")
+
+
+def nemo_to_path(nemo_checkpoint: Union[Path, str]) -> Union[Path, TarPath]:
+    """
+    Creates Path / TarPath object suitable for navigating inside the nemo checkpoint.
+
+    Args:
+        nemo_checkpoint (Path, str): Path to the NeMo checkpoint.
+    Returns:
+        Path | TarPath: Suitable Path object for navigating through the checkpoint.
+    """
+    string_path = str(nemo_checkpoint)
+
+    if os.path.isdir(string_path):
+        return Path(string_path)
+    return TarPath(string_path)
+
+
+class TarFileSystemReader(FileSystemReader):
+    """Reader that accepts both Path and TarPath checkpoint directory.
+
+    The FileSystemReader works with TarPath, but expects a pure Path.
+    It's enough to skip the Path check in __init__.
+    """
+
+    def __init__(self, path: Union[Path, TarPath]) -> None:
+        """Makes sure that super().__init__ gets a pure path as expected."""
+        super_path = str(path) if isinstance(path, TarPath) else path
+        super().__init__(super_path)
+        if isinstance(path, TarPath):
+            self.path = path  # overwrites path set in super().__init__ call
+
+
+def load_sharded_metadata_torch_dist(
+    checkpoint_dir: Union[Path, TarPath], load_extra_states: bool = False
+) -> Dict[str, Any]:
+    """
+    Loads model state dictionary from torch_dist checkpoint.
+
+    Args:
+        checkpoint_dir (Path | TarPath): Path to the model weights directory.
+        load_extra_states (bool): If set to true, loads BytesIO objects, related to the extra states.
+    Returns:
+        dict: Loaded model state dictionary (weights are stored in torch tensors).
+    """
+    fs_reader = TarFileSystemReader(checkpoint_dir)
+    metadata = fs_reader.read_metadata()
+
+    state_dict = {
+        k: torch.empty(tp.size, dtype=tp.properties.dtype)
+        for k, tp in metadata.state_dict_metadata.items()
+        if isinstance(tp, TensorStorageMetadata)
+    }
+
+    if load_extra_states:
+        state_dict.update(
+            {k: [] for k, tp in metadata.state_dict_metadata.items() if isinstance(tp, BytesStorageMetadata)}
+        )
+
+    load(state_dict, storage_reader=fs_reader)
+    return state_dict
+
+
+def load_sharded_pickle_extra_state_scale(dir: Union[Path, TarPath]) -> Dict[str, BytesIO]:
+    """
+    Loads model extra states from the .pt shards.
+
+    Args:
+        dir (Path | TarPath): Path to the directory with sharded extra states.
+    Returns:
+        dict: State dictionary corresponding to the loaded extra states.
+    """
+    pt_files = list(dir.glob('shard_*_*.pt'))
+    extra_states = {}
+    for file in pt_files:
+        shard_name = file.name.split('.')[0]
+        with file.open('rb') as opened_file:
+            extra_states[dir.name + '/' + shard_name] = torch.load(opened_file, weights_only=True)
+
+    return extra_states
+
+
+def contains_extra_states(subdir: Union[Path, TarPath]) -> bool:
+    """
+    Checks if zarr directory contains extra states.
+
+    Args:
+        subdir (Path | TarPath): Directory inside the zarr checkpoint.
+    Returns:
+        bool: Is a directory with extra states
+    """
+    return list(subdir.glob('shard_0_*.pt')) != []
+
+
+def load_sharded_metadata_zarr(
+    checkpoint_dir: Union[Path, TarPath], load_extra_states: bool = False
+) -> Dict[str, Any]:
+    """
+    Loads model dictionary from the zarr format.
+
+    Args:
+        checkpoint_dir (Path | TarPath): Path to the NeMo checkpoint.
+        load_extra_states (bool): If set to True, the function will load BufferIO objects with extra states.
+    Returns:
+        dict: Model state dictionary.
+    """
+    if load_extra_states:
+        torch.serialization.add_safe_globals([BytesIO])
+
+    sharded_state_dict = {}
+    for subdir in checkpoint_dir.iterdir():
+        if not subdir.is_dir():
+            continue
+
+        if load_extra_states and contains_extra_states(subdir):
+            sharded_state_dict.update(load_sharded_pickle_extra_state_scale(subdir))
+
+        elif (subdir / '.zarray').exists():
+            key = subdir.name
+            zstore = ZarrPathStore(subdir)
+
+            import zarr
+
+            arr = zarr.open(zstore, 'r')
+
+            if arr.dtype.name == "bfloat16":
+                sharded_state_dict[key] = torch.from_numpy(arr[:].view(numpy.int16)).view(torch.bfloat16)
+            else:
+                sharded_state_dict[key] = torch.from_numpy(arr[:])
+
+    return sharded_state_dict
+
+
+def nemo_weights_directory(nemo_path: Union[Path, TarPath]) -> Union[Path, TarPath]:
+    """
+    Returns a Path pointing to the weights directory inside the NeMo checkpoint.
+
+    Args:
+        nemo_path (Path | TarPath): Path to the nemo checkpoint.
+    Returns:
+        Path | TarPath: Path to the weights directory inside the model checkpoint.
+    """
+    if (nemo_path / "model_weights").exists():
+        return nemo_path / "model_weights"
+
+    if (nemo_path / "weights").exists():
+        return nemo_path / "weights"
+
+    return nemo_path
+
+
+def load_model_weights(checkpoint_path: Union[str, Path], load_extra_states: bool = False) -> Dict[str, Any]:
+    """
+    Loads NeMo state dictionary. Weights are stored in torch.Tensor
+
+    Args:
+        checkpoint_path (str | Path): Path to the NeMo checkpoint.
+        load_extra_states (bool): If True, loads BytesIO objects, corresponding to the extra states.
+    Returns:
+        dict: Model state dictionary.
+    """
+
+    nemo_path = nemo_to_path(checkpoint_path)
+    nemo_weights = nemo_weights_directory(nemo_path)
+
+    with (nemo_weights / 'metadata.json').open(mode='r') as f:
+        config_dict = json.load(f)
+
+    if config_dict['sharded_backend'] == 'zarr':
+        return load_sharded_metadata_zarr(nemo_weights, load_extra_states=load_extra_states)
+    elif config_dict['sharded_backend'] == 'torch_dist':
+        # TODO: Remove mocking imports once MCore is available in NIM containers
+        with _mock_import("megatron.core.dist_checkpointing.strategies.torch"):
+            return load_sharded_metadata_torch_dist(nemo_weights, load_extra_states=load_extra_states)
+
+    raise NotImplementedError(f'Distributed checkpoint backend {config_dict["sharded_backend"]} not supported')
diff --git a/nemo/export/utils/utils.py b/nemo/export/utils/utils.py
new file mode 100755
index 000000000000..cdbc5658c052
--- /dev/null
+++ b/nemo/export/utils/utils.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import shutil
+from collections import Counter
+from pathlib import Path
+from typing import Dict, Optional, Union
+
+import torch
+
+
+def is_nemo2_checkpoint(checkpoint_path: str) -> bool:
+    """
+    Checks if the checkpoint is in NeMo 2.0 format.
+    Args:
+        checkpoint_path (str): Path to a checkpoint.
+    Returns:
+        bool: True if the path points to a NeMo 2.0 checkpoint; otherwise false.
+    """
+
+    ckpt_path = Path(checkpoint_path)
+    return (ckpt_path / 'context').is_dir()
+
+
+def prepare_directory_for_export(
+    model_dir: Union[str, Path], delete_existing_files: bool, subdir: Optional[str] = None
+) -> None:
+    """
+    Prepares model_dir path for the TensorRTT-LLM / vLLM export.
+    Makes sure that the model_dir directory exists and is empty.
+
+    Args:
+        model_dir (str): Path to the target directory for the export.
+        delete_existing_files (bool): Attempt to delete existing files if they exist.
+        subdir (Optional[str]): Subdirectory to create inside the model_dir.
+
+    Returns:
+        None
+    """
+    model_path = Path(model_dir)
+
+    if model_path.exists():
+        if delete_existing_files:
+            shutil.rmtree(model_path)
+        elif any(model_path.iterdir()):
+            raise RuntimeError(f"There are files in {model_path} folder: try setting delete_existing_files=True.")
+
+    if subdir is not None:
+        model_path /= subdir
+    model_path.mkdir(parents=True, exist_ok=True)
+
+
+def is_nemo_tarfile(path: str) -> bool:
+    """
+    Checks if the path exists and points to packed NeMo 1 checkpoint.
+
+    Args:
+        path (str): Path to possible checkpoint.
+    Returns:
+        bool: NeMo 1 checkpoint exists and is in '.nemo' format.
+    """
+    checkpoint_path = Path(path)
+    return checkpoint_path.exists() and checkpoint_path.suffix == '.nemo'
+
+
+# Copied from nemo.collections.nlp.parts.utils_funcs to avoid introducing extra NeMo dependencies:
+def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: bool = True) -> torch.dtype:
+    """
+    Mapping from PyTorch Lighthing (PTL) precision types to corresponding PyTorch parameter data type.
+
+    Args:
+        precision (Union[int, str]): The PTL precision type used.
+        megatron_amp_O2 (bool): A flag indicating if Megatron AMP O2 is enabled.
+
+    Returns:
+        torch.dtype: The corresponding PyTorch data type based on the provided precision.
+    """
+    if not megatron_amp_O2:
+        return torch.float32
+
+    if precision in ['bf16', 'bf16-mixed']:
+        return torch.bfloat16
+    elif precision in [16, '16', '16-mixed']:
+        return torch.float16
+    elif precision in [32, '32', '32-true']:
+        return torch.float32
+    else:
+        raise ValueError(f"Could not parse the precision of '{precision}' to a valid torch.dtype")
+
+
+def get_model_device_type(module: torch.nn.Module) -> str:
+    """Find the device type the model is assigned to and ensure consistency."""
+    # Collect device types of all parameters and buffers
+    param_device_types = {param.device.type for param in module.parameters()}
+    buffer_device_types = {buffer.device.type for buffer in module.buffers()}
+    all_device_types = param_device_types.union(buffer_device_types)
+
+    if len(all_device_types) > 1:
+        raise ValueError(
+            f"Model parameters and buffers are on multiple device types: {all_device_types}. "
+            "Ensure all parameters and buffers are on the same device type."
+        )
+
+    # Return the single device type, or default to 'cpu' if no parameters or buffers
+    return all_device_types.pop() if all_device_types else "cpu"
+
+
+def get_example_inputs(tokenizer) -> Dict[str, torch.Tensor]:
+    """Gets example data to feed to the model during ONNX export.
+
+    Returns:
+        Dictionary of tokenizer outputs.
+    """
+    example_inputs = dict(
+        tokenizer(
+            ["example query one", "example query two"],
+            ["example passage one", "example passage two"],
+            return_tensors="pt",
+        )
+    )
+
+    return example_inputs
+
+
+def validate_fp8_network(network) -> None:
+    """Checks the network to ensure it's compatible with fp8 precison.
+
+    Raises:
+        ValueError if netowrk doesn't container Q/DQ FP8 layers
+    """
+
+    import tensorrt as trt
+
+    quantize_dequantize_layers = []
+    for layer in network:
+        if layer.type in {trt.LayerType.QUANTIZE, trt.LayerType.DEQUANTIZE}:
+            quantize_dequantize_layers.append(layer)
+    if not quantize_dequantize_layers:
+        error_msg = "No Quantize/Dequantize layers found"
+        raise ValueError(error_msg)
+    quantize_dequantize_layer_dtypes = Counter(layer.precision for layer in quantize_dequantize_layers)
+    if trt.DataType.FP8 not in quantize_dequantize_layer_dtypes:
+        error_msg = "Found Quantize/Dequantize layers. But none with FP8 precision."
+        raise ValueError(error_msg)
diff --git a/nemo/export/vllm/__init__.py b/nemo/export/vllm/__init__.py
new file mode 100644
index 000000000000..d9155f923f18
--- /dev/null
+++ b/nemo/export/vllm/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/nemo/export/vllm/engine.py b/nemo/export/vllm/engine.py
new file mode 100644
index 000000000000..c3776b842b83
--- /dev/null
+++ b/nemo/export/vllm/engine.py
@@ -0,0 +1,140 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from pathlib import Path
+
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizerBase
+from vllm import LLMEngine
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+
+from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
+from nemo.export.tarutils import TarPath
+from nemo.export.vllm.tokenizer_group import NemoTokenizerGroup
+
+LOGGER = logging.getLogger("NeMo")
+
+
+class vLLMTokenizerGroup(TokenizerGroup):
+    """
+    Implements a custom tokenizer for vLLM, based on a huggingface tokenizer
+    """
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def get_lora_tokenizer(self, lora_request):
+        return self.tokenizer
+
+    async def get_lora_tokenizer_async(self, lora_request):
+        return self.tokenizer
+
+    @property
+    def max_input_length(self, lora_request=None):
+        return None
+
+
+class NemoLLMEngine(LLMEngine):
+    """
+    Overrides some functionality from vllm.LLMEngine to use our custom tokenizer
+    instead of one from Transformers.
+    """
+
+    def _init_tokenizer(self, **tokenizer_init_kwargs):
+        # Determine if the model needs a bos token (which is not stored in Nemo checkpoints)
+        add_bos_token = self.model_config.model_converter.requires_bos_token()
+        tokenizer_config = self.model_config.nemo_model_config.get('tokenizer', {})
+
+        if not isinstance(tokenizer_config, dict) and hasattr(tokenizer_config, 'tokenizer'):
+            tokenizer = tokenizer_config.tokenizer
+
+            if isinstance(tokenizer, SentencePieceProcessor):
+                self.model_config.hf_config.bos_token_id = tokenizer.bos_token_id
+                self.model_config.hf_config.eos_token_id = tokenizer.eos_token_id
+
+                tokenizer = SentencePieceTokenizer(tokenizer=tokenizer)
+                return NemoTokenizerGroup(tokenizer, add_bos_token=add_bos_token)
+
+            if isinstance(tokenizer, PreTrainedTokenizerBase):
+                return vLLMTokenizerGroup(tokenizer)
+
+        # Find the tokenizer file name in the Nemo checkpoint config
+        tokenizer_model = tokenizer_config.get('model', tokenizer_config.get('tokenizer_model', None))
+
+        # If there is no tokenizer file specified but there's a reference to an HF tokenizer, use that
+        if tokenizer_model is None and tokenizer_config.get('library') == 'huggingface':
+            tokenizer_type = tokenizer_config.get('type')
+            if tokenizer_type is not None:
+                tokenizer_group = TokenizerGroup(
+                    tokenizer_id=tokenizer_type,
+                    enable_lora=bool(self.lora_config),
+                    max_num_seqs=self.scheduler_config.max_num_seqs,
+                    max_input_length=None,
+                )
+
+                # Update the HF config fields that come from the tokenizer in NeMo
+                self.model_config.hf_config.vocab_size = len(
+                    tokenizer_group.tokenizer.vocab
+                )  # this may be greater than vocab_size
+                self.model_config.hf_config.bos_token_id = tokenizer_group.tokenizer.bos_token_id
+                self.model_config.hf_config.eos_token_id = tokenizer_group.tokenizer.eos_token_id
+                self.model_config.hf_config.pad_token_id = tokenizer_group.tokenizer.pad_token_id
+
+                return tokenizer_group
+
+        # Open the checkpoint archive
+        with TarPath(self.model_config.nemo_checkpoint) as archive:
+            tokenizer_model_file = None
+            if isinstance(tokenizer_model, str) and tokenizer_model.startswith('nemo:'):
+                tokenizer_model = tokenizer_model[len('nemo:') :]
+                tokenizer_model_file = archive / tokenizer_model
+                if not tokenizer_model_file.exists():
+                    LOGGER.warn(
+                        f'Tokenizer model file {tokenizer_model} specified in the model_config does not '
+                        + 'exist in the checkpoint.'
+                    )
+                    tokenizer_model_file = None
+
+            if tokenizer_model_file is None:
+                for path in archive.glob('*tokenizer*.model'):
+                    LOGGER.info(f'Found tokenizer model file {path}.')
+                    tokenizer_model_file = path
+                    break
+
+            if tokenizer_model_file is None:
+                raise RuntimeError('No tokenizer model file found, aborting.')
+
+            # Extract the tokenizer model file into the model directory,
+            # because sentencepiece cannot load it directly from TarPath.
+            extracted_tokenizer_model = Path(self.model_config.model) / 'tokenizer.model'
+            with tokenizer_model_file.open('rb') as infile:
+                with extracted_tokenizer_model.open('wb') as outfile:
+                    outfile.write(infile.read())
+
+            # Construct the tokenizer object and wrapper
+            tokenizer = SentencePieceTokenizer(str(extracted_tokenizer_model))
+
+            # Determine if the model needs a bos token (which is not stored in Nemo checkpoints)
+            add_bos_token = self.model_config.model_converter.requires_bos_token()
+
+            tokenizer_group = NemoTokenizerGroup(tokenizer, add_bos_token=add_bos_token)
+
+            # Update the HF config fields that come from the tokenizer in NeMo
+            self.model_config.hf_config.vocab_size = tokenizer.vocab_size
+            self.model_config.hf_config.bos_token_id = tokenizer.bos_token_id
+            self.model_config.hf_config.eos_token_id = tokenizer.eos_token_id
+            self.model_config.hf_config.pad_token_id = tokenizer.pad_id
+
+            return tokenizer_group
diff --git a/nemo/export/vllm/model_config.py b/nemo/export/vllm/model_config.py
new file mode 100644
index 000000000000..21151adbf658
--- /dev/null
+++ b/nemo/export/vllm/model_config.py
@@ -0,0 +1,245 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+from pathlib import Path
+from typing import Any, Dict, Optional, Union
+
+import torch
+import yaml
+from hydra.utils import instantiate
+from omegaconf import OmegaConf
+from transformers import AutoConfig
+from vllm.config import ModelConfig, ModelImpl, PoolerConfig, _get_and_verify_dtype, _get_and_verify_max_len
+from vllm.transformers_utils.config import get_hf_text_config
+
+from nemo.export.tarutils import TarPath
+from nemo.export.utils import is_nemo2_checkpoint
+from nemo.export.vllm.model_converters import get_model_converter
+
+
+class NemoModelConfig(ModelConfig):
+    """
+    This class pretents to be a vllm.config.ModelConfig (with extra fields) but skips
+    some of its initialization code, and initializes the configuration from a Nemo checkpoint instead.
+    """
+
+    def __init__(
+        self,
+        nemo_checkpoint: str,
+        model_dir: str,
+        model_type: str,
+        tokenizer_mode: str,
+        dtype: Union[str, torch.dtype],
+        seed: int,
+        revision: Optional[str] = None,
+        override_neuron_config: Optional[Dict[str, Any]] = None,
+        code_revision: Optional[str] = None,
+        rope_scaling: Optional[dict] = None,
+        rope_theta: Optional[float] = None,
+        tokenizer_revision: Optional[str] = None,
+        max_model_len: Optional[int] = None,
+        quantization: Optional[str] = None,
+        quantization_param_path: Optional[str] = None,
+        enforce_eager: bool = False,
+        max_seq_len_to_capture: Optional[int] = None,
+        max_logprobs: int = 5,
+        disable_sliding_window: bool = False,
+        use_async_output_proc: bool = False,
+        disable_mm_preprocessor_cache: bool = False,
+        logits_processor_pattern: Optional[str] = None,
+        override_pooler_config: Optional[PoolerConfig] = None,
+        enable_sleep_mode: bool = False,
+        model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
+    ) -> None:
+        # Don't call ModelConfig.__init__ because we don't want it to call
+        # transformers.AutoConfig.from_pretrained(...)
+
+        # TODO: Do something about vLLM's call to _load_generation_config_dict in LLMEngine.__init__
+        # because it calls transformers.GenerationConfig.from_pretrained(...), which tries to download things
+
+        self.nemo_checkpoint = nemo_checkpoint
+        self.model = model_dir
+        self.model_type = model_type
+        self.tokenizer = None
+        self.tokenizer_mode = tokenizer_mode
+        self.skip_tokenizer_init = False
+        self.trust_remote_code = False
+        self.seed = seed
+        self.revision = revision
+        self.code_revision = code_revision
+        self.override_neuron_config = override_neuron_config
+        self.rope_scaling = rope_scaling
+        self.rope_theta = rope_theta
+        self.tokenizer_revision = tokenizer_revision
+        self.model_impl = model_impl
+        self.quantization = quantization
+        self.quantization_param_path = quantization_param_path
+        self.enforce_eager = enforce_eager
+        self.max_seq_len_to_capture = max_seq_len_to_capture
+        self.max_logprobs = max_logprobs
+        self.disable_sliding_window = disable_sliding_window
+        self.served_model_name = nemo_checkpoint
+        self.multimodal_config = None
+        self.mm_processor_kwargs = {}
+        self.use_async_output_proc = use_async_output_proc
+        self.disable_mm_preprocessor_cache = disable_mm_preprocessor_cache
+        self.logits_processor_pattern = logits_processor_pattern
+        self.generation_config = None
+        self.task = "generate"  # Only the generate task is supported
+        self.is_hybrid = False  # No hybrid models are supported
+
+        if self.task in ("draft", "generate"):
+            self.truncation_side = "left"
+        else:
+            self.truncation_side = "right"
+
+        self.encoder_config = self._get_encoder_config()
+        self.pooler_config = self._init_pooler_config(override_pooler_config)
+        self.enable_sleep_mode = enable_sleep_mode
+
+        from vllm.platforms import current_platform  # vLLM uses local import for current_platform
+
+        if self.enable_sleep_mode and not current_platform.is_cuda():
+            raise ValueError("Sleep mode is only supported on CUDA devices.")
+
+        self.model_converter = get_model_converter(model_type)
+        if self.model_converter is None:
+            raise RuntimeError(f'Unknown model type "{model_type}"')
+
+        if is_nemo2_checkpoint(nemo_checkpoint):
+            nemo_checkpoint: Path = Path(nemo_checkpoint)
+            tokenizer_config = OmegaConf.load(nemo_checkpoint / "context/model.yaml").tokenizer
+            if ('additional_special_tokens' in tokenizer_config) and len(
+                tokenizer_config['additional_special_tokens']
+            ) == 0:
+                del tokenizer_config['additional_special_tokens']
+
+            tokenizer_config = self._change_paths_to_absolute_paths(tokenizer_config, nemo_checkpoint)
+            tokenizer = instantiate(tokenizer_config)
+
+            with (nemo_checkpoint / "context/model.yaml").open('r') as config_file:
+                self.nemo_model_config: dict = yaml.load(config_file, Loader=yaml.SafeLoader)
+            hf_args = self._load_hf_arguments(self.nemo_model_config['config'])
+
+            if hasattr(tokenizer, 'bos_id'):
+                tokenizer.tokenizer.bos_token_id = tokenizer.bos_id
+            if hasattr(tokenizer, 'eos_id'):
+                tokenizer.tokenizer.eos_token_id = tokenizer.eos_id
+
+            hf_args['vocab_size'] = tokenizer.original_vocab_size
+            self.model_converter.convert_config(self.nemo_model_config['config'], hf_args)
+            self.hf_config = AutoConfig.for_model(model_type, **hf_args)
+            self.nemo_model_config['tokenizer'] = tokenizer
+        else:
+            with TarPath(nemo_checkpoint) as archive:
+                with (archive / "model_config.yaml").open("r") as model_config_file:
+                    self.nemo_model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
+                    hf_args = self._load_hf_arguments(self.nemo_model_config)
+                    self.model_converter.convert_config(self.nemo_model_config, hf_args)
+                self.hf_config = AutoConfig.for_model(model_type, **hf_args)
+
+        self.hf_config.architectures = [self.model_converter.get_architecture()]
+        if self.rope_scaling is not None:
+            self.hf_config['rope_scaling'] = rope_scaling
+
+        self.hf_text_config = get_hf_text_config(self.hf_config)
+        self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
+        self.max_model_len = _get_and_verify_max_len(
+            hf_config=self.hf_text_config,
+            max_model_len=max_model_len,
+            disable_sliding_window=self.disable_sliding_window,
+            sliding_window_len=self.get_hf_config_sliding_window(),
+        )
+        self.is_attention_free = self._init_attention_free()
+        self.has_inner_state = self._init_has_inner_state()
+        self.has_noops = self._init_has_noops()
+
+        self._verify_tokenizer_mode()
+        self._verify_quantization()
+        self._verify_cuda_graph()
+
+    @staticmethod
+    def _change_paths_to_absolute_paths(tokenizer_config: Dict[Any, Any], nemo_checkpoint: Path) -> Dict[Any, Any]:
+        """
+        Creates absolute path to the local tokenizers. Used for NeMo 2.0.
+
+        Args:
+            tokenizer_config (dict): Parameters for instantiating the tokenizer.
+            nemo_checkpoint (path): Path to the NeMo2 checkpoint.
+        Returns:
+            dict: Updated tokenizer config.
+        """
+        context_path = nemo_checkpoint / 'context'
+
+        # 'pretrained_model_name' -- huggingface tokenizer case
+        # 'model_path' -- sentencepiece tokenizer
+        path_keys = ['pretrained_model_name', 'model_path']
+
+        for path_key in path_keys:
+            if path := tokenizer_config.get(path_key, None):
+                tokenizer_path = context_path / path
+                if not tokenizer_path.exists():
+                    continue
+
+                tokenizer_config[path_key] = str(tokenizer_path.resolve())
+
+        return tokenizer_config
+
+    def _load_hf_arguments(self, nemo_config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Maps argument names used in NeMo to their corresponding names in HF.
+        """
+
+        hf_to_nemo_dict = {
+            'hidden_size': 'hidden_size',
+            'intermediate_size': 'ffn_hidden_size',
+            'num_hidden_layers': 'num_layers',
+            'num_attention_heads': 'num_attention_heads',
+            'num_key_value_heads': 'num_query_groups',
+            # 'hidden_act': 'activation', ## <- vLLM has good defaults for the models, nemo values are wrong
+            'max_position_embeddings': ['max_position_embeddings', 'encoder_seq_length'],
+            'tie_word_embeddings': 'share_embeddings_and_output_weights',
+            'rms_norm_eps': 'layernorm_epsilon',
+            'attention_dropout': 'attention_dropout',
+            'initializer_range': 'init_method_std',
+            'norm_epsilon': 'layernorm_epsilon',
+            'rope_theta': 'rotary_base',
+            'use_bias': ['bias', 'add_bias_linear'],
+        }
+
+        hf_args = {}
+        for hf_arg, nemo_arg in hf_to_nemo_dict.items():
+            if not isinstance(nemo_arg, list):
+                nemo_arg = [nemo_arg]
+
+            for nemo_arg_option in nemo_arg:
+                value = nemo_config.get(nemo_arg_option)
+                if value is not None:
+                    hf_args[hf_arg] = value
+                    break
+
+        return hf_args
+
+    def try_get_generation_config(self, *args, **kwargs):
+        """
+        Prevent vLLM from trying to load a generation config
+        """
+        nemo_path = Path(self.nemo_checkpoint)
+        generation_config_path = nemo_path / "context" / "artifacts" / "generation_config.json"
+        if generation_config_path.exists():
+            with generation_config_path.open("r") as f:
+                return json.load(f)
+
+        return {}
diff --git a/nemo/export/vllm/model_converters.py b/nemo/export/vllm/model_converters.py
new file mode 100644
index 000000000000..5e4cf619d281
--- /dev/null
+++ b/nemo/export/vllm/model_converters.py
@@ -0,0 +1,421 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Generator, Optional, Tuple
+
+import torch
+
+
+class ModelConverter(ABC):
+    """
+    Abstract class that defines the interface for a converter that implements model-specific conversion functions
+    for deploying NeMo checkpoints on vLLM.
+    """
+
+    def __init__(self, model_type: str):
+        self.model_type = model_type
+
+    @abstractmethod
+    def get_architecture(self) -> Optional[str]:
+        """
+        Returns the HF architecture name for the current model, such as 'LlamaForCausalLM'.
+        """
+        pass
+
+    def convert_config(self, nemo_model_config: dict, hf_config: dict) -> None:
+        """
+        Implements any custom HF configuration adjustments in the 'hf_config' dict that are necessary
+        for this model after the common translation takes place in NemoModelConfig's constructor.
+        """
+        pass
+
+    @abstractmethod
+    def convert_weights(
+        self, nemo_model_config: dict, state_dict: dict
+    ) -> Generator[Tuple[str, torch.tensor], None, None]:
+        """
+        Returns or yields a sequence of (name, tensor) tuples that contain model weights in the HF format.
+        """
+        pass
+
+    def requires_bos_token(self) -> bool:
+        """
+        Returns True if the model requires a 'bos' token to be used at the beginning of the input sequence.
+        NeMo checkpoints do not store this information.
+        """
+        return False
+
+
+class LlamaConverter(ModelConverter):
+
+    def get_architecture(self):
+        if self.model_type == 'llama':
+            return 'LlamaForCausalLM'
+        if self.model_type == 'mistral':
+            return 'MistralForCausalLM'
+        return None
+
+    def convert_weights(self, nemo_model_config, state_dict):
+        hidden_size = nemo_model_config["hidden_size"]
+        head_num = nemo_model_config["num_attention_heads"]
+        num_query_groups = nemo_model_config["num_query_groups"]
+        num_layers = nemo_model_config["num_layers"]
+        head_size = hidden_size // head_num
+        heads_per_group = head_num // num_query_groups
+        qkv_total_dim = head_num + 2 * num_query_groups
+
+        yield ('model.embed_tokens.weight', state_dict['model.embedding.word_embeddings.weight'])
+        yield ('model.norm.weight', state_dict['model.decoder.final_layernorm.weight'])
+        if not nemo_model_config.get("share_embeddings_and_output_weights", False):
+            yield ('lm_head.weight', state_dict['model.output_layer.weight'])
+
+        for layer in range(int(num_layers)):
+            qkv_weights = state_dict['model.decoder.layers.self_attention.linear_qkv.weight'][layer]
+            qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
+
+            q_slice = torch.cat(
+                [
+                    torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                    for i in range(num_query_groups)
+                ]
+            )
+            k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+            v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+            for name, slice in [('q_proj', q_slice), ('k_proj', k_slice), ('v_proj', v_slice)]:
+                weight_name = f'model.layers.{layer}.self_attn.{name}.weight'
+                yield (weight_name, qkv_weights[slice].reshape(-1, hidden_size))
+
+            linear_proj_weight = state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer]
+            yield (f'model.layers.{layer}.self_attn.o_proj.weight', linear_proj_weight)
+
+            gate_proj_weight, up_proj_weight = torch.chunk(
+                state_dict['model.decoder.layers.mlp.linear_fc1.weight'][layer], 2, dim=0
+            )
+            yield (f'model.layers.{layer}.mlp.gate_proj.weight', gate_proj_weight)
+            yield (f'model.layers.{layer}.mlp.up_proj.weight', up_proj_weight)
+
+            mlp_up_weight = state_dict['model.decoder.layers.mlp.linear_fc2.weight'][layer]
+            yield (f'model.layers.{layer}.mlp.down_proj.weight', mlp_up_weight)
+
+            input_layernorm_weight = state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_weight'][
+                layer
+            ]
+            yield (f'model.layers.{layer}.input_layernorm.weight', input_layernorm_weight)
+
+            post_attn_layernorm_weight = state_dict['model.decoder.layers.mlp.linear_fc1.layer_norm_weight'][layer]
+            yield (f'model.layers.{layer}.post_attention_layernorm.weight', post_attn_layernorm_weight)
+
+    def requires_bos_token(self):
+        return True
+
+
+class MixtralConverter(ModelConverter):
+
+    def get_architecture(self):
+        if self.model_type == 'mixtral':
+            return 'MixtralForCausalLM'
+        return None
+
+    def convert_weights(self, nemo_model_config, state_dict):
+        hidden_size = nemo_model_config["hidden_size"]
+        head_num = nemo_model_config["num_attention_heads"]
+        num_query_groups = nemo_model_config["num_query_groups"]
+        num_layers = nemo_model_config["num_layers"]
+        num_moe_experts = nemo_model_config["num_moe_experts"]
+        head_size = hidden_size // head_num
+        heads_per_group = head_num // num_query_groups
+        qkv_total_dim = head_num + 2 * num_query_groups
+
+        yield ('model.embed_tokens.weight', state_dict['model.embedding.word_embeddings.weight'])
+        yield ('model.norm.weight', state_dict['model.decoder.final_layernorm.weight'])
+        yield ('lm_head.weight', state_dict['model.output_layer.weight'])
+
+        for layer in range(int(num_layers)):
+            qkv_weights = state_dict['model.decoder.layers.self_attention.linear_qkv.weight'][layer]
+            qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
+
+            q_slice = torch.cat(
+                [
+                    torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                    for i in range(num_query_groups)
+                ]
+            )
+            k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+            v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+            for name, slice in [('q_proj', q_slice), ('k_proj', k_slice), ('v_proj', v_slice)]:
+                weight_name = f'model.layers.{layer}.self_attn.{name}.weight'
+                yield (weight_name, qkv_weights[slice].reshape(-1, hidden_size))
+
+            linear_proj_weight = state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer]
+            yield (f'model.layers.{layer}.self_attn.o_proj.weight', linear_proj_weight)
+
+            mlp_router_weight = state_dict['model.decoder.layers.mlp.router.weight'][layer]
+            yield (f'model.layers.{layer}.block_sparse_moe.gate.weight', mlp_router_weight)
+
+            for expert in range(num_moe_experts):
+                linear_fc1_weight = state_dict['model.decoder.layers.mlp.experts.experts.linear_fc1.weight'][layer][
+                    expert
+                ]
+                gate_proj_weight, up_proj_weight = torch.chunk(linear_fc1_weight, 2, dim=0)
+                yield (f'model.layers.{layer}.block_sparse_moe.experts.{expert}.w1.weight', gate_proj_weight)
+                yield (f'model.layers.{layer}.block_sparse_moe.experts.{expert}.w3.weight', up_proj_weight)
+
+                linear_fc2_weight = state_dict['model.decoder.layers.mlp.experts.experts.linear_fc2.weight'][layer][
+                    expert
+                ]
+                yield (f'model.layers.{layer}.block_sparse_moe.experts.{expert}.w2.weight', linear_fc2_weight)
+
+            input_layernorm_weight = state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_weight'][
+                layer
+            ]
+            yield (f'model.layers.{layer}.input_layernorm.weight', input_layernorm_weight)
+
+            post_attn_layernorm_weight = state_dict['model.decoder.layers.pre_mlp_layernorm.weight'][layer]
+            yield (f'model.layers.{layer}.post_attention_layernorm.weight', post_attn_layernorm_weight)
+
+    def requires_bos_token(self):
+        return True
+
+
+class GemmaConverter(ModelConverter):
+
+    def get_architecture(self):
+        if self.model_type == 'gemma':
+            return 'GemmaForCausalLM'
+        return None
+
+    def convert_weights(self, nemo_model_config, state_dict):
+        num_layers = nemo_model_config["num_layers"]
+        num_query_groups = nemo_model_config["num_query_groups"]
+        head_num = nemo_model_config["num_attention_heads"]
+        head_size = nemo_model_config["kv_channels"]
+        hidden_size = nemo_model_config["hidden_size"]
+        zero_centered_gamma = nemo_model_config.get("layernorm_zero_centered_gamma", False)
+        heads_per_group = head_num // num_query_groups
+
+        yield ('model.embed_tokens.weight', state_dict['model.embedding.word_embeddings.weight'])
+
+        final_layernorm_weight = state_dict['model.decoder.final_layernorm.weight']
+        if not zero_centered_gamma:
+            final_layernorm_weight -= 1.0
+        yield ('model.norm.weight', final_layernorm_weight)
+
+        for layer in range(int(num_layers)):
+            input_layernorm_weight = state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_weight'][
+                layer
+            ]
+            if not zero_centered_gamma:
+                input_layernorm_weight -= 1.0
+            yield (f'model.layers.{layer}.input_layernorm.weight', input_layernorm_weight)
+
+            post_attention_layernorm_weight = state_dict['model.decoder.layers.mlp.linear_fc1.layer_norm_weight'][
+                layer
+            ]
+            if not zero_centered_gamma:
+                post_attention_layernorm_weight -= 1.0
+            yield (f'model.layers.{layer}.post_attention_layernorm.weight', post_attention_layernorm_weight)
+
+            gate_up_combined_weight = state_dict['model.decoder.layers.mlp.linear_fc1.weight'][layer]
+            gate_size = gate_up_combined_weight.shape[0] // 2
+            yield (f'model.layers.{layer}.mlp.gate_proj.weight', gate_up_combined_weight[:gate_size, :])
+            yield (f'model.layers.{layer}.mlp.up_proj.weight', gate_up_combined_weight[gate_size:, :])
+
+            down_proj_weight = state_dict['model.decoder.layers.mlp.linear_fc2.weight'][layer]
+            yield (f'model.layers.{layer}.mlp.down_proj.weight', down_proj_weight)
+
+            self_attn_o_proj_weight = state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer]
+            yield (f'model.layers.{layer}.self_attn.o_proj.weight', self_attn_o_proj_weight)
+
+            qkv_weight = state_dict['model.decoder.layers.self_attention.linear_qkv.weight'][layer]
+            qkv_intermediate_size = head_num + 2 * num_query_groups
+            qkv_weight = qkv_weight.reshape(qkv_intermediate_size, head_size, hidden_size)
+
+            q_weight = torch.empty((head_num, head_size, hidden_size), dtype=qkv_weight.dtype)
+            k_weight = torch.empty((num_query_groups, head_size, hidden_size), dtype=qkv_weight.dtype)
+            v_weight = torch.empty((num_query_groups, head_size, hidden_size), dtype=qkv_weight.dtype)
+
+            ptr = 0
+            for i in range(num_query_groups):
+                q_weight[i * heads_per_group : (i + 1) * heads_per_group, :, :] = qkv_weight[
+                    ptr : ptr + heads_per_group, ::
+                ]
+                ptr += heads_per_group
+                k_weight[i : i + 1, :, :] = qkv_weight[ptr : ptr + 1, :, :]
+                ptr += 1
+                v_weight[i : i + 1, :, :] = qkv_weight[ptr : ptr + 1, :, :]
+                ptr += 1
+            assert ptr == qkv_intermediate_size
+
+            q_weight = q_weight.reshape(head_num * head_size, hidden_size)
+            k_weight = k_weight.reshape(num_query_groups * head_size, hidden_size)
+            v_weight = v_weight.reshape(num_query_groups * head_size, hidden_size)
+
+            yield (f'model.layers.{layer}.self_attn.q_proj.weight', q_weight)
+            yield (f'model.layers.{layer}.self_attn.k_proj.weight', k_weight)
+            yield (f'model.layers.{layer}.self_attn.v_proj.weight', v_weight)
+
+    def requires_bos_token(self):
+        return True
+
+
+class Starcoder2Converter(ModelConverter):
+
+    def get_architecture(self):
+        if self.model_type == 'starcoder2':
+            return 'Starcoder2ForCausalLM'
+        return None
+
+    def convert_config(self, nemo_model_config, hf_config):
+        window_sizes = nemo_model_config.get('window_size')
+        if window_sizes is not None:
+            hf_config['sliding_window'] = window_sizes[0]
+
+        # 'tie_word_embeddings = False' means that there is a 'lm_head.weight' tensor.
+        # This converter assumes that it's always there.
+        # If there is a version of starcoder2 where it's not there, we'll need to copy
+        # 'model.embed_tokens.weight' into 'lm_head.weight' and still set 'tie_word_embeddings = False'
+        # because at this point we don't know if the weight is there or not, and this configuration
+        # is not stored in NeMo checkpoints.
+        hf_config['tie_word_embeddings'] = False
+
+    def convert_weights(self, nemo_model_config, state_dict):
+        num_layers = nemo_model_config["num_layers"]
+        num_query_groups = nemo_model_config["num_query_groups"]
+        head_num = nemo_model_config["num_attention_heads"]
+        hidden_size = nemo_model_config["hidden_size"]
+        head_size = hidden_size // head_num
+        heads_per_group = head_num // num_query_groups
+        qkv_total_dim = head_num + 2 * num_query_groups
+
+        if 'bias' in nemo_model_config:
+            has_bias = nemo_model_config["bias"]
+        else:
+            has_bias = nemo_model_config["add_bias_linear"]
+
+        yield ('model.embed_tokens.weight', state_dict['model.embedding.word_embeddings.weight'])
+
+        yield ('model.norm.weight', state_dict['model.decoder.final_layernorm.weight'])
+        if has_bias:
+            yield ('model.norm.bias', state_dict['model.decoder.final_layernorm.bias'])
+
+        yield ('lm_head.weight', state_dict['model.output_layer.weight'])
+
+        for layer in range(int(num_layers)):
+            # q,k,v
+            qkv_weights = state_dict['model.decoder.layers.self_attention.linear_qkv.weight'][layer]
+            qkv_weights = qkv_weights.reshape([qkv_total_dim, head_size, hidden_size])
+            if has_bias:
+                qkv_bias = state_dict['model.decoder.layers.self_attention.linear_qkv.bias'][layer]
+                qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
+
+            q_slice = torch.cat(
+                [
+                    torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+                    for i in range(num_query_groups)
+                ]
+            )
+            k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+            v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+            for name, slice in [('q_proj', q_slice), ('k_proj', k_slice), ('v_proj', v_slice)]:
+                qkv_weights_slice = qkv_weights[slice].reshape(-1, hidden_size)
+                yield (f'model.layers.{layer}.self_attn.{name}.weight', qkv_weights_slice)
+                if has_bias:
+                    qkv_bias_slice = qkv_bias[slice].reshape(-1)
+                    yield (f'model.layers.{layer}.self_attn.{name}.bias', qkv_bias_slice)
+
+            # Attention dense
+            yield (
+                f'model.layers.{layer}.self_attn.o_proj.weight',
+                state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer],
+            )
+            if has_bias:
+                yield (
+                    f'model.layers.{layer}.self_attn.o_proj.bias',
+                    state_dict['model.decoder.layers.self_attention.linear_proj.bias'][layer],
+                )
+
+            # MLP FC1
+            yield (
+                f'model.layers.{layer}.mlp.c_fc.weight',
+                state_dict['model.decoder.layers.mlp.linear_fc1.weight'][layer],
+            )
+            if has_bias:
+                yield (
+                    f'model.layers.{layer}.mlp.c_fc.bias',
+                    state_dict['model.decoder.layers.mlp.linear_fc1.bias'][layer],
+                )
+
+            # MLP FC2
+            yield (
+                f'model.layers.{layer}.mlp.c_proj.weight',
+                state_dict['model.decoder.layers.mlp.linear_fc2.weight'][layer],
+            )
+            if has_bias:
+                yield (
+                    f'model.layers.{layer}.mlp.c_proj.bias',
+                    state_dict['model.decoder.layers.mlp.linear_fc2.bias'][layer],
+                )
+
+            # Input LayerNorm
+            yield (
+                f'model.layers.{layer}.input_layernorm.weight',
+                state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_weight'][layer],
+            )
+            if has_bias:
+                yield (
+                    f'model.layers.{layer}.input_layernorm.bias',
+                    state_dict['model.decoder.layers.self_attention.linear_qkv.layer_norm_bias'][layer],
+                )
+
+            # Post-attention LayerNorm
+            yield (
+                f'model.layers.{layer}.post_attention_layernorm.weight',
+                state_dict['model.decoder.layers.mlp.linear_fc1.layer_norm_weight'][layer],
+            )
+            if has_bias:
+                yield (
+                    f'model.layers.{layer}.post_attention_layernorm.bias',
+                    state_dict['model.decoder.layers.mlp.linear_fc1.layer_norm_bias'][layer],
+                )
+
+
+_MODEL_CONVERTERS = {
+    'llama': LlamaConverter,
+    'mistral': LlamaConverter,
+    'mixtral': MixtralConverter,
+    'gemma': GemmaConverter,
+    'starcoder2': Starcoder2Converter,
+}
+
+
+def register_model_converter(model_type, cls):
+    """
+    Establishes a mapping from short model type to a class that converts the model from Nemo format
+    to a vLLM compatible format.
+    """
+    _MODEL_CONVERTERS[model_type] = cls
+
+
+def get_model_converter(model_type) -> Optional[ModelConverter]:
+    """
+    Returns an instance of the the model conversion class for the given model type, or None.
+    """
+    cls = _MODEL_CONVERTERS.get(model_type, None)
+    if cls is None:
+        return None
+    return cls(model_type)
diff --git a/nemo/export/vllm/model_loader.py b/nemo/export/vllm/model_loader.py
new file mode 100644
index 000000000000..c5d74fe883be
--- /dev/null
+++ b/nemo/export/vllm/model_loader.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os.path
+from typing import Any, Dict
+
+import safetensors.torch
+import torch
+from vllm.config import ModelConfig
+from vllm.model_executor.model_loader.loader import BaseModelLoader, _initialize_model
+from vllm.model_executor.model_loader.utils import set_default_torch_dtype
+
+from nemo.export.utils import load_model_weights
+from nemo.export.vllm.model_config import NemoModelConfig
+
+LOGGER = logging.getLogger("NeMo")
+
+
+class NemoModelLoader(BaseModelLoader):
+    """
+    Implements a custom ModelLoader for vLLM that reads the weights from a Nemo checkpoint
+    and converts them to a vLLM compatible format at load time.
+
+    Also supports an ahead-of-time conversion that stores new weights in a Safetensors file,
+    see convert_and_store_nemo_weights(...)
+    """
+
+    @staticmethod
+    def _load_nemo_checkpoint_state(nemo_file: str) -> Dict[str, Any]:
+        LOGGER.info(f'Loading weights from {nemo_file}...')
+        return load_model_weights(nemo_file)
+
+    def download_model(self, model_config: ModelConfig) -> None:  # pylint: disable=missing-function-docstring
+        raise NotImplementedError
+
+    def load_model(
+        self,
+        *,
+        vllm_config: NemoModelConfig,
+    ) -> torch.nn.Module:
+        """
+        Overrides the load_model function from BaseModelLoader to convert Nemo weights at load time.
+        """
+        model_config = vllm_config.model_config
+        device_config = vllm_config.device_config
+
+        assert isinstance(model_config, NemoModelConfig)
+        state_dict = NemoModelLoader._load_nemo_checkpoint_state(model_config.nemo_checkpoint)
+
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(vllm_config)
+
+            config = model_config.nemo_model_config
+            if 'config' in config:
+                config = config['config']
+            state_dict = NemoModelLoader._standardize_nemo2_naming(state_dict)
+
+            weights_iterator = model_config.model_converter.convert_weights(config, state_dict)
+            model.load_weights(weights_iterator)
+
+        return model.eval()
+
+    @staticmethod
+    def convert_and_store_nemo_weights(model_config: NemoModelConfig, safetensors_file: str):
+        """
+        Converts Nemo weights and stores the converted weights in a Safetensors file.
+        """
+
+        assert isinstance(model_config, NemoModelConfig)
+        assert os.path.exists(model_config.model)
+
+        state_dict = NemoModelLoader._load_nemo_checkpoint_state(model_config.nemo_checkpoint)
+
+        config = model_config.nemo_model_config
+
+        # NeMo2 checkpoint loads the whole TrainerContext where the config is stored under 'config' key
+        if 'config' in config:
+            config = config['config']
+        state_dict = NemoModelLoader._standardize_nemo2_naming(state_dict)
+
+        tensors = {name: tensor for name, tensor in model_config.model_converter.convert_weights(config, state_dict)}
+
+        LOGGER.info(f'Saving weights to {safetensors_file}...')
+        safetensors.torch.save_file(tensors, safetensors_file)
+
+    @staticmethod
+    def _standardize_nemo2_naming(state_dict: Dict[str, Any]) -> Dict[str, Any]:
+        return {k.replace('module', 'model'): v for k, v in state_dict.items()}
diff --git a/nemo/export/vllm/tokenizer_group.py b/nemo/export/vllm/tokenizer_group.py
new file mode 100644
index 000000000000..d99daebb417f
--- /dev/null
+++ b/nemo/export/vllm/tokenizer_group.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Optional
+
+from vllm.config import TokenizerPoolConfig
+from vllm.lora.request import LoRARequest
+from vllm.transformers_utils.tokenizer_group import TokenizerGroup
+
+from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
+
+
+class NemoTokenizerGroup(TokenizerGroup):
+    """
+    Implements a custom tokenizer for vLLM, based on SentencePieceTokenizer.
+    """
+
+    def __init__(self, tokenizer: SentencePieceTokenizer, add_bos_token: bool = False):
+        self.tokenizer = tokenizer
+        self.add_bos_token = add_bos_token
+
+    @classmethod
+    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, **init_kwargs):
+        """Create a tokenizer group from a config."""
+        raise NotImplementedError
+
+    def ping(self) -> bool:
+        """Check if the tokenizer group is alive."""
+        return True
+
+    def get_max_input_len(self, lora_request: Optional[LoRARequest] = None) -> Optional[int]:
+        """Get the maximum input length for the LoRA request."""
+        return None
+
+    def encode(
+        self,
+        prompt: str,
+        request_id: Optional[str] = None,
+        lora_request: Optional[LoRARequest] = None,
+        add_special_tokens: Optional[bool] = None,
+    ) -> List[int]:
+        """Tokenizes the prompt."""
+        ids = self.tokenizer.encode(prompt)
+        if self.add_bos_token:
+            ids = [self.tokenizer.bos_token_id] + ids
+        return ids
+
+    async def encode_async(
+        self,
+        prompt: str,
+        request_id: Optional[str] = None,
+        lora_request: Optional[LoRARequest] = None,
+        add_special_tokens: Optional[bool] = None,
+    ) -> List[int]:
+        """Encode a prompt using the tokenizer group."""
+        return self.tokenizer.encode(prompt)  # TODO: not sure how this is supposed to work
+
+    def get_lora_tokenizer(self, lora_request: Optional[LoRARequest] = None) -> SentencePieceTokenizer:
+        """Get a tokenizer for a LoRA request."""
+        return self.tokenizer
+
+    async def get_lora_tokenizer_async(self, lora_request: Optional[LoRARequest] = None) -> SentencePieceTokenizer:
+        """Get a tokenizer for a LoRA request."""
+        return self.tokenizer
diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
new file mode 100644
index 000000000000..fd67fdb2d6a3
--- /dev/null
+++ b/nemo/export/vllm_exporter.py
@@ -0,0 +1,538 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os.path
+from typing import Iterable, List, Optional, Union
+
+import numpy
+import vllm.envs as envs
+import wrapt
+from vllm import RequestOutput, SamplingParams
+from vllm.config import (
+    CacheConfig,
+    DeviceConfig,
+    LoadConfig,
+    LoadFormat,
+    LoRAConfig,
+    ObservabilityConfig,
+    ParallelConfig,
+    SchedulerConfig,
+    VllmConfig,
+)
+from vllm.executor.ray_utils import initialize_ray_cluster
+from vllm.lora.request import LoRARequest
+
+from nemo.deploy import ITritonDeployable
+from nemo.deploy.utils import cast_output
+from nemo.export.utils import convert_lora_nemo_to_canonical, prepare_directory_for_export
+from nemo.export.vllm.engine import NemoLLMEngine
+from nemo.export.vllm.model_config import NemoModelConfig
+from nemo.export.vllm.model_loader import NemoModelLoader
+
+LOGGER = logging.getLogger("NeMo")
+
+
+@wrapt.decorator
+def noop_decorator(func):
+    """Used as batch if pytriton is not supported"""
+
+    def wrapper(*args, **kwargs):
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+batch = noop_decorator
+use_pytriton = True
+try:
+    from pytriton.decorators import batch
+    from pytriton.model_config import Tensor
+except Exception:
+    use_pytriton = False
+
+
+class vLLMExporter(ITritonDeployable):
+    """
+    The vLLMExporter class implements conversion from a Nemo checkpoint format to something compatible with vLLM,
+    loading the model in vLLM, and binding that model to a Triton server.
+
+    Example:
+        from nemo.export.vllm_exporter import vLLMExporter
+        from nemo.deploy import DeployPyTriton
+
+        exporter = vLLMExporter()
+
+        exporter.export(
+            nemo_checkpoint='/path/to/checkpoint.nemo',
+            model_dir='/path/to/temp_dir',
+            model_type='llama',
+        )
+
+        server = DeployPyTriton(
+            model=exporter,
+            triton_model_name='LLAMA',
+        )
+
+        server.deploy()
+        server.serve()
+    """
+
+    def __init__(self):
+        self.request_id = 0
+        # TODO: Support v1 vllm engine
+        if envs.VLLM_USE_V1:
+            envs.set_vllm_use_v1(False)
+
+    def export(
+        self,
+        nemo_checkpoint: str,
+        model_dir: str,
+        model_type: str,
+        device: str = 'auto',
+        tensor_parallel_size: int = 1,
+        pipeline_parallel_size: int = 1,
+        max_model_len: Optional[int] = None,
+        lora_checkpoints: Optional[List[str]] = None,
+        dtype: str = 'auto',
+        seed: int = 0,
+        log_stats: bool = True,
+        weight_storage: str = 'auto',
+        gpu_memory_utilization: float = 0.9,
+        quantization: Optional[str] = None,
+        delete_existing_files: bool = True,
+    ):
+        """
+        Exports the Nemo checkpoint to vLLM and initializes the engine.
+
+        Args:
+            nemo_checkpoint (str): path to the nemo checkpoint.
+            model_dir (str): path to a temporary directory to store weights and the tokenizer model.
+                The temp dir may persist between subsequent export operations, in which case
+                converted weights may be reused to speed up the export.
+            model_type (str): type of the model, such as "llama", "mistral", "mixtral".
+                Needs to be compatible with transformers.AutoConfig.
+            device (str): type of the device to use by the vLLM engine.
+                Supported values are "auto", "cuda", "cpu", "neuron".
+            tensor_parallel_size (int): tensor parallelism.
+            pipeline_parallel_size (int): pipeline parallelism.
+                Values over 1 are not currently supported by vLLM.
+            max_model_len (int): model context length.
+            lora_checkpoints List[str]: paths to LoRA checkpoints.
+            dtype (str): data type for model weights and activations.
+                Possible choices: auto, half, float16, bfloat16, float, float32
+                "auto" will use FP16 precision for FP32 and FP16 models,
+                and BF16 precision for BF16 models.
+            seed (int): random seed value.
+            log_stats (bool): enables logging inference performance statistics by vLLM.
+            weight_storage (str): controls how converted weights are stored:
+                "file" - always write weights into a file inside 'model_dir',
+                "memory" - always do an in-memory conversion,
+                "cache" - reuse existing files if they are newer than the nemo checkpoint,
+                "auto" - use "cache" for multi-GPU runs and "memory" for single-GPU runs.
+            gpu_memory_utilization (float): The fraction of GPU memory to be used for the model
+                executor, which can range from 0 to 1.
+            quantization (str): quantization method that is used to quantize the model weights.
+                Possible choices are None (weights not quantized, default) and "fp8".
+            delete_existing_files (bool): if True, deletes all the files in model_dir.
+        """
+        prepare_directory_for_export(model_dir, delete_existing_files=delete_existing_files)
+
+        # Pouplate the basic configuration structures
+        device_config = DeviceConfig(device)
+
+        assert quantization in {None, 'fp8'}
+
+        model_config = NemoModelConfig(
+            nemo_checkpoint,
+            model_dir,
+            model_type,
+            tokenizer_mode='auto',
+            dtype=dtype,
+            seed=seed,
+            revision=None,
+            code_revision=None,
+            tokenizer_revision=None,
+            max_model_len=max_model_len,
+            quantization=quantization,
+            quantization_param_path=None,
+            enforce_eager=False,
+            max_seq_len_to_capture=None,
+        )
+
+        if model_config.nemo_model_config.get("fp8", False):
+            LOGGER.warning(
+                "NeMo FP8 checkpoint detected, but exporting FP8 quantized engines is not supported for vLLM."
+            )
+
+        parallel_config = ParallelConfig(
+            pipeline_parallel_size=pipeline_parallel_size, tensor_parallel_size=tensor_parallel_size
+        )
+
+        # vllm/huggingface doesn't like the absense of config file. Place config in load dir.
+        if model_config.model and not os.path.exists(os.path.join(model_config.model, 'config.json')):
+            with open(os.path.join(model_config.model, 'config.json'), "w") as f:
+                json.dump(model_config.hf_text_config.to_dict(), f, indent=2)
+
+        # Dynamic online FP8 quantization currently does not support in-memory conversion [TODO]
+        if quantization is not None and weight_storage in {'auto', 'memory'}:
+            LOGGER.warning('Setting weight_storage = "file" for FP8 quantization')
+            weight_storage = 'file'
+
+        # See if we have an up-to-date safetensors file
+        safetensors_file = os.path.join(model_config.model, 'model.safetensors')
+        safetensors_file_valid = os.path.exists(safetensors_file) and os.path.getmtime(
+            safetensors_file
+        ) > os.path.getmtime(nemo_checkpoint)
+
+        # Decide how we're going to convert the weights
+        if weight_storage == 'auto':
+            if parallel_config.distributed_executor_backend is not None:
+                save_weights = not safetensors_file_valid
+                inmemory_weight_conversion = False
+            else:
+                save_weights = False
+                inmemory_weight_conversion = True
+
+        elif weight_storage == 'cache':
+            save_weights = not safetensors_file_valid
+            inmemory_weight_conversion = False
+
+        elif weight_storage == 'file':
+            save_weights = True
+            inmemory_weight_conversion = False
+
+        elif weight_storage == 'memory':
+            save_weights = False
+            inmemory_weight_conversion = True
+
+        else:
+            raise ValueError(f'Unsupported value for weight_storage: "{weight_storage}"')
+
+        # Convert the weights ahead-of-time, if needed
+        if save_weights:
+            NemoModelLoader.convert_and_store_nemo_weights(model_config, safetensors_file)
+        elif not inmemory_weight_conversion:
+            LOGGER.info(f'Using cached weights in {safetensors_file}')
+
+        # TODO: these values are the defaults from vllm.EngineArgs.
+        cache_config = CacheConfig(
+            block_size=16,
+            gpu_memory_utilization=gpu_memory_utilization,
+            swap_space=4,
+            cache_dtype='auto',
+            sliding_window=model_config.get_sliding_window(),
+        )
+
+        # TODO: these values are the defaults from vllm.EngineArgs.
+        scheduler_config = SchedulerConfig(
+            max_num_batched_tokens=None,
+            max_num_seqs=256,
+            # Note: max_model_len can be derived by model_config if the input value is None
+            max_model_len=model_config.max_model_len,
+            num_lookahead_slots=0,
+            delay_factor=0.0,
+            enable_chunked_prefill=False,
+        )
+
+        load_config = LoadConfig(
+            load_format=NemoModelLoader if inmemory_weight_conversion else LoadFormat.SAFETENSORS,
+            download_dir=None,
+            model_loader_extra_config=None,
+        )
+
+        # Convert the LoRA checkpoints to vLLM compatible format and derive the configuration structure
+        lora_config = self._prepare_lora_checkpoints(
+            model_dir=model_dir, lora_checkpoints=lora_checkpoints, dtype=model_config.dtype
+        )
+
+        # Initialize the cluster and specify the executor class.
+        if parallel_config.distributed_executor_backend == "ray":
+            initialize_ray_cluster(parallel_config)
+            from vllm.executor.ray_distributed_executor import RayDistributedExecutor
+
+            executor_class = RayDistributedExecutor
+
+        elif parallel_config.distributed_executor_backend == "mp":
+            from vllm.executor.mp_distributed_executor import MultiprocessingDistributedExecutor
+
+            executor_class = MultiprocessingDistributedExecutor
+
+        else:
+            assert parallel_config.distributed_executor_backend == "uni" or parallel_config.world_size == 1
+
+            from vllm.executor.uniproc_executor import UniProcExecutor
+
+            executor_class = UniProcExecutor
+
+        # Initialize the engine
+        self.engine = NemoLLMEngine(
+            vllm_config=VllmConfig(
+                model_config=model_config,
+                cache_config=cache_config,
+                parallel_config=parallel_config,
+                scheduler_config=scheduler_config,
+                device_config=device_config,
+                load_config=load_config,
+                lora_config=lora_config,
+                observability_config=ObservabilityConfig(),
+            ),
+            executor_class=executor_class,
+            log_stats=log_stats,
+        )
+
+    def _prepare_lora_checkpoints(
+        self, model_dir: str, lora_checkpoints: Optional[List[str]], dtype: str
+    ) -> LoRAConfig:
+        self.lora_checkpoints = []
+
+        if not lora_checkpoints:
+            return None
+
+        index = 0
+        max_lora_rank = 0
+        for nemo_file in lora_checkpoints:
+            if not os.path.isfile(nemo_file):
+                raise FileNotFoundError(f"LoRA checkpoint file '{nemo_file} does not exist'")
+
+            hf_lora_dir = os.path.join(model_dir, f'lora_{index}')
+
+            LOGGER.info(f"Converting LoRA checkpoint '{nemo_file}' into '{hf_lora_dir}'...")
+
+            _, lora_config = convert_lora_nemo_to_canonical(nemo_file, hf_lora_dir, hf_format=True)
+            self.lora_checkpoints.append(hf_lora_dir)
+
+            rank = lora_config['peft']['lora_tuning']['adapter_dim']
+            max_lora_rank = max(max_lora_rank, rank)
+
+            index += 1
+
+        return LoRAConfig(max_lora_rank=max_lora_rank, max_loras=len(self.lora_checkpoints), lora_dtype=dtype)
+
+    def _add_request_to_engine(
+        self,
+        prompt: str,
+        max_output_len: int,
+        temperature: float = 1.0,
+        top_k: int = 1,
+        top_p: float = 0.0,
+        lora_uid: Optional[int] = None,
+    ) -> str:
+        if top_p <= 0.0:
+            top_p = 1.0
+
+        sampling_params = SamplingParams(
+            max_tokens=max_output_len, temperature=temperature, top_k=int(top_k), top_p=top_p
+        )
+
+        if lora_uid is not None and lora_uid >= 0 and lora_uid < len(self.lora_checkpoints):
+            lora_request = LoRARequest(
+                lora_name=f'LoRA_{lora_uid}', lora_int_id=lora_uid + 1, lora_local_path=self.lora_checkpoints[lora_uid]
+            )
+        else:
+            lora_request = None
+
+        request_id = str(self.request_id)
+        self.request_id += 1
+
+        self.engine.add_request(request_id, prompt, sampling_params, lora_request=lora_request)
+
+        return request_id
+
+    def _forward_regular(self, request_ids: List[str]):
+        responses = [None] * len(request_ids)
+        finished = [False] * len(request_ids)
+
+        while not all(finished):
+            request_outputs: List[RequestOutput] = self.engine.step()
+
+            for request_output in request_outputs:
+                if not request_output.finished:
+                    continue
+
+                try:
+                    request_index = request_ids.index(request_output.request_id)
+                except ValueError:
+                    continue
+
+                finished[request_index] = request_output.finished
+                output_text = request_output.outputs[-1].text
+                responses[request_index] = output_text
+
+        return [[response] for response in responses]
+
+    def _forward_streaming(self, request_ids: List[str]):
+        responses = [None] * len(request_ids)
+        finished = [False] * len(request_ids)
+
+        while not all(finished):
+            request_outputs: List[RequestOutput] = self.engine.step()
+
+            for request_output in request_outputs:
+                try:
+                    request_index = request_ids.index(request_output.request_id)
+                except ValueError:
+                    continue
+
+                finished[request_index] = request_output.finished
+                output_text = request_output.outputs[-1].text
+                responses[request_index] = output_text
+
+            yield [[response] for response in responses]
+
+    def _add_triton_request_to_engine(self, inputs: numpy.ndarray, index: int) -> str:
+        if 'lora_uids' in inputs:
+            lora_uid = int(numpy.char.decode(inputs['lora_uids'][index][0], encoding="utf-8"))
+        else:
+            lora_uid = None
+
+        return self._add_request_to_engine(
+            prompt=inputs['prompts'][index][0].decode('UTF-8'),
+            max_output_len=inputs['max_output_len'][index][0],
+            temperature=inputs['temperature'][index][0],
+            top_k=inputs['top_k'][index][0],
+            top_p=inputs['top_p'][index][0],
+            lora_uid=lora_uid,
+        )
+
+    @property
+    def get_triton_input(self):
+        inputs = (
+            Tensor(name="prompts", shape=(-1,), dtype=bytes),
+            Tensor(name="max_output_len", shape=(-1,), dtype=numpy.int_, optional=True),
+            Tensor(name="top_k", shape=(-1,), dtype=numpy.int_, optional=True),
+            Tensor(name="top_p", shape=(-1,), dtype=numpy.single, optional=True),
+            Tensor(name="temperature", shape=(-1,), dtype=numpy.single, optional=True),
+            Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="output_generation_logits", shape=(-1,), dtype=numpy.bool_, optional=True),
+            Tensor(name="output_context_logits", shape=(-1,), dtype=numpy.bool_, optional=True),
+        )
+        return inputs
+
+    @property
+    def get_triton_output(self):
+        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
+        return outputs
+
+    @batch
+    def triton_infer_fn(self, **inputs: numpy.ndarray):
+        """
+        This function is used to perform inference on a batch of prompts.
+        """
+        request_ids = []
+        num_requests = len(inputs["prompts"])
+        for index in range(num_requests):
+            request_id = self._add_triton_request_to_engine(inputs, index)
+            request_ids.append(request_id)
+
+        responses = self._forward_regular(request_ids)
+        responses = [r[0] for r in responses]
+
+        output_tensor = cast_output(responses, numpy.bytes_)
+        return {'outputs': output_tensor}
+
+    @batch
+    def triton_infer_fn_streaming(self, **inputs: numpy.ndarray):
+        """
+        This function is used to perform streaming inference.
+        """
+        request_ids = []
+        num_requests = len(inputs["prompts"])
+        for index in range(num_requests):
+            request_id = self._add_triton_request_to_engine(inputs, index)
+            request_ids.append(request_id)
+
+        for responses in self._forward_streaming(request_ids):
+            responses = [r[0] for r in responses]
+            output_tensor = cast_output(responses, numpy.bytes_)
+            yield {'outputs': output_tensor}
+
+    # Mimic the TensorRTLLM exporter's forward function, even though we don't support many of its features.
+    def forward(
+        self,
+        input_texts: List[str],
+        max_output_len: int = 64,
+        top_k: int = 1,
+        top_p: float = 0.0,
+        temperature: float = 1.0,
+        stop_words_list: Optional[List[str]] = None,
+        bad_words_list: Optional[List[str]] = None,
+        no_repeat_ngram_size: Optional[int] = None,
+        task_ids: Optional[List[str]] = None,
+        lora_uids: Optional[List[str]] = None,
+        prompt_embeddings_table=None,
+        prompt_embeddings_checkpoint_path: Optional[str] = None,
+        streaming: bool = False,
+        output_log_probs: bool = False,
+        output_generation_logits: bool = False,
+        output_context_logits: bool = False,
+    ) -> Union[List[List[str]], Iterable[List[List[str]]]]:
+        """
+        The forward function performs LLM evaluation on the provided array of prompts with other parameters shared,
+        and returns the generated texts. If 'streaming' is True, the output texts are returned incrementally
+        with a generator: one token appended to each output at a time. If 'streaming' is false, the final output texts
+        are returned as a single list of responses.
+        """
+
+        if stop_words_list is not None and stop_words_list != []:
+            raise NotImplementedError("stop_words_list is not supported")
+
+        if bad_words_list is not None and bad_words_list != []:
+            raise NotImplementedError("bad_words_list is not supported")
+
+        if no_repeat_ngram_size is not None:
+            raise NotImplementedError("no_repeat_ngram_size is not supported")
+
+        if task_ids is not None and task_ids != []:
+            raise NotImplementedError("task_ids is not supported")
+
+        if prompt_embeddings_table is not None:
+            raise NotImplementedError("prompt_embeddings_table is not supported")
+
+        if prompt_embeddings_checkpoint_path is not None:
+            raise NotImplementedError("prompt_embeddings_checkpoint_path is not supported")
+
+        if output_log_probs:
+            raise NotImplementedError("output_log_probs is not supported")
+
+        if output_generation_logits:
+            raise NotImplementedError("output_generation_logits is not supported")
+
+        if output_context_logits:
+            raise NotImplementedError("output_context_logits is not supported")
+
+        request_ids = []
+        for index in range(len(input_texts)):
+            prompt = input_texts[index]
+
+            if lora_uids is not None and index < len(lora_uids):
+                lora_uid = lora_uids[index]
+            else:
+                lora_uid = None
+
+            request_id = self._add_request_to_engine(
+                prompt=prompt,
+                max_output_len=max_output_len,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                lora_uid=lora_uid,
+            )
+            request_ids.append(request_id)
+
+        if streaming:
+            return self._forward_streaming(request_ids)
+        else:
+            return self._forward_regular(request_ids)
diff --git a/nemo/export/vllm_hf_exporter.py b/nemo/export/vllm_hf_exporter.py
new file mode 100755
index 000000000000..4e90ab962ee8
--- /dev/null
+++ b/nemo/export/vllm_hf_exporter.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List
+
+import numpy as np
+from pytriton.decorators import batch, first_value
+from pytriton.model_config import Tensor
+from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+
+from nemo.deploy import ITritonDeployable
+from nemo.deploy.utils import cast_output, str_ndarray2list
+
+
+class vLLMHFExporter(ITritonDeployable):
+    """
+    The Exporter class uses vLLM APIs to convert a HF model to vLLM and makes the class,
+    deployable with Triton server.
+
+    Example:
+        from nemo.export import vLLMHFExporter
+        from nemo.deploy import DeployPyTriton
+
+        exporter = vLLMHFExporter()
+        exporter.export(model="/path/to/model/")
+
+        server = DeployPyTriton(
+            model=exporter,
+            triton_model_name='model'
+        )
+
+        server.deploy()
+        server.serve()
+        server.stop()
+    """
+
+    def __init__(self):
+        self.model = None
+        self.lora_models = None
+
+    def export(self, model, enable_lora: bool = False):
+        """
+        Exports the HF checkpoint to vLLM and initializes the engine.
+        Args:
+            model (str): model name or the path
+        """
+        self.model = LLM(model=model, enable_lora=enable_lora)
+
+    def add_lora_models(self, lora_model_name, lora_model):
+        if self.lora_models is None:
+            self.lora_models = {}
+        self.lora_models[lora_model_name] = lora_model
+
+    @property
+    def get_triton_input(self):
+        inputs = (
+            Tensor(name="prompts", shape=(-1,), dtype=bytes),
+            Tensor(name="max_output_len", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
+        )
+        return inputs
+
+    @property
+    def get_triton_output(self):
+        outputs = (Tensor(name="outputs", shape=(-1,), dtype=bytes),)
+        return outputs
+
+    @batch
+    @first_value("max_output_len", "top_k", "top_p", "temperature")
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        try:
+            infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
+            if "max_output_len" in inputs:
+                infer_input["max_output_len"] = inputs.pop("max_output_len")
+            if "top_k" in inputs:
+                infer_input["top_k"] = inputs.pop("top_k")
+            if "top_p" in inputs:
+                infer_input["top_p"] = inputs.pop("top_p")
+            if "temperature" in inputs:
+                infer_input["temperature"] = inputs.pop("temperature")
+
+            output_texts = self.forward(**infer_input)
+            output = cast_output(output_texts, np.bytes_)
+        except Exception as error:
+            err_msg = "An error occurred: {0}".format(str(error))
+            output = cast_output([err_msg], np.bytes_)
+
+        return {"outputs": output}
+
+    def forward(
+        self,
+        input_texts: List[str],
+        max_output_len: int = 64,
+        top_k: int = 1,
+        top_p: float = 0.1,
+        temperature: float = 1.0,
+        lora_model_name: str = None,
+    ):
+        assert self.model is not None, "Model is not initialized."
+
+        lora_request = None
+        if lora_model_name is not None:
+            if self.lora_models is None:
+                raise Exception("No lora models are available.")
+            assert lora_model_name in self.lora_models.keys(), "Lora model was not added before"
+            lora_request = LoRARequest(lora_model_name, 1, self.lora_models[lora_model_name])
+
+        sampling_params = SamplingParams(
+            max_tokens=max_output_len, temperature=temperature, top_k=int(top_k), top_p=top_p
+        )
+
+        request_output = self.model.generate(input_texts, sampling_params, lora_request=lora_request)
+        output = []
+        for o in request_output:
+            output.append(o.outputs[0].text)
+
+        return output

From f1513057600f67dc305b470a07446d4891bbe8eb Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Fri, 7 Nov 2025 23:38:28 -0800
Subject: [PATCH 10/15] lintfix

Signed-off-by: Pablo Garay 
---
 .github/workflows/code-linting.yml    | 3 ++-
 nemo/export/quantize/__init__.py      | 2 ++
 nemo/export/trt_llm/qnemo/__init__.py | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/code-linting.yml b/.github/workflows/code-linting.yml
index df51f051f0a7..0ed4bfd813ae 100644
--- a/.github/workflows/code-linting.yml
+++ b/.github/workflows/code-linting.yml
@@ -42,7 +42,8 @@ jobs:
               "!nemo/collections/audio/**/*.py",
               "!nemo/collections/multimodal/speech_llm/**/*.py",
               "!nemo/collections/speechlm/**/*.py",
-              "!nemo/collections/speechlm2/**/*.py"
+              "!nemo/collections/speechlm2/**/*.py",
+              "!nemo/export/**/*.py"
             ] | join(",")')
           fi
 
diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py
index 87812e621bb6..39696977a2ba 100644
--- a/nemo/export/quantize/__init__.py
+++ b/nemo/export/quantize/__init__.py
@@ -13,3 +13,5 @@
 # limitations under the License.
 
 from .quantizer import Quantizer
+
+__all__ = ["Quantizer"]
\ No newline at end of file
diff --git a/nemo/export/trt_llm/qnemo/__init__.py b/nemo/export/trt_llm/qnemo/__init__.py
index 59b9eb8ae6a6..5d3d945f065d 100644
--- a/nemo/export/trt_llm/qnemo/__init__.py
+++ b/nemo/export/trt_llm/qnemo/__init__.py
@@ -13,3 +13,5 @@
 # limitations under the License.
 
 from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
+
+__all__ = ["qnemo_to_tensorrt_llm"]
\ No newline at end of file

From f866ac01d5372c6cb0649c4487f40a727d1d11bd Mon Sep 17 00:00:00 2001
From: pablo-garay 
Date: Sat, 8 Nov 2025 07:39:26 +0000
Subject: [PATCH 11/15] Apply isort and black reformatting

Signed-off-by: pablo-garay 
---
 nemo/export/quantize/__init__.py      | 2 +-
 nemo/export/trt_llm/qnemo/__init__.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py
index 39696977a2ba..49ec9da3c64a 100644
--- a/nemo/export/quantize/__init__.py
+++ b/nemo/export/quantize/__init__.py
@@ -14,4 +14,4 @@
 
 from .quantizer import Quantizer
 
-__all__ = ["Quantizer"]
\ No newline at end of file
+__all__ = ["Quantizer"]
diff --git a/nemo/export/trt_llm/qnemo/__init__.py b/nemo/export/trt_llm/qnemo/__init__.py
index 5d3d945f065d..dbbfd23bac12 100644
--- a/nemo/export/trt_llm/qnemo/__init__.py
+++ b/nemo/export/trt_llm/qnemo/__init__.py
@@ -14,4 +14,4 @@
 
 from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
 
-__all__ = ["qnemo_to_tensorrt_llm"]
\ No newline at end of file
+__all__ = ["qnemo_to_tensorrt_llm"]

From dfcd924301859a897b8ccd2070ab26e5ce117760 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Fri, 7 Nov 2025 23:54:41 -0800
Subject: [PATCH 12/15] back

Signed-off-by: Pablo Garay 
---
 nemo/collections/llm/gpt/model/llama.py          | 2 +-
 nemo/collections/vlm/llama4/model/llama4_omni.py | 2 +-
 nemo/collections/vlm/mllama/model/mllama.py      | 2 +-
 nemo/collections/vlm/qwen2vl/model/qwen2vl.py    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 9665f92fb3f3..46abf425043e 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/collections/vlm/llama4/model/llama4_omni.py b/nemo/collections/vlm/llama4/model/llama4_omni.py
index 5e1edd0a091a..6482b1833302 100644
--- a/nemo/collections/vlm/llama4/model/llama4_omni.py
+++ b/nemo/collections/vlm/llama4/model/llama4_omni.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/collections/vlm/mllama/model/mllama.py b/nemo/collections/vlm/mllama/model/mllama.py
index 3a9e29c75d45..794fae9a5af9 100644
--- a/nemo/collections/vlm/mllama/model/mllama.py
+++ b/nemo/collections/vlm/mllama/model/mllama.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
index 77ca41b96f99..ef99ce7ed1c3 100644
--- a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
+++ b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From 1d9a4e5ef09a9a46df53a65c8beeb9f809086fc4 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Sat, 8 Nov 2025 00:03:05 -0800
Subject: [PATCH 13/15] revert back

Signed-off-by: Pablo Garay 
---
 nemo/collections/llm/gpt/model/deepseek.py    | 113 ++-
 nemo/collections/llm/gpt/model/llama.py       |  18 +-
 .../vlm/llama4/model/llama4_omni.py           |   2 +-
 nemo/collections/vlm/mllama/model/mllama.py   | 625 ++++++++++++---
 nemo/collections/vlm/qwen2vl/model/qwen2vl.py | 753 ++++++++++++++++--
 5 files changed, 1359 insertions(+), 152 deletions(-)
 mode change 100644 => 100755 nemo/collections/vlm/qwen2vl/model/qwen2vl.py

diff --git a/nemo/collections/llm/gpt/model/deepseek.py b/nemo/collections/llm/gpt/model/deepseek.py
index e46e2a389434..5723ee7d73bc 100644
--- a/nemo/collections/llm/gpt/model/deepseek.py
+++ b/nemo/collections/llm/gpt/model/deepseek.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import json
 import re
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from functools import cached_property, partial
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
@@ -26,6 +26,7 @@
 from megatron.core.transformer.transformer_config import MLATransformerConfig
 from safetensors.torch import load_file
 from torch import nn
+from transformers import AutoConfig
 
 from nemo.collections.llm.gpt.model.base import (
     HAVE_TE,
@@ -44,10 +45,14 @@
 if TYPE_CHECKING:
     from megatron.core.transformer import ModuleSpec
     from transformers import AutoModelForCausalLM
+    from transformers import DeepseekV3Config as HFDeepseekV3Config
 
     from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
     from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 
+if HAVE_TE:
+    from megatron.core.utils import is_te_min_version
+
 
 @dataclass
 class DeepSeekConfig(MLATransformerConfig, GPTConfig):
@@ -86,6 +91,7 @@ class DeepSeekConfig(MLATransformerConfig, GPTConfig):
     moe_token_dispatcher_type: str = "alltoall"
     moe_router_load_balancing_type: str = 'seq_aux_loss'
     moe_shared_expert_overlap: bool = True
+    moe_router_dtype: Optional[str] = 'fp32'
 
     # MLA
     q_lora_rank: int = 1536
@@ -116,6 +122,9 @@ class DeepSeekConfig(MLATransformerConfig, GPTConfig):
     bias_dropout_fusion: bool = True
     masked_softmax_fusion: bool = True
     gradient_accumulation_fusion: bool = True
+    cross_entropy_loss_fusion: bool = True
+    cross_entropy_fusion_impl: str = "te"
+    moe_permute_fusion: bool = is_te_min_version("2.1.0") if HAVE_TE else False
 
     def __post_init__(self):
         super().__post_init__()
@@ -220,6 +229,7 @@ def apply(self, output_path: Path, convert_mtp: bool = False) -> Path:
         from transformers import AutoModelForCausalLM
 
         self.convert_mtp = convert_mtp
+        self._verify_source()
         source = AutoModelForCausalLM.from_pretrained(str(self), trust_remote_code=True, torch_dtype='auto')
         target = self.init()
         trainer = self.nemo_setup(target)
@@ -233,6 +243,15 @@ def apply(self, output_path: Path, convert_mtp: bool = False) -> Path:
 
         return output_path
 
+    def _verify_source(self):
+        source_config = AutoConfig.from_pretrained(str(self), trust_remote_code=True)
+        assert 'quantization_config' not in source_config, (
+            "HuggingFace cannot load DeepSeek V3's FP8 checkpoint directly. You must convert the checkpoint "
+            "to BF16. See NeMo documentation for more details: "
+            "https://nemo-framework-tme.gitlab-master-pages.nvidia.com/documentation/user-guide/latest/llms/"
+            "deepseek_v3.html#nemo-2-0-finetuning-recipes "
+        )
+
     def _modify_source_state(self, source: nn.Module) -> _ModelState:
         """
         In deepseek, HF weight `model.layers.*.post_attention_layernorm.weight` is mapped to mcore weight
@@ -419,7 +438,7 @@ def config(self) -> DeepSeekConfig:
             moe_router_num_groups=source.n_group,
             moe_router_group_topk=source.topk_group,
             moe_router_topk_scaling_factor=source.routed_scaling_factor,
-            moe_aux_loss_coeff=source.aux_loss_alpha,
+            moe_aux_loss_coeff=getattr(source, "aux_loss_alpha", 0.001),
             kv_lora_rank=source.kv_lora_rank,
             qk_head_dim=source.qk_nope_head_dim,
             qk_pos_emb_head_dim=source.qk_rope_head_dim,
@@ -454,6 +473,28 @@ def init(self, dtype=torch.bfloat16, model_name="deepseek-ai/DeepSeek-V3") -> "A
             type(hf_model).register_for_auto_class("AutoModelForCausalLM")
             return hf_model
 
+    def _detect_hf_deepseek_version(self, source_config: Dict[str, Any]) -> str:
+        """
+        Detect the HF DeepSeek version based on the source NeMo config.
+
+        Args:
+            source_config (Dict[str, Any]): The source NeMo model config.
+
+        Returns:
+            str: The DeepSeek version in the Hugging Face Hub convention.
+        """
+        if source_config['moe_router_enable_expert_bias']:
+            target_model_name = "deepseek-ai/DeepSeek-V3"
+        elif source_config['q_lora_rank'] is not None:
+            target_model_name = "deepseek-ai/DeepSeek-V2"
+        else:
+            target_model_name = "deepseek-ai/DeepSeek-V2-Lite"
+        logging.info(
+            f"Your model is determined to be {target_model_name} based on the config. If this is not correct, "
+            f"please pass in a local HF checkpoint."
+        )
+        return target_model_name
+
     def ckpt_load(self, path: Path) -> Tuple[Dict, Dict]:
         """
         This function loads the state dict directly from a distributed checkpoint, and modify the state dict
@@ -493,21 +534,12 @@ def apply(self, output_path: Path, target_model_name=None) -> Path:
         logging.info("DeepSeek NeMo checkpoint loaded.")
         if target_model_name is None:
             # Before DeepSeek is fully supported by HF, it is necessary to pass in a local HF checkpoint that
-            # is used to initialize the HF model. The following
+            # is used to initialize the HF model.
             logging.warning(
                 "Before DeepSeek is officially supported in HF, you should pass in a local HF "
                 "checkpoint using llm.export_ckpt(..., target_model_name=)"
             )
-            if source_config['moe_router_enable_expert_bias']:
-                target_model_name = "deepseek-ai/DeepSeek-V3"
-            elif source_config['q_lora_rank'] is not None:
-                target_model_name = "deepseek-ai/DeepSeek-V2"
-            else:
-                target_model_name = "deepseek-ai/DeepSeek-V2-Lite"
-            logging.info(
-                f"Your model is determined to be {target_model_name} based on the config. If this is not correct, "
-                f"please pass in a local HF checkpoint."
-            )
+            target_model_name = self._detect_hf_deepseek_version(source_config)
 
         target = self.init(torch_dtype_from_dict_config(source_config), model_name=target_model_name)
         target = self.convert_state(source, target, source_config)
@@ -600,6 +632,7 @@ def convert_state(self, source, target, source_config):
             target,
             mapping=mapping,
             transforms=transforms,
+            cast_dtype=torch.bfloat16,
         )
 
     def _modify_source_state(self, source: Dict[str, Any], source_config: Dict[str, Any]) -> _ModelState:
@@ -621,6 +654,60 @@ def _modify_source_state(self, source: Dict[str, Any], source_config: Dict[str,
     def tokenizer(self) -> 'AutoTokenizer':
         return io.load_context(self, subpath="model").tokenizer
 
+    @property
+    def config(self) -> "HFDeepseekV3Config":
+        """Create a HF DeepseekV3Config from the NeMo model config.
+
+        Translates the NeMo configuration parameters to the equivalent HF
+        configuration.
+
+        Currently only supports DeepseekV3Config based on availability
+        in the Transformers library.
+
+        Returns:
+            HFDeepseekV3Config: HF configuration for DeepSeekV3 models
+        """
+        # TODO: Get config for all DeepSeek model variants once available in transformers
+
+        from transformers import DeepseekV3Config as HFDeepseekV3Config
+
+        source: DeepSeekV3Config = io.load_context(str(self)).model.config
+
+        target_model_name = self._detect_hf_deepseek_version(asdict(source))
+        if target_model_name != "deepseek-ai/DeepSeek-V3":
+            raise ValueError(f"Getting config for model other than {target_model_name} is not supported.")
+
+        # Figure out the number of zeros in the prefix of moe_layer_freq array
+        # for the HF first_k_dense_replace parameter and validate the reminder:
+        k = 0
+        while k < len(source.moe_layer_freq) and source.moe_layer_freq[k] == 0:
+            k += 1
+        assert all(x == 1 for x in source.moe_layer_freq[k:])
+
+        return HFDeepseekV3Config(
+            architectures=["DeepseekV3ForCausalLM"],
+            num_hidden_layers=source.num_layers,
+            hidden_size=source.hidden_size,
+            intermediate_size=source.ffn_hidden_size,
+            num_attention_heads=source.num_attention_heads,
+            q_lora_rank=source.q_lora_rank,
+            qk_nope_head_dim=source.qk_head_dim,
+            qk_rope_head_dim=source.qk_pos_emb_head_dim,
+            v_head_dim=source.v_head_dim,
+            kv_lora_rank=source.kv_lora_rank,
+            num_key_value_heads=source.kv_channels,
+            n_routed_experts=source.num_moe_experts,
+            moe_intermediate_size=source.moe_ffn_hidden_size,
+            first_k_dense_replace=k,
+            num_experts_per_tok=source.moe_router_topk,
+            n_group=source.moe_router_num_groups,
+            topk_group=source.moe_router_group_topk,
+            routed_scaling_factor=source.moe_router_topk_scaling_factor,
+            aux_loss_alpha=source.moe_aux_loss_coeff,
+            max_position_embeddings=source.max_position_embeddings,
+            vocab_size=self.tokenizer.vocab_size,
+        )
+
 
 __all__ = [
     "DeepSeekConfig",
diff --git a/nemo/collections/llm/gpt/model/llama.py b/nemo/collections/llm/gpt/model/llama.py
index 46abf425043e..1c1a4654a6d2 100644
--- a/nemo/collections/llm/gpt/model/llama.py
+++ b/nemo/collections/llm/gpt/model/llama.py
@@ -79,6 +79,7 @@ class LlamaConfig(GPTConfig):
     persist_layer_norm: bool = True
     bias_dropout_fusion: bool = True
     apply_rope_fusion: bool = True
+    use_transformer_engine_op_fuser: Optional[bool] = None
 
 
 @dataclass
@@ -169,7 +170,7 @@ class Llama31Config(Llama3Config):
     old_context_len: int = 8192
     init_method_std: float = 0.02
 
-    def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MCoreGPTModel":
+    def configure_model(self, tokenizer, pre_process=None, post_process=None, vp_stage=None) -> "MCoreGPTModel":
         """Configure and instantiate a Megatron Core Llama 3.1 model.
 
         Extends the base configuration with Llama 3.1 specific RoPE scaling.
@@ -182,7 +183,7 @@ def configure_model(self, tokenizer, pre_process=None, post_process=None) -> "MC
         Returns:
             MCoreGPTModel: Configured Megatron Core GPT model instance
         """
-        model = super().configure_model(tokenizer, pre_process, post_process)
+        model = super().configure_model(tokenizer, pre_process, post_process, vp_stage)
         # Apply rope scaling for Llama3.1 model
         model.rotary_pos_emb.inv_freq = apply_rope_scaling(
             model.rotary_pos_emb.inv_freq,
@@ -292,6 +293,7 @@ class Llama32Config1B(Llama31Config):
     scale_factor: float = 32.0
     share_embeddings_and_output_weights: bool = True
     rotary_base: int = 500_000
+    seq_length: int = 131072
     num_layers: int = 16
     hidden_size: int = 2048
     ffn_hidden_size: int = 8192
@@ -311,6 +313,7 @@ class Llama32Config3B(Llama31Config):
     scale_factor: int = 32
     share_embeddings_and_output_weights: bool = True
     rotary_base: int = 500_000
+    seq_length: int = 131072
     num_layers: int = 28
     hidden_size: int = 3072
     ffn_hidden_size: int = 8192
@@ -765,7 +768,7 @@ def make_vocab_size_divisible_by(vocab_size):
             params_dtype=dtype_from_hf(source),
             generation_config=generation_config,
             vocab_size=source.vocab_size,
-            kv_channels=getattr(source, "head_dim"),
+            kv_channels=getattr(source, "head_dim", None),
             **args,
         )
 
@@ -900,6 +903,13 @@ def convert_state(self, source, target, source_config=None):
                     "decoder.layers.*.mlp.experts.linear_fc1.weight": "model.layers.*.feed_forward.experts.gate_up_proj",
                 }
             )
+
+            # Remove the transform with source_key "decoder.layers.*.mlp.linear_fc1.weight" from transforms
+            # Llama4's HF model has a different mapping for the MLP weights (map to feed_forward instead of mlp)
+            transforms = [
+                t for t in transforms if getattr(t, "source_key", None) != "decoder.layers.*.mlp.linear_fc1.weight"
+            ]
+
             transforms.extend(
                 [
                     io.state_transform(
@@ -1160,7 +1170,7 @@ def apply(self, output_path: Path) -> Path:
         """
         from nemo.collections.llm.peft import CanonicalLoRA, DoRA, LoRA
 
-        self.peft_obj: Union[LoRA, DoRA, CanonicalLoRA] = io.load_context(str(self)).model.model_transform
+        self.peft_obj: Union[LoRA, DoRA, CanonicalLoRA] = io.load_context(str(self), subpath="model.model_transform")
 
         source, _ = self.nemo_load(str(self))
         target = self.init(torch_dtype_from_mcore_config(source.config))
diff --git a/nemo/collections/vlm/llama4/model/llama4_omni.py b/nemo/collections/vlm/llama4/model/llama4_omni.py
index 6482b1833302..5e1edd0a091a 100644
--- a/nemo/collections/vlm/llama4/model/llama4_omni.py
+++ b/nemo/collections/vlm/llama4/model/llama4_omni.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/collections/vlm/mllama/model/mllama.py b/nemo/collections/vlm/mllama/model/mllama.py
index 794fae9a5af9..1dfecd0e5a07 100644
--- a/nemo/collections/vlm/mllama/model/mllama.py
+++ b/nemo/collections/vlm/mllama/model/mllama.py
@@ -15,23 +15,29 @@
 import re
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Dict, Optional, Tuple
 
 import torch
 import torch.distributed
 from megatron.core.transformer import TransformerConfig
 from torch import Tensor
+from transformers import MllamaConfig as HFMllamaConfig
+from transformers import MllamaForConditionalGeneration
+from transformers.models.mllama.configuration_mllama import MllamaTextConfig, MllamaVisionConfig
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
+from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
 from nemo.collections.vlm.mllama.model.base import (
     CrossAttentionTextConfig,
     CrossAttentionVisionConfig,
     MLlamaModel,
     MLlamaModelConfig,
 )
+from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import io, teardown
 from nemo.lightning.io.state import _ModelState
 from nemo.lightning.pytorch.utils import dtype_from_hf
+from nemo.utils import logging
 
 # pylint: disable=C0115,C0116,C0301
 
@@ -84,9 +90,7 @@ def local_path(self, base_path: Optional[Path] = None) -> Path:
         return output_path
 
     def apply(self, output_path: Path) -> Path:
-        from transformers import MllamaForConditionalGeneration
-
-        source = MllamaForConditionalGeneration.from_pretrained(str(self), torch_dtype='auto')
+        source = MllamaForConditionalGeneration.from_pretrained(str(self), torch_dtype="auto")
 
         state_dict = _rename_xattn_layer_nums_hf(source.state_dict())
         source = _ModelState(state_dict)
@@ -107,69 +111,69 @@ def convert_state(self, source, target):
         transforms = []
         mapping.update(
             {
-                "language_model.model.layers.*.self_attn.o_proj.weight": "language_model.decoder.layers.*.self_attention.linear_proj.weight",
-                "language_model.model.xattn_layers.*.cross_attn.o_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_proj.weight",
-                "language_model.model.xattn_layers.*.cross_attn.q_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.weight",
-                "language_model.model.norm.weight": "language_model.decoder.final_layernorm.weight",
-                "language_model.lm_head.weight": "language_model.output_layer.weight",
-                "language_model.model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
-                "language_model.model.layers.*.mlp.down_proj.weight": "language_model.decoder.layers.*.mlp.linear_fc2.weight",
-                "language_model.model.layers.*.input_layernorm.weight": "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
-                "language_model.model.xattn_layers.*.cross_attn.k_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.k_layernorm.weight",
-                "language_model.model.xattn_layers.*.input_layernorm.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.layer_norm_weight",
-                "language_model.model.xattn_layers.*.cross_attn.q_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.q_layernorm.weight",
-                "language_model.model.xattn_layers.*.post_attention_layernorm.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc1.layer_norm_weight",
-                "language_model.model.xattn_layers.*.mlp.down_proj.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc2.weight",
+                "model.language_model.layers.*.self_attn.o_proj.weight": "language_model.decoder.layers.*.self_attention.linear_proj.weight",
+                "model.language_model.xattn_layers.*.cross_attn.o_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_proj.weight",
+                "model.language_model.xattn_layers.*.cross_attn.q_proj.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.weight",
+                "model.language_model.norm.weight": "language_model.decoder.final_layernorm.weight",
+                "lm_head.weight": "language_model.output_layer.weight",
+                "model.language_model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
+                "model.language_model.layers.*.mlp.down_proj.weight": "language_model.decoder.layers.*.mlp.linear_fc2.weight",
+                "model.language_model.layers.*.input_layernorm.weight": "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
+                "model.language_model.xattn_layers.*.cross_attn.k_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.k_layernorm.weight",
+                "model.language_model.xattn_layers.*.input_layernorm.weight": "language_model.decoder.xattn_layers.*.cross_attention.linear_q.layer_norm_weight",
+                "model.language_model.xattn_layers.*.cross_attn.q_norm.weight": "language_model.decoder.xattn_layers.*.cross_attention.q_layernorm.weight",
+                "model.language_model.xattn_layers.*.post_attention_layernorm.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc1.layer_norm_weight",
+                "model.language_model.xattn_layers.*.mlp.down_proj.weight": "language_model.decoder.xattn_layers.*.mlp.linear_fc2.weight",
             }
         )
 
         transforms.extend(
             [
                 io.state_transform(
-                    source_key="language_model.model.xattn_layers.*.cross_attn_attn_gate",
+                    source_key="model.language_model.xattn_layers.*.cross_attn_attn_gate",
                     target_key="language_model.decoder.xattn_layers.*.gate_attn",
                     fn=_import_gate,
                 ),
                 io.state_transform(
-                    source_key="language_model.model.xattn_layers.*.cross_attn_mlp_gate",
+                    source_key="model.language_model.xattn_layers.*.cross_attn_mlp_gate",
                     target_key="language_model.decoder.xattn_layers.*.gate_ffn",
                     fn=_import_gate,
                 ),
                 io.state_transform(
                     source_key=(
-                        "language_model.model.layers.*.self_attn.q_proj.weight",
-                        "language_model.model.layers.*.self_attn.k_proj.weight",
-                        "language_model.model.layers.*.self_attn.v_proj.weight",
+                        "model.language_model.layers.*.self_attn.q_proj.weight",
+                        "model.language_model.layers.*.self_attn.k_proj.weight",
+                        "model.language_model.layers.*.self_attn.v_proj.weight",
                     ),
                     target_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
                     fn=_import_text_qkv,
                 ),
                 io.state_transform(
                     source_key=(
-                        "language_model.model.layers.*.mlp.gate_proj.weight",
-                        "language_model.model.layers.*.mlp.up_proj.weight",
+                        "model.language_model.layers.*.mlp.gate_proj.weight",
+                        "model.language_model.layers.*.mlp.up_proj.weight",
                     ),
                     target_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
                     fn=_import_simple_concat,
                 ),
                 io.state_transform(
                     source_key=(
-                        "language_model.model.xattn_layers.*.cross_attn.k_proj.weight",
-                        "language_model.model.xattn_layers.*.cross_attn.v_proj.weight",
+                        "model.language_model.xattn_layers.*.cross_attn.k_proj.weight",
+                        "model.language_model.xattn_layers.*.cross_attn.v_proj.weight",
                     ),
                     target_key="language_model.decoder.xattn_layers.*.cross_attention.linear_kv.weight",
                     fn=_import_text_kv,
                 ),
                 io.state_transform(
                     source_key=(
-                        "language_model.model.xattn_layers.*.mlp.gate_proj.weight",
-                        "language_model.model.xattn_layers.*.mlp.up_proj.weight",
+                        "model.language_model.xattn_layers.*.mlp.gate_proj.weight",
+                        "model.language_model.xattn_layers.*.mlp.up_proj.weight",
                     ),
                     target_key="language_model.decoder.xattn_layers.*.mlp.linear_fc1.weight",
                     fn=_import_simple_concat,
                 ),
                 io.state_transform(
-                    source_key="language_model.model.embed_tokens.weight",
+                    source_key="model.language_model.embed_tokens.weight",
                     target_key=(
                         "language_model.embedding.word_embeddings.weight",
                         "language_model.learnable_embedding.weight",
@@ -182,64 +186,64 @@ def convert_state(self, source, target):
         v = "vision_model.vision_encoder"
         mapping.update(
             {
-                "vision_model.global_transformer.layers.*.self_attn.o_proj.weight": f"{v}.global_transformer.layers.*.self_attention.linear_proj.weight",
-                "vision_model.global_transformer.layers.*.gate_attn": f"{v}.global_transformer.layers.*.gate_attn",
-                "vision_model.global_transformer.layers.*.gate_ffn": f"{v}.global_transformer.layers.*.gate_ffn",
-                "vision_model.global_transformer.layers.*.input_layernorm.bias": f"{v}.global_transformer.layers.*.input_layernorm.bias",
-                "vision_model.global_transformer.layers.*.input_layernorm.weight": f"{v}.global_transformer.layers.*.input_layernorm.weight",
-                "vision_model.global_transformer.layers.*.post_attention_layernorm.bias": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.bias",
-                "vision_model.global_transformer.layers.*.post_attention_layernorm.weight": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.weight",
-                "vision_model.global_transformer.layers.*.mlp.fc1.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc1.bias",
-                "vision_model.global_transformer.layers.*.mlp.fc1.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc1.weight",
-                "vision_model.global_transformer.layers.*.mlp.fc2.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc2.bias",
-                "vision_model.global_transformer.layers.*.mlp.fc2.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc2.weight",
-                "vision_model.transformer.layers.*.self_attn.o_proj.weight": f"{v}.transformer.layers.*.self_attention.linear_proj.weight",
-                "vision_model.transformer.layers.*.input_layernorm.bias": f"{v}.transformer.layers.*.input_layernorm.bias",
-                "vision_model.transformer.layers.*.input_layernorm.weight": f"{v}.transformer.layers.*.input_layernorm.weight",
-                "vision_model.transformer.layers.*.post_attention_layernorm.bias": f"{v}.transformer.layers.*.pre_mlp_layernorm.bias",
-                "vision_model.transformer.layers.*.post_attention_layernorm.weight": f"{v}.transformer.layers.*.pre_mlp_layernorm.weight",
-                "vision_model.transformer.layers.*.mlp.fc1.bias": f"{v}.transformer.layers.*.mlp.linear_fc1.bias",
-                "vision_model.transformer.layers.*.mlp.fc1.weight": f"{v}.transformer.layers.*.mlp.linear_fc1.weight",
-                "vision_model.transformer.layers.*.mlp.fc2.bias": f"{v}.transformer.layers.*.mlp.linear_fc2.bias",
-                "vision_model.transformer.layers.*.mlp.fc2.weight": f"{v}.transformer.layers.*.mlp.linear_fc2.weight",
-                "vision_model.class_embedding": f"{v}.class_embedding",
-                "vision_model.gated_positional_embedding.embedding": f"{v}.positional_embedding",
-                "vision_model.gated_positional_embedding.tile_embedding.weight": f"{v}.gated_tile_positional_embedding.weight",
-                "vision_model.gated_positional_embedding.gate": f"{v}.gated_positional_embedding_gate",
-                "vision_model.layernorm_post.bias": f"{v}.ln_post.bias",
-                "vision_model.layernorm_post.weight": f"{v}.ln_post.weight",
-                "vision_model.layernorm_pre.bias": f"{v}.ln_pre.bias",
-                "vision_model.layernorm_pre.weight": f"{v}.ln_pre.weight",
-                "vision_model.post_tile_positional_embedding.embedding.weight": f"{v}.post_tile_pos_embed.embedding.weight",
-                "vision_model.post_tile_positional_embedding.gate": f"{v}.post_tile_pos_embed.gate",
-                "vision_model.pre_tile_positional_embedding.embedding.weight": f"{v}.pre_tile_pos_embed.embedding.weight",
-                "vision_model.pre_tile_positional_embedding.gate": f"{v}.pre_tile_pos_embed.gate",
-                "multi_modal_projector.bias": "vision_model.vision_projection.encoder.bias",
-                "multi_modal_projector.weight": "vision_model.vision_projection.encoder.weight",
+                "model.vision_model.global_transformer.layers.*.self_attn.o_proj.weight": f"{v}.global_transformer.layers.*.self_attention.linear_proj.weight",
+                "model.vision_model.global_transformer.layers.*.gate_attn": f"{v}.global_transformer.layers.*.gate_attn",
+                "model.vision_model.global_transformer.layers.*.gate_ffn": f"{v}.global_transformer.layers.*.gate_ffn",
+                "model.vision_model.global_transformer.layers.*.input_layernorm.bias": f"{v}.global_transformer.layers.*.input_layernorm.bias",
+                "model.vision_model.global_transformer.layers.*.input_layernorm.weight": f"{v}.global_transformer.layers.*.input_layernorm.weight",
+                "model.vision_model.global_transformer.layers.*.post_attention_layernorm.bias": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.bias",
+                "model.vision_model.global_transformer.layers.*.post_attention_layernorm.weight": f"{v}.global_transformer.layers.*.pre_mlp_layernorm.weight",
+                "model.vision_model.global_transformer.layers.*.mlp.fc1.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc1.bias",
+                "model.vision_model.global_transformer.layers.*.mlp.fc1.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc1.weight",
+                "model.vision_model.global_transformer.layers.*.mlp.fc2.bias": f"{v}.global_transformer.layers.*.mlp.linear_fc2.bias",
+                "model.vision_model.global_transformer.layers.*.mlp.fc2.weight": f"{v}.global_transformer.layers.*.mlp.linear_fc2.weight",
+                "model.vision_model.transformer.layers.*.self_attn.o_proj.weight": f"{v}.transformer.layers.*.self_attention.linear_proj.weight",
+                "model.vision_model.transformer.layers.*.input_layernorm.bias": f"{v}.transformer.layers.*.input_layernorm.bias",
+                "model.vision_model.transformer.layers.*.input_layernorm.weight": f"{v}.transformer.layers.*.input_layernorm.weight",
+                "model.vision_model.transformer.layers.*.post_attention_layernorm.bias": f"{v}.transformer.layers.*.pre_mlp_layernorm.bias",
+                "model.vision_model.transformer.layers.*.post_attention_layernorm.weight": f"{v}.transformer.layers.*.pre_mlp_layernorm.weight",
+                "model.vision_model.transformer.layers.*.mlp.fc1.bias": f"{v}.transformer.layers.*.mlp.linear_fc1.bias",
+                "model.vision_model.transformer.layers.*.mlp.fc1.weight": f"{v}.transformer.layers.*.mlp.linear_fc1.weight",
+                "model.vision_model.transformer.layers.*.mlp.fc2.bias": f"{v}.transformer.layers.*.mlp.linear_fc2.bias",
+                "model.vision_model.transformer.layers.*.mlp.fc2.weight": f"{v}.transformer.layers.*.mlp.linear_fc2.weight",
+                "model.vision_model.class_embedding": f"{v}.class_embedding",
+                "model.vision_model.gated_positional_embedding.embedding": f"{v}.positional_embedding",
+                "model.vision_model.gated_positional_embedding.tile_embedding.weight": f"{v}.gated_tile_positional_embedding.weight",
+                "model.vision_model.gated_positional_embedding.gate": f"{v}.gated_positional_embedding_gate",
+                "model.vision_model.layernorm_post.bias": f"{v}.ln_post.bias",
+                "model.vision_model.layernorm_post.weight": f"{v}.ln_post.weight",
+                "model.vision_model.layernorm_pre.bias": f"{v}.ln_pre.bias",
+                "model.vision_model.layernorm_pre.weight": f"{v}.ln_pre.weight",
+                "model.vision_model.post_tile_positional_embedding.embedding.weight": f"{v}.post_tile_pos_embed.embedding.weight",
+                "model.vision_model.post_tile_positional_embedding.gate": f"{v}.post_tile_pos_embed.gate",
+                "model.vision_model.pre_tile_positional_embedding.embedding.weight": f"{v}.pre_tile_pos_embed.embedding.weight",
+                "model.vision_model.pre_tile_positional_embedding.gate": f"{v}.pre_tile_pos_embed.gate",
+                "model.multi_modal_projector.bias": "vision_model.vision_projection.encoder.bias",
+                "model.multi_modal_projector.weight": "vision_model.vision_projection.encoder.weight",
             }
         )
         transforms.extend(
             [
                 io.state_transform(
                     source_key=(
-                        "vision_model.global_transformer.layers.*.self_attn.q_proj.weight",
-                        "vision_model.global_transformer.layers.*.self_attn.k_proj.weight",
-                        "vision_model.global_transformer.layers.*.self_attn.v_proj.weight",
+                        "model.vision_model.global_transformer.layers.*.self_attn.q_proj.weight",
+                        "model.vision_model.global_transformer.layers.*.self_attn.k_proj.weight",
+                        "model.vision_model.global_transformer.layers.*.self_attn.v_proj.weight",
                     ),
                     target_key=(f"{v}.global_transformer.layers.*.self_attention.linear_qkv.weight"),
                     fn=_import_vision_qkv,
                 ),
                 io.state_transform(
                     source_key=(
-                        "vision_model.transformer.layers.*.self_attn.q_proj.weight",
-                        "vision_model.transformer.layers.*.self_attn.k_proj.weight",
-                        "vision_model.transformer.layers.*.self_attn.v_proj.weight",
+                        "model.vision_model.transformer.layers.*.self_attn.q_proj.weight",
+                        "model.vision_model.transformer.layers.*.self_attn.k_proj.weight",
+                        "model.vision_model.transformer.layers.*.self_attn.v_proj.weight",
                     ),
                     target_key=(f"{v}.transformer.layers.*.self_attention.linear_qkv.weight"),
                     fn=_import_vision_qkv,
                 ),
                 io.state_transform(
-                    source_key="vision_model.patch_embedding.weight",
+                    source_key="model.vision_model.patch_embedding.weight",
                     target_key=f"{v}.conv1._linear.weight",
                     fn=_import_patch_embedding_hf,
                 ),
@@ -271,7 +275,8 @@ def _calculate_num_layers(num_hidden_layers, cross_attention_layers):
             rotary_base=source.text_config.rope_theta,
             seq_length=8192,
             num_layers=_calculate_num_layers(
-                source.text_config.num_hidden_layers, source.text_config.cross_attention_layers
+                source.text_config.num_hidden_layers,
+                source.text_config.cross_attention_layers,
             ),
             num_cross_attention_layers=len(source.text_config.cross_attention_layers),
             hidden_size=source.text_config.hidden_size,
@@ -298,16 +303,390 @@ def _vision_model_config(self, source) -> Optional[CrossAttentionVisionConfig]:
         )
 
 
+@io.model_exporter(MLlamaModel, "hf")
+class HFMLlamaExporter(io.ModelConnector[MLlamaModel, "MllamaForConditionalGeneration"]):
+    """
+    Exporter class for converting NeMo MLlama model to HuggingFace format.
+
+    Inherits:
+        io.ModelConnector: Connector interface to handle setup, save, and load using the Lightning framework.
+
+    Methods:
+        init: Initializes a new HuggingFace MLlama model instance.
+        apply: Converts the NeMo model to HuggingFace format and saves it.
+        convert_state: Maps and transforms the state dictionary from NeMo to HuggingFace format.
+        config: Generates and returns the HuggingFace MLlama config for the model.
+    """
+
+    def init(self, dtype=torch.bfloat16) -> "MllamaForConditionalGeneration":
+        """
+        Initializes a HuggingFace MllamaForConditionalGeneration model.
+
+        Args:
+            dtype: The data type to use for the model (default: torch.bfloat16)
+
+        Returns:
+            MllamaForConditionalGeneration: A HuggingFace MLlama model initialized with the configuration.
+        """
+        from transformers.modeling_utils import no_init_weights
+
+        with no_init_weights():
+            return MllamaForConditionalGeneration._from_config(self.config, torch_dtype=dtype)
+
+    def apply(self, output_path: Path) -> Path:
+        """
+        Converts the NeMo MLlama model to HuggingFace format and saves it to the specified path.
+
+        Args:
+            output_path (Path): The path where the converted HuggingFace model will be saved.
+
+        Returns:
+            Path: The output path where the HuggingFace model was saved.
+        """
+        logging.info("Loading MLlama NeMo checkpoint. This may take a while...")
+        source, source_config = self.ckpt_load(self)
+        logging.info("MLlama NeMo checkpoint loaded.")
+        logging.info("Initializing the HF model..")
+        target = self.init()
+        logging.info("Start Converting the model..")
+        target = self.convert_state(source, target, source_config)
+        target = target.cpu()
+        target.save_pretrained(output_path)
+
+        try:
+            self.tokenizer.tokenizer.save_pretrained(output_path)
+        except Exception:
+            logging.warning("Failed to save tokenizer")
+
+        print(f"Converted MLlama model saved to {output_path}")
+
+        return output_path
+
+    def convert_state(self, source, target, source_config):
+        # pylint: disable=C0115,C0116,line-too-long
+        """
+        Maps and transforms the state dictionary from NeMo to HuggingFace format.
+
+        Args:
+            source: The source NeMo model.
+            target: The target HuggingFace model.
+
+        Returns:
+            The target HuggingFace model with the converted state.
+        """
+        source = self._modify_mllama_source_state(source, source_config)
+        mapping = {}
+        transforms = []
+        # Define the state mapping from NeMo to HuggingFace
+        mapping.update(
+            {
+                "language_model.decoder.layers.*.self_attention.linear_proj.weight": "model.language_model.layers.*.self_attn.o_proj.weight",
+                "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.language_model.layers.*.input_layernorm.weight",
+                "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.language_model.layers.*.post_attention_layernorm.weight",
+                "language_model.decoder.layers.*.mlp.linear_fc2.weight": "model.language_model.layers.*.mlp.down_proj.weight",
+                "language_model.decoder.xattn_layers.*.cross_attention.q_layernorm.weight": "model.language_model.layers.*.cross_attn.q_norm.weight",
+                "language_model.decoder.xattn_layers.*.cross_attention.linear_q.weight": "model.language_model.layers.*.cross_attn.q_proj.weight",
+                "language_model.decoder.xattn_layers.*.cross_attention.k_layernorm.weight": "model.language_model.layers.*.cross_attn.k_norm.weight",
+                "language_model.decoder.xattn_layers.*.cross_attention.linear_proj.weight": "model.language_model.layers.*.cross_attn.o_proj.weight",
+                "language_model.decoder.final_layernorm.weight": "model.language_model.norm.weight",
+                "language_model.output_layer.weight": "lm_head.weight",
+            }
+        )
+        transforms.extend(
+            [
+                io.state_transform(
+                    source_key="language_model.decoder.xattn_layers.*.gate_attn",
+                    target_key="model.language_model.layers.*.cross_attn_attn_gate",
+                    fn=_export_gate,
+                ),
+                io.state_transform(
+                    source_key="language_model.decoder.xattn_layers.*.gate_ffn",
+                    target_key="model.language_model.layers.*.cross_attn_mlp_gate",
+                    fn=_export_gate,
+                ),
+                io.state_transform(
+                    source_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
+                    target_key=(
+                        "model.language_model.layers.*.self_attn.q_proj.weight",
+                        "model.language_model.layers.*.self_attn.k_proj.weight",
+                        "model.language_model.layers.*.self_attn.v_proj.weight",
+                    ),
+                    fn=_export_text_qkv,
+                ),
+                io.state_transform(
+                    source_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
+                    target_key=(
+                        "model.language_model.layers.*.mlp.gate_proj.weight",
+                        "model.language_model.layers.*.mlp.up_proj.weight",
+                    ),
+                    fn=_export_simple_split,
+                ),
+                io.state_transform(
+                    source_key="language_model.decoder.xattn_layers.*.cross_attention.linear_kv.weight",
+                    target_key=(
+                        "model.language_model.layers.*.cross_attn.k_proj.weight",
+                        "model.language_model.layers.*.cross_attn.v_proj.weight",
+                    ),
+                    fn=_export_text_kv,
+                ),
+                io.state_transform(
+                    source_key=(
+                        "language_model.embedding.word_embeddings.weight",
+                        "language_model.learnable_embedding.weight",
+                    ),
+                    target_key="model.language_model.embed_tokens.weight",
+                    fn=_export_embedding_hf,
+                ),
+            ]
+        )
+        v = "vision_model.vision_encoder"
+        mapping.update(
+            {
+                f"{v}.global_transformer.layers.*.self_attention.linear_proj.weight": "model.vision_model.global_transformer.layers.*.self_attn.o_proj.weight",
+                f"{v}.global_transformer.layers.*.gate_attn": "model.vision_model.global_transformer.layers.*.gate_attn",
+                f"{v}.global_transformer.layers.*.gate_ffn": "model.vision_model.global_transformer.layers.*.gate_ffn",
+                f"{v}.global_transformer.layers.*.input_layernorm.bias": "model.vision_model.global_transformer.layers.*.input_layernorm.bias",
+                f"{v}.global_transformer.layers.*.input_layernorm.weight": "model.vision_model.global_transformer.layers.*.input_layernorm.weight",
+                f"{v}.global_transformer.layers.*.pre_mlp_layernorm.bias": "model.vision_model.global_transformer.layers.*.post_attention_layernorm.bias",
+                f"{v}.global_transformer.layers.*.pre_mlp_layernorm.weight": "model.vision_model.global_transformer.layers.*.post_attention_layernorm.weight",
+                f"{v}.global_transformer.layers.*.mlp.linear_fc1.bias": "model.vision_model.global_transformer.layers.*.mlp.fc1.bias",
+                f"{v}.global_transformer.layers.*.mlp.linear_fc1.weight": "model.vision_model.global_transformer.layers.*.mlp.fc1.weight",
+                f"{v}.global_transformer.layers.*.mlp.linear_fc2.bias": "model.vision_model.global_transformer.layers.*.mlp.fc2.bias",
+                f"{v}.global_transformer.layers.*.mlp.linear_fc2.weight": "model.vision_model.global_transformer.layers.*.mlp.fc2.weight",
+                f"{v}.transformer.layers.*.self_attention.linear_proj.weight": "model.vision_model.transformer.layers.*.self_attn.o_proj.weight",
+                f"{v}.transformer.layers.*.input_layernorm.bias": "model.vision_model.transformer.layers.*.input_layernorm.bias",
+                f"{v}.transformer.layers.*.input_layernorm.weight": "model.vision_model.transformer.layers.*.input_layernorm.weight",
+                f"{v}.transformer.layers.*.pre_mlp_layernorm.bias": "model.vision_model.transformer.layers.*.post_attention_layernorm.bias",
+                f"{v}.transformer.layers.*.pre_mlp_layernorm.weight": "model.vision_model.transformer.layers.*.post_attention_layernorm.weight",
+                f"{v}.transformer.layers.*.mlp.linear_fc1.bias": "model.vision_model.transformer.layers.*.mlp.fc1.bias",
+                f"{v}.transformer.layers.*.mlp.linear_fc1.weight": "model.vision_model.transformer.layers.*.mlp.fc1.weight",
+                f"{v}.transformer.layers.*.mlp.linear_fc2.bias": "model.vision_model.transformer.layers.*.mlp.fc2.bias",
+                f"{v}.transformer.layers.*.mlp.linear_fc2.weight": "model.vision_model.transformer.layers.*.mlp.fc2.weight",
+                f"{v}.class_embedding": "model.vision_model.class_embedding",
+                f"{v}.positional_embedding": "model.vision_model.gated_positional_embedding.embedding",
+                f"{v}.gated_tile_positional_embedding.weight": "model.vision_model.gated_positional_embedding.tile_embedding.weight",
+                f"{v}.gated_positional_embedding_gate": "model.vision_model.gated_positional_embedding.gate",
+                f"{v}.ln_post.bias": "model.vision_model.layernorm_post.bias",
+                f"{v}.ln_post.weight": "model.vision_model.layernorm_post.weight",
+                f"{v}.ln_pre.bias": "model.vision_model.layernorm_pre.bias",
+                f"{v}.ln_pre.weight": "model.vision_model.layernorm_pre.weight",
+                f"{v}.post_tile_pos_embed.embedding.weight": "model.vision_model.post_tile_positional_embedding.embedding.weight",
+                f"{v}.post_tile_pos_embed.gate": "model.vision_model.post_tile_positional_embedding.gate",
+                f"{v}.pre_tile_pos_embed.embedding.weight": "model.vision_model.pre_tile_positional_embedding.embedding.weight",
+                f"{v}.pre_tile_pos_embed.gate": "model.vision_model.pre_tile_positional_embedding.gate",
+                "vision_model.vision_projection.encoder.bias": "model.multi_modal_projector.bias",
+                "vision_model.vision_projection.encoder.weight": "model.multi_modal_projector.weight",
+            }
+        )
+        transforms.extend(
+            [
+                io.state_transform(
+                    source_key=(f"{v}.global_transformer.layers.*.self_attention.linear_qkv.weight"),
+                    target_key=(
+                        "model.vision_model.global_transformer.layers.*.self_attn.q_proj.weight",
+                        "model.vision_model.global_transformer.layers.*.self_attn.k_proj.weight",
+                        "model.vision_model.global_transformer.layers.*.self_attn.v_proj.weight",
+                    ),
+                    fn=_export_vision_qkv,
+                ),
+                io.state_transform(
+                    source_key=(f"{v}.transformer.layers.*.self_attention.linear_qkv.weight"),
+                    target_key=(
+                        "model.vision_model.transformer.layers.*.self_attn.q_proj.weight",
+                        "model.vision_model.transformer.layers.*.self_attn.k_proj.weight",
+                        "model.vision_model.transformer.layers.*.self_attn.v_proj.weight",
+                    ),
+                    fn=_export_vision_qkv,
+                ),
+                io.state_transform(
+                    source_key=f"{v}.conv1._linear.weight",
+                    target_key="model.vision_model.patch_embedding.weight",
+                    fn=_export_patch_embedding_hf,
+                ),
+            ]
+        )
+        return io.apply_transforms(source, target, mapping=mapping, transforms=transforms)
+
+    @property
+    def tokenizer(self) -> "TokenizerSpec":
+        """
+        Gets the tokenizer from the loaded model context.
+
+        Returns:
+            The tokenizer specification.
+        """
+        return io.load_context(str(self), subpath="model").tokenizer
+
+    def ckpt_load(self, path: Path) -> Tuple[Dict, Dict]:
+        """
+        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
+        so that it is consistent with the key names you would get from loading the checkpoint into a model.
+        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
+
+        Args:
+            path (Path): The path from which the model will be loaded.
+
+        Returns
+        -------
+            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
+        """
+        config = io.load_context(str(self), subpath="model.config")
+        dist_ckpt_folder = path / "weights"
+        state_dict = {}
+
+        langauge_layers = config.language_model_config.num_layers
+        vision_layers = config.vision_model_config.num_layers
+        distributed_model_weights = load_distributed_model_weights(dist_ckpt_folder, True).items()
+        for k, v in distributed_model_weights:
+            if "_extra_state" in k:
+                continue
+            new_k = k.replace("module.", "")
+            if "layers" in new_k and (v.size(0) == langauge_layers or v.size(0) == vision_layers):
+                # Only split layers
+                for i in range(v.size(0)):
+                    state_dict[new_k.replace("layers", f"layers.{str(i)}")] = v[i]
+            elif "global_transformer.layers" in new_k:
+                for i in range(v.size(0)):
+                    state_dict[new_k.replace("layers", f"layers.{str(i)}")] = v[i]
+            state_dict[new_k] = v
+        return state_dict, config
+
+    def _modify_mllama_source_state(self, state_dict, source_config):
+        """
+        - Modify state dict to integrate cross-attention layers into self-attention layer.
+        e.g. 11B: 32 self-attn + 8 cross-attn -> 40 layers, 90B: 80 self-attn + 20 cross-attn -> 100 layers
+        - Change the layer index to match the cross_attention_layers in the model config.
+        e.g. 11B: [3, 7, 11, 15, 19, 23, 27, 31] -> [3, 8, 13, 18, 23, 28, 33, 38]
+
+        Args:
+            state_dict: Source model state dict
+            source_config: Model config dict
+
+        Returns:
+            _ModelState: Modified state
+        """
+
+        def convert_layer_num(match):
+            layer_num = int(match.group(1))
+            x_num = (layer_num - 3) // (cross_attention_frequency)
+            if (layer_num - 3) % (cross_attention_frequency) == 0:
+                new_layer_num = x_num + layer_num
+                return f".{new_layer_num}."
+            raise ValueError(
+                f"Unexpected layer_num: {layer_num} (does not align with cross_attention_frequency={cross_attention_frequency})"
+            )
+
+        text_config = source_config.language_model_config
+        cross_attention_frequency = text_config.num_layers // text_config.num_cross_attention_layers
+        total_num_layer = text_config.num_layers + text_config.num_cross_attention_layers
+        prefix = "language_model.decoder"
+
+        new_state_dict = {}
+        # Integrating layer indexes of self-attention and cross-attention
+        for i in range(total_num_layer):
+            cross_num = (i - 3) // (cross_attention_frequency + 1)
+            if (i - 3) % (cross_attention_frequency + 1) == 0:
+                xattn_index = cross_num * cross_attention_frequency + 3
+                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc1.layer_norm_weight"] = state_dict.pop(
+                    f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc1.layer_norm_weight"
+                )
+                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc2.weight"] = state_dict.pop(
+                    f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc2.weight"
+                )
+                new_state_dict[f"{prefix}.layers.{i}.self_attention.linear_qkv.layer_norm_weight"] = state_dict.pop(
+                    f"{prefix}.xattn_layers.{xattn_index}.cross_attention.linear_q.layer_norm_weight"
+                )
+                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc1.weight"] = state_dict.pop(
+                    f"{prefix}.xattn_layers.{xattn_index}.mlp.linear_fc1.weight"
+                )
+            else:
+                attn_index = i - cross_num - 1
+                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc1.layer_norm_weight"] = state_dict.pop(
+                    f"{prefix}.layers.{attn_index}.mlp.linear_fc1.layer_norm_weight"
+                )
+                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc2.weight"] = state_dict.pop(
+                    f"{prefix}.layers.{attn_index}.mlp.linear_fc2.weight"
+                )
+                new_state_dict[f"{prefix}.layers.{i}.self_attention.linear_qkv.layer_norm_weight"] = state_dict.pop(
+                    f"{prefix}.layers.{attn_index}.self_attention.linear_qkv.layer_norm_weight"
+                )
+                new_state_dict[f"{prefix}.layers.{i}.mlp.linear_fc1.weight"] = state_dict.pop(
+                    f"{prefix}.layers.{attn_index}.mlp.linear_fc1.weight"
+                )
+
+        for k, v in new_state_dict.items():
+            state_dict[k] = v
+
+        new_state_dict = {}
+        # Align the cross-attention layer index with HF
+        for k, v in state_dict.items():
+            if "xattn_layers" in k:
+                new_state_dict[re.sub(r"\.(\d+)\.", convert_layer_num, k)] = v
+            else:
+                new_state_dict[k] = v
+
+        source = _ModelState(new_state_dict)
+        return source
+
+    @property
+    def config(self) -> "HFMllamaConfig":
+        """
+        Generates the configuration for the HuggingFace MLlama model based on the NeMo model.
+
+        Returns:
+            HFMllamaConfig: A configuration object for the HuggingFace MLlama model.
+        """
+        source = io.load_context(str(self), subpath="model.config")
+        vision_model_config = source.vision_model_config
+        language_config = source.language_model_config
+
+        vision_config = MllamaVisionConfig(
+            num_hidden_layers=vision_model_config.num_layers,
+            hidden_size=vision_model_config.hidden_size,
+            attention_heads=vision_model_config.num_attention_heads,
+            image_size=vision_model_config.vision_chunk_size,
+            max_num_tiles=vision_model_config.vision_max_num_chunks,
+            torch_dtype="bfloat16",
+        )
+        cross_attention_layers = [
+            x + i
+            for i, x in enumerate(language_config._init_fusion_schedule(language_config.num_cross_attention_layers))
+        ]
+        # Create text config for HuggingFace model
+        text_config = MllamaTextConfig(
+            rope_theta=language_config.rotary_base,
+            num_hidden_layers=language_config.num_layers + language_config.num_cross_attention_layers,
+            tie_word_embeddings=language_config.share_embeddings_and_output_weights,
+            cross_attention_layers=cross_attention_layers,
+            hidden_size=language_config.hidden_size,
+            intermediate_size=language_config.ffn_hidden_size,
+            num_attention_heads=language_config.num_attention_heads,
+            num_key_value_heads=language_config.num_query_groups,
+            vocab_size=language_config.vocab_size,
+            rope_scaling={
+                "factor": 8.0,
+                "high_freq_factor": 4.0,
+                "low_freq_factor": 1.0,
+                "original_max_position_embeddings": 8192,
+                "rope_type": "llama3",
+            },
+            eos_token_id=[128001, 128008, 128009],
+            torch_dtype="bfloat16",
+        )
+        # Create the MllamaConfig for HuggingFace
+        return HFMllamaConfig(vision_config=vision_config, text_config=text_config, torch_dtype="bfloat16")
+
+
 def _rename_xattn_layer_nums_hf(source: Dict):
     def convert_layer_num(match):
         layer_num = int(match.group(1))
         cross_num = (layer_num - 3) // (cross_attention_frequency + 1)
         if (layer_num - 3) % (cross_attention_frequency + 1) == 0:
             new_layer_num = cross_num * cross_attention_frequency + 3
-            return f'xattn_layers.{new_layer_num}.'
+            return f"xattn_layers.{new_layer_num}."
 
         new_layer_num = layer_num - cross_num - 1
-        return f'layers.{new_layer_num}.'
+        return f"layers.{new_layer_num}."
 
     cross_attention_frequency = 4
 
@@ -362,7 +741,19 @@ def _import_text_kv(ctx: io.TransformCTX, k, v):
     return _merge_kv(k, v, head_num, num_query_groups, head_size, hidden_size)
 
 
-def _merge_kv(k: Tensor, v: Tensor, head_num: int, num_query_groups: int, head_size: int, hidden_size: int):
+def _import_simple_concat(a, b):
+    # for both (w1, w3) -> fc1, and (wk, wv) -> wkv
+    return torch.cat((a, b), dim=0)
+
+
+def _merge_kv(
+    k: Tensor,
+    v: Tensor,
+    head_num: int,
+    num_query_groups: int,
+    head_size: int,
+    hidden_size: int,
+):
     old_tensor_shape = k.size()
     new_kv_tensor_shape = (num_query_groups, head_size) + old_tensor_shape[1:]
 
@@ -381,7 +772,13 @@ def _merge_kv(k: Tensor, v: Tensor, head_num: int, num_query_groups: int, head_s
 
 
 def _merge_qkv(
-    q: Tensor, k: Tensor, v: Tensor, head_num: int, num_query_groups: int, head_size: int, hidden_size: int
+    q: Tensor,
+    k: Tensor,
+    v: Tensor,
+    head_num: int,
+    num_query_groups: int,
+    head_size: int,
+    hidden_size: int,
 ):
     heads_per_group = head_num // num_query_groups
     old_tensor_shape = q.size()
@@ -408,6 +805,26 @@ def _merge_qkv(
     return qkv_weights
 
 
+def _split_kv(
+    kv: Tensor,
+    head_num: int,
+    num_query_groups: int,
+    head_size: int,
+    hidden_size: int,
+):
+    kv_total_dim = 2 * num_query_groups
+
+    linear_kv = kv.reshape([kv_total_dim, head_size, hidden_size])
+
+    k_slice = torch.arange(0, kv_total_dim, 2)
+    v_slice = torch.arange(1, kv_total_dim, 2)
+
+    k_proj = linear_kv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_kv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return k_proj, v_proj
+
+
 def _split_qkv(qkv, head_num: int, num_query_groups: int, head_size: int, hidden_size: int):
     heads_per_group = head_num // num_query_groups
     qkv_total_dim = head_num + 2 * num_query_groups
@@ -429,20 +846,50 @@ def _split_qkv(qkv, head_num: int, num_query_groups: int, head_size: int, hidden
     return q_proj, k_proj, v_proj
 
 
-def _import_simple_concat(a, b):
-    # for both (w1, w3) -> fc1, and (wk, wv) -> wkv
-    return torch.cat((a, b), dim=0)
+def _export_gate(gate):
+    return gate[0:1]
 
 
-def _rename_xattn_layer_nums(source: Dict):
-    def convert_layer_num(match):
-        new_layer_num = int(match.group(1)) * 4 + 3
-        return f'.{new_layer_num}.'
+def _export_patch_embedding_hf(a):
+    return a.reshape(a.shape[0], 3, 14, 14)
 
-    output_dict = {}
-    for k, v in source.items():
-        if "cross_attention_layers" in k:
-            output_dict[re.sub(r"\.(\d+)\.", convert_layer_num, k)] = v
-        else:
-            output_dict[k] = v
-    return output_dict
+
+def _export_vision_qkv(ctx: io.TransformCTX, qkv):
+    vision_config = ctx.target.config.vision_config
+
+    head_num = vision_config.attention_heads
+    num_query_groups = vision_config.attention_heads
+    hidden_size = vision_config.hidden_size
+    head_size = hidden_size // head_num
+    return _split_qkv(qkv, head_num, num_query_groups, head_size, hidden_size)
+
+
+def _export_text_kv(ctx: io.TransformCTX, kv):
+    text_config = ctx.target.config.text_config
+
+    head_num = text_config.num_attention_heads
+    num_query_groups = text_config.num_key_value_heads
+    hidden_size = text_config.hidden_size
+    head_size = hidden_size // head_num
+    return _split_kv(kv, head_num, num_query_groups, head_size, hidden_size)
+
+
+def _export_text_qkv(ctx: io.TransformCTX, qkv):
+    text_config = ctx.target.config.text_config
+
+    head_num = text_config.num_attention_heads
+    num_query_groups = text_config.num_key_value_heads
+    hidden_size = text_config.hidden_size
+    head_size = hidden_size // head_num
+    return _split_qkv(qkv, head_num, num_query_groups, head_size, hidden_size)
+
+
+def _export_simple_split(linear_fc1):
+    """Splits NeMo's fused MLP linear_fc1 weight into gate_proj and up_proj for HuggingFace format."""
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+    return gate_proj, up_proj
+
+
+def _export_embedding_hf(word_embeddings, learnable_embedding):
+    """Transforms the word embeddings from NeMo to HuggingFace format."""
+    return torch.cat((word_embeddings, learnable_embedding), dim=0)
diff --git a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
old mode 100644
new mode 100755
index ef99ce7ed1c3..e9cbe154fcd1
--- a/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
+++ b/nemo/collections/vlm/qwen2vl/model/qwen2vl.py
@@ -14,15 +14,43 @@
 
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Dict, Tuple, Union
 
 import torch
+import transformers
 from megatron.core.transformer.transformer_config import TransformerConfig
+from transformers import AutoConfig as HFAutoConfig
+from transformers import AutoModelForImageTextToText
+from transformers import Qwen2_5_VLConfig as HFQwen25VLConfig
+from transformers import Qwen2VLConfig as HFQwen2VLConfig
+from transformers import Qwen2VLForConditionalGeneration
+from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig as HFQwen25VLVisionConfig
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLVisionConfig as HFQwen2VLVisionConfig
 
-from nemo.collections.llm import Qwen2Config, Qwen2Config1P5B, Qwen2Config7B, Qwen2Config72B
-from nemo.collections.vlm.qwen2vl.model.base import Qwen2VLConfig, Qwen2VLModel, Qwen2VLVisionConfig
+from nemo.collections.common.tokenizers.tokenizer_spec import TokenizerSpec
+from nemo.collections.llm import (
+    Qwen2Config,
+    Qwen2Config1P5B,
+    Qwen2Config7B,
+    Qwen2Config72B,
+    Qwen25Config3B,
+    Qwen25Config7B,
+    Qwen25Config32B,
+    Qwen25Config72B,
+)
+from nemo.collections.vlm.neva.model.llava import export_qkv, export_qkv_bias
+from nemo.collections.vlm.qwen2vl.model.base import (
+    Qwen2VLConfig,
+    Qwen2VLModel,
+    Qwen2VLVisionConfig,
+    Qwen25VLVisionConfig,
+)
 from nemo.collections.vlm.vision import MultimodalProjectorConfig
+from nemo.export.trt_llm.nemo_ckpt_loader.nemo_file import load_distributed_model_weights
 from nemo.lightning import io, teardown
+from nemo.lightning.io.state import _ModelState
+from nemo.lightning.pytorch.utils import dtype_from_hf
+from nemo.utils import logging
 
 if TYPE_CHECKING:
     from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
@@ -36,7 +64,9 @@ class Qwen2VLConfig2B(Qwen2VLConfig):
 
     from transformers import PretrainedConfig
 
-    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen2Config1P5B())
+    language_transformer_config: TransformerConfig = field(
+        default_factory=lambda: Qwen2Config1P5B(share_embeddings_and_output_weights=True)
+    )
     vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
         default_factory=lambda: Qwen2VLVisionConfig(num_layers=32, num_attention_heads=16)
     )
@@ -75,27 +105,95 @@ class Qwen2VLConfig72B(Qwen2VLConfig):
     )
 
 
+@dataclass
+class Qwen25VLConfig3B(Qwen2VLConfig):
+    """Qwen2.5VL Config 3B"""
+
+    from transformers import PretrainedConfig
+
+    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen25Config3B())
+    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
+        default_factory=lambda: Qwen25VLVisionConfig(num_layers=32, num_attention_heads=16)
+    )
+    vision_projection_config: TransformerConfig = field(
+        default_factory=lambda: MultimodalProjectorConfig(
+            projector_type="mcore_mlp", input_size=5120, hidden_size=2048, ffn_hidden_size=5120
+        )
+    )
+
+
+@dataclass
+class Qwen25VLConfig7B(Qwen2VLConfig):
+    """Qwen2.5VL Config 7B"""
+
+    from transformers import PretrainedConfig
+
+    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen25Config7B())
+    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
+        default_factory=lambda: Qwen25VLVisionConfig(num_layers=32, num_attention_heads=16)
+    )
+    vision_projection_config: TransformerConfig = field(
+        default_factory=lambda: MultimodalProjectorConfig(
+            projector_type="mcore_mlp", input_size=5120, hidden_size=3584, ffn_hidden_size=5120
+        )
+    )
+
+
+@dataclass
+class Qwen25VLConfig32B(Qwen2VLConfig):
+    """Qwen2.5VL Config 32B"""
+
+    from transformers import PretrainedConfig
+
+    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen25Config32B())
+    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
+        default_factory=lambda: Qwen25VLVisionConfig(num_layers=32, num_attention_heads=16, ffn_hidden_size=3456)
+    )
+    vision_projection_config: TransformerConfig = field(
+        default_factory=lambda: MultimodalProjectorConfig(
+            projector_type="mcore_mlp", input_size=5120, hidden_size=5120, ffn_hidden_size=5120
+        )
+    )
+
+
+@dataclass
+class Qwen25VLConfig72B(Qwen2VLConfig):
+    """Qwen2.5VL Config 72B"""
+
+    from transformers import PretrainedConfig
+
+    language_transformer_config: TransformerConfig = field(default_factory=lambda: Qwen25Config72B())
+    vision_transformer_config: Union[TransformerConfig, PretrainedConfig] = field(
+        default_factory=lambda: Qwen25VLVisionConfig(num_layers=32, num_attention_heads=16, ffn_hidden_size=3456)
+    )
+    vision_projection_config: TransformerConfig = field(
+        default_factory=lambda: MultimodalProjectorConfig(
+            projector_type="mcore_mlp", input_size=5120, hidden_size=8192, ffn_hidden_size=5120
+        )
+    )
+
+
 @io.model_importer(Qwen2VLModel, "hf")
 class HFQwen2VLImporter(io.ModelConnector["Qwen2VLForConditionalGeneration", Qwen2VLModel]):
     """Qwen2VL Model HF Importer"""
 
     def init(self) -> Qwen2VLModel:
         # pylint: disable=C0115,C0116
-        return Qwen2VLModel(self.config, tokenizer=self.tokenizer)
+        return Qwen2VLModel(self.config, model_version="qwen2-vl", tokenizer=self.tokenizer)
 
     def apply(self, output_path: Path) -> Path:
         # pylint: disable=C0115,C0116
-        from transformers import Qwen2VLForConditionalGeneration
+        source = AutoModelForImageTextToText.from_pretrained(str(self), trust_remote_code=True)
+        hf_config = HFAutoConfig.from_pretrained(str(self), trust_remote_code=True)
+        self.is_v2_5 = hf_config.model_type == "qwen2_5_vl"
 
-        source = Qwen2VLForConditionalGeneration.from_pretrained(str(self))
         target = self.init()
         trainer = self.nemo_setup(target)
+        source = source.to(dtype_from_hf(hf_config))
+        target = target.to(dtype_from_hf(hf_config))
         self.convert_state(source, target)
         print(f"Converted Qwen2VL model to Nemo, saving to {output_path}")
-        # for name, param in target.named_parameters():
-        #     print(name, param.shape)
         self.nemo_save(output_path, trainer)
-
         print(f"Converted Qwen2VL model saved to {output_path}")
 
         teardown(trainer, target)
@@ -113,21 +211,36 @@ def convert_state(self, source, target):
             "visual.blocks.*.norm2.bias": "vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias",
             "visual.blocks.*.attn.proj.weight": "vision_model.decoder.layers.*.self_attention.linear_proj.weight",
             "visual.blocks.*.attn.proj.bias": "vision_model.decoder.layers.*.self_attention.linear_proj.bias",
-            "visual.blocks.*.mlp.fc1.weight": "vision_model.decoder.layers.*.mlp.linear_fc1.weight",
-            "visual.blocks.*.mlp.fc1.bias": "vision_model.decoder.layers.*.mlp.linear_fc1.bias",
-            "visual.blocks.*.mlp.fc2.weight": "vision_model.decoder.layers.*.mlp.linear_fc2.weight",
-            "visual.blocks.*.mlp.fc2.bias": "vision_model.decoder.layers.*.mlp.linear_fc2.bias",
-            "visual.merger.ln_q.weight": "vision_model.decoder.final_layernorm.weight",
-            "visual.merger.ln_q.bias": "vision_model.decoder.final_layernorm.bias",
             "model.embed_tokens.weight": "language_model.embedding.word_embeddings.weight",
             "model.layers.*.self_attn.o_proj.weight": "language_model.decoder.layers.*.self_attention.linear_proj.weight",
             "model.layers.*.mlp.down_proj.weight": "language_model.decoder.layers.*.mlp.linear_fc2.weight",
             "model.layers.*.input_layernorm.weight": "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight",
             "model.layers.*.post_attention_layernorm.weight": "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight",
             "model.norm.weight": "language_model.decoder.final_layernorm.weight",
-            "lm_head.weight": "language_model.output_layer.weight",
+            # "lm_head.weight": "language_model.output_layer.weight",
         }
+        if not target.config.language_transformer_config.share_embeddings_and_output_weights:
+            mapping.update({"lm_head.weight": "language_model.output_layer.weight"})
 
+        if self.is_v2_5:
+            mapping.update(
+                {
+                    "visual.blocks.*.mlp.down_proj.weight": "vision_model.decoder.layers.*.mlp.linear_fc2.weight",
+                    "visual.blocks.*.mlp.down_proj.bias": "vision_model.decoder.layers.*.mlp.linear_fc2.bias",
+                    "visual.merger.ln_q.weight": "vision_model.decoder.final_layernorm.weight",
+                }
+            )
+        else:
+            mapping.update(
+                {
+                    "visual.blocks.*.mlp.fc1.weight": "vision_model.decoder.layers.*.mlp.linear_fc1.weight",
+                    "visual.blocks.*.mlp.fc1.bias": "vision_model.decoder.layers.*.mlp.linear_fc1.bias",
+                    "visual.blocks.*.mlp.fc2.weight": "vision_model.decoder.layers.*.mlp.linear_fc2.weight",
+                    "visual.blocks.*.mlp.fc2.bias": "vision_model.decoder.layers.*.mlp.linear_fc2.bias",
+                    "visual.merger.ln_q.weight": "vision_model.decoder.final_layernorm.weight",
+                    "visual.merger.ln_q.bias": "vision_model.decoder.final_layernorm.bias",
+                }
+            )
         if "vision_projection.encoder.linear_fc1.weight" in target.module.state_dict().keys():
             mapping.update(
                 {
@@ -149,17 +262,23 @@ def convert_state(self, source, target):
         else:
             raise KeyError("Unable to map vision projection keys.")
 
+        transforms = [
+            _import_language_qkv,
+            _import_language_qkv_bias,
+            _import_vision_qkv,
+            _import_vision_qkv_bias,
+            _import_linear_fc1,
+        ]
+        if self.is_v2_5:
+            transforms += [
+                _import_vision_linear_fc1_weight,
+                _import_vision_linear_fc1_bias,
+            ]
         return io.apply_transforms(
             source,
             target,
             mapping=mapping,
-            transforms=[
-                _import_language_qkv,
-                _import_language_qkv_bias,
-                _import_vision_qkv,
-                _import_vision_qkv_bias,
-                _import_linear_fc1,
-            ],
+            transforms=transforms,
         )
 
     @property
@@ -172,9 +291,17 @@ def tokenizer(self) -> "AutoTokenizer":
     @property
     def config(self) -> Qwen2VLConfig:
         # pylint: disable=C0115,C0116
-        from transformers import Qwen2VLConfig as HFQwen2VLConfig
+        from packaging.version import Version
+
+        if Version(transformers.__version__) > Version('4.51.3'):
+            # Todo: need to fix with newest version of transformers
+            raise ValueError(
+                f"Current version of transformers is {transformers.__version__},"
+                f"Please lower the version to be <= 4.51.3"
+            )
 
-        hf_config = HFQwen2VLConfig.from_pretrained(str(self))
+        hf_config = HFAutoConfig.from_pretrained(str(self), trust_remote_code=True)
+        is_v2_5 = hf_config.model_type == "qwen2_5_vl"
 
         def make_vocab_size_divisible_by(vocab_size):
             # pylint: disable=C0115,C0116
@@ -183,41 +310,382 @@ def make_vocab_size_divisible_by(vocab_size):
                 base //= 2
             return base
 
+        text_config = hf_config
         language_transformer_config = Qwen2Config(
-            num_layers=hf_config.num_hidden_layers,
-            hidden_size=hf_config.hidden_size,
-            ffn_hidden_size=hf_config.intermediate_size,
-            num_attention_heads=hf_config.num_attention_heads,
-            init_method_std=hf_config.initializer_range,
-            layernorm_epsilon=hf_config.rms_norm_eps,
-            num_query_groups=hf_config.num_key_value_heads,
-            rotary_base=hf_config.rope_theta,
+            num_layers=text_config.num_hidden_layers,
+            hidden_size=text_config.hidden_size,
+            ffn_hidden_size=text_config.intermediate_size,
+            num_attention_heads=text_config.num_attention_heads,
+            init_method_std=text_config.initializer_range,
+            layernorm_epsilon=text_config.rms_norm_eps,
+            num_query_groups=text_config.num_key_value_heads,
+            rotary_base=text_config.rope_theta,
             gated_linear_unit=True,
-            make_vocab_size_divisible_by=make_vocab_size_divisible_by(hf_config.vocab_size),
-            share_embeddings_and_output_weights=False,
-            vocab_size=hf_config.vocab_size,
+            make_vocab_size_divisible_by=make_vocab_size_divisible_by(text_config.vocab_size),
+            share_embeddings_and_output_weights=text_config.tie_word_embeddings,
+            vocab_size=text_config.vocab_size,
+            fp16=(dtype_from_hf(text_config) == torch.float16),
+            bf16=(dtype_from_hf(text_config) == torch.bfloat16),
+            params_dtype=dtype_from_hf(text_config),
         )
 
         # Use MCore instead of Pytorch
-        vision_transformer_config = Qwen2VLVisionConfig()
-        merge_hidden_size = hf_config.vision_config.embed_dim * (hf_config.vision_config.spatial_merge_size**2)
-        vision_projection_config = MultimodalProjectorConfig(
-            input_size=merge_hidden_size,
-            hidden_size=hf_config.vision_config.hidden_size,
-            ffn_hidden_size=merge_hidden_size,
-            projector_type="mcore_mlp",
-        )
+        vision_config = hf_config.vision_config
+        if is_v2_5:
+            vision_transformer_config = Qwen25VLVisionConfig(
+                ffn_hidden_size=vision_config.intermediate_size,
+                fp16=(dtype_from_hf(hf_config) == torch.float16),
+                bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
+                params_dtype=dtype_from_hf(hf_config),
+            )
+            merge_hidden_size = vision_config.hidden_size * (vision_config.spatial_merge_size**2)
+            vision_projection_config = MultimodalProjectorConfig(
+                input_size=merge_hidden_size,
+                hidden_size=vision_config.out_hidden_size,
+                ffn_hidden_size=merge_hidden_size,
+                projector_type="mcore_mlp",
+                fp16=(dtype_from_hf(hf_config) == torch.float16),
+                bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
+                params_dtype=dtype_from_hf(hf_config),
+            )
+        else:
+            vision_transformer_config = Qwen2VLVisionConfig(
+                fp16=(dtype_from_hf(hf_config) == torch.float16),
+                bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
+                params_dtype=dtype_from_hf(hf_config),
+            )
+            merge_hidden_size = vision_config.embed_dim * (vision_config.spatial_merge_size**2)
+            vision_projection_config = MultimodalProjectorConfig(
+                input_size=merge_hidden_size,
+                hidden_size=vision_config.hidden_size,
+                ffn_hidden_size=merge_hidden_size,
+                projector_type="mcore_mlp",
+                fp16=(dtype_from_hf(hf_config) == torch.float16),
+                bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
+                params_dtype=dtype_from_hf(hf_config),
+            )
 
         output = Qwen2VLConfig(
             language_transformer_config=language_transformer_config,
             vision_transformer_config=vision_transformer_config,
             vision_projection_config=vision_projection_config,
             vision_feature_layer=-1,
+            fp16=(dtype_from_hf(hf_config) == torch.float16),
+            bf16=(dtype_from_hf(hf_config) == torch.bfloat16),
+            params_dtype=dtype_from_hf(hf_config),
         )
 
         return output
 
 
+@io.model_exporter(Qwen2VLModel, "hf")
+class HFQwen2VLExporter(io.ModelConnector[Qwen2VLModel, "Qwen2VLForConditionalGeneration"]):
+    """
+    Exporter class for converting NeMo Qwen2VL model to HuggingFace format.
+
+    Inherits:
+        io.ModelConnector: Connector interface to handle setup, save, and load using the Lightning framework.
+
+    Methods:
+        init: Initializes a new HuggingFace Qwen2VL model instance.
+        apply: Converts the NeMo model to HuggingFace format and saves it.
+        convert_state: Maps and transforms the state dictionary from NeMo to HuggingFace format.
+        config: Generates and returns the HuggingFace Qwen2VL config for the model.
+    """
+
+    def init(self, dtype=torch.bfloat16) -> "Qwen2VLForConditionalGeneration":
+        """
+        Initializes a HuggingFace Qwen2VLForConditionalGeneration model.
+
+        Args:
+            dtype: The data type to use for the model (default: torch.bfloat16)
+
+        Returns:
+            Qwen2VLForConditionalGeneration: A HuggingFace Qwen2VL model initialized with the configuration.
+        """
+        from transformers.modeling_utils import no_init_weights
+
+        with no_init_weights():
+            return AutoModelForImageTextToText.from_config(self.config, torch_dtype=dtype)
+
+    def apply(self, output_path: Path) -> Path:
+        """
+        Converts the NeMo Qwen2VL model to HuggingFace format and saves it to the specified path.
+
+        Args:
+            output_path (Path): The path where the converted HuggingFace model will be saved.
+
+        Returns:
+            Path: The output path where the HuggingFace model was saved.
+        """
+        logging.info("Loading Qwen2VL NeMo checkpoint. This may take a while...")
+        source, source_config = self.ckpt_load(self)
+        logging.info("Qwen2VL NeMo checkpoint loaded.")
+        logging.info("Initializing the HF model..")
+        target = self.init()
+        logging.info("Start Converting the model..")
+        target = self.convert_state(source, target, source_config)
+        target = target.cpu()
+        target.save_pretrained(output_path)
+
+        try:
+            self.tokenizer.tokenizer.save_pretrained(output_path)
+        except Exception:
+            logging.warning("Failed to save tokenizer")
+
+        print(f"Converted Qwen2VL model saved to {output_path}")
+
+        return output_path
+
+    def convert_state(self, source, target, source_config):
+        # pylint: disable=C0115,C0116,line-too-long
+        """
+        Maps and transforms the state dictionary from NeMo to HuggingFace format.
+
+        Args:
+            source: The source NeMo model.
+            target: The target HuggingFace model.
+
+        Returns:
+            The target HuggingFace model with the converted state.
+        """
+
+        mapping = {
+            "vision_model.conv1.weight": "visual.patch_embed.proj.weight",
+            "vision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "visual.blocks.*.norm1.weight",
+            "vision_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_bias": "visual.blocks.*.norm1.bias",
+            "vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "visual.blocks.*.norm2.weight",
+            "vision_model.decoder.layers.*.mlp.linear_fc1.layer_norm_bias": "visual.blocks.*.norm2.bias",
+            "vision_model.decoder.layers.*.self_attention.linear_proj.weight": "visual.blocks.*.attn.proj.weight",
+            "vision_model.decoder.layers.*.self_attention.linear_proj.bias": "visual.blocks.*.attn.proj.bias",
+            "language_model.embedding.word_embeddings.weight": "model.embed_tokens.weight",
+            "language_model.decoder.layers.*.self_attention.linear_proj.weight": "model.layers.*.self_attn.o_proj.weight",
+            "language_model.decoder.layers.*.mlp.linear_fc2.weight": "model.layers.*.mlp.down_proj.weight",
+            "language_model.decoder.layers.*.self_attention.linear_qkv.layer_norm_weight": "model.layers.*.input_layernorm.weight",
+            "language_model.decoder.layers.*.mlp.linear_fc1.layer_norm_weight": "model.layers.*.post_attention_layernorm.weight",
+            "language_model.decoder.final_layernorm.weight": "model.norm.weight",
+            # "language_model.output_layer.weight": "lm_head.weight",
+        }
+        if source_config.language_transformer_config.share_embeddings_and_output_weights:
+            mapping.update({"language_model.embedding.word_embeddings.weight": "lm_head.weight"})
+        else:
+            mapping.update({"language_model.output_layer.weight": "lm_head.weight"})
+
+        if self.is_v2_5:
+            mapping.update(
+                {
+                    "vision_model.decoder.layers.*.mlp.linear_fc2.weight": "visual.blocks.*.mlp.down_proj.weight",
+                    "vision_model.decoder.layers.*.mlp.linear_fc2.bias": "visual.blocks.*.mlp.down_proj.bias",
+                    "vision_model.decoder.final_layernorm.weight": "visual.merger.ln_q.weight",
+                }
+            )
+
+        else:
+            mapping.update(
+                {
+                    "vision_model.decoder.layers.*.mlp.linear_fc1.weight": "visual.blocks.*.mlp.fc1.weight",
+                    "vision_model.decoder.layers.*.mlp.linear_fc1.bias": "visual.blocks.*.mlp.fc1.bias",
+                    "vision_model.decoder.layers.*.mlp.linear_fc2.weight": "visual.blocks.*.mlp.fc2.weight",
+                    "vision_model.decoder.layers.*.mlp.linear_fc2.bias": "visual.blocks.*.mlp.fc2.bias",
+                    "vision_model.decoder.final_layernorm.weight": "visual.merger.ln_q.weight",
+                    "vision_model.decoder.final_layernorm.bias": "visual.merger.ln_q.bias",
+                }
+            )
+        if "vision_projection.encoder.linear_fc1.weight" in source.state_dict().keys():
+            mapping.update(
+                {
+                    "vision_projection.encoder.linear_fc1.weight": "visual.merger.mlp.0.weight",
+                    "vision_projection.encoder.linear_fc1.bias": "visual.merger.mlp.0.bias",
+                    "vision_projection.encoder.linear_fc2.weight": "visual.merger.mlp.2.weight",
+                    "vision_projection.encoder.linear_fc2.bias": "visual.merger.mlp.2.bias",
+                }
+            )
+        elif "vision_projection.0.weight" in source.state_dict().keys():
+            mapping.update(
+                {
+                    "vision_projection.0.weight": "visual.merger.mlp.0.weight",
+                    "vision_projection.0.bias": "visual.merger.mlp.0.bias",
+                    "vision_projection.2.weight": "visual.merger.mlp.2.weight",
+                    "vision_projection.2.bias": "visual.merger.mlp.2.bias",
+                }
+            )
+        else:
+            raise KeyError("Unable to map vision projection keys.")
+
+        transforms = [
+            _export_language_qkv,
+            _export_language_qkv_bias,
+            _export_vision_qkv,
+            _export_vision_qkv_bias,
+            _export_linear_fc1,
+        ]
+        if self.is_v2_5:
+            transforms += [
+                _export_vision_linear_fc1_weight,
+                _export_vision_linear_fc1_bias,
+            ]
+
+        return io.apply_transforms(
+            source,
+            target,
+            mapping=mapping,
+            transforms=transforms,
+        )
+
+    @property
+    def tokenizer(self) -> "TokenizerSpec":
+        """
+        Gets the tokenizer from the loaded model context.
+
+        Returns:
+            The tokenizer specification.
+        """
+        return io.load_context(str(self), subpath="model").tokenizer
+
+    def ckpt_load(self, path: Path) -> Tuple[Dict, Dict]:
+        """
+        This function loads the state dict directly from a distributed checkpoint, and modify the state dict
+        so that it is consistent with the key names you would get from loading the checkpoint into a model.
+        This is a more memory-efficient method to obtain a state dict without initializing the nemo model.
+
+        Args:
+            path (Path): The path from which the model will be loaded.
+
+        Returns
+        -------
+            Tuple[Dict, Dict]: The loaded state dict and the yaml config dict.
+        """
+        config = io.load_context(str(self), subpath="model.config")
+        dist_ckpt_folder = path / "weights"
+        state_dict = {}
+
+        langauge_layers = config.language_transformer_config.num_layers
+        vision_layers = config.vision_transformer_config.num_layers
+        distributed_model_weights = load_distributed_model_weights(dist_ckpt_folder, True).items()
+        for k, v in distributed_model_weights:
+            if "_extra_state" in k:
+                continue
+            new_k = k.replace("module.", "")
+            if "layers" in new_k and (v.size(0) == langauge_layers or v.size(0) == vision_layers):
+                # Only split layers
+                for i in range(v.size(0)):
+                    state_dict[new_k.replace("layers", f"layers.{str(i)}")] = v[i]
+            state_dict[new_k] = v
+
+        source = _ModelState(state_dict)
+        return source, config
+
+    @property
+    def config(self) -> "HFQwen2VLConfig":
+        """
+        Generates the configuration for the HuggingFace Qwen2VL model based on the NeMo model.
+
+        Returns:
+            HFQwen2VLConfig: A configuration object for the HuggingFace Qwen2VL model.
+        """
+        from packaging.version import Version
+
+        if Version(transformers.__version__) > Version('4.51.3'):
+            # Todo: need to fix with newest version of transformers
+            raise ValueError(
+                f"Current version of transformers is {transformers.__version__},"
+                f"Please lower the version to be <= 4.51.3"
+            )
+        source = io.load_context(str(self), subpath="model.config")
+
+        language_config = source.language_transformer_config
+        vision_model_config = source.vision_transformer_config
+        vision_projection_config = source.vision_projection_config
+
+        self.is_v2_5 = hasattr(vision_model_config, "fullatt_block_indexes") and (
+            vision_model_config.fullatt_block_indexes != None
+        )
+
+        if self.is_v2_5:
+            vision_config = HFQwen25VLVisionConfig(
+                depth=vision_model_config.num_layers,
+                embed_dim=vision_model_config.embed_dim,
+                hidden_size=vision_model_config.hidden_size,
+                out_hidden_size=language_config.hidden_size,
+                hidden_act="silu",
+                mlp_ratio=int(vision_projection_config.ffn_hidden_size // vision_model_config.hidden_size),
+                num_heads=vision_model_config.num_attention_heads,
+                in_channels=3,
+                patch_size=vision_model_config.patch_dim,
+                spatial_merge_size=vision_model_config.spatial_merge_size,
+                spatial_patch_size=vision_model_config.spatial_patch_size,
+                temporal_patch_size=vision_model_config.temporal_patch_size,
+                initializer_range=vision_model_config.init_method_std,
+                fullatt_block_indexes=[7, 15, 23, 31],
+                tokens_per_second=2,
+                model_type="qwen2_5_vl",
+                torch_dtype="bfloat16",
+            ).to_dict()
+
+            # Create the LlavaConfig for HuggingFace
+            hf_config = HFQwen25VLConfig(
+                vision_config=vision_config,
+                num_hidden_layers=language_config.num_layers,
+                hidden_size=language_config.hidden_size,
+                intermediate_size=language_config.ffn_hidden_size,
+                num_attention_heads=language_config.num_attention_heads,
+                max_window_layers=70,
+                max_position_embeddings=language_config.seq_length,
+                initializer_range=language_config.init_method_std,
+                rms_norm_eps=language_config.layernorm_epsilon,
+                num_key_value_heads=language_config.num_query_groups,
+                rope_theta=language_config.rotary_base,
+                vocab_size=language_config.vocab_size,
+                rope_scaling={"type": "mrope", "mrope_section": [16, 24, 24]},
+                tie_word_embeddings=language_config.share_embeddings_and_output_weights,
+                torch_dtype="bfloat16",
+                # vocab_size=self.tokenizer.vocab_size,
+                bos_token_id=151643,
+                eos_token_id=151645,
+                vision_start_token_id=151652,
+                vision_end_token_id=151653,
+                vision_token_id=151654,
+                image_token_id=151655,
+                video_token_id=51656,
+            )
+            return hf_config
+        else:
+            vision_config = HFQwen2VLVisionConfig(
+                depth=vision_model_config.num_layers,
+                embed_dim=vision_model_config.embed_dim,
+                hidden_size=vision_projection_config.hidden_size,
+                hidden_act="quick_gelu",
+                mlp_ratio=int(vision_projection_config.ffn_hidden_size // vision_model_config.hidden_size),
+                num_heads=vision_model_config.num_attention_heads,
+                in_channels=3,
+                patch_size=vision_model_config.patch_dim,
+                spatial_merge_size=vision_model_config.spatial_merge_size,
+                spatial_patch_size=vision_model_config.spatial_patch_size,
+                temporal_patch_size=vision_model_config.temporal_patch_size,
+                initializer_range=vision_model_config.init_method_std,
+                model_type="qwen2_vl",
+                torch_dtype="bfloat16",
+            ).to_dict()
+
+            # Create the Qwen2VLConfig for HuggingFace
+            # if transformers > 4.51.3, use Qwen2VLTextConfig as text_config
+            # https://github.com/huggingface/transformers/pull/37268
+            return HFQwen2VLConfig(
+                num_hidden_layers=language_config.num_layers,
+                hidden_size=language_config.hidden_size,
+                intermediate_size=language_config.ffn_hidden_size,
+                num_attention_heads=language_config.num_attention_heads,
+                initializer_range=language_config.init_method_std,
+                rms_norm_eps=language_config.layernorm_epsilon,
+                num_key_value_heads=language_config.num_query_groups,
+                rope_theta=language_config.rotary_base,
+                tie_word_embeddings=language_config.share_embeddings_and_output_weights,
+                vocab_size=language_config.vocab_size,
+                vision_config=vision_config,
+                torch_dtype="bfloat16",
+            )
+
+
 def import_qkv(q, k, v, head_num, num_query_groups, heads_per_group, hidden_size, head_size):
     # pylint: disable=C0115,C0116
     old_tensor_shape = q.size()
@@ -362,3 +830,198 @@ def _import_cls_token(ctx: io.TransformCTX, cls_token):
 def _import_linear_fc1(down, gate):
     # pylint: disable=C0115,C0116
     return torch.cat((down, gate), axis=0)
+
+
+@io.state_transform(
+    source_key=("visual.blocks.*.mlp.gate_proj.weight", "visual.blocks.*.mlp.up_proj.weight"),
+    target_key="vision_model.decoder.layers.*.mlp.linear_fc1.weight",
+)
+def _import_vision_linear_fc1_weight(down, gate):
+    # pylint: disable=C0115,C0116
+    return torch.cat((down, gate), axis=0)
+
+
+@io.state_transform(
+    source_key=("visual.blocks.*.mlp.gate_proj.bias", "visual.blocks.*.mlp.up_proj.bias"),
+    target_key="vision_model.decoder.layers.*.mlp.linear_fc1.bias",
+)
+def _import_vision_linear_fc1_bias(down, gate):
+    # pylint: disable=C0115,C0116
+    return torch.cat((down, gate), axis=0)
+
+
+def export_qkv(linear_qkv, head_num, num_query_groups, heads_per_group, hidden_size, head_size):
+    # pylint: disable=C0115,C0116
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    linear_qkv = linear_qkv.reshape([qkv_total_dim, head_size, -1])
+    hidden_size = linear_qkv.size(-1)
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_proj = linear_qkv[q_slice].reshape(-1, hidden_size).cpu()
+    k_proj = linear_qkv[k_slice].reshape(-1, hidden_size).cpu()
+    v_proj = linear_qkv[v_slice].reshape(-1, hidden_size).cpu()
+
+    return q_proj, k_proj, v_proj
+
+
+def export_qkv_bias(qkv_bias: torch.Tensor, head_num, num_query_groups, heads_per_group, head_size):
+    """
+    Split interleave-concatenated qkv bias to separate q, k, v bias
+
+    Example: export layer linear_qkv bias to HF {q|k|v}_proj bias
+    """
+    qkv_total_dim = head_num + 2 * num_query_groups
+
+    qkv_bias = qkv_bias.reshape([qkv_total_dim, head_size])
+    q_slice = torch.cat(
+        [
+            torch.arange((heads_per_group + 2) * i, (heads_per_group + 2) * i + heads_per_group)
+            for i in range(num_query_groups)
+        ]
+    )
+    k_slice = torch.arange(heads_per_group, qkv_total_dim, (heads_per_group + 2))
+    v_slice = torch.arange(heads_per_group + 1, qkv_total_dim, (heads_per_group + 2))
+
+    q_bias = qkv_bias[q_slice].reshape(-1).cpu()
+    k_bias = qkv_bias[k_slice].reshape(-1).cpu()
+    v_bias = qkv_bias[v_slice].reshape(-1).cpu()
+
+    return q_bias, k_bias, v_bias
+
+
+@io.state_transform(
+    source_key="vision_model.decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key="visual.blocks.*.attn.qkv.weight",
+)
+def _export_vision_qkv(ctx: io.TransformCTX, qkv):
+    # pylint: disable=C0115,C0116
+    hf_config = ctx.target.config.vision_config
+    hidden_size = hf_config.embed_dim if hf_config.model_type == "qwen2_vl" else hf_config.hidden_size
+    return torch.cat(
+        export_qkv(
+            qkv,
+            head_num=hf_config.num_heads,
+            num_query_groups=hf_config.num_heads,
+            heads_per_group=hf_config.num_heads // hf_config.num_heads,
+            hidden_size=hidden_size,
+            head_size=hidden_size // hf_config.num_heads,
+        ),
+        axis=0,
+    )
+
+
+@io.state_transform(
+    source_key="vision_model.decoder.layers.*.self_attention.linear_qkv.bias",
+    target_key="visual.blocks.*.attn.qkv.bias",
+)
+def _export_vision_qkv_bias(ctx: io.TransformCTX, qkv_bias):
+    # pylint: disable=C0115,C0116
+    hf_config = ctx.target.config.vision_config
+    hidden_size = hf_config.embed_dim if hf_config.model_type == "qwen2_vl" else hf_config.hidden_size
+    return torch.cat(
+        export_qkv_bias(
+            qkv_bias,
+            head_num=hf_config.num_heads,
+            num_query_groups=hf_config.num_heads,
+            heads_per_group=hf_config.num_heads // hf_config.num_heads,
+            head_size=hidden_size // hf_config.num_heads,
+        ),
+        axis=0,
+    )
+
+
+@io.state_transform(
+    source_key="language_model.decoder.layers.*.self_attention.linear_qkv.weight",
+    target_key=(
+        "model.layers.*.self_attn.q_proj.weight",
+        "model.layers.*.self_attn.k_proj.weight",
+        "model.layers.*.self_attn.v_proj.weight",
+    ),
+)
+def _export_language_qkv(ctx: io.TransformCTX, qkv):
+    # pylint: disable=C0115,C0116
+    hf_config = ctx.target.config
+    return export_qkv(
+        qkv,
+        head_num=hf_config.num_attention_heads,
+        num_query_groups=hf_config.num_key_value_heads,
+        heads_per_group=hf_config.num_attention_heads // hf_config.num_key_value_heads,
+        hidden_size=hf_config.hidden_size,
+        head_size=hf_config.hidden_size // hf_config.num_attention_heads,
+    )
+
+
+@io.state_transform(
+    source_key="language_model.decoder.layers.*.self_attention.linear_qkv.bias",
+    target_key=(
+        "model.layers.*.self_attn.q_proj.bias",
+        "model.layers.*.self_attn.k_proj.bias",
+        "model.layers.*.self_attn.v_proj.bias",
+    ),
+)
+def _export_language_qkv_bias(ctx: io.TransformCTX, qkv_bias):
+    # pylint: disable=C0115,C0116
+    hf_config = ctx.target.config
+    return export_qkv_bias(
+        qkv_bias,
+        head_num=hf_config.num_attention_heads,
+        num_query_groups=hf_config.num_key_value_heads,
+        heads_per_group=hf_config.num_attention_heads // hf_config.num_key_value_heads,
+        head_size=hf_config.hidden_size // hf_config.num_attention_heads,
+    )
+
+
+@io.state_transform(
+    source_key="vision_model.class_token",
+    target_key="vision_model.embeddings.class_embedding",
+)
+def _export_cls_token(ctx: io.TransformCTX, cls_token):
+    # pylint: disable=C0115,C0116
+    return cls_token.squeeze()
+
+
+@io.state_transform(
+    source_key="language_model.decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=(
+        "model.layers.*.mlp.gate_proj.weight",
+        "model.layers.*.mlp.up_proj.weight",
+    ),
+)
+def _export_linear_fc1(linear_fc1):
+    # pylint: disable=C0115,C0116
+    gate_proj, up_proj = torch.chunk(linear_fc1, 2, dim=0)
+    return gate_proj, up_proj
+
+
+@io.state_transform(
+    source_key="vision_model.decoder.layers.*.mlp.linear_fc1.weight",
+    target_key=(
+        "visual.blocks.*.mlp.gate_proj.weight",
+        "visual.blocks.*.mlp.up_proj.weight",
+    ),
+)
+def _export_vision_linear_fc1_weight(vision_fc1_weight):
+    # pylint: disable=C0115,C0116
+    gate_proj, up_proj = torch.chunk(vision_fc1_weight, 2, dim=0)
+    return gate_proj, up_proj
+
+
+@io.state_transform(
+    source_key="vision_model.decoder.layers.*.mlp.linear_fc1.bias",
+    target_key=(
+        "visual.blocks.*.mlp.gate_proj.bias",
+        "visual.blocks.*.mlp.up_proj.bias",
+    ),
+)
+def _export_vision_linear_fc1_bias(vision_fc1_bias):
+    # pylint: disable=C0115,C0116
+    gate_proj, up_proj = torch.chunk(vision_fc1_bias, 2, dim=0)
+    return gate_proj, up_proj

From f8cba4dfe439707f9625a147d8bac6e493356590 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Sat, 8 Nov 2025 00:13:31 -0800
Subject: [PATCH 14/15] revert back

Signed-off-by: Pablo Garay 
---
 nemo/export/__init__.py                       |  11 +-
 nemo/export/multimodal/__init__.py            |   2 +-
 nemo/export/multimodal/build.py               |   2 +-
 nemo/export/multimodal/run.py                 |   2 +-
 nemo/export/onnx_llm_exporter.py              |  22 ++-
 nemo/export/quantize/__init__.py              |   4 +-
 nemo/export/quantize/quantizer.py             |  28 +++-
 nemo/export/sentencepiece_tokenizer.py        |   2 +-
 nemo/export/tarutils.py                       |  14 +-
 nemo/export/tensorrt_llm.py                   |   7 +-
 nemo/export/tensorrt_mm_exporter.py           |   6 +-
 nemo/export/tiktoken_tokenizer.py             |   2 +-
 .../trt_llm/converter/model_converter.py      |   2 +-
 .../converter/model_to_trt_llm_ckpt.py        |   8 +-
 nemo/export/trt_llm/converter/utils.py        |   2 +-
 .../trt_llm/nemo_ckpt_loader/__init__.py      |   2 +-
 .../trt_llm/nemo_ckpt_loader/nemo_file.py     |   2 +-
 nemo/export/trt_llm/qnemo/__init__.py         |   4 +-
 .../trt_llm/qnemo/qnemo_to_tensorrt_llm.py    |  51 ++++---
 nemo/export/trt_llm/qnemo/tokenizer_utils.py  |   2 +-
 nemo/export/trt_llm/qnemo/utils.py            |   2 +-
 nemo/export/trt_llm/tensorrt_llm_build.py     |   2 +-
 nemo/export/trt_llm/tensorrt_llm_run.py       |   2 +-
 nemo/export/trt_llm/utils.py                  |   2 +-
 nemo/export/utils/__init__.py                 |   2 +-
 nemo/export/utils/lora_converter.py           |   2 +-
 nemo/export/utils/model_loader.py             |   2 +-
 nemo/export/utils/utils.py                    |   2 +-
 nemo/export/vllm/__init__.py                  |   2 +-
 nemo/export/vllm/engine.py                    | 140 ------------------
 nemo/export/vllm/model_config.py              |  27 ++--
 nemo/export/vllm/model_converters.py          |   2 +-
 nemo/export/vllm/model_loader.py              |   2 +-
 nemo/export/vllm/tokenizer_group.py           |  75 ----------
 nemo/export/vllm_exporter.py                  |  21 ++-
 nemo/export/vllm_hf_exporter.py               |   2 +-
 36 files changed, 137 insertions(+), 325 deletions(-)
 delete mode 100644 nemo/export/vllm/engine.py
 delete mode 100644 nemo/export/vllm/tokenizer_group.py

diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
index 3685c9a5cb07..1a5e5f6afd5c 100644
--- a/nemo/export/__init__.py
+++ b/nemo/export/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -19,3 +19,12 @@
     __all__ = ["io"]
 except (ImportError, ModuleNotFoundError):
     pass
+
+import warnings
+
+warnings.warn(
+    "The 'nemo.export' is deprecated and will be removed in NeMo FW 25.09 container release. "
+    "For evaluation functionality, please use the new Eval repository: https://github.com/NVIDIA-NeMo/Export-Deploy",
+    DeprecationWarning,
+    stacklevel=2,
+)
diff --git a/nemo/export/multimodal/__init__.py b/nemo/export/multimodal/__init__.py
index d9155f923f18..341a77c5bc66 100644
--- a/nemo/export/multimodal/__init__.py
+++ b/nemo/export/multimodal/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py
index 6d5c792d02d9..f3a133cd65fe 100644
--- a/nemo/export/multimodal/build.py
+++ b/nemo/export/multimodal/build.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/multimodal/run.py b/nemo/export/multimodal/run.py
index be2e74dc685d..d113f877b3c3 100644
--- a/nemo/export/multimodal/run.py
+++ b/nemo/export/multimodal/run.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/onnx_llm_exporter.py b/nemo/export/onnx_llm_exporter.py
index e7ce4aeb49bc..3204a3c75eeb 100755
--- a/nemo/export/onnx_llm_exporter.py
+++ b/nemo/export/onnx_llm_exporter.py
@@ -15,10 +15,9 @@
 
 import warnings
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 import numpy as np
-import tensorrt as trt
 import torch
 import wrapt
 from transformers import AutoModel, AutoTokenizer
@@ -27,6 +26,9 @@
 from nemo.export.utils import get_example_inputs, get_model_device_type, is_nemo2_checkpoint, validate_fp8_network
 from nemo.utils import logging
 
+if TYPE_CHECKING:
+    import tensorrt as trt
+
 
 @wrapt.decorator
 def noop_decorator(func):
@@ -55,6 +57,14 @@ def wrapper(*args, **kwargs):
     use_onnxruntime = False
 
 
+use_trt = True
+try:
+    import tensorrt as trt
+except ImportError:
+    logging.warning("tensorrt is not available")
+    use_trt = False
+
+
 # pylint: disable=line-too-long
 class OnnxLLMExporter(ITritonDeployable):
     """
@@ -226,7 +236,7 @@ def export_onnx_to_trt(
         override_layers_to_fp32: List = None,
         trt_dtype: str = "fp16",
         profiling_verbosity: str = "layer_names_only",
-        trt_builder_flags: List[trt.BuilderFlag] = None,
+        trt_builder_flags: List["trt.BuilderFlag"] = None,
     ) -> None:
         """Performs TensorRT conversion from an ONNX model.
 
@@ -313,11 +323,11 @@ def export_onnx_to_trt(
         trt_model_path.write_bytes(engine_string)
         logging.info(f"Successfully exported ONNX model ({self.onnx_model_path}) " f"to TRT engine ({trt_model_path})")
 
-    def _override_layer_precision_to_fp32(self, layer: trt.ILayer) -> None:
+    def _override_layer_precision_to_fp32(self, layer: "trt.ILayer") -> None:
         layer.precision = trt.float32
         layer.set_output_type(0, trt.float32)
 
-    def _override_layers_to_fp32(self, network: trt.INetworkDefinition, fp32_layer_patterns: list[str]) -> None:
+    def _override_layers_to_fp32(self, network: "trt.INetworkDefinition", fp32_layer_patterns: list[str]) -> None:
         for i in range(network.num_layers):
             layer = network.get_layer(i)
             layer_name = layer.name
@@ -341,7 +351,7 @@ def _override_layers_to_fp32(self, network: trt.INetworkDefinition, fp32_layer_p
                         layer.set_output_type(j, trt.float32)
                         logging.info(f"Setting layer {i} {layer_name} (type: {layer.type}) output type {j} to FP32")
 
-    def _override_layernorm_precision_to_fp32(self, network: trt.INetworkDefinition) -> None:
+    def _override_layernorm_precision_to_fp32(self, network: "trt.INetworkDefinition") -> None:
         """Set the precision of LayerNorm subgraphs to FP32 to preserve accuracy.
 
         - https://nvbugs/4478448 (Mistral)
diff --git a/nemo/export/quantize/__init__.py b/nemo/export/quantize/__init__.py
index 49ec9da3c64a..e32b12643387 100644
--- a/nemo/export/quantize/__init__.py
+++ b/nemo/export/quantize/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,5 +13,3 @@
 # limitations under the License.
 
 from .quantizer import Quantizer
-
-__all__ = ["Quantizer"]
diff --git a/nemo/export/quantize/quantizer.py b/nemo/export/quantize/quantizer.py
index f9f2f2bcbf61..98f24cd4e4a9 100644
--- a/nemo/export/quantize/quantizer.py
+++ b/nemo/export/quantize/quantizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,7 +15,7 @@
 import os
 import tarfile
 from contextlib import nullcontext
-from typing import Callable, Optional
+from typing import Callable, Optional, Union
 
 import torch
 import torch.distributed as dist
@@ -23,8 +23,13 @@
 from megatron.core.transformer.module import Float16Module
 from omegaconf.omegaconf import DictConfig, open_dict
 
-from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
-from nemo.collections.nlp.parts.utils_funcs import torch_dtype_from_precision
+try:
+    from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
+except (ImportError, ModuleNotFoundError):
+    from abc import ABC
+
+    MegatronGPTModel = ABC
+
 from nemo.utils import logging
 from nemo.utils.distributed import temporary_directory
 from nemo.utils.model_utils import save_artifacts, unwrap_model
@@ -53,6 +58,21 @@
 SUPPORTED_DTYPE = [16, "16", "bf16"]  # Default precision for non-quantized layers
 
 
+def torch_dtype_from_precision(precision: Union[int, str], megatron_amp_O2: Optional[bool] = None) -> torch.dtype:
+    """Mapping from PTL precision types to corresponding PyTorch parameter datatype."""
+    if megatron_amp_O2 is not None and megatron_amp_O2 is False:
+        return torch.float32
+
+    if precision in ['bf16', 'bf16-mixed']:
+        return torch.bfloat16
+    elif precision in [16, '16', '16-mixed']:
+        return torch.float16
+    elif precision in [32, '32', '32-true']:
+        return torch.float32
+    else:
+        raise ValueError(f"Could not parse the precision of `{precision}` to a valid torch.dtype")
+
+
 class Quantizer:
     """Post-training quantization (PTQ) and TRT-LLM export of Nemo checkpoints.
 
diff --git a/nemo/export/sentencepiece_tokenizer.py b/nemo/export/sentencepiece_tokenizer.py
index 190400ed6215..e6e09aa8b6d7 100644
--- a/nemo/export/sentencepiece_tokenizer.py
+++ b/nemo/export/sentencepiece_tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/tarutils.py b/nemo/export/tarutils.py
index ac608dc935ff..40add3162db6 100644
--- a/nemo/export/tarutils.py
+++ b/nemo/export/tarutils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -263,15 +263,3 @@ def keys(self):
         Returns an iterator over the keys in the store.
         """
         return self._path.iterdir()
-
-
-def unpack_tarball(archive: str, dest_dir: str):
-    """
-    Unpacks a tarball into a destination directory.
-
-    Args:
-        archive (str): The path to the tarball.
-        dest_dir (str): The path to the destination directory.
-    """
-    with tarfile.open(archive, mode="r") as tar:
-        tar.extractall(path=dest_dir)
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index de71aea86b23..a19d342713b7 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -78,7 +78,7 @@
 from transformers import PreTrainedTokenizerBase
 
 from nemo.deploy import ITritonDeployable
-from nemo.export.tarutils import TarPath, unpack_tarball
+from nemo.export.tarutils import TarPath
 from nemo.export.trt_llm.converter.model_converter import determine_quantization_settings, model_to_trtllm_ckpt
 from nemo.export.trt_llm.converter.model_to_trt_llm_ckpt import dist_model_to_trt_llm_ckpt, get_layer_prefix
 from nemo.export.trt_llm.converter.utils import init_model_parallel_from_nemo
@@ -326,8 +326,7 @@ def export(
                 if os.path.isdir(nemo_checkpoint_path):
                     nemo_export_dir = nemo_checkpoint_path
                 else:
-                    unpack_tarball(nemo_checkpoint_path, tmp_dir.name)
-                    nemo_checkpoint_path = tmp_dir.name
+                    raise ValueError("Checkpoint path must be a directory")
 
                 if os.path.exists(os.path.join(nemo_checkpoint_path, TOKENIZER_CONFIG_FILE)):
                     # Instantiate tokenizer for a legacy "Nemo 1" quantized checkpoint from a tokenizer config.
diff --git a/nemo/export/tensorrt_mm_exporter.py b/nemo/export/tensorrt_mm_exporter.py
index 7eeb0ca2721f..54914846fa79 100644
--- a/nemo/export/tensorrt_mm_exporter.py
+++ b/nemo/export/tensorrt_mm_exporter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,6 @@
     extract_lora_ckpt,
 )
 from nemo.export.multimodal.run import MultimodalModelRunner, SpeechllmModelRunner
-from nemo.export.tarutils import unpack_tarball
 
 use_deploy = True
 try:
@@ -152,8 +151,7 @@ def export(
                 if os.path.isdir(lora_checkpoint_path):
                     lora_dir = lora_checkpoint_path
                 else:
-                    lora_dir = os.path.join(tmp_dir.name, "unpacked_lora")
-                    unpack_tarball(lora_checkpoint_path, lora_dir)
+                    raise ValueError("lora_checkpoint_path in nemo1 is not supported. It must be a directory")
 
                 llm_lora_path = [extract_lora_ckpt(lora_dir, tmp_dir.name)]
             else:
diff --git a/nemo/export/tiktoken_tokenizer.py b/nemo/export/tiktoken_tokenizer.py
index d599620256fa..2dbfd736f450 100644
--- a/nemo/export/tiktoken_tokenizer.py
+++ b/nemo/export/tiktoken_tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
index aef3c44e6cac..e31ab9aed4b4 100755
--- a/nemo/export/trt_llm/converter/model_converter.py
+++ b/nemo/export/trt_llm/converter/model_converter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
index cb505f634490..043c8bc48dd9 100644
--- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
+++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
+# pylint: disable=missing-function-docstring
 
 
 import logging
@@ -306,8 +308,8 @@ def dist_model_to_trt_llm_ckpt(
     pp_last_rank = parallel_state.get_pipeline_model_parallel_last_rank()
     pp_size = parallel_state.get_pipeline_model_parallel_world_size()
     pp_group = parallel_state.get_pipeline_model_parallel_group()
-    pp_is_last = parallel_state.is_pipeline_last_stage(ignore_virtual=True)
-    pp_is_first = parallel_state.is_pipeline_first_stage(ignore_virtual=True)
+    pp_is_last = parallel_state.is_pipeline_last_stage()
+    pp_is_first = parallel_state.is_pipeline_first_stage()
     vp_size = parallel_state.get_virtual_pipeline_model_parallel_world_size()
     if not vp_size:
         vp_size = 1
diff --git a/nemo/export/trt_llm/converter/utils.py b/nemo/export/trt_llm/converter/utils.py
index aaa1b2b5cbfe..a3a2e21dab02 100755
--- a/nemo/export/trt_llm/converter/utils.py
+++ b/nemo/export/trt_llm/converter/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
index d9155f923f18..341a77c5bc66 100644
--- a/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
index 34cb8f1eca19..cd547db25664 100644
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/trt_llm/qnemo/__init__.py b/nemo/export/trt_llm/qnemo/__init__.py
index dbbfd23bac12..c8d1fa8f690a 100644
--- a/nemo/export/trt_llm/qnemo/__init__.py
+++ b/nemo/export/trt_llm/qnemo/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,5 +13,3 @@
 # limitations under the License.
 
 from .qnemo_to_tensorrt_llm import qnemo_to_tensorrt_llm
-
-__all__ = ["qnemo_to_tensorrt_llm"]
diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
index 7fd554a66d14..003d1aba2a2c 100644
--- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
+++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import glob
+import itertools
 import os
 import subprocess
 import warnings
@@ -78,42 +79,40 @@ def qnemo_to_tensorrt_llm(
 
     speculative_decoding_mode = "medusa" if "Medusa" in config.architecture else None
 
-    build_cmd = "trtllm-build "
-    build_cmd += f"--checkpoint_dir {nemo_checkpoint_path} "
-    build_cmd += f"--log_level {log_level} "
-    build_cmd += f"--output_dir {engine_dir} "
-    build_cmd += f"--workers {num_build_workers} "
-    build_cmd += f"--max_batch_size {max_batch_size} "
-    build_cmd += f"--max_input_len {max_input_len} "
-    build_cmd += f"--max_beam_width {max_beam_width} "
-    build_cmd += f"--max_prompt_embedding_table_size {max_prompt_embedding_table_size} "
-    build_cmd += f"--paged_kv_cache {'enable' if paged_kv_cache else 'disable'} "
-    build_cmd += f"--use_paged_context_fmha {'enable' if paged_context_fmha else 'disable'} "
-    build_cmd += f"--remove_input_padding {'enable' if remove_input_padding else 'disable'} "
-    build_cmd += f"--multiple_profiles {'enable' if multiple_profiles else 'disable'} "
-    build_cmd += f"--reduce_fusion {'enable' if reduce_fusion else 'disable'} "
-    build_cmd += f"--use_fused_mlp {'enable' if use_fused_mlp else 'disable'} "
+    build_cmd = ["trtllm-build"]
+    build_cmd.extend(["--checkpoint_dir", nemo_checkpoint_path])
+    build_cmd.extend(["--log_level", log_level])
+    build_cmd.extend(["--output_dir", engine_dir])
+    build_cmd.extend(["--workers", str(num_build_workers)])
+    build_cmd.extend(["--max_batch_size", str(max_batch_size)])
+    build_cmd.extend(["--max_input_len", str(max_input_len)])
+    build_cmd.extend(["--max_beam_width", str(max_beam_width)])
+    build_cmd.extend(["--max_prompt_embedding_table_size", str(max_prompt_embedding_table_size)])
+    build_cmd.extend(["--paged_kv_cache", "enable" if paged_kv_cache else "disable"])
+    build_cmd.extend(["--use_paged_context_fmha", "enable" if paged_context_fmha else "disable"])
+    build_cmd.extend(["--remove_input_padding", "enable" if remove_input_padding else "disable"])
+    build_cmd.extend(["--multiple_profiles", "enable" if multiple_profiles else "disable"])
+    build_cmd.extend(["--reduce_fusion", "enable" if reduce_fusion else "disable"])
+    build_cmd.extend(["--use_fused_mlp", "enable" if use_fused_mlp else "disable"])
 
     if not use_qdq:
-        build_cmd += "--gemm_plugin auto "
+        build_cmd.extend(["--gemm_plugin", "auto"])
 
     if max_seq_len is not None:
-        build_cmd += f"--max_seq_len {max_seq_len} "
+        build_cmd.extend(["--max_seq_len", str(max_seq_len)])
 
     if max_num_tokens is not None:
-        build_cmd += f"--max_num_tokens {max_num_tokens} "
+        build_cmd.extend(["--max_num_tokens", str(max_num_tokens)])
     else:
-        build_cmd += f"--max_num_tokens {max_batch_size * max_input_len} "
+        build_cmd.extend(["--max_num_tokens", str(max_batch_size * max_input_len)])
 
     if opt_num_tokens is not None:
-        build_cmd += f"--opt_num_tokens {opt_num_tokens} "
+        build_cmd.extend(["--opt_num_tokens", str(opt_num_tokens)])
 
     if speculative_decoding_mode:
-        build_cmd += f"--speculative_decoding_mode {speculative_decoding_mode} "
-
-    build_cmd = build_cmd.replace("--", "\\\n  --")  # Separate parameters line by line
+        build_cmd.extend(["--speculative_decoding_mode", speculative_decoding_mode])
 
     print("trtllm-build command:")
-    print(build_cmd)
+    print("".join(itertools.chain.from_iterable(zip(build_cmd, itertools.cycle(["\n ", " "])))).strip())
 
-    subprocess.run(build_cmd, shell=True, check=True)
+    subprocess.run(build_cmd, shell=False, check=True)
diff --git a/nemo/export/trt_llm/qnemo/tokenizer_utils.py b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
index 37b45521dcca..b3cc88de7caf 100644
--- a/nemo/export/trt_llm/qnemo/tokenizer_utils.py
+++ b/nemo/export/trt_llm/qnemo/tokenizer_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/trt_llm/qnemo/utils.py b/nemo/export/trt_llm/qnemo/utils.py
index a2bd74d3ff4c..b64b9d07431e 100644
--- a/nemo/export/trt_llm/qnemo/utils.py
+++ b/nemo/export/trt_llm/qnemo/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
index a0c8d52b9895..2b7b0cff9965 100755
--- a/nemo/export/trt_llm/tensorrt_llm_build.py
+++ b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index b3d504cd86ea..3a61d781193c 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/trt_llm/utils.py b/nemo/export/trt_llm/utils.py
index bb30048b96c7..d24183923281 100644
--- a/nemo/export/trt_llm/utils.py
+++ b/nemo/export/trt_llm/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/utils/__init__.py b/nemo/export/utils/__init__.py
index 12442fca30a4..ed7ee448bc6e 100644
--- a/nemo/export/utils/__init__.py
+++ b/nemo/export/utils/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/utils/lora_converter.py b/nemo/export/utils/lora_converter.py
index cd229317bf23..020a87ac9f70 100644
--- a/nemo/export/utils/lora_converter.py
+++ b/nemo/export/utils/lora_converter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/utils/model_loader.py b/nemo/export/utils/model_loader.py
index 39fbc26505d7..64173a8e5cb3 100644
--- a/nemo/export/utils/model_loader.py
+++ b/nemo/export/utils/model_loader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/utils/utils.py b/nemo/export/utils/utils.py
index cdbc5658c052..fa2034ed70ac 100755
--- a/nemo/export/utils/utils.py
+++ b/nemo/export/utils/utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/vllm/__init__.py b/nemo/export/vllm/__init__.py
index d9155f923f18..341a77c5bc66 100644
--- a/nemo/export/vllm/__init__.py
+++ b/nemo/export/vllm/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/vllm/engine.py b/nemo/export/vllm/engine.py
deleted file mode 100644
index c3776b842b83..000000000000
--- a/nemo/export/vllm/engine.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from pathlib import Path
-
-from sentencepiece import SentencePieceProcessor
-from transformers import PreTrainedTokenizerBase
-from vllm import LLMEngine
-from vllm.transformers_utils.tokenizer_group import TokenizerGroup
-
-from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
-from nemo.export.tarutils import TarPath
-from nemo.export.vllm.tokenizer_group import NemoTokenizerGroup
-
-LOGGER = logging.getLogger("NeMo")
-
-
-class vLLMTokenizerGroup(TokenizerGroup):
-    """
-    Implements a custom tokenizer for vLLM, based on a huggingface tokenizer
-    """
-
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-
-    def get_lora_tokenizer(self, lora_request):
-        return self.tokenizer
-
-    async def get_lora_tokenizer_async(self, lora_request):
-        return self.tokenizer
-
-    @property
-    def max_input_length(self, lora_request=None):
-        return None
-
-
-class NemoLLMEngine(LLMEngine):
-    """
-    Overrides some functionality from vllm.LLMEngine to use our custom tokenizer
-    instead of one from Transformers.
-    """
-
-    def _init_tokenizer(self, **tokenizer_init_kwargs):
-        # Determine if the model needs a bos token (which is not stored in Nemo checkpoints)
-        add_bos_token = self.model_config.model_converter.requires_bos_token()
-        tokenizer_config = self.model_config.nemo_model_config.get('tokenizer', {})
-
-        if not isinstance(tokenizer_config, dict) and hasattr(tokenizer_config, 'tokenizer'):
-            tokenizer = tokenizer_config.tokenizer
-
-            if isinstance(tokenizer, SentencePieceProcessor):
-                self.model_config.hf_config.bos_token_id = tokenizer.bos_token_id
-                self.model_config.hf_config.eos_token_id = tokenizer.eos_token_id
-
-                tokenizer = SentencePieceTokenizer(tokenizer=tokenizer)
-                return NemoTokenizerGroup(tokenizer, add_bos_token=add_bos_token)
-
-            if isinstance(tokenizer, PreTrainedTokenizerBase):
-                return vLLMTokenizerGroup(tokenizer)
-
-        # Find the tokenizer file name in the Nemo checkpoint config
-        tokenizer_model = tokenizer_config.get('model', tokenizer_config.get('tokenizer_model', None))
-
-        # If there is no tokenizer file specified but there's a reference to an HF tokenizer, use that
-        if tokenizer_model is None and tokenizer_config.get('library') == 'huggingface':
-            tokenizer_type = tokenizer_config.get('type')
-            if tokenizer_type is not None:
-                tokenizer_group = TokenizerGroup(
-                    tokenizer_id=tokenizer_type,
-                    enable_lora=bool(self.lora_config),
-                    max_num_seqs=self.scheduler_config.max_num_seqs,
-                    max_input_length=None,
-                )
-
-                # Update the HF config fields that come from the tokenizer in NeMo
-                self.model_config.hf_config.vocab_size = len(
-                    tokenizer_group.tokenizer.vocab
-                )  # this may be greater than vocab_size
-                self.model_config.hf_config.bos_token_id = tokenizer_group.tokenizer.bos_token_id
-                self.model_config.hf_config.eos_token_id = tokenizer_group.tokenizer.eos_token_id
-                self.model_config.hf_config.pad_token_id = tokenizer_group.tokenizer.pad_token_id
-
-                return tokenizer_group
-
-        # Open the checkpoint archive
-        with TarPath(self.model_config.nemo_checkpoint) as archive:
-            tokenizer_model_file = None
-            if isinstance(tokenizer_model, str) and tokenizer_model.startswith('nemo:'):
-                tokenizer_model = tokenizer_model[len('nemo:') :]
-                tokenizer_model_file = archive / tokenizer_model
-                if not tokenizer_model_file.exists():
-                    LOGGER.warn(
-                        f'Tokenizer model file {tokenizer_model} specified in the model_config does not '
-                        + 'exist in the checkpoint.'
-                    )
-                    tokenizer_model_file = None
-
-            if tokenizer_model_file is None:
-                for path in archive.glob('*tokenizer*.model'):
-                    LOGGER.info(f'Found tokenizer model file {path}.')
-                    tokenizer_model_file = path
-                    break
-
-            if tokenizer_model_file is None:
-                raise RuntimeError('No tokenizer model file found, aborting.')
-
-            # Extract the tokenizer model file into the model directory,
-            # because sentencepiece cannot load it directly from TarPath.
-            extracted_tokenizer_model = Path(self.model_config.model) / 'tokenizer.model'
-            with tokenizer_model_file.open('rb') as infile:
-                with extracted_tokenizer_model.open('wb') as outfile:
-                    outfile.write(infile.read())
-
-            # Construct the tokenizer object and wrapper
-            tokenizer = SentencePieceTokenizer(str(extracted_tokenizer_model))
-
-            # Determine if the model needs a bos token (which is not stored in Nemo checkpoints)
-            add_bos_token = self.model_config.model_converter.requires_bos_token()
-
-            tokenizer_group = NemoTokenizerGroup(tokenizer, add_bos_token=add_bos_token)
-
-            # Update the HF config fields that come from the tokenizer in NeMo
-            self.model_config.hf_config.vocab_size = tokenizer.vocab_size
-            self.model_config.hf_config.bos_token_id = tokenizer.bos_token_id
-            self.model_config.hf_config.eos_token_id = tokenizer.eos_token_id
-            self.model_config.hf_config.pad_token_id = tokenizer.pad_id
-
-            return tokenizer_group
diff --git a/nemo/export/vllm/model_config.py b/nemo/export/vllm/model_config.py
index 21151adbf658..8550f8bcbbc1 100644
--- a/nemo/export/vllm/model_config.py
+++ b/nemo/export/vllm/model_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -53,13 +53,15 @@ def __init__(
         quantization: Optional[str] = None,
         quantization_param_path: Optional[str] = None,
         enforce_eager: bool = False,
-        max_seq_len_to_capture: Optional[int] = None,
+        max_seq_len_to_capture: Optional[int] = 8192,
         max_logprobs: int = 5,
         disable_sliding_window: bool = False,
+        disable_cascade_attn: bool = False,
         use_async_output_proc: bool = False,
         disable_mm_preprocessor_cache: bool = False,
         logits_processor_pattern: Optional[str] = None,
         override_pooler_config: Optional[PoolerConfig] = None,
+        override_generation_config: Optional[Dict[str, Any]] = None,
         enable_sleep_mode: bool = False,
         model_impl: Union[str, ModelImpl] = ModelImpl.AUTO,
     ) -> None:
@@ -90,6 +92,7 @@ def __init__(
         self.max_seq_len_to_capture = max_seq_len_to_capture
         self.max_logprobs = max_logprobs
         self.disable_sliding_window = disable_sliding_window
+        self.disable_cascade_attn = disable_cascade_attn
         self.served_model_name = nemo_checkpoint
         self.multimodal_config = None
         self.mm_processor_kwargs = {}
@@ -99,6 +102,8 @@ def __init__(
         self.generation_config = None
         self.task = "generate"  # Only the generate task is supported
         self.is_hybrid = False  # No hybrid models are supported
+        self.attention_chunk_size = None  # Llama4-specific parameter
+        self.override_generation_config = override_generation_config
 
         if self.task in ("draft", "generate"):
             self.truncation_side = "left"
@@ -127,21 +132,19 @@ def __init__(
                 del tokenizer_config['additional_special_tokens']
 
             tokenizer_config = self._change_paths_to_absolute_paths(tokenizer_config, nemo_checkpoint)
-            tokenizer = instantiate(tokenizer_config)
-
             with (nemo_checkpoint / "context/model.yaml").open('r') as config_file:
                 self.nemo_model_config: dict = yaml.load(config_file, Loader=yaml.SafeLoader)
             hf_args = self._load_hf_arguments(self.nemo_model_config['config'])
 
-            if hasattr(tokenizer, 'bos_id'):
-                tokenizer.tokenizer.bos_token_id = tokenizer.bos_id
-            if hasattr(tokenizer, 'eos_id'):
-                tokenizer.tokenizer.eos_token_id = tokenizer.eos_id
-
+            tokenizer = instantiate(tokenizer_config)
             hf_args['vocab_size'] = tokenizer.original_vocab_size
             self.model_converter.convert_config(self.nemo_model_config['config'], hf_args)
+            # In transformers ~= 4.52.0, the config for model_type="mixtral" loads with head_dim=None
+            # which causes issues down the way in vLLM in MixtralAttention class. One possible fix is
+            # to delete head_dim from the config if it is None.
             self.hf_config = AutoConfig.for_model(model_type, **hf_args)
-            self.nemo_model_config['tokenizer'] = tokenizer
+            assert "huggingface" in tokenizer_config["_target_"]
+            tokenizer_id = tokenizer_config["pretrained_model_name"]
         else:
             with TarPath(nemo_checkpoint) as archive:
                 with (archive / "model_config.yaml").open("r") as model_config_file:
@@ -149,6 +152,9 @@ def __init__(
                     hf_args = self._load_hf_arguments(self.nemo_model_config)
                     self.model_converter.convert_config(self.nemo_model_config, hf_args)
                 self.hf_config = AutoConfig.for_model(model_type, **hf_args)
+            assert self.nemo_model_config["tokenizer"]["library"] == "huggingface"
+            tokenizer_id = self.nemo_model_config["tokenizer"]["type"]
+        self.tokenizer = tokenizer_id
 
         self.hf_config.architectures = [self.model_converter.get_architecture()]
         if self.rope_scaling is not None:
@@ -209,6 +215,7 @@ def _load_hf_arguments(self, nemo_config: Dict[str, Any]) -> Dict[str, Any]:
             'num_attention_heads': 'num_attention_heads',
             'num_key_value_heads': 'num_query_groups',
             # 'hidden_act': 'activation', ## <- vLLM has good defaults for the models, nemo values are wrong
+            'num_local_experts': 'num_moe_experts',
             'max_position_embeddings': ['max_position_embeddings', 'encoder_seq_length'],
             'tie_word_embeddings': 'share_embeddings_and_output_weights',
             'rms_norm_eps': 'layernorm_epsilon',
diff --git a/nemo/export/vllm/model_converters.py b/nemo/export/vllm/model_converters.py
index 5e4cf619d281..87b670560c9c 100644
--- a/nemo/export/vllm/model_converters.py
+++ b/nemo/export/vllm/model_converters.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/vllm/model_loader.py b/nemo/export/vllm/model_loader.py
index c5d74fe883be..a4c5cef39db6 100644
--- a/nemo/export/vllm/model_loader.py
+++ b/nemo/export/vllm/model_loader.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/nemo/export/vllm/tokenizer_group.py b/nemo/export/vllm/tokenizer_group.py
deleted file mode 100644
index d99daebb417f..000000000000
--- a/nemo/export/vllm/tokenizer_group.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Optional
-
-from vllm.config import TokenizerPoolConfig
-from vllm.lora.request import LoRARequest
-from vllm.transformers_utils.tokenizer_group import TokenizerGroup
-
-from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
-
-
-class NemoTokenizerGroup(TokenizerGroup):
-    """
-    Implements a custom tokenizer for vLLM, based on SentencePieceTokenizer.
-    """
-
-    def __init__(self, tokenizer: SentencePieceTokenizer, add_bos_token: bool = False):
-        self.tokenizer = tokenizer
-        self.add_bos_token = add_bos_token
-
-    @classmethod
-    def from_config(cls, tokenizer_pool_config: Optional[TokenizerPoolConfig] = None, **init_kwargs):
-        """Create a tokenizer group from a config."""
-        raise NotImplementedError
-
-    def ping(self) -> bool:
-        """Check if the tokenizer group is alive."""
-        return True
-
-    def get_max_input_len(self, lora_request: Optional[LoRARequest] = None) -> Optional[int]:
-        """Get the maximum input length for the LoRA request."""
-        return None
-
-    def encode(
-        self,
-        prompt: str,
-        request_id: Optional[str] = None,
-        lora_request: Optional[LoRARequest] = None,
-        add_special_tokens: Optional[bool] = None,
-    ) -> List[int]:
-        """Tokenizes the prompt."""
-        ids = self.tokenizer.encode(prompt)
-        if self.add_bos_token:
-            ids = [self.tokenizer.bos_token_id] + ids
-        return ids
-
-    async def encode_async(
-        self,
-        prompt: str,
-        request_id: Optional[str] = None,
-        lora_request: Optional[LoRARequest] = None,
-        add_special_tokens: Optional[bool] = None,
-    ) -> List[int]:
-        """Encode a prompt using the tokenizer group."""
-        return self.tokenizer.encode(prompt)  # TODO: not sure how this is supposed to work
-
-    def get_lora_tokenizer(self, lora_request: Optional[LoRARequest] = None) -> SentencePieceTokenizer:
-        """Get a tokenizer for a LoRA request."""
-        return self.tokenizer
-
-    async def get_lora_tokenizer_async(self, lora_request: Optional[LoRARequest] = None) -> SentencePieceTokenizer:
-        """Get a tokenizer for a LoRA request."""
-        return self.tokenizer
diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
index fd67fdb2d6a3..6aeaa4877bd8 100644
--- a/nemo/export/vllm_exporter.py
+++ b/nemo/export/vllm_exporter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -34,11 +34,12 @@
 )
 from vllm.executor.ray_utils import initialize_ray_cluster
 from vllm.lora.request import LoRARequest
+from vllm.v1.core.sched.scheduler import Scheduler as V1Scheduler
+from vllm.v1.engine.llm_engine import LLMEngine
 
 from nemo.deploy import ITritonDeployable
 from nemo.deploy.utils import cast_output
 from nemo.export.utils import convert_lora_nemo_to_canonical, prepare_directory_for_export
-from nemo.export.vllm.engine import NemoLLMEngine
 from nemo.export.vllm.model_config import NemoModelConfig
 from nemo.export.vllm.model_loader import NemoModelLoader
 
@@ -92,9 +93,7 @@ class vLLMExporter(ITritonDeployable):
 
     def __init__(self):
         self.request_id = 0
-        # TODO: Support v1 vllm engine
-        if envs.VLLM_USE_V1:
-            envs.set_vllm_use_v1(False)
+        assert envs.VLLM_USE_V1, "Only vLLM V1 is supported"
 
     def export(
         self,
@@ -169,7 +168,6 @@ def export(
             quantization=quantization,
             quantization_param_path=None,
             enforce_eager=False,
-            max_seq_len_to_capture=None,
         )
 
         if model_config.nemo_model_config.get("fp8", False):
@@ -245,6 +243,7 @@ def export(
             num_lookahead_slots=0,
             delay_factor=0.0,
             enable_chunked_prefill=False,
+            scheduler_cls=V1Scheduler,
         )
 
         load_config = LoadConfig(
@@ -261,24 +260,24 @@ def export(
         # Initialize the cluster and specify the executor class.
         if parallel_config.distributed_executor_backend == "ray":
             initialize_ray_cluster(parallel_config)
-            from vllm.executor.ray_distributed_executor import RayDistributedExecutor
+            from vllm.v1.executor.ray_distributed_executor import RayDistributedExecutor
 
             executor_class = RayDistributedExecutor
 
         elif parallel_config.distributed_executor_backend == "mp":
-            from vllm.executor.mp_distributed_executor import MultiprocessingDistributedExecutor
+            from vllm.v1.executor.multiproc_executor import MultiprocExecutor
 
-            executor_class = MultiprocessingDistributedExecutor
+            executor_class = MultiprocExecutor
 
         else:
             assert parallel_config.distributed_executor_backend == "uni" or parallel_config.world_size == 1
 
-            from vllm.executor.uniproc_executor import UniProcExecutor
+            from vllm.v1.executor.abstract import UniProcExecutor
 
             executor_class = UniProcExecutor
 
         # Initialize the engine
-        self.engine = NemoLLMEngine(
+        self.engine = LLMEngine(
             vllm_config=VllmConfig(
                 model_config=model_config,
                 cache_config=cache_config,
diff --git a/nemo/export/vllm_hf_exporter.py b/nemo/export/vllm_hf_exporter.py
index 4e90ab962ee8..9e15208a39bf 100755
--- a/nemo/export/vllm_hf_exporter.py
+++ b/nemo/export/vllm_hf_exporter.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From cb9b902ba77092ed123ebad75a024991271280c3 Mon Sep 17 00:00:00 2001
From: Pablo Garay 
Date: Sat, 8 Nov 2025 09:15:25 -0800
Subject: [PATCH 15/15] remove ExportDeploy

Signed-off-by: Pablo Garay 
---
 .github/workflows/cicd-main-nemo2.yml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.github/workflows/cicd-main-nemo2.yml b/.github/workflows/cicd-main-nemo2.yml
index c44930a9f207..85cb4733c57a 100644
--- a/.github/workflows/cicd-main-nemo2.yml
+++ b/.github/workflows/cicd-main-nemo2.yml
@@ -162,8 +162,6 @@ jobs:
             runner: self-hosted-azure
           - script: L2_NEMO_2_LoRA_MERGE
             runner: self-hosted-azure
-          - script: L2_NEMO_2_LoRA_Export
-            runner: self-hosted-azure-gpus-1
           - script: L2_NEMO_2_LoRA_Inference
             runner: self-hosted-azure-gpus-1
           - script: L2_NeMo_2_NeMo_Mcore_Mixtral_bitexact
@@ -177,8 +175,6 @@ jobs:
             runner: self-hosted-azure
           - script: L2_NeMo_2_PTQ_Llama2_FP8_nemo
             runner: self-hosted-azure
-          - script: L2_NeMo_2_PTQ_Unified_Export
-            runner: self-hosted-azure
           - script: L2_NeMo_2_Distill_Llama3_TP1PP2
             runner: self-hosted-azure
           - script: L2_NeMo_2_Prune_Llama_TP1PP2