VLLMModel propogates token IDs (#11)

bxyu-nvidia · abhibha-nvidia · commit ee4d76764802 · 2025-09-29T11:47:54.000-07:00
Signed-off-by: Brian Yu &lt;bxyu@nvidia.com&gt;
Signed-off-by: Abhibha Gupta &lt;abhibhag@nvidia.com&gt;
diff --git a/README.md b/README.md
@@ -23,6 +23,7 @@
 - [FAQ: Why NeMo Gym?](#faq-why-nemo-gym)
 - [FAQ: Error: Found files with missing copyright](#faq-error-found-files-with-missing-copyright)
 - [FAQ: build-docs / Build docs CI failures](#faq-build-docs--build-docs-ci-failures)
+- [FAQ: NeMo Gym, training frameworks, and token IDs](#faq-nemo-gym-training-frameworks-and-token-ids)
 
 
 # NeMo-Gym
@@ -874,3 +875,15 @@ pickling environment... done
 checking consistency... done
 ```
 You may need to reformat some of your docstrings to Napoleon format docstrings https://sphinxcontrib-napoleon.readthedocs.io/en/latest/
+
+
+# FAQ: NeMo Gym, training frameworks, and token IDs
+One of the goals of NeMo Gym is to act as a rollout tool for LLM post-training, either as synthetic data generation for SFT or as training environments for RL.
+
+RL training frameworks don't typically operate in OpenAI schema; they operate in tokens IDs. It is especially critical to always have the correct token IDs during training so that we stay on-policy and to make sure that what we think the model sees is what the model actually sees. However, when providing this OpenAI schema compatible interface to training environment developers, we lose track of the token IDs in Gym.
+
+For example, say we are training a Qwen 3 family model. During rollouts, the model may sample from the entire token distribution. The token IDs are then decoded into text and subsequently converted to OpenAI schema and returned to the training environment developer. At some point for multi-step and multi-turn scenarios, the training environment developer will call the model again with the previously output OpenAI schema. This re-tokenization causes problems since a single string may map to multiple possible sequences of token IDs. So if the model generations token ID sequence 1 and the re-tokenization outputs token ID sequence 2, suddenly things may become off policy when the Gym result is consumed by the RL training framework.
+
+So, the OpenAI compatible model server in a training framework needs to be able to handle this discrepancy. In order to do that, Gym needs a handle on the ground truth token IDs and it needs to provide that information back to the training frameworks' OpenAI compatible server.
+
+TODO @bxyu-nvidia: expand on this later.
diff --git a/nemo_gym/openai_utils.py b/nemo_gym/openai_utils.py
@@ -197,6 +197,15 @@ class NeMoGymResponseReasoningItemForTraining(NeMoGymResponseReasoningItem, Toke
     pass
 
 
+RESPONSES_TO_TRAIN = {
+    NeMoGymEasyInputMessage: NeMoGymEasyInputMessageForTraining,
+    NeMoGymMessage: NeMoGymMessageForTraining,
+    NeMoGymResponseOutputMessage: NeMoGymResponseOutputMessageForTraining,
+    NeMoGymResponseFunctionToolCall: NeMoGymResponseFunctionToolCallForTraining,
+    NeMoGymResponseReasoningItem: NeMoGymResponseReasoningItemForTraining,
+}
+
+
 NeMoGymResponseInputItem = Union[
     NeMoGymEasyInputMessage,
     NeMoGymMessage,
diff --git a/pyproject.toml b/pyproject.toml
@@ -226,7 +226,7 @@ ng_dump_config = "nemo_gym.cli:dump_config"
 
 [tool.setuptools.packages.find]
 where = ["."]
-include = ["resources_servers", "responses_api_agents", "responses_api_models", "nemo_gym"]
+include = ["resources_servers", "responses_api_agents", "responses_api_models", "nemo_gym", "penguin"]
 
 ################################################
 # Testing
diff --git a/responses_api_models/vllm_model/app.py b/responses_api_models/vllm_model/app.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 import re
 from time import time
-from typing import List, Tuple
+from typing import ClassVar, List, Optional, Tuple
 from uuid import uuid4
 
 from openai import BaseModel as OpenAIBaseModel
@@ -25,8 +25,10 @@
     SimpleResponsesAPIModel,
 )
 from nemo_gym.openai_utils import (
+    RESPONSES_TO_TRAIN,
     NeMoGymAsyncOpenAI,
     NeMoGymChatCompletion,
+    NeMoGymChatCompletionAssistantMessageForTrainingParam,
     NeMoGymChatCompletionAssistantMessageParam,
     NeMoGymChatCompletionCreateParamsNonStreaming,
     NeMoGymChatCompletionDeveloperMessageParam,
@@ -47,13 +49,15 @@
     NeMoGymResponseOutputText,
     NeMoGymResponseReasoningItem,
     NeMoGymSummary,
+    TokenIDLogProbMixin,
 )
 
 
 class VLLMModelConfig(BaseResponsesAPIModelConfig):
     base_url: str
     api_key: str
     model: str
+    return_token_id_information: bool
 
 
 # This needs to be OpenAI BaseModel since it is casted to below by the OpenAI client.
@@ -69,7 +73,7 @@ def model_post_init(self, context):
             base_url=self.config.base_url,
             api_key=self.config.api_key,
         )
-        self._converter = VLLMConverter()
+        self._converter = VLLMConverter(return_token_id_information=self.config.return_token_id_information)
         return super().model_post_init(context)
 
     async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()) -> NeMoGymResponse:
@@ -82,21 +86,10 @@ async def responses(self, body: NeMoGymResponseCreateParamsNonStreaming = Body()
         chat_completion_response = await self.chat_completions(chat_completion_create_params)
 
         choice = chat_completion_response.choices[0]
-        message = choice.message
 
         response_output = self._converter.postprocess_chat_response(choice)
         response_output_dicts = [item.model_dump() for item in response_output]
 
-        last_response_output_item = response_output_dicts[-1]
-        if hasattr(message, "prompt_token_ids"):
-            last_response_output_item.update(
-                dict(
-                    prompt_token_ids=message.prompt_token_ids,
-                    generation_token_ids=message.generation_token_ids,
-                    generation_log_probs=message.generation_log_probs,
-                )
-            )
-
         # Chat Completion -> Response
         return NeMoGymResponse(
             id=f"resp_{uuid4().hex}",
@@ -130,72 +123,97 @@ async def chat_completions(
         body_dict = body.model_dump(exclude_unset=True)
         body_dict.setdefault("model", self.config.model)
 
-        openai_response = await self._client.chat.completions.create(
-            **body_dict,
-            logprobs=True,
-            # The extra body below is VLLM specific to get the generation log probs associated with generation token IDs.
-            extra_body={
-                "return_tokens_as_token_ids": True,
-            },
-        )
+        create_params = body_dict
+        if self.config.return_token_id_information:
+            create_params |= dict(
+                logprobs=True,
+                # The extra body below is VLLM specific to get the generation log probs associated with generation token IDs.
+                extra_body={
+                    "return_tokens_as_token_ids": True,
+                },
+            )
+
+        openai_response = await self._client.chat.completions.create(**create_params)
         assert not getattr(openai_response.choices[0].message, "reasoning_content", None), (
             "Please do not use a reasoning parser in vLLM! There is one source of truth for handling data (including reasoning), which is NeMo Gym!"
         )
         openai_response: NeMoGymChatCompletion
 
-        log_probs = openai_response.choices[0].logprobs.content
-        generation_token_ids = []
-        generation_log_probs = []
-        for log_prob in log_probs:
-            # Looks like `"token_id:151667"`
-            generation_token_ids.append(int(log_prob.token.removeprefix("token_id:")))
-            generation_log_probs.append(log_prob.logprob)
-
-        # The base url has /v1 at the end but vLLM's tokenize endpoint does not have v1, hence the ..
-        # I can't believe the path is resolved correctly LOL
-        tokenize_response = await self._client.post(
-            "../tokenize",
-            cast_to=VLLMTokenizeResponse,
-            body=body_dict,
-        )
-
         chat_completion_dict = openai_response.model_dump()
-        message_dict = chat_completion_dict["choices"][0]["message"]
-        message_dict.update(
-            dict(
-                prompt_token_ids=tokenize_response.tokens,
-                generation_token_ids=generation_token_ids,
-                generation_log_probs=generation_log_probs,
+
+        if self.config.return_token_id_information:
+            log_probs = openai_response.choices[0].logprobs.content
+            generation_token_ids = []
+            generation_log_probs = []
+            for log_prob in log_probs:
+                # Looks like `"token_id:151667"`
+                generation_token_ids.append(int(log_prob.token.removeprefix("token_id:")))
+                generation_log_probs.append(log_prob.logprob)
+
+            # The base url has /v1 at the end but vLLM's tokenize endpoint does not have v1, hence the ..
+            # I can't believe the path is resolved correctly LOL
+            tokenize_response = await self._client.post(
+                "../tokenize",
+                cast_to=VLLMTokenizeResponse,
+                body=body_dict,
             )
-        )
+
+            message_dict = chat_completion_dict["choices"][0]["message"]
+            message_dict.update(
+                dict(
+                    prompt_token_ids=tokenize_response.tokens,
+                    generation_token_ids=generation_token_ids,
+                    generation_log_probs=generation_log_probs,
+                )
+            )
+
         return NeMoGymChatCompletion(**chat_completion_dict)
 
 
 class VLLMConverterResponsesToChatCompletionsState(BaseModel):
+    return_token_id_information: bool
+
     messages: List[NeMoGymChatCompletionMessageParam] = Field(default_factory=list)
 
     # We are mapping from Response input items to chat completions messages, which is many to one.
     # Our state will accumulate the reasoning, chat, and tool calls for assistant messages.
     content_buffer: str = ""  # Buffer for reasoning and chat
     tool_calls_buffer: List[NeMoGymChatCompletionMessageToolCallParam] = Field(default_factory=list)
 
+    # Will only be populated if return_token_id_information is True.
+    token_information: Optional[TokenIDLogProbMixin] = None
+
     def flush_assistant(self) -> None:
         if not (self.content_buffer or self.tool_calls_buffer):
             return
 
-        self.messages.append(
-            NeMoGymChatCompletionAssistantMessageParam(
-                content=self.content_buffer or None,
-                role="assistant",
-                tool_calls=self.tool_calls_buffer,
-            )
+        shared_params = dict(
+            content=self.content_buffer or None,
+            role="assistant",
+            tool_calls=self.tool_calls_buffer,
         )
+        if self.return_token_id_information:
+            message = NeMoGymChatCompletionAssistantMessageForTrainingParam(
+                **shared_params,
+                **self.token_information.model_dump(),
+            )
+        else:
+            message = NeMoGymChatCompletionAssistantMessageParam(**shared_params)
+
+        self.messages.append(message)
+
         self.content_buffer = ""
         self.tool_calls_buffer = []
 
 
-class VLLMConverter:
-    THINK_TAG_PATTERN = re.compile(r"<think>(.*?)</think>", re.DOTALL)
+class VLLMConverter(BaseModel):
+    return_token_id_information: bool
+
+    # =======================================================
+    # Reasoning handling. This may change across models and model families
+    # =======================================================
+
+    THINK_TAG_PATTERN: ClassVar = re.compile(r"<think>(.*?)</think>", re.DOTALL)
 
     @staticmethod
     def _wrap_reasoning_in_think_tags(texts: List[str]) -> str:
@@ -220,7 +238,9 @@ def responses_to_chat_completion_create_params(
         responses_create_params = responses_create_params.model_dump(exclude_unset=True)
 
         # Tracks messages including reasoning for each respective message type helper function
-        state = VLLMConverterResponsesToChatCompletionsState()
+        state = VLLMConverterResponsesToChatCompletionsState(
+            return_token_id_information=self.return_token_id_information
+        )
 
         # Input can be a string. Wrap in a ResponseInput-like
         response_input = responses_create_params["input"]
@@ -255,6 +275,13 @@ def responses_to_chat_completion_create_params(
                 case _:  # pragma: no cover
                     raise NotImplementedError(f"Unsupported message type: {m}")
 
+            if self.return_token_id_information and m.get("prompt_token_ids"):
+                state.token_information = TokenIDLogProbMixin(
+                    prompt_token_ids=m["prompt_token_ids"],
+                    generation_token_ids=m["generation_token_ids"],
+                    generation_log_probs=m["generation_log_probs"],
+                )
+
         state.flush_assistant()
 
         model = responses_create_params.pop("model", None)
@@ -439,6 +466,16 @@ def postprocess_chat_response(self, choice: NeMoGymChoice) -> List[NeMoGymRespon
                 )
             )
 
+        if self.return_token_id_information:
+            last_response_output_item = response_output[-1]
+            train_cls = RESPONSES_TO_TRAIN[last_response_output_item.__class__]
+            response_output[-1] = train_cls(
+                **last_response_output_item.model_dump(),
+                prompt_token_ids=raw_message["prompt_token_ids"],
+                generation_token_ids=raw_message["generation_token_ids"],
+                generation_log_probs=raw_message["generation_log_probs"],
+            )
+
         return response_output
 
     def _extract_reasoning_from_content(self, content: str) -> Tuple[List[str], str]:
diff --git a/responses_api_models/vllm_model/configs/vllm_model.yaml b/responses_api_models/vllm_model/configs/vllm_model.yaml
@@ -5,3 +5,4 @@ openai_model:
       base_url: ${policy_base_url}
       api_key: ${policy_api_key}
       model: ${policy_model_name}
+      return_token_id_information: false
diff --git a/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml b/responses_api_models/vllm_model/configs/vllm_model_for_training.yaml
@@ -0,0 +1,8 @@
+openai_model:
+  responses_api_models:
+    vllm_model:
+      entrypoint: app.py
+      base_url: ${policy_base_url}
+      api_key: ${policy_api_key}
+      model: ${policy_model_name}
+      return_token_id_information: true
diff --git a/responses_api_models/vllm_model/tests/round_trip_test_data.json b/responses_api_models/vllm_model/tests/round_trip_test_data.json
diff --git a/responses_api_models/vllm_model/tests/test_app.py b/responses_api_models/vllm_model/tests/test_app.py