merged preallocated_output with allocated_output

cehongwang · cehongwang · commit f393b438c0b8 · 2025-12-31T00:25:18.000Z
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -181,7 +181,6 @@ struct TRTEngine : torch::CustomClassHolder {
   std::string shape_key = "None";
   bool use_pre_allocated_outputs = false;
   std::vector<at::Tensor> pre_allocated_outputs;
-  std::vector<at::Tensor> allocated_outputs;
 
   // Output Allocator-Related Functionality
   bool requires_output_allocator = false; // engine requires output allocator
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -247,11 +247,11 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       if (can_use_pre_allocated_outputs) {
         outputs = compiled_engine->pre_allocated_outputs;
       } else {
-        if (compiled_engine->allocated_outputs.size() == 0 || compiled_engine->output_tensors_are_unowned or
+        if (compiled_engine->pre_allocated_outputs.size() == 0 || compiled_engine->output_tensors_are_unowned or
             shape_changed) {
-          compiled_engine->allocated_outputs = create_output_tensors(compiled_engine);
+          compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);
         }
-        outputs = compiled_engine->allocated_outputs;
+        outputs = compiled_engine->pre_allocated_outputs;
       }
 
       for (auto output_indices : compiled_engine->out_binding_map) {
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -174,7 +174,6 @@ def __init__(
         self.cudagraph: Optional[torch.cuda.CUDAGraph] = None
         self._caller_stream: Optional[torch.cuda.Stream] = None
         self._engine_stream: Optional[torch.cuda.Stream] = None
-        self.output_tensors: Optional[List[torch.Tensor]] = None
 
         # TODO: Make the below a Dictionary {shape: cudagraph}
         self.shape_key: Optional[str] = None
@@ -505,7 +504,7 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                 if can_use_pre_allocated_outputs:
                     outputs = self.pre_allocated_outputs
                 else:
-                    if shape_changed or self.output_tensors is None:
+                    if shape_changed or not self.pre_allocated_outputs:
                         self.output_shapes = [
                             tuple(self.context.get_tensor_shape(output_name))
                             for output_name in self.output_names
@@ -515,12 +514,12 @@ def run_standard_execution() -> torch.Tensor | Tuple[torch.Tensor, ...]:
                             "Encountered dynamic output shapes during runtime. This could mean the network has data-dependent output shapes which is not currently supported."
                         )
                     if (
-                        self.output_tensors is None
+                        not self.pre_allocated_outputs
                         or self.output_tensors_are_unowned
                         or shape_changed
                     ):
-                        self.output_tensors = self.create_output_tensors()
-                    outputs = self.output_tensors
+                        self.pre_allocated_outputs = self.create_output_tensors()
+                    outputs = self.pre_allocated_outputs
 
                 for o, output_name in enumerate(self.output_names):
                     if need_cudagraphs_record:

Original file line number	Diff line number	Diff line change
`@@ -247,11 +247,11 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr`
`247`	`247`	`if (can_use_pre_allocated_outputs) {`
`248`	`248`	`outputs = compiled_engine->pre_allocated_outputs;`
`249`	`249`	`} else {`
`250`		`- if (compiled_engine->allocated_outputs.size() == 0 \|\| compiled_engine->output_tensors_are_unowned or`
	`250`	`+ if (compiled_engine->pre_allocated_outputs.size() == 0 \|\| compiled_engine->output_tensors_are_unowned or`
`251`	`251`	`shape_changed) {`
`252`		`- compiled_engine->allocated_outputs = create_output_tensors(compiled_engine);`
	`252`	`+ compiled_engine->pre_allocated_outputs = create_output_tensors(compiled_engine);`
`253`	`253`	`}`
`254`		`- outputs = compiled_engine->allocated_outputs;`
	`254`	`+ outputs = compiled_engine->pre_allocated_outputs;`
`255`	`255`	`}`
`256`	`256`
`257`	`257`	`for (auto output_indices : compiled_engine->out_binding_map) {`