Address comments

HuiGao-NV · HuiGao-NV · commit bd30b29ab8ad · 2025-09-26T21:42:56.000+08:00
Signed-off-by: Hui Gao &lt;huig@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -621,7 +621,7 @@ def _post_init_with_buffers(self, buffers) -> None:
         capture_graph = torch.cuda.is_current_stream_capturing()
 
         def get_empty(tensor_shape: list[int], dtype: torch.dtype,
-                      cache_name: str, pin_memory: bool) -> torch.Tensor:
+                      cache_name: str) -> torch.Tensor:
             """
             Finds a compatible, reusable buffer from a cache or creates a new one.
 
@@ -637,35 +637,33 @@ def get_empty(tensor_shape: list[int], dtype: torch.dtype,
                 tensor_shape: The required shape.
                 dtype: The required dtype.
                 cache_name: The key for the specific list of buffers to search in.
-                pin_memory: This buffer block shall be kept in buffer pool if provided
             Returns:
                 An existing compatible buffer or a newly created one.
             """
             if buffers is None:
                 return torch.zeros(tensor_shape, device='cuda', dtype=dtype)
 
             return buffers.get_buffer(tensor_shape, dtype, cache_name,
-                                      pin_memory)
+                                      capture_graph)
 
-        def get_empty_like(like_tensor: torch.Tensor, cache_name: str,
-                           pin_memory: bool) -> torch.Tensor:
+        def get_empty_like(like_tensor: torch.Tensor,
+                           cache_name: str) -> torch.Tensor:
             return get_empty(like_tensor.shape,
                              cache_name=cache_name,
-                             dtype=like_tensor.dtype,
-                             pin_memory=pin_memory)
+                             dtype=like_tensor.dtype)
 
-        self.prompt_lens_cuda = get_empty((self.max_num_sequences, ),
-                                          cache_name="prompt_lens_cuda",
-                                          dtype=torch.int,
-                                          pin_memory=capture_graph)
+        self.prompt_lens_cuda = get_empty(
+            (self.max_num_sequences, ),
+            cache_name="prompt_lens_cuda",
+            dtype=torch.int,
+        )
         self.prompt_lens_cpu = torch.empty_like(
             self.prompt_lens_cuda,
             device='cpu',
             pin_memory=True,
         )
         self.kv_lens_cuda = get_empty_like(self.prompt_lens_cuda,
-                                           cache_name="kv_lens_cuda",
-                                           pin_memory=capture_graph)
+                                           cache_name="kv_lens_cuda")
         self.kv_lens = torch.empty_like(self.kv_lens_cuda,
                                         device='cpu',
                                         pin_memory=True)
@@ -687,7 +685,7 @@ def get_empty_like(like_tensor: torch.Tensor, cache_name: str,
                 ],
                 cache_name="kv_cache_block_offsets",
                 dtype=torch.int32,
-                pin_memory=capture_graph)
+            )
             self.host_kv_cache_block_offsets = torch.empty_like(
                 self.kv_cache_block_offsets,
                 device='cpu',
@@ -703,22 +701,22 @@ def get_empty_like(like_tensor: torch.Tensor, cache_name: str,
                     ],
                     cache_name="block_ids_per_seq",
                     dtype=torch.int32,
-                    pin_memory=capture_graph)
+                )
                 self.kv_block_ids_per_seq = get_empty(
                     [
                         self.kv_cache_manager.max_batch_size,
                         self.kv_cache_manager.max_blocks_per_seq
                     ],
                     cache_name="kv_block_ids_per_seq",
                     dtype=torch.int32,
-                    pin_memory=capture_graph)
+                )
             if self.enable_context_mla_with_cached_kv:
                 # for kv cache reuse/chunked context in MLA
                 self.ctx_cached_token_indptr = get_empty(
                     (self.max_num_requests + 1, ),
                     cache_name="ctx_cached_token_indptr",
                     dtype=torch.int64,
-                    pin_memory=capture_graph)
+                )
                 self.host_ctx_cached_token_indptr = torch.zeros_like(
                     self.ctx_cached_token_indptr,
                     device='cpu',
@@ -728,17 +726,18 @@ def get_empty_like(like_tensor: torch.Tensor, cache_name: str,
                     (self.max_num_requests + 1, ),
                     cache_name="ctx_uncached_token_indptr",
                     dtype=torch.int64,
-                    pin_memory=capture_graph)
+                )
                 self.host_ctx_uncached_token_indptr = torch.zeros_like(
                     self.ctx_uncached_token_indptr,
                     device='cpu',
                     pin_memory=True,
                 )
                 # context full seqlens include cached tokens and uncached tokens
-                self.ctx_kv_indptr = get_empty((self.max_num_requests + 1, ),
-                                               cache_name="ctx_kv_indptr",
-                                               dtype=torch.int64,
-                                               pin_memory=capture_graph)
+                self.ctx_kv_indptr = get_empty(
+                    (self.max_num_requests + 1, ),
+                    cache_name="ctx_kv_indptr",
+                    dtype=torch.int64,
+                )
                 self.host_ctx_kv_indptr = torch.zeros_like(
                     self.ctx_kv_indptr,
                     device='cpu',
diff --git a/tensorrt_llm/_torch/memory_buffer_utils.py b/tensorrt_llm/_torch/memory_buffer_utils.py
@@ -7,109 +7,93 @@
 
 @dataclass
 class BufferBlock:
+    """A container for a buffer tensor and its state."""
     buffer: torch.Tensor = None
-    pin_memory: bool = False
-
-
-# Intention to have this buffer is to reuse buffer tensors across graph and non-graph
-# situation (across layer/round).
-# When forward is under graph capturing, one stream is created and all tensors' memory
-# is associated with this stream and be kept in a graph pool. Then, all buffer memory
-# allocated during graph capture won't be released back to allocator/system.
-# Then, in non-graph mode, additional buffers are allocated which give bigger pressure
-# on memory consumption at runtime.
-# Timeline example:
-#   [t0] start cudagraph capture
-#   [t1] A = torch.zeros(....) -> allocate buffer A and put into graph pool
-#   [t2] end cudagraph capture
-#   [t3] in non-graph forward
-#   [t4] A = torch.zeros(....) -> allocate buffer A in allocator but not use memory in cudagraph pool
-#        OOM may happen
-# TODO:
-# The final resolution to this problem shall be supported in pytorch that to allocate memory
-#    from a give pool, it's the graph pool here.
-# It will be like
-#    try:
-#        with torch.cuda.use_mem_pool(graphpool):
-#            allocate_memory_here
-#    except exception as ex:
-#        allocate_memory_outside of graphpool
-# Need some archeteture change:
-#    1. a. set a thread local graphpool context object when cudagraphRunner start a fn
-#       b. check and get the thread local graphpool
-#       b. allocate memory
-#    2. aggregate workspaces in the same OP to be a big one in graph pool
-#       allocate memory for the big workspace and slice them into small ones.
-#       However, in non-graph mode, allocate workspace one by one
+    is_reserved: bool = False
+
+
 class Buffers:
+    """
+    Manages and reuses CUDA memory buffers to reduce allocation overhead,
+    especially when interacting with CUDA graphs.
+
+    This class maintains a pool of named buffers. When a buffer is requested,
+    it tries to find an existing, available buffer that is large enough.
+    If none is found, a new one is allocated and added to the pool. This helps
+    avoid repeated allocations, which can be slow and cause memory fragmentation,
+    particularly when the same operations are run inside and outside of a
+    CUDA graph context.
+    """
 
     def __init__(self):
         self.buffers: dict[str, list[BufferBlock]] = {}
 
-    def get_buffer(self, tensor_shape: list[int], dtype: torch.dtype,
-                   buffer_name: str, pin_memory: bool):
+    @staticmethod
+    def _view_as(buffer: torch.Tensor, target_shape: list[int],
+                 target_dtype: torch.dtype) -> torch.Tensor:
+        """Safely creates a view of a raw byte buffer with the desired shape and dtype."""
+        # The buffer is stored as uint8, so its numel is its size in bytes.
+        required_size_in_bytes = math.prod(target_shape) * target_dtype.itemsize
+        if buffer.numel() < required_size_in_bytes:
+            raise ValueError(
+                "Buffer is too small for the requested shape and dtype.")
 
-        def select_buffer_with_more_elements(
-            pinned_buffer: Optional[torch.Tensor],
-            runtime_buffer: Optional[torch.Tensor]
-        ) -> tuple[Optional[torch.Tensor]]:
-            if pinned_buffer is None:
-                return runtime_buffer
-            if runtime_buffer is None:
-                return pinned_buffer
+        # Slice the buffer to the exact required size, then view it with the correct type and shape.
+        return buffer[:required_size_in_bytes].view(target_dtype).view(
+            target_shape)
 
-            return runtime_buffer if runtime_buffer.buffer.numel(
-            ) > pinned_buffer.buffer.numel() else pinned_buffer
-
-        def view_to(buffer: torch.Tensor, dtype: torch.dtype,
-                    tensor_shape: list[int]) -> torch.Tensor:
-            return buffer[0:math.prod(tensor_shape) *
-                          dtype.itemsize].view(dtype).view(tensor_shape)
+    def get_buffer(self, tensor_shape: list[int], dtype: torch.dtype,
+                   buffer_name: str, reserve_buffer: bool):
 
         # all buffers are allocated with 1 byte element size
-        element_size = dtype.itemsize
-        required_memory_size = math.prod(tensor_shape) * element_size
-        candidate_buffers = self.buffers.get(buffer_name, [])
-        pinned_buffer = None
-        free_buffer = None
-        for buffer in candidate_buffers:
-            buffer_size = buffer.buffer.numel()
-            if buffer_size >= required_memory_size:
-                if buffer.pin_memory:
-                    pinned_buffer = buffer
-                else:
-                    free_buffer = buffer
-
-            if free_buffer is not None and pinned_buffer is not None:
-                break
-
-        if pin_memory:
-            if pinned_buffer is not None:
-                return view_to(pinned_buffer.buffer, dtype, tensor_shape)
-            elif free_buffer is not None:
-                free_buffer.pin_memory = True
-                return view_to(free_buffer.buffer, dtype, tensor_shape)
-
-        if buffer_name in self.buffers:
-            candidate_buffers = self.buffers.get(buffer_name, [])
-            for buffer in list(candidate_buffers):
-                if not buffer.pin_memory:
-                    # Need to call del BufferBlock.buffer, otherwise memory isn't
-                    # released and OOM may happen.
-                    del buffer.buffer
-                    candidate_buffers.remove(buffer)
-
-        new_buffer = torch.zeros((required_memory_size, ),
-                                 device='cuda',
-                                 dtype=torch.uint8)
-        self.buffers.setdefault(buffer_name, []).append(
-            BufferBlock(buffer=new_buffer, pin_memory=pin_memory))
-        return view_to(new_buffer, dtype, tensor_shape)
+        required_memory_size = math.prod(tensor_shape) * dtype.itemsize
+        candidate_blocks = self.buffers.get(buffer_name, [])
+
+        # Find the best-fit available buffer.
+        best_fit_block: Optional[BufferBlock] = None
+        smallest_sufficient_size = float('inf')
+        for block in candidate_blocks:
+            # Skip buffers that are too small.
+            if block.buffer.numel() < required_memory_size:
+                continue
+
+            # Find the smallest buffer that is still large enough (best-fit).
+            if block.buffer.numel() < smallest_sufficient_size:
+                # Use reserved block if find one.
+                if best_fit_block is not None and best_fit_block.is_reserved and not block.is_reserved:
+                    continue
+
+                best_fit_block = block
+                smallest_sufficient_size = block.buffer.numel()
+
+        if reserve_buffer and best_fit_block is not None:
+            # A suitable buffer was found, so reuse it.
+            best_fit_block.is_reserved = True
+            return self._view_as(best_fit_block.buffer, tensor_shape, dtype)
+
+        for block in list(candidate_blocks):
+            if not block.is_reserved:
+                # Need to call del BufferBlock.buffer, otherwise memory isn't
+                # released and OOM may happen.
+                del block.buffer
+                candidate_blocks.remove(block)
+
+        # No suitable buffer was found, so allocate a new one.
+        # The new buffer is created with uint8 to represent raw bytes.
+        new_buffer_tensor = torch.zeros((required_memory_size, ),
+                                        device='cuda',
+                                        dtype=torch.uint8)
+        new_block = BufferBlock(buffer=new_buffer_tensor,
+                                is_reserved=reserve_buffer)
+
+        # Add the new buffer to the pool for this name.
+        self.buffers.setdefault(buffer_name, []).append(new_block)
+        return self._view_as(new_block.buffer, tensor_shape, dtype)
 
 
 _buffer = Buffers()
 
 
-def get_memory_buffer():
+def get_memory_buffers():
     global _buffer
     return _buffer
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -9,7 +9,7 @@
 from tensorrt_llm._utils import nvtx_range
 
 from ...distributed import allgather
-from ...memory_buffer_utils import get_memory_buffer
+from ...memory_buffer_utils import get_memory_buffers
 from ...model_config import ModelConfig
 from ...utils import AuxStreamType, EventType, Fp4QuantizedTensor
 from .fused_moe_cutlass import CutlassFusedMoE
@@ -367,7 +367,7 @@ class DeepGemmFusedMoE(CutlassFusedMoE):
     """
 
     # To reuse pytorch memory segments allocated during graph capture.
-    buffers = get_memory_buffer()
+    buffers = get_memory_buffers()
 
     def __init__(
         self,
@@ -425,12 +425,12 @@ def get_workspace(self, m_max: int, group_size: int):
             (num_experts * m_max * fp8_dim, ),
             dtype=torch.float8_e4m3fn,
             buffer_name='workspace_0',
-            pin_memory=capture_graph)
+            reserve_buffer=capture_graph)
         workspace_1 = DeepGemmFusedMoE.buffers.get_buffer(
             (num_experts * m_max * max(intermediate_size * 2, hidden_size), ),
             dtype=torch.bfloat16,
             buffer_name='workspace_1',
-            pin_memory=capture_graph)
+            reserve_buffer=capture_graph)
 
         # create workspace for scaling factors
         m_padded = fp8_utils.align(m_max, 4)
@@ -441,7 +441,7 @@ def get_workspace(self, m_max: int, group_size: int):
             (num_experts * (scale_k_padded // 4) * m_padded, ),
             dtype=torch.int32,
             buffer_name='workspace_sf',
-            pin_memory=capture_graph)
+            reserve_buffer=capture_graph)
 
         workspace = {
             "workspace_0": workspace_0,
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -7,7 +7,7 @@
 
 from ...inputs.multimodal import MultimodalParams
 from ..expert_statistic import ExpertStatistic
-from ..memory_buffer_utils import get_memory_buffer
+from ..memory_buffer_utils import get_memory_buffers
 from ..modules.multi_stream_utils import with_multi_stream
 from ..speculative.eagle3 import Eagle3ResourceManager
 from ..utils import make_weak_ref, piecewise_cuda_graph
@@ -54,7 +54,7 @@ def __init__(self, engine: "PyTorchModelEngine"):
         self.shared_static_tensors: Dict[str, torch.Tensor] = {}
         if self.enabled:
             self._create_shared_static_tensors()
-        self.cuda_graph_meta_buffers = get_memory_buffer()
+        self.cuda_graph_meta_buffers = get_memory_buffers()
 
     def _create_shared_static_tensors(self):
         """Allocates static tensors sized for the largest possible batch."""