NVIDIA
diff --git a/‎README.md‎
Lines changed: 209 additions & 231 deletions b/‎README.md‎
Lines changed: 209 additions & 231 deletions
diff --git a/‎benchmarks/cpp/README.md‎
Lines changed: 5 additions & 0 deletions b/‎benchmarks/cpp/README.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎benchmarks/python/gpt_benchmark.py‎
Lines changed: 0 additions & 3 deletions b/‎benchmarks/python/gpt_benchmark.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 0 additions & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/runtime/ipcUtils.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/include/tensorrt_llm/runtime/ipcUtils.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/common/tensor.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/common/tensor.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h‎
Lines changed: 0 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h‎
Lines changed: 1 addition & 1 deletion
@@ -9,6 +9,11 @@ multiple GPUs or multiple nodes with multiple GPUs.
 
 Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM.
 
+Windows users: Follow the
+[`Windows installation document`](../../../windows/README.md)
+instead, and be sure to set DLL paths as specified in
+[Extra Steps for C++ Runtime Usage](../../../windows/README.md#extra-steps-for-c-runtime-usage).
+
 After that, you can build benchmarking source code for C++ runtime
 ```
 cd cpp/build
 
@@ -394,9 +394,6 @@ def build(self):
             network.plugin_config.set_smooth_quant_gemm_plugin(dtype=self.dtype)
             network.plugin_config.set_layernorm_quantization_plugin(
                 dtype=self.dtype)
-            # FIXME(nkorobov)
-            # See https://nvbugs/4164762
-            # See https://nvbugs/4174113
             network.plugin_config.set_quantize_tensor_plugin()
             network.plugin_config.set_quantize_per_token_plugin()
         elif self.use_weight_only:
 
@@ -126,7 +126,6 @@ class LlmRequest
     {
         // TODO: For beamWidth > 1, we would need to support swapping to avoid
         // recomputing from the start
-        // See https://jirasw.nvidia.com/browse/TRT-21715
         // As a temporary solution, we currently reset the tokens to the prompt
         if (mSamplingConfig.beamWidth > 1)
         {
 
@@ -34,7 +34,7 @@ class IpcMemory
 
     size_t static constexpr FLAGS_SIZE = kernels::MAX_ALL_REDUCE_BLOCKS * sizeof(uint32_t);
 
-    IpcMemory(WorldConfig worldConfig, SizeType bufferSize);
+    IpcMemory(WorldConfig worldConfig, std::size_t bufferSize);
     ~IpcMemory();
 
     [[nodiscard]] const std::vector<void*>& getCommPtrsTensor() const
@@ -48,7 +48,7 @@ class IpcMemory
 
     WorldConfig mWorldConfig;
     std::vector<void*> mCommPtrs;
-    SizeType mBufferSize;
+    std::size_t mBufferSize;
     void* mBufferPtr;
 };
 
 
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:74a8f58055a5772fa5ef65cbe96895a98e9d33431a618ec2eefab8d417b1d3b1
-size 1489782
+oid sha256:179091b7779a2a9ae047be54b65da9f45c66073a9ee34e889a00332ef13d7f39
+size 1489814
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e41d1e4eb3c7c8ae14ca81f76a857a2adc56923df3206927ef9453457f5a8915
-size 1259888
+oid sha256:4c00eb2685b3224ea2df468c533a443c9a2a71d99ffa9125a792f735d9166544
+size 1257346
@@ -215,7 +215,7 @@ class Tensor
     MemoryType where;
     DataType type;
     std::vector<size_t> shape;
-    void const* data; // TODO(bhseuh) modify from const void* to void* const
+    void const* data; // TODO modify from const void* to void* const
 
     Tensor();
     Tensor(MemoryType _where, DataType _type, std::vector<size_t> const& _shape, void const* _data);
 
@@ -27,8 +27,6 @@ namespace kernels
 enum Data_type
 {
     DATA_TYPE_BOOL,
-    DATA_TYPE_E8M10,
-    DATA_TYPE_E8M7,
     DATA_TYPE_FP16,
     DATA_TYPE_FP32,
     DATA_TYPE_INT4,
 
@@ -34,7 +34,7 @@ namespace cutlass_kernels
   This runner supports:
   int8_t inputs (A and B)
   float alpha scalings (either per-col, or per-col x per-row)
-  T output (D) where T = {float, half, __nv_bfloat16} // TODO(mseznec)
+  T output (D) where T = {float, half, __nv_bfloat16} // TODO
 
   Activations, biases, scales and outputs are all assumed to be row-major.
   Weights are assumed to be column-major.
Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,6 @@ class LlmRequest`
`126`	`126`	`{`
`127`	`127`	`// TODO: For beamWidth > 1, we would need to support swapping to avoid`
`128`	`128`	`// recomputing from the start`
`129`		`- // See https://jirasw.nvidia.com/browse/TRT-21715`
`130`	`129`	`// As a temporary solution, we currently reset the tokens to the prompt`
`131`	`130`	`if (mSamplingConfig.beamWidth > 1)`
`132`	`131`	`{`
Original file line number	Diff line number	Diff line change
`@@ -27,8 +27,6 @@ namespace kernels`
`27`	`27`	`enum Data_type`
`28`	`28`	`{`
`29`	`29`	`DATA_TYPE_BOOL,`
`30`		`- DATA_TYPE_E8M10,`
`31`		`- DATA_TYPE_E8M7,`
`32`	`30`	`DATA_TYPE_FP16,`
`33`	`31`	`DATA_TYPE_FP32,`
`34`	`32`	`DATA_TYPE_INT4,`