Skip to content

Commit 6b32b40

Browse files
committed
refresh 0.5.0 release branch with the latest revision
1 parent dcd773e commit 6b32b40

File tree

83 files changed

+1035
-442
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

83 files changed

+1035
-442
lines changed

README.md

Lines changed: 209 additions & 231 deletions
Large diffs are not rendered by default.

benchmarks/cpp/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@ multiple GPUs or multiple nodes with multiple GPUs.
99

1010
Please follow the [`installation document`](../../../README.md) to build TensorRT-LLM.
1111

12+
Windows users: Follow the
13+
[`Windows installation document`](../../../windows/README.md)
14+
instead, and be sure to set DLL paths as specified in
15+
[Extra Steps for C++ Runtime Usage](../../../windows/README.md#extra-steps-for-c-runtime-usage).
16+
1217
After that, you can build benchmarking source code for C++ runtime
1318
```
1419
cd cpp/build

benchmarks/python/gpt_benchmark.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -394,9 +394,6 @@ def build(self):
394394
network.plugin_config.set_smooth_quant_gemm_plugin(dtype=self.dtype)
395395
network.plugin_config.set_layernorm_quantization_plugin(
396396
dtype=self.dtype)
397-
# FIXME(nkorobov)
398-
# See https://nvbugs/4164762
399-
# See https://nvbugs/4174113
400397
network.plugin_config.set_quantize_tensor_plugin()
401398
network.plugin_config.set_quantize_per_token_plugin()
402399
elif self.use_weight_only:

cpp/include/tensorrt_llm/batch_manager/llmRequest.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,6 @@ class LlmRequest
126126
{
127127
// TODO: For beamWidth > 1, we would need to support swapping to avoid
128128
// recomputing from the start
129-
// See https://jirasw.nvidia.com/browse/TRT-21715
130129
// As a temporary solution, we currently reset the tokens to the prompt
131130
if (mSamplingConfig.beamWidth > 1)
132131
{

cpp/include/tensorrt_llm/runtime/ipcUtils.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ class IpcMemory
3434

3535
size_t static constexpr FLAGS_SIZE = kernels::MAX_ALL_REDUCE_BLOCKS * sizeof(uint32_t);
3636

37-
IpcMemory(WorldConfig worldConfig, SizeType bufferSize);
37+
IpcMemory(WorldConfig worldConfig, std::size_t bufferSize);
3838
~IpcMemory();
3939

4040
[[nodiscard]] const std::vector<void*>& getCommPtrsTensor() const
@@ -48,7 +48,7 @@ class IpcMemory
4848

4949
WorldConfig mWorldConfig;
5050
std::vector<void*> mCommPtrs;
51-
SizeType mBufferSize;
51+
std::size_t mBufferSize;
5252
void* mBufferPtr;
5353
};
5454

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:74a8f58055a5772fa5ef65cbe96895a98e9d33431a618ec2eefab8d417b1d3b1
3-
size 1489782
2+
oid sha256:179091b7779a2a9ae047be54b65da9f45c66073a9ee34e889a00332ef13d7f39
3+
size 1489814
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
version https://git-lfs.github.com/spec/v1
2-
oid sha256:e41d1e4eb3c7c8ae14ca81f76a857a2adc56923df3206927ef9453457f5a8915
3-
size 1259888
2+
oid sha256:4c00eb2685b3224ea2df468c533a443c9a2a71d99ffa9125a792f735d9166544
3+
size 1257346

cpp/tensorrt_llm/common/tensor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ class Tensor
215215
MemoryType where;
216216
DataType type;
217217
std::vector<size_t> shape;
218-
void const* data; // TODO(bhseuh) modify from const void* to void* const
218+
void const* data; // TODO modify from const void* to void* const
219219

220220
Tensor();
221221
Tensor(MemoryType _where, DataType _type, std::vector<size_t> const& _shape, void const* _data);

cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@ namespace kernels
2727
enum Data_type
2828
{
2929
DATA_TYPE_BOOL,
30-
DATA_TYPE_E8M10,
31-
DATA_TYPE_E8M7,
3230
DATA_TYPE_FP16,
3331
DATA_TYPE_FP32,
3432
DATA_TYPE_INT4,

cpp/tensorrt_llm/kernels/cutlass_kernels/int8_gemm/int8_gemm.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ namespace cutlass_kernels
3434
This runner supports:
3535
int8_t inputs (A and B)
3636
float alpha scalings (either per-col, or per-col x per-row)
37-
T output (D) where T = {float, half, __nv_bfloat16} // TODO(mseznec)
37+
T output (D) where T = {float, half, __nv_bfloat16} // TODO
3838
3939
Activations, biases, scales and outputs are all assumed to be row-major.
4040
Weights are assumed to be column-major.

0 commit comments

Comments
 (0)