Skip to content

Commit e821eca

Browse files
bbeckcagemini-code-assist[bot]
authored andcommitted
Migrate MiniCPMOAudioInputs to TensorSchema (vllm-project#21847)
Signed-off-by: Benji Beck <benjibeck@meta.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 6952c15 commit e821eca

File tree

1 file changed

+35
-17
lines changed

1 file changed

+35
-17
lines changed

vllm/model_executor/models/minicpmo.py

Lines changed: 35 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
# limitations under the License.
2525
"""Inference-only MiniCPM-O model compatible with HuggingFace weights."""
2626
from collections.abc import Iterable, Mapping, Sequence
27-
from typing import Any, Callable, Literal, Optional, TypedDict, Union
27+
from typing import Annotated, Any, Callable, Literal, Optional, Union
2828

2929
import torch
3030
from torch import nn
@@ -49,6 +49,7 @@
4949
MultiModalDataParser)
5050
from vllm.multimodal.processing import (PromptReplacement, PromptUpdate,
5151
PromptUpdateDetails)
52+
from vllm.utils.tensor_schema import TensorSchema, TensorShape
5253

5354
from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6,
5455
MiniCPMVDummyInputsBuilder,
@@ -61,35 +62,52 @@
6162
CPU_DEVICE = torch.device("cpu")
6263

6364

64-
class MiniCPMOAudioFeatureInputs(TypedDict):
65-
type: Literal["audio_features"]
66-
audio_features: Union[torch.Tensor, list[torch.Tensor]]
65+
class MiniCPMOAudioFeatureInputs(TensorSchema):
6766
"""
68-
Shape: `(batch_size * num_audios * num_slices, num_channels, length)`
69-
Slice here means chunk. Audio that is too long will be split into slices,
70-
which is the same as image.
71-
Padding is used therefore `audio_features` is `torch.Tensor`.
67+
Dimensions:
68+
- bns: Batch size * number of audios * number of slices
69+
- bn: Batch size * number of audios
70+
- c: Number of channels
71+
- l: Length
72+
- s: Number of slices
7273
"""
74+
type: Literal["audio_features"] = "audio_features"
7375

74-
audio_feature_lens: Union[torch.Tensor, list[torch.Tensor]]
76+
audio_features: Annotated[
77+
Union[torch.Tensor, list[torch.Tensor]],
78+
TensorShape("bns", "c", "l", dynamic_dims={"l"}),
79+
]
80+
"""
81+
Slice here means chunk. Audio that is too long will be split into slices,
82+
which is the same as image. Padding is used therefore `audio_features` is
83+
`torch.Tensor`.
7584
"""
76-
Shape: `(batch_size * num_audios, num_slices)`
7785

86+
audio_feature_lens: Annotated[
87+
Union[torch.Tensor, list[torch.Tensor]],
88+
TensorShape("bn", "s"),
89+
]
90+
"""
7891
This should be feature length of each audio slice,
7992
which equals to `audio_features.shape[-1]`
8093
"""
8194

8295

83-
class MiniCPMOAudioEmbeddingInputs(TypedDict):
84-
type: Literal["audio_embeds"]
85-
audio_embeds: Union[torch.Tensor, list[torch.Tensor]]
96+
class MiniCPMOAudioEmbeddingInputs(TensorSchema):
8697
"""
87-
Shape: `(batch_size * num_audios, num_slices, hidden_size)`
88-
89-
`hidden_size` must match the hidden size of language model backbone.
90-
instead of a batched tensor.
98+
Dimensions:
99+
- bn: Batch size * number of audios
100+
- s: Number of slices
101+
- h: Hidden size (must match language model backbone)
102+
91103
Length of each slice may vary, so pass it as a list.
92104
"""
105+
type: Literal["audio_embeds"] = "audio_embeds"
106+
107+
audio_embeds: Annotated[
108+
Union[torch.Tensor, list[torch.Tensor]],
109+
TensorShape("bn", "s", "h", dynamic_dims={"s"}),
110+
]
93111

94112

95113
MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs,

0 commit comments

Comments
 (0)