|
24 | 24 | # limitations under the License. |
25 | 25 | """Inference-only MiniCPM-O model compatible with HuggingFace weights.""" |
26 | 26 | from collections.abc import Iterable, Mapping, Sequence |
27 | | -from typing import Any, Callable, Literal, Optional, TypedDict, Union |
| 27 | +from typing import Annotated, Any, Callable, Literal, Optional, Union |
28 | 28 |
|
29 | 29 | import torch |
30 | 30 | from torch import nn |
|
49 | 49 | MultiModalDataParser) |
50 | 50 | from vllm.multimodal.processing import (PromptReplacement, PromptUpdate, |
51 | 51 | PromptUpdateDetails) |
| 52 | +from vllm.utils.tensor_schema import TensorSchema, TensorShape |
52 | 53 |
|
53 | 54 | from .minicpmv import (_MAX_FRAMES_PER_VIDEO, MiniCPMV2_6, |
54 | 55 | MiniCPMVDummyInputsBuilder, |
|
61 | 62 | CPU_DEVICE = torch.device("cpu") |
62 | 63 |
|
63 | 64 |
|
64 | | -class MiniCPMOAudioFeatureInputs(TypedDict): |
65 | | - type: Literal["audio_features"] |
66 | | - audio_features: Union[torch.Tensor, list[torch.Tensor]] |
| 65 | +class MiniCPMOAudioFeatureInputs(TensorSchema): |
67 | 66 | """ |
68 | | - Shape: `(batch_size * num_audios * num_slices, num_channels, length)` |
69 | | - Slice here means chunk. Audio that is too long will be split into slices, |
70 | | - which is the same as image. |
71 | | - Padding is used therefore `audio_features` is `torch.Tensor`. |
| 67 | + Dimensions: |
| 68 | + - bns: Batch size * number of audios * number of slices |
| 69 | + - bn: Batch size * number of audios |
| 70 | + - c: Number of channels |
| 71 | + - l: Length |
| 72 | + - s: Number of slices |
72 | 73 | """ |
| 74 | + type: Literal["audio_features"] = "audio_features" |
73 | 75 |
|
74 | | - audio_feature_lens: Union[torch.Tensor, list[torch.Tensor]] |
| 76 | + audio_features: Annotated[ |
| 77 | + Union[torch.Tensor, list[torch.Tensor]], |
| 78 | + TensorShape("bns", "c", "l", dynamic_dims={"l"}), |
| 79 | + ] |
| 80 | + """ |
| 81 | + Slice here means chunk. Audio that is too long will be split into slices, |
| 82 | + which is the same as image. Padding is used therefore `audio_features` is |
| 83 | + `torch.Tensor`. |
75 | 84 | """ |
76 | | - Shape: `(batch_size * num_audios, num_slices)` |
77 | 85 |
|
| 86 | + audio_feature_lens: Annotated[ |
| 87 | + Union[torch.Tensor, list[torch.Tensor]], |
| 88 | + TensorShape("bn", "s"), |
| 89 | + ] |
| 90 | + """ |
78 | 91 | This should be feature length of each audio slice, |
79 | 92 | which equals to `audio_features.shape[-1]` |
80 | 93 | """ |
81 | 94 |
|
82 | 95 |
|
83 | | -class MiniCPMOAudioEmbeddingInputs(TypedDict): |
84 | | - type: Literal["audio_embeds"] |
85 | | - audio_embeds: Union[torch.Tensor, list[torch.Tensor]] |
| 96 | +class MiniCPMOAudioEmbeddingInputs(TensorSchema): |
86 | 97 | """ |
87 | | - Shape: `(batch_size * num_audios, num_slices, hidden_size)` |
88 | | -
|
89 | | - `hidden_size` must match the hidden size of language model backbone. |
90 | | - instead of a batched tensor. |
| 98 | + Dimensions: |
| 99 | + - bn: Batch size * number of audios |
| 100 | + - s: Number of slices |
| 101 | + - h: Hidden size (must match language model backbone) |
| 102 | + |
91 | 103 | Length of each slice may vary, so pass it as a list. |
92 | 104 | """ |
| 105 | + type: Literal["audio_embeds"] = "audio_embeds" |
| 106 | + |
| 107 | + audio_embeds: Annotated[ |
| 108 | + Union[torch.Tensor, list[torch.Tensor]], |
| 109 | + TensorShape("bn", "s", "h", dynamic_dims={"s"}), |
| 110 | + ] |
93 | 111 |
|
94 | 112 |
|
95 | 113 | MiniCPMOAudioInputs = Union[MiniCPMOAudioFeatureInputs, |
|
0 commit comments