-
Notifications
You must be signed in to change notification settings - Fork 237
Expand file tree
/
Copy pathaudio.py
More file actions
153 lines (120 loc) · 4.59 KB
/
audio.py
File metadata and controls
153 lines (120 loc) · 4.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
from typing import TYPE_CHECKING, Any, Dict, Optional, Type, TypeVar, Union
import numpy as np
from pydantic import Field
from docarray.base_doc import BaseDoc
from docarray.typing import AnyEmbedding, AudioUrl
from docarray.typing.bytes.audio_bytes import AudioBytes
from docarray.typing.tensor.abstract_tensor import AbstractTensor
from docarray.typing.tensor.audio.audio_tensor import AudioTensor
from docarray.utils._internal.misc import import_library
from docarray.utils._internal.pydantic import is_pydantic_v2
if is_pydantic_v2:
from pydantic import model_validator
if TYPE_CHECKING:
import tensorflow as tf # type: ignore
import torch
else:
torch = import_library('torch', raise_error=False)
tf = import_library('tensorflow', raise_error=False)
T = TypeVar('T', bound='AudioDoc')
class AudioDoc(BaseDoc):
"""
Document for handling audios.
The Audio Document can contain:
- an [`AudioUrl`][docarray.typing.url.AudioUrl] (`AudioDoc.url`)
- an [`AudioTensor`](../../../api_references/typing/tensor/audio) (`AudioDoc.tensor`)
- an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`AudioDoc.embedding`)
- an [`AudioBytes`][docarray.typing.bytes.AudioBytes] (`AudioDoc.bytes_`) object
- an integer representing the frame_rate (`AudioDoc.frame_rate`)
You can use this Document directly:
```python
from docarray.documents import AudioDoc
# use it directly
audio = AudioDoc(
url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
)
audio.tensor, audio.frame_rate = audio.url.load()
# model = MyEmbeddingModel()
# audio.embedding = model(audio.tensor)
```
You can extend this Document:
```python
from docarray.documents import AudioDoc, TextDoc
from typing import Optional
# extend it
class MyAudio(AudioDoc):
name: Optional[TextDoc] = None
audio = MyAudio(
url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
)
audio.name = TextDoc(text='my first audio')
audio.tensor, audio.frame_rate = audio.url.load()
# model = MyEmbeddingModel()
# audio.embedding = model(audio.tensor)
```
You can use this Document for composition:
```python
from docarray import BaseDoc
from docarray.documents import AudioDoc, TextDoc
# compose it
class MultiModalDoc(BaseDoc):
audio: AudioDoc
text: TextDoc
mmdoc = MultiModalDoc(
audio=AudioDoc(
url='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.wav?raw=true'
),
text=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.url.load()
# equivalent to
mmdoc.audio.bytes_ = mmdoc.audio.url.load_bytes()
mmdoc.audio.tensor, mmdoc.audio.frame_rate = mmdoc.audio.bytes_.load()
```
"""
url: Optional[AudioUrl] = Field(
description='The url to a (potentially remote) audio file that can be loaded',
example='https://github.com/docarray/docarray/blob/main/tests/toydata/hello.mp3?raw=true',
default=None,
)
tensor: Optional[AudioTensor] = Field(
description='Tensor object of the audio which can be specified to one of `AudioNdArray`, `AudioTorchTensor`, `AudioTensorFlowTensor`',
default=None,
)
embedding: Optional[AnyEmbedding] = Field(
description='Store an embedding: a vector representation of the audio.',
example=[0, 1, 0],
default=None,
)
bytes_: Optional[AudioBytes] = Field(
description='Bytes representation pf the audio',
default=None,
)
frame_rate: Optional[int] = Field(
description='An integer representing the frame rate of the audio.',
example=24,
default=None,
)
@classmethod
def _validate(cls, value) -> Dict[str, Any]:
if isinstance(value, str):
value = dict(url=value)
elif isinstance(value, (AbstractTensor, np.ndarray)) or (
torch is not None
and isinstance(value, torch.Tensor)
or (tf is not None and isinstance(value, tf.Tensor))
):
value = dict(tensor=value)
return value
if is_pydantic_v2:
@model_validator(mode='before')
@classmethod
def validate_model_before(cls, value):
return cls._validate(value)
else:
@classmethod
def validate(
cls: Type[T],
value: Union[str, AbstractTensor, Any],
) -> T:
return super().validate(cls._validate(value))