-
Notifications
You must be signed in to change notification settings - Fork 237
Expand file tree
/
Copy pathtext.py
More file actions
185 lines (141 loc) · 5.26 KB
/
text.py
File metadata and controls
185 lines (141 loc) · 5.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
from typing import Any, Optional, Type, TypeVar, Union
from pydantic import Field
from docarray.base_doc import BaseDoc
from docarray.typing import TextUrl
from docarray.typing.tensor.embedding import AnyEmbedding
from docarray.utils._internal.pydantic import is_pydantic_v2
if is_pydantic_v2:
from pydantic import model_validator
T = TypeVar('T', bound='TextDoc')
class TextDoc(BaseDoc):
"""
Document for handling text.
It can contain:
- a [`TextUrl`][docarray.typing.url.TextUrl] (`TextDoc.url`)
- a `str` (`TextDoc.text`)
- an [`AnyEmbedding`](../../../api_references/typing/tensor/embedding) (`TextDoc.embedding`)
- a `bytes` object (`TextDoc.bytes_`)
You can use this Document directly:
```python
from docarray.documents import TextDoc
# use it directly
txt_doc = TextDoc(url='https://www.gutenberg.org/files/1065/1065-0.txt')
txt_doc.text = txt_doc.url.load()
# model = MyEmbeddingModel()
# txt_doc.embedding = model(txt_doc.text)
```
You can initialize directly from a string:
```python
from docarray.documents import TextDoc
txt_doc = TextDoc('hello world')
```
You can extend this Document:
```python
from docarray.documents import TextDoc
from docarray.typing import AnyEmbedding
from typing import Optional
# extend it
class MyText(TextDoc):
second_embedding: Optional[AnyEmbedding] = None
txt_doc = MyText(url='https://www.gutenberg.org/files/1065/1065-0.txt')
txt_doc.text = txt_doc.url.load()
# model = MyEmbeddingModel()
# txt_doc.embedding = model(txt_doc.text)
# txt_doc.second_embedding = model(txt_doc.text)
```
You can use this Document for composition:
```python
from docarray import BaseDoc
from docarray.documents import ImageDoc, TextDoc
# compose it
class MultiModalDoc(BaseDoc):
image_doc: ImageDoc
text_doc: TextDoc
mmdoc = MultiModalDoc(
image_doc=ImageDoc(
url='https://github.com/docarray/docarray/blob/main/tests/toydata/image-data/apple.png?raw=true'
),
text_doc=TextDoc(text='hello world, how are you doing?'),
)
mmdoc.image_doc.tensor = mmdoc.image_doc.url.load()
# or
mmdoc.image_doc.bytes_ = mmdoc.image_doc.url.load_bytes()
mmdoc.image_doc.tensor = mmdoc.image_doc.bytes_.load()
```
This Document can be compared against another Document of the same type or a string.
When compared against another object of the same type, the pydantic BaseModel
equality check will apply which checks the equality of every attribute,
excluding `id`. When compared against a str, it will check the equality
of the `text` attribute against the given string.
```python
from docarray.documents import TextDoc
doc = TextDoc(text='This is the main text', url='exampleurl.com/file')
doc2 = TextDoc(text='This is the main text', url='exampleurl.com/file')
doc == 'This is the main text' # True
doc == doc2 # True
```
"""
text: Optional[str] = Field(
description='The text content stored in the document',
example='This is an example text content of the document',
default=None,
)
url: Optional[TextUrl] = Field(
description='URL to a (potentially remote) text file that can be loaded',
example='https://www.w3.org/History/19921103-hypertext/hypertext/README.html',
default=None,
)
embedding: Optional[AnyEmbedding] = Field(
description='Store an embedding: a vector representation of the text',
example=[1, 0, 1],
default=None,
)
bytes_: Optional[bytes] = Field(
description='Bytes representation of the text',
default=None,
)
def __init__(self, text: Optional[str] = None, **kwargs):
if 'text' not in kwargs:
kwargs['text'] = text
super().__init__(**kwargs)
if is_pydantic_v2:
@model_validator(mode='before')
@classmethod
def validate_model_before(cls, values):
if isinstance(values, str):
return {'text': values}
else:
return values
else:
@classmethod
def validate(
cls: Type[T],
value: Union[str, Any],
) -> T:
if isinstance(value, str):
value = cls(text=value)
return super().validate(value)
def __eq__(self, other: Any) -> bool:
if isinstance(other, str):
return self.text == other
else:
# BaseModel has a default equality
return super().__eq__(other)
def __contains__(self, item: str) -> bool:
"""
This method makes `TextDoc` behave the same as an `str`.
:param item: A string to be checked if is a substring of `text` attribute
:return: A boolean determining the presence of `item` as a substring in `text`
```python
from docarray.documents import TextDoc
t = TextDoc(text='this is my text document')
assert 'text' in t
assert 'docarray' not in t
```
"""
if self.text is not None:
return self.text.__contains__(item)
else:
return False
def _get_string_for_regex_filter(self):
return self.text