-
Notifications
You must be signed in to change notification settings - Fork 244
Expand file tree
/
Copy pathtext.py
More file actions
148 lines (102 loc) · 4.07 KB
/
Copy pathtext.py
File metadata and controls
148 lines (102 loc) · 4.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from typing import Any, Optional, Type, TypeVar, Union
from docarray.base_doc import BaseDoc
from docarray.typing import TextUrl
from docarray.typing.tensor.embedding import AnyEmbedding
T = TypeVar('T', bound='TextDoc')
class TextDoc(BaseDoc):
"""
Document for handling text.
It can contain a TextUrl (`TextDoc.url`), a str (`TextDoc.text`),
and an AnyEmbedding (`TextDoc.embedding`).
EXAMPLE USAGE:
You can use this Document directly:
.. code-block:: python
from docarray.documents import TextDoc
# use it directly
txt_doc = Text(url='http://www.jina.ai/')
txt_doc.text = txt_doc.url.load()
model = MyEmbeddingModel()
txt_doc.embedding = model(txt_doc.text)
You can initialize directly from a string:
.. code-block:: python
from docarray.documents import TextDoc
txt_doc = Text('hello world')
You can extend this Document:
.. code-block:: python
from docarray.documents import TextDoc
from docarray.typing import AnyEmbedding
from typing import Optional
# extend it
class MyText(Text):
second_embedding: Optional[AnyEmbedding]
txt_doc = MyText(url='http://www.jina.ai/')
txt_doc.text = txt_doc.url.load()
model = MyEmbeddingModel()
txt_doc.embedding = model(txt_doc.text)
txt_doc.second_embedding = model(txt_doc.text)
You can use this Document for composition:
.. code-block:: python
from docarray import BaseDoc
from docarray.documents import ImageDoc, TextDoc
# compose it
class MultiModalDoc(BaseDoc):
image_doc: Image
text_doc: Text
mmdoc = MultiModalDoc(
image_doc=Image(url="http://www.jina.ai/image.jpg"),
text_doc=Text(text="hello world, how are you doing?"),
)
mmdoc.text_doc.text = mmdoc.text_doc.url.load()
# or
mmdoc.text_doc.bytes_ = mmdoc.text_doc.url.load_bytes()
This Document can be compared against another Document of the same type or a string.
When compared against another object of the same type, the pydantic BaseModel
equality check will apply which checks the equality of every attribute,
including `id`. When compared against a str, it will check the equality
of the `text` attribute against the given string.
.. code-block:: python
from docarray.documents import TextDoc
doc = Text(text='This is the main text', url='exampleurl.com')
doc2 = Text(text='This is the main text', url='exampleurl.com')
doc == 'This is the main text' # True
doc == doc2 # False, their ids are not equivalent
"""
text: Optional[str]
url: Optional[TextUrl]
embedding: Optional[AnyEmbedding]
bytes_: Optional[bytes]
def __init__(self, text: Optional[str] = None, **kwargs):
if 'text' not in kwargs:
kwargs['text'] = text
super().__init__(**kwargs)
@classmethod
def validate(
cls: Type[T],
value: Union[str, Any],
) -> T:
if isinstance(value, str):
value = cls(text=value)
return super().validate(value)
def __eq__(self, other: Any) -> bool:
if isinstance(other, str):
return self.text == other
else:
# BaseModel has a default equality
return super().__eq__(other)
def __contains__(self, item: str) -> bool:
"""
This method makes `Text` behave the same as an `str`.
.. code-block:: python
from docarray.documents import Text
t = Text(text='this is my text document')
assert 'text' in t
assert 'docarray' not in t
:param item: A string to be checked if is a substring of `text` attribute
:return: A boolean determining the presence of `item` as a substring in `text`
"""
if self.text is not None:
return self.text.__contains__(item)
else:
return False
def _get_string_for_regex_filter(self):
return self.text