forked from docarray/docarray
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata.py
More file actions
152 lines (136 loc) · 5.4 KB
/
data.py
File metadata and controls
152 lines (136 loc) · 5.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import mimetypes
import uuid
from collections import defaultdict
from dataclasses import dataclass, field, fields
from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
if TYPE_CHECKING:
from ..score import NamedScore
from .. import DocumentArray, Document
from ..types import ArrayType, StructValueType, DocumentContentType
default_values = dict(
granularity=0,
adjacency=0,
parent_id='',
buffer=b'',
text='',
weight=0.0,
uri='',
mime_type='',
tags=dict,
offset=0.0,
location=list,
modality='',
evaluations='Dict[str, NamedScore]',
scores='Dict[str, NamedScore]',
chunks='ChunkArray',
matches='MatchArray',
timestamps=dict,
)
_all_mime_types = set(mimetypes.types_map.values())
@dataclass(unsafe_hash=True)
class DocumentData:
_reference_doc: 'Document' = field(hash=False, compare=False)
id: str = field(default_factory=lambda: uuid.uuid1().hex)
parent_id: Optional[str] = None
granularity: Optional[int] = None
adjacency: Optional[int] = None
buffer: Optional[bytes] = None
blob: Optional['ArrayType'] = field(default=None, hash=False, compare=False)
mime_type: Optional[str] = None # must be put in front of `text` `content`
text: Optional[str] = None
content: Optional['DocumentContentType'] = None
weight: Optional[float] = None
uri: Optional[str] = None
tags: Optional[Dict[str, 'StructValueType']] = None
offset: Optional[float] = None
location: Optional[List[float]] = None
embedding: Optional['ArrayType'] = field(default=None, hash=False, compare=False)
modality: Optional[str] = None
evaluations: Optional[Dict[str, 'NamedScore']] = None
scores: Optional[Dict[str, 'NamedScore']] = None
chunks: Optional['DocumentArray'] = None
matches: Optional['DocumentArray'] = None
def __setattr__(self, key, value):
if value is not None:
if key == 'text' or key == 'blob' or key == 'buffer':
# enable mutual exclusivity for content field
dv = default_values.get(key)
if type(value) != type(dv) or value != dv:
self.text = None
self.blob = None
self.buffer = None
if key == 'text':
self.mime_type = 'text/plain'
elif key == 'uri':
mime_type = mimetypes.guess_type(value)[0]
if mime_type:
self.mime_type = mime_type
elif key == 'mime_type':
if value not in _all_mime_types:
# given but not recognizable, do best guess
r = mimetypes.guess_type(f'*.{value}')[0]
value = r or value
elif key == 'content':
if isinstance(value, bytes):
self.buffer = value
elif isinstance(value, str):
self.text = value
else:
self.blob = value
value = None
elif key == 'chunks':
from ..array.chunk import ChunkArray
if not isinstance(value, ChunkArray):
value = ChunkArray(value, reference_doc=self._reference_doc)
elif key == 'matches':
from ..array.match import MatchArray
if not isinstance(value, MatchArray):
value = MatchArray(value, reference_doc=self._reference_doc)
self.__dict__[key] = value
@property
def _non_empty_fields(self) -> Tuple[str]:
r = []
for f in fields(self):
f_name = f.name
if not f_name.startswith('_'):
v = getattr(self, f_name)
if v is not None:
if f_name not in default_values:
r.append(f_name)
else:
dv = default_values[f_name]
if dv in (
'ChunkArray',
'MatchArray',
'DocumentArray',
list,
dict,
'Dict[str, NamedScore]',
):
if v:
r.append(f_name)
elif v != dv:
r.append(f_name)
return tuple(r)
def _set_default_value_if_none(self, key):
if getattr(self, key) is None:
v = default_values.get(key, None)
if v is not None:
if v == 'DocumentArray':
from .. import DocumentArray
setattr(self, key, DocumentArray())
elif v == 'ChunkArray':
from ..array.chunk import ChunkArray
setattr(
self, key, ChunkArray(None, reference_doc=self._reference_doc)
)
elif v == 'MatchArray':
from ..array.match import MatchArray
setattr(
self, key, MatchArray(None, reference_doc=self._reference_doc)
)
elif v == 'Dict[str, NamedScore]':
from ..score import NamedScore
setattr(self, key, defaultdict(NamedScore))
else:
setattr(self, key, v() if callable(v) else v)