Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 64 additions & 36 deletions docarray/array/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,18 @@ def __getitem__(
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
):
_docs = self[index[0]]
_attrs = index[1]
if isinstance(_attrs, str):
_attrs = (index[1],)

return _docs._get_attributes(*_attrs)
if isinstance(index[0], str) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self._id2offset:
return DocumentArray([self[index[0]], self[index[1]]])
else:
return getattr(self[index[0]], index[1])
elif isinstance(index[0], (slice, Sequence)):
_docs = self[index[0]]
_attrs = index[1]
if isinstance(_attrs, str):
_attrs = (index[1],)
return _docs._get_attributes(*_attrs)
elif isinstance(index[0], bool):
return DocumentArray(itertools.compress(self._data, index))
elif isinstance(index[0], int):
Expand Down Expand Up @@ -231,31 +237,45 @@ def __setitem__(
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
):
_docs = self[index[0]]
_attrs = index[1]

if isinstance(_attrs, str):
# a -> [a]
# [a, a] -> [a, a]
_attrs = (index[1],)
if isinstance(value, (list, tuple)) and not any(
isinstance(el, (tuple, list)) for el in value
):
# [x] -> [[x]]
# [[x], [y]] -> [[x], [y]]
value = (value,)
if not isinstance(value, (list, tuple)):
# x -> [x]
value = (value,)

for _a, _v in zip(_attrs, value):
if _a == 'blob':
_docs.blobs = _v
elif _a == 'embedding':
_docs.embeddings = _v
if isinstance(index[0], str) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self._id2offset:
for _d, _v in zip((self[index[0]], self[index[1]]), value):
_d._data = _v._data
self._rebuild_id2offset()
elif hasattr(self[index[0]], index[1]):
setattr(self[index[0]], index[1], value)
else:
for _d, _vv in zip(_docs, _v):
setattr(_d, _a, _vv)
# to avoid accidentally add new unsupport attribute
raise ValueError(
f'`{index[1]}` is neither a valid id nor attribute name'
)
elif isinstance(index[0], (slice, Sequence)):
_docs = self[index[0]]
_attrs = index[1]

if isinstance(_attrs, str):
# a -> [a]
# [a, a] -> [a, a]
_attrs = (index[1],)
if isinstance(value, (list, tuple)) and not any(
isinstance(el, (tuple, list)) for el in value
):
# [x] -> [[x]]
# [[x], [y]] -> [[x], [y]]
value = (value,)
if not isinstance(value, (list, tuple)):
# x -> [x]
value = (value,)

for _a, _v in zip(_attrs, value):
if _a == 'blob':
_docs.blobs = _v
elif _a == 'embedding':
_docs.embeddings = _v
else:
for _d, _vv in zip(_docs, _v):
setattr(_d, _a, _vv)
elif isinstance(index[0], bool):
if len(index) != len(self._data):
raise IndexError(
Expand Down Expand Up @@ -313,12 +333,20 @@ def __delitem__(self, index: 'DocumentArrayIndexType'):
and len(index) == 2
and isinstance(index[0], (slice, Sequence))
):
_docs = self[index[0]]
_attrs = index[1]
if isinstance(_attrs, str):
_attrs = (index[1],)
for _d in _docs:
_d.pop(*_attrs)
if isinstance(index[0], str) and isinstance(index[1], str):
# ambiguity only comes from the second string
if index[1] in self._id2offset:
del self[index[0]]
del self[index[1]]
else:
self[index[0]].pop(index[1])
elif isinstance(index[0], (slice, Sequence)):
_docs = self[index[0]]
_attrs = index[1]
if isinstance(_attrs, str):
_attrs = (index[1],)
for _d in _docs:
_d.pop(*_attrs)
elif isinstance(index[0], bool):
self._data = list(
itertools.compress(self._data, (not _i for _i in index))
Expand Down
16 changes: 13 additions & 3 deletions docs/fundamentals/document/construct.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,19 @@ d = Document()
<Document ('id',) at 5dd542406d3f11eca3241e008a366d49>
```

Every Document will have a unique random `id` that helps you identify this Document. It can be used to {ref}`access this Document inside a DocumentArray<access-elements>`. You can override this `id` or assign your own `id` during construction, as demonstrated below.
Every Document will have a unique random `id` that helps you identify this Document. It can be used to {ref}`access this Document inside a DocumentArray<access-elements>`.

````{tip}
The random `id` is the hex value of [UUID1](https://docs.python.org/3/library/uuid.html#uuid.uuid1). To convert it into the string of UUID:

```python
import uuid
str(uuid.UUID(d.id))
```
````

Though possible, it is not recommended modifying `.id` of a Document frequently, as this will lead to unexpected behavior.


## Construct with attributes

Expand All @@ -25,7 +37,6 @@ This is the most common usage of the constructor: initializing a Document object
from docarray import Document
import numpy

d0 = Document(id='my_id')
d1 = Document(text='hello')
d2 = Document(buffer=b'\f1')
d3 = Document(blob=numpy.array([1, 2, 3]))
Expand All @@ -43,7 +54,6 @@ Don't forget to leverage autocomplete in your IDE.
```

```text
<Document ('id',) at my_id>
<Document ('id', 'mime_type', 'text') at a14effee6d3e11ec8bde1e008a366d49>
<Document ('id', 'buffer') at a14f00986d3e11ec8bde1e008a366d49>
<Document ('id', 'blob') at a14f01a66d3e11ec8bde1e008a366d49>
Expand Down
2 changes: 1 addition & 1 deletion docs/get-started/what-is.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ As one can see, you can convert a DocumentArray into AwkwardArray via `.to_list(

[Zarr](https://zarr.readthedocs.io/en/stable/) is a format for the storage of chunked, compressed, N-dimensional arrays. I know Zarr quite long time ago, to me it is the package when a `numpy.ndarray` is so big to fit into memory. Zarr provides a comprehensive set of functions that allows one to chunk, compress, stream large NdArray. Hence, from that perspective, Zarr like `numpy.ndarray` focuses on numerical representation and computation.

In DocArray, the basic element one would work with is a Document, not `ndarray`. The support of `ndarray` is important, but not the full story: in the context of deep learning engineering, it is often an intermediate representation of Document for computing, then being thrown away. Therefore, having a consistent data structure that can live *long enough* to cover creating, storing, computing, transferring, returning and rendering is a motivation behind DocArray.
In DocArray, the basic element one would work with is a Document, not `ndarray`. The support of `ndarray` is important, but not the full story: in the context of deep learning engineering, `ndarray` is often an intermediate representation of Document for computing, then throw away. Therefore, having a consistent data structure that lives *long enough* to cover creating, storing, computing, transferring, returning and rendering is one of the major motivations of DocArray.

## To Jina Users

Expand Down
40 changes: 38 additions & 2 deletions tests/unit/array/test_advance_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,6 @@ def test_advance_selector_mixed():


def test_single_boolean_and_padding():
from docarray import DocumentArray

da = DocumentArray.empty(3)

with pytest.raises(IndexError):
Expand All @@ -237,3 +235,41 @@ def test_single_boolean_and_padding():

assert len(da[True, False]) == 1
assert len(da[False, False]) == 0


def test_edge_case_two_strings():
# getitem
da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
assert da['1', 'id'] == '1'
assert len(da['1', '2']) == 2
assert isinstance(da['1', '2'], DocumentArray)
with pytest.raises(KeyError):
da['hello', '2']
with pytest.raises(AttributeError):
da['1', 'hello']
assert len(da['1', '2', '3']) == 3
assert isinstance(da['1', '2', '3'], DocumentArray)

# delitem
del da['1', '2']
assert len(da) == 1

da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
del da['1', 'id']
assert len(da) == 3
assert not da[0].id

del da['2', 'hello']

# setitem
da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
da['1', '2'] = DocumentArray.empty(2)
assert da[0].id != '1'
assert da[1].id != '2'

da = DocumentArray([Document(id='1'), Document(id='2'), Document(id='3')])
da['1', 'text'] = 'hello'
assert da['1'].text == 'hello'

with pytest.raises(ValueError):
da['1', 'hellohello'] = 'hello'