Skip to content
169 changes: 168 additions & 1 deletion docarray/base_document/document.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import os
from typing import Type
from typing import List, Type

import orjson
from pydantic import BaseModel, Field, parse_obj_as
from rich.console import Console
from typing_inspect import get_origin

from docarray.base_document.abstract_document import AbstractDocument
from docarray.base_document.base_node import BaseNode
Expand Down Expand Up @@ -46,3 +47,169 @@ def __str__(self):

def _get_string_for_regex_filter(self):
return str(self)

def update(self, other: 'BaseDocument'):
"""
Updates self with the content of other. Changes are applied to self.
Updating one Document with another consists in the following:
- setting data properties of the second Document to the first Document
if they are not None
- Concatenating lists and updating sets
- Updating recursively Documents and DocumentArrays
- Updating Dictionaries of the left with the right

It behaves as an update operation for Dictionaries, except that since
it is applied to a static schema type, the presence of the field is
given by the field not having a None value and that DocumentArrays,
lists and sets are concatenated. It is worth mentioning that Tuples
are not merged together since they are meant to be inmutable,
so they behave as regular types and the value of `self` is updated
with the value of `other`

EXAMPLE USAGE

.. code-block:: python

from docarray import BaseDocument
from docarray.documents import Text


class MyDocument(BaseDocument):
content: str
title: Optional[str] = None
tags_: List


doc1 = MyDocument(
content='Core content of the document',
title='Title',
tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']

:param other: The Document with which to update the contents of this
"""
if type(self) != type(other):
raise Exception(
f'Update operation can only be applied to '
f'Documents of the same type. '
f'Trying to update Document of type '
f'{type(self)} with Document of type '
f'{type(other)}'
)
from collections import namedtuple

from docarray import DocumentArray
from docarray.utils.reduce import reduce

# Declaring namedtuple()
_FieldGroups = namedtuple(
Comment thread
JoanFM marked this conversation as resolved.
'_FieldGroups',
[
'simple_non_empty_fields',
'list_fields',
'set_fields',
'dict_fields',
'nested_docarray_fields',
'nested_docs_fields',
],
)

FORBIDDEN_FIELDS_TO_UPDATE = ['ID']

def _group_fields(doc: 'BaseDocument') -> _FieldGroups:
simple_non_empty_fields: List[str] = []
list_fields: List[str] = []
set_fields: List[str] = []
dict_fields: List[str] = []
Comment thread
JoanFM marked this conversation as resolved.
nested_docs_fields: List[str] = []
nested_docarray_fields: List[str] = []

for field_name, field in doc.__fields__.items():
if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
field_type = doc._get_field_type(field_name)

if isinstance(field_type, type) and issubclass(
field_type, DocumentArray
):
nested_docarray_fields.append(field_name)
else:
origin = get_origin(field_type)
if origin is list:
list_fields.append(field_name)
elif origin is set:
set_fields.append(field_name)
elif origin is dict:
dict_fields.append(field_name)
else:
v = getattr(doc, field_name)
if v:
if isinstance(v, BaseDocument):
nested_docs_fields.append(field_name)
else:
simple_non_empty_fields.append(field_name)
return _FieldGroups(
simple_non_empty_fields,
list_fields,
set_fields,
dict_fields,
nested_docarray_fields,
nested_docs_fields,
)

doc1_fields = _group_fields(self)
doc2_fields = _group_fields(other)

for field in doc2_fields.simple_non_empty_fields:
setattr(self, field, getattr(other, field))

for field in set(
doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
):
sub_doc_1: BaseDocument = getattr(self, field)
sub_doc_2: BaseDocument = getattr(other, field)
sub_doc_1.update(sub_doc_2)
setattr(self, field, sub_doc_1)

for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
array1 = getattr(self, field)
array2 = getattr(other, field)
if array1 is None and array2 is not None:
setattr(self, field, array2)
elif array1 is not None and array2 is not None:
array1.extend(array2)
setattr(self, field, array1)

for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
array1 = getattr(self, field)
array2 = getattr(other, field)
if array1 is None and array2 is not None:
setattr(self, field, array2)
elif array1 is not None and array2 is not None:
array1.update(array2)
setattr(self, field, array1)

for field in set(
doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
):
array1 = getattr(self, field)
array2 = getattr(other, field)
if array1 is None and array2 is not None:
setattr(self, field, array2)
elif array1 is not None and array2 is not None:
array1 = reduce(array1, array2)
setattr(self, field, array1)

for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
dict1 = getattr(self, field)
dict2 = getattr(other, field)
if dict1 is None and dict2 is not None:
setattr(self, field, dict2)
elif dict1 is not None and dict2 is not None:
dict1.update(dict2)
setattr(self, field, dict1)
68 changes: 68 additions & 0 deletions docarray/utils/reduce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from docarray import DocumentArray
from typing import List, Optional, Dict


def reduce(
left: DocumentArray, right: DocumentArray, left_id_map: Optional[Dict] = None
) -> 'DocumentArray':
"""
Reduces left and right DocumentArray into one DocumentArray in-place.
Changes are applied to the left DocumentArray.
Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray
to the first DocumentArray if they do not exist.
If a Document exists in both DocumentArrays (identified by ID),
the data properties are merged with priority to the left Document.

Nested DocumentArrays are also reduced in the same way.
:param left: First DocumentArray to be reduced. Changes will be applied to it
in-place
:param right: Second DocumentArray to be reduced
:param left_id_map: Optional parameter to be passed in repeated calls
for optimizations, keeping a map of the Document ID to its offset
in the DocumentArray
:return: Reduced DocumentArray
"""
left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)}

for doc in right:
if doc.id in left_id_map:
left[left_id_map[doc.id]].update(doc)
else:
left.append(doc)

return left


def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray:
"""
Reduces a list of DocumentArrays into one DocumentArray.
Changes are applied to the first DocumentArray in-place.

The resulting DocumentArray contains Documents of all DocumentArrays.
If a Document exists (identified by their ID) in many DocumentArrays,
data properties are merged with priority to the left-most
DocumentArrays (that is, if a data attribute is set in a Document
belonging to many DocumentArrays, the attribute value of the left-most
DocumentArray is kept).
Nested DocumentArrays belonging to many DocumentArrays
are also reduced in the same way.
.. note::
- Nested DocumentArrays order does not follow any specific rule.
You might want to re-sort them in a later step.
- The final result depends on the order of DocumentArrays
when applying reduction.

:param docarrays: List of DocumentArrays to be reduced
:return: the resulting DocumentArray
"""
if len(docarrays) <= 1:
raise Exception(
'In order to reduce DocumentArrays'
' we should have more than one DocumentArray'
)
left = docarrays[0]
others = docarrays[1:]
left_id_map = {doc.id: i for i, doc in enumerate(left)}
for da in others:
reduce(left, da, left_id_map)
return left
19 changes: 18 additions & 1 deletion tests/units/document/test_base_document.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,25 @@
from typing import Optional, List
from docarray.base_document.document import BaseDocument


def test_base_document_init():

doc = BaseDocument()

assert doc.id is not None


def test_update():
class MyDocument(BaseDocument):
content: str
title: Optional[str] = None
tags_: List

doc1 = MyDocument(
content='Core content of the document', title='Title', tags_=['python', 'AI']
)
doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

doc1.update(doc2)
assert doc1.content == 'Core content updated'
assert doc1.title == 'Title'
assert doc1.tags_ == ['python', 'AI', 'docarray']
102 changes: 102 additions & 0 deletions tests/units/document/test_update.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import pytest
from typing import Optional, List, Dict, Set
from docarray import BaseDocument, DocumentArray
from docarray.documents import Image


class InnerDoc(BaseDocument):
integer: int
l: List


class MMDoc(BaseDocument):
text: str = ''
price: int = 0
categories: Optional[List[str]] = None
image: Optional[Image] = None
matches: Optional[DocumentArray] = None
matches_with_same_id: Optional[DocumentArray] = None
opt_int: Optional[int] = None
test_set: Optional[Set] = None
inner_doc: Optional[InnerDoc] = None
test_dict: Optional[Dict] = None


@pytest.fixture
def doc1():
return MMDoc(
text='hey here',
categories=['a', 'b', 'c'],
price=10,
matches=DocumentArray[MMDoc]([MMDoc()]),
matches_with_same_id=DocumentArray[MMDoc](
[MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]
),
test_set={'a', 'a'},
inner_doc=InnerDoc(integer=2, l=['c', 'd']),
test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3},
)


@pytest.fixture
def doc2(doc1):
return MMDoc(
id=doc1.id,
text='hey here 2',
categories=['d', 'e', 'f'],
price=5,
opt_int=5,
matches=DocumentArray[MMDoc]([MMDoc()]),
matches_with_same_id=DocumentArray[MMDoc](
[MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]
),
test_set={'a', 'b'},
inner_doc=InnerDoc(integer=3, l=['a', 'b']),
test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None},
)


def test_update_complex(doc1, doc2):
doc1.update(doc2)
# doc1 is changed in place (no extra memory)
assert doc1.text == 'hey here 2'
assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f']
assert len(doc1.matches) == 2
assert doc1.opt_int == 5
assert doc1.price == 5
assert doc1.test_set == {'a', 'b'}
assert len(doc1.matches_with_same_id) == 1
assert len(doc1.matches_with_same_id[0].matches) == 2
assert doc1.inner_doc.integer == 3
assert doc1.inner_doc.l == ['c', 'd', 'a', 'b']
assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None}


def test_update_simple():
class MyDocument(BaseDocument):
content: str
title: Optional[str] = None
tags_: List

my_doc1 = MyDocument(
content='Core content of the document', title='Title', tags_=['python', 'AI']
)
my_doc2 = MyDocument(content='Core content updated', tags_=['docarray'])

my_doc1.update(my_doc2)
assert my_doc1.content == 'Core content updated'
assert my_doc1.title == 'Title'
assert my_doc1.tags_ == ['python', 'AI', 'docarray']


def test_update_different_schema_fails():
class DocA(BaseDocument):
content: str

class DocB(BaseDocument):
image: Optional[Image] = None

docA = DocA(content='haha')
docB = DocB()
with pytest.raises(Exception):
docA.update(docB)
Loading