docarray · JoanFM · Feb 6, 2023 · Feb 1, 2023 · Feb 2, 2023 · Feb 2, 2023
diff --git a/docarray/base_document/document.py b/docarray/base_document/document.py
@@ -1,9 +1,10 @@
 import os
-from typing import Type
+from typing import List, Type
 
 import orjson
 from pydantic import BaseModel, Field, parse_obj_as
 from rich.console import Console
+from typing_inspect import get_origin
 
 from docarray.base_document.abstract_document import AbstractDocument
 from docarray.base_document.base_node import BaseNode
@@ -46,3 +47,169 @@ def __str__(self):
 
     def _get_string_for_regex_filter(self):
         return str(self)
+
+    def update(self, other: 'BaseDocument'):
+        """
+        Updates self with the content of other. Changes are applied to self.
+        Updating one Document with another consists in the following:
+         - setting data properties of the second Document to the first Document
+         if they are not None
+         - Concatenating lists and updating sets
+         - Updating recursively Documents and DocumentArrays
+         - Updating Dictionaries of the left with the right
+
+        It behaves as an update operation for Dictionaries, except that since
+        it is applied to a static schema type, the presence of the field is
+        given by the field not having a None value and that DocumentArrays,
+        lists and sets are concatenated. It is worth mentioning that Tuples
+        are not merged together since they are meant to be inmutable,
+        so they behave as regular types and the value of `self` is updated
+        with the value of `other`
+
+            EXAMPLE USAGE
+
+            .. code-block:: python
+
+                from docarray import BaseDocument
+                from docarray.documents import Text
+
+
+                class MyDocument(BaseDocument):
+                    content: str
+                    title: Optional[str] = None
+                    tags_: List
+
+
+                doc1 = MyDocument(
+                    content='Core content of the document',
+                    title='Title',
+                    tags_=['python', 'AI']
+                )
+                doc2 = MyDocument(content='Core content updated', tags_=['docarray'])
+
+                doc1.update(doc2)
+                assert doc1.content == 'Core content updated'
+                assert doc1.title == 'Title'
+                assert doc1.tags_ == ['python', 'AI', 'docarray']
+
+        :param other: The Document with which to update the contents of this
+        """
+        if type(self) != type(other):
+            raise Exception(
+                f'Update operation can only be applied to '
+                f'Documents of the same type. '
+                f'Trying to update Document of type '
+                f'{type(self)} with Document of type '
+                f'{type(other)}'
+            )
+        from collections import namedtuple
+
+        from docarray import DocumentArray
+        from docarray.utils.reduce import reduce
+
+        # Declaring namedtuple()
+        _FieldGroups = namedtuple(
+            '_FieldGroups',
+            [
+                'simple_non_empty_fields',
+                'list_fields',
+                'set_fields',
+                'dict_fields',
+                'nested_docarray_fields',
+                'nested_docs_fields',
+            ],
+        )
+
+        FORBIDDEN_FIELDS_TO_UPDATE = ['ID']
+
+        def _group_fields(doc: 'BaseDocument') -> _FieldGroups:
+            simple_non_empty_fields: List[str] = []
+            list_fields: List[str] = []
+            set_fields: List[str] = []
+            dict_fields: List[str] = []
+            nested_docs_fields: List[str] = []
+            nested_docarray_fields: List[str] = []
+
+            for field_name, field in doc.__fields__.items():
+                if field_name not in FORBIDDEN_FIELDS_TO_UPDATE:
+                    field_type = doc._get_field_type(field_name)
+
+                    if isinstance(field_type, type) and issubclass(
+                        field_type, DocumentArray
+                    ):
+                        nested_docarray_fields.append(field_name)
+                    else:
+                        origin = get_origin(field_type)
+                        if origin is list:
+                            list_fields.append(field_name)
+                        elif origin is set:
+                            set_fields.append(field_name)
+                        elif origin is dict:
+                            dict_fields.append(field_name)
+                        else:
+                            v = getattr(doc, field_name)
+                            if v:
+                                if isinstance(v, BaseDocument):
+                                    nested_docs_fields.append(field_name)
+                                else:
+                                    simple_non_empty_fields.append(field_name)
+            return _FieldGroups(
+                simple_non_empty_fields,
+                list_fields,
+                set_fields,
+                dict_fields,
+                nested_docarray_fields,
+                nested_docs_fields,
+            )
+
+        doc1_fields = _group_fields(self)
+        doc2_fields = _group_fields(other)
+
+        for field in doc2_fields.simple_non_empty_fields:
+            setattr(self, field, getattr(other, field))
+
+        for field in set(
+            doc1_fields.nested_docs_fields + doc2_fields.nested_docs_fields
+        ):
+            sub_doc_1: BaseDocument = getattr(self, field)
+            sub_doc_2: BaseDocument = getattr(other, field)
+            sub_doc_1.update(sub_doc_2)
+            setattr(self, field, sub_doc_1)
+
+        for field in set(doc1_fields.list_fields + doc2_fields.list_fields):
+            array1 = getattr(self, field)
+            array2 = getattr(other, field)
+            if array1 is None and array2 is not None:
+                setattr(self, field, array2)
+            elif array1 is not None and array2 is not None:
+                array1.extend(array2)
+                setattr(self, field, array1)
+
+        for field in set(doc1_fields.set_fields + doc2_fields.set_fields):
+            array1 = getattr(self, field)
+            array2 = getattr(other, field)
+            if array1 is None and array2 is not None:
+                setattr(self, field, array2)
+            elif array1 is not None and array2 is not None:
+                array1.update(array2)
+                setattr(self, field, array1)
+
+        for field in set(
+            doc1_fields.nested_docarray_fields + doc2_fields.nested_docarray_fields
+        ):
+            array1 = getattr(self, field)
+            array2 = getattr(other, field)
+            if array1 is None and array2 is not None:
+                setattr(self, field, array2)
+            elif array1 is not None and array2 is not None:
+                array1 = reduce(array1, array2)
+                setattr(self, field, array1)
+
+        for field in set(doc1_fields.dict_fields + doc2_fields.dict_fields):
+            dict1 = getattr(self, field)
+            dict2 = getattr(other, field)
+            if dict1 is None and dict2 is not None:
+                setattr(self, field, dict2)
+            elif dict1 is not None and dict2 is not None:
+                dict1.update(dict2)
+                setattr(self, field, dict1)
diff --git a/docarray/utils/reduce.py b/docarray/utils/reduce.py
@@ -0,0 +1,68 @@
+from docarray import DocumentArray
+from typing import List, Optional, Dict
+
+
+def reduce(
+    left: DocumentArray, right: DocumentArray, left_id_map: Optional[Dict] = None
+) -> 'DocumentArray':
+    """
+    Reduces left and right DocumentArray into one DocumentArray in-place.
+    Changes are applied to the left DocumentArray.
+    Reducing 2 DocumentArrays consists in adding Documents in the second DocumentArray
+    to the first DocumentArray if they do not exist.
+    If a Document exists in both DocumentArrays (identified by ID),
+    the data properties are merged with priority to the left Document.
+
+    Nested DocumentArrays are also reduced in the same way.
+    :param left: First DocumentArray to be reduced. Changes will be applied to it
+    in-place
+    :param right: Second DocumentArray to be reduced
+    :param left_id_map: Optional parameter to be passed in repeated calls
+    for optimizations, keeping a map of the Document ID to its offset
+    in the DocumentArray
+    :return: Reduced DocumentArray
+    """
+    left_id_map = left_id_map or {doc.id: i for i, doc in enumerate(left)}
+
+    for doc in right:
+        if doc.id in left_id_map:
+            left[left_id_map[doc.id]].update(doc)
+        else:
+            left.append(doc)
+
+    return left
+
+
+def reduce_all(docarrays: List[DocumentArray]) -> DocumentArray:
+    """
+    Reduces a list of DocumentArrays into one DocumentArray.
+    Changes are applied to the first DocumentArray in-place.
+
+    The resulting DocumentArray contains Documents of all DocumentArrays.
+    If a Document exists (identified by their ID) in many DocumentArrays,
+    data properties are merged with priority to the left-most
+    DocumentArrays (that is, if a data attribute is set in a Document
+    belonging to many DocumentArrays, the attribute value of the left-most
+     DocumentArray is kept).
+    Nested DocumentArrays belonging to many DocumentArrays
+     are also reduced in the same way.
+    .. note::
+        - Nested DocumentArrays order does not follow any specific rule.
+        You might want to re-sort them in a later step.
+        - The final result depends on the order of DocumentArrays
+        when applying reduction.
+
+    :param docarrays: List of DocumentArrays to be reduced
+    :return: the resulting DocumentArray
+    """
+    if len(docarrays) <= 1:
+        raise Exception(
+            'In order to reduce DocumentArrays'
+            ' we should have more than one DocumentArray'
+        )
+    left = docarrays[0]
+    others = docarrays[1:]
+    left_id_map = {doc.id: i for i, doc in enumerate(left)}
+    for da in others:
+        reduce(left, da, left_id_map)
+    return left
diff --git a/tests/units/document/test_base_document.py b/tests/units/document/test_base_document.py
@@ -1,8 +1,25 @@
+from typing import Optional, List
 from docarray.base_document.document import BaseDocument
 
 
 def test_base_document_init():
-
     doc = BaseDocument()
 
     assert doc.id is not None
+
+
+def test_update():
+    class MyDocument(BaseDocument):
+        content: str
+        title: Optional[str] = None
+        tags_: List
+
+    doc1 = MyDocument(
+        content='Core content of the document', title='Title', tags_=['python', 'AI']
+    )
+    doc2 = MyDocument(content='Core content updated', tags_=['docarray'])
+
+    doc1.update(doc2)
+    assert doc1.content == 'Core content updated'
+    assert doc1.title == 'Title'
+    assert doc1.tags_ == ['python', 'AI', 'docarray']
diff --git a/tests/units/document/test_update.py b/tests/units/document/test_update.py
@@ -0,0 +1,102 @@
+import pytest
+from typing import Optional, List, Dict, Set
+from docarray import BaseDocument, DocumentArray
+from docarray.documents import Image
+
+
+class InnerDoc(BaseDocument):
+    integer: int
+    l: List
+
+
+class MMDoc(BaseDocument):
+    text: str = ''
+    price: int = 0
+    categories: Optional[List[str]] = None
+    image: Optional[Image] = None
+    matches: Optional[DocumentArray] = None
+    matches_with_same_id: Optional[DocumentArray] = None
+    opt_int: Optional[int] = None
+    test_set: Optional[Set] = None
+    inner_doc: Optional[InnerDoc] = None
+    test_dict: Optional[Dict] = None
+
+
+@pytest.fixture
+def doc1():
+    return MMDoc(
+        text='hey here',
+        categories=['a', 'b', 'c'],
+        price=10,
+        matches=DocumentArray[MMDoc]([MMDoc()]),
+        matches_with_same_id=DocumentArray[MMDoc](
+            [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]
+        ),
+        test_set={'a', 'a'},
+        inner_doc=InnerDoc(integer=2, l=['c', 'd']),
+        test_dict={'a': 0, 'b': 2, 'd': 4, 'z': 3},
+    )
+
+
+@pytest.fixture
+def doc2(doc1):
+    return MMDoc(
+        id=doc1.id,
+        text='hey here 2',
+        categories=['d', 'e', 'f'],
+        price=5,
+        opt_int=5,
+        matches=DocumentArray[MMDoc]([MMDoc()]),
+        matches_with_same_id=DocumentArray[MMDoc](
+            [MMDoc(id='a', matches=DocumentArray[MMDoc]([MMDoc()]))]
+        ),
+        test_set={'a', 'b'},
+        inner_doc=InnerDoc(integer=3, l=['a', 'b']),
+        test_dict={'a': 10, 'b': 10, 'c': 3, 'z': None},
+    )
+
+
+def test_update_complex(doc1, doc2):
+    doc1.update(doc2)
+    # doc1 is changed in place (no extra memory)
+    assert doc1.text == 'hey here 2'
+    assert doc1.categories == ['a', 'b', 'c', 'd', 'e', 'f']
+    assert len(doc1.matches) == 2
+    assert doc1.opt_int == 5
+    assert doc1.price == 5
+    assert doc1.test_set == {'a', 'b'}
+    assert len(doc1.matches_with_same_id) == 1
+    assert len(doc1.matches_with_same_id[0].matches) == 2
+    assert doc1.inner_doc.integer == 3
+    assert doc1.inner_doc.l == ['c', 'd', 'a', 'b']
+    assert doc1.test_dict == {'a': 10, 'b': 10, 'c': 3, 'd': 4, 'z': None}
+
+
+def test_update_simple():
+    class MyDocument(BaseDocument):
+        content: str
+        title: Optional[str] = None
+        tags_: List
+
+    my_doc1 = MyDocument(
+        content='Core content of the document', title='Title', tags_=['python', 'AI']
+    )
+    my_doc2 = MyDocument(content='Core content updated', tags_=['docarray'])
+
+    my_doc1.update(my_doc2)
+    assert my_doc1.content == 'Core content updated'
+    assert my_doc1.title == 'Title'
+    assert my_doc1.tags_ == ['python', 'AI', 'docarray']
+
+
+def test_update_different_schema_fails():
+    class DocA(BaseDocument):
+        content: str
+
+    class DocB(BaseDocument):
+        image: Optional[Image] = None
+
+    docA = DocA(content='haha')
+    docB = DocB()
+    with pytest.raises(Exception):
+        docA.update(docB)