docarray · hanxiao · Jan 4, 2022 · Jan 4, 2022
diff --git a/docarray/array/chunk.py b/docarray/array/chunk.py
@@ -1,4 +1,10 @@
-from typing import TYPE_CHECKING
+import itertools
+from typing import (
+    TYPE_CHECKING,
+    Generator,
+    Iterator,
+    Sequence,
+)
 
 from .document import DocumentArray
 
@@ -24,6 +30,15 @@ def __init__(self, docs, reference_doc: 'Document'):
         """
         self._ref_doc = reference_doc
         super().__init__(docs)
+        if (
+            isinstance(
+                docs, (DocumentArray, Sequence, Generator, Iterator, itertools.chain)
+            )
+            and self._ref_doc is not None
+        ):
+            for d in docs:
+                d.parent_id = self._ref_doc.id
+                d.granularity = self._ref_doc.granularity + 1
 
     def append(self, document: 'Document'):
         """Add a sub-document (i.e chunk) to the current Document.

diff --git a/docarray/array/document.py b/docarray/array/document.py
@@ -242,7 +242,7 @@ def __bool__(self):
         return len(self) > 0
 
     def __repr__(self):
-        return f'<{typename(self)} (length={len(self)}) at {id(self)}>'
+        return f'<{self.__class__.__name__} (length={len(self)}) at {id(self)}>'
 
     def __add__(self, other: 'Document'):
         v = type(self)()

diff --git a/docarray/array/match.py b/docarray/array/match.py
@@ -1,4 +1,10 @@
-from typing import TYPE_CHECKING
+import itertools
+from typing import (
+    TYPE_CHECKING,
+    Generator,
+    Iterator,
+    Sequence,
+)
 
 from .. import DocumentArray
 
@@ -18,13 +24,20 @@ class MatchArray(DocumentArray):
     def __init__(self, docs, reference_doc: 'Document'):
         self._ref_doc = reference_doc
         super().__init__(docs)
+        if (
+            isinstance(
+                docs, (DocumentArray, Sequence, Generator, Iterator, itertools.chain)
+            )
+            and self._ref_doc is not None
+        ):
+            for d in docs:
+                d.adjacency = self._ref_doc.adjacency + 1
 
     def append(self, document: 'Document'):
         """Add a matched document to the current Document.
 
         :param document: Sub-document to be added
         """
-        document.granularity = self._ref_doc.granularity
         document.adjacency = self._ref_doc.adjacency + 1
         super().append(document)
 

diff --git a/docarray/array/mixins/io/dataframe.py b/docarray/array/mixins/io/dataframe.py
@@ -21,7 +21,7 @@ def to_dataframe(self, **kwargs) -> 'DataFrame':
         """
         from pandas import DataFrame
 
-        return DataFrame.from_dict(self.to_list_safe(), **kwargs)
+        return DataFrame.from_dict(self.to_list(), **kwargs)
 
     @classmethod
     def from_dataframe(cls: Type['T'], df: 'DataFrame') -> 'T':

diff --git a/docarray/array/mixins/io/json.py b/docarray/array/mixins/io/json.py
@@ -55,24 +55,24 @@ def from_json(cls: Type['T'], file: Union[str, TextIO]) -> 'T':
         return cls.load_json(file)
 
     @classmethod
-    def from_list_safe(cls: Type['T'], values: List) -> 'T':
+    def from_list(cls: Type['T'], values: List) -> 'T':
         from .... import Document
 
         return cls(Document.from_dict(v) for v in values)
 
-    def to_list_safe(self) -> List:
+    def to_list(self, strict: bool = True) -> List:
         """Convert the object into a Python list.
 
         .. note::
             Array like object such as :class:`numpy.ndarray` will be converted to Python list.
 
         :return: a Python list
         """
-        return [d.to_dict() for d in self]
+        return [d.to_dict(strict=strict) for d in self]
 
     def to_json(self) -> str:
         """Convert the object into a JSON string. Can be loaded via :meth:`.load_json`.
 
         :return: a Python list
         """
-        return json.dumps(self.to_list_safe())
+        return json.dumps(self.to_list())
diff --git a/docarray/base.py b/docarray/base.py
@@ -16,6 +16,7 @@ def __init__(
         _obj: Optional['T'] = None,
         copy: bool = False,
         field_resolver: Optional[Dict[str, str]] = None,
+        unknown_fields_handler: str = 'catch',
         **kwargs,
     ):
         self._data = None
@@ -32,23 +33,26 @@ def __init__(
                 kwargs = {field_resolver.get(k, k): v for k, v in kwargs.items()}
 
             _unknown_kwargs = None
-            if hasattr(self, '_unresolved_fields_dest'):
-                _unresolved = set(kwargs.keys()).difference(
-                    {f.name for f in fields(self._data_class)}
-                )
-                if _unresolved:
-                    _unknown_kwargs = {k: kwargs[k] for k in _unresolved}
-                    for k in _unresolved:
-                        kwargs.pop(k)
+            _unresolved = set(kwargs.keys()).difference(
+                {f.name for f in fields(self._data_class)}
+            )
+
+            if _unresolved:
+                if unknown_fields_handler == 'raise':
+                    raise AttributeError(f'unknown attributes: {_unresolved}')
+
+                _unknown_kwargs = {k: kwargs[k] for k in _unresolved}
+                for k in _unresolved:
+                    kwargs.pop(k)
 
             self._data = self._data_class(self)
             for k, v in kwargs.items():
                 setattr(self._data, k, v)
 
-            if _unknown_kwargs:
+            if _unknown_kwargs and unknown_fields_handler == 'catch':
                 getattr(self, self._unresolved_fields_dest).update(_unknown_kwargs)
 
-        if _obj is None and not kwargs:
+        if _obj is None and not kwargs and self._data is None:
             self._data = self._data_class(self)
 
         if self._data is None:
@@ -100,8 +104,8 @@ def __hash__(self):
 
     def __repr__(self):
         content = str(self.non_empty_fields)
-        content += f' at {id(self)}'
-        return f'<{typename(self)} {content.strip()}>'
+        content += f' at {getattr(self, "id", id(self))}'
+        return f'<{self.__class__.__name__} {content.strip()}>'
 
     def __bytes__(self):
         return self.to_bytes()

diff --git a/docarray/document/data.py b/docarray/document/data.py
@@ -84,6 +84,7 @@ def __setattr__(self, key, value):
                     self.text = value
                 else:
                     self.blob = value
+                value = None
             elif key == 'chunks':
                 from ..array.chunk import ChunkArray
 

diff --git a/docarray/document/mixins/plot.py b/docarray/document/mixins/plot.py
@@ -7,66 +7,6 @@
 class PlotMixin:
     """Provide helper functions for :class:`Document` to plot and visualize itself. """
 
-    @property
-    def _mermaid_id(self):
-        if not hasattr(self, '__mermaid_id'):
-            self.__mermaid_id = random_identity()
-        return self.__mermaid_id
-
-    def __mermaid_str__(self):
-        results = []
-        _id = f'{self._mermaid_id[:3]}~Document~'
-
-        for idx, c in enumerate(self.chunks):
-            results.append(
-                f'{_id} --> "{idx + 1}/{len(self.chunks)}" {c._mermaid_id[:3]}~Document~: chunks'
-            )
-            results.append(c.__mermaid_str__())
-
-        for idx, c in enumerate(self.matches):
-            results.append(
-                f'{_id} ..> "{idx + 1}/{len(self.matches)}" {c._mermaid_id[:3]}~Document~: matches'
-            )
-            results.append(c.__mermaid_str__())
-
-        content = self.to_dict()
-        if 'chunks' in content:
-            content.pop('chunks')
-        if 'matches' in content:
-            content.pop('matches')
-        if content:
-            results.append(f'class {_id}{{')
-            for k, v in content.items():
-                if isinstance(v, (str, int, float, bytes)):
-                    results.append(f'+{k} {str(v)[:10]}')
-                else:
-                    results.append(f'+{k}({type(getattr(self, k, v))})')
-            results.append('}')
-
-        return '\n'.join(results)
-
-    def _mermaid_to_url(self, img_type: str) -> str:
-        """
-        Rendering the current flow as a url points to a SVG, it needs internet connection
-
-        :param img_type: the type of image to be generated
-        :return: the url pointing to a SVG
-        """
-        mermaid_str = (
-            """
-                                                                                %%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#FFC666'}}}%%
-                                                                                classDiagram
-
-                                                                                        """
-            + self.__mermaid_str__()
-        )
-
-        encoded_str = base64.b64encode(bytes(mermaid_str.strip(), 'utf-8')).decode(
-            'utf-8'
-        )
-
-        return f'https://mermaid.ink/{img_type}/{encoded_str}'
-
     def _ipython_display_(self):
         """Displays the object in IPython as a side effect"""
         self.summary()
@@ -92,7 +32,7 @@ def _plot_recursion(self, _str_list, indent, box_char='├─'):
                     _str_list, indent=len(prefix) + 4, box_char='└─'
                 )
 
-    def plot_image(self):
+    def plot(self):
         """ Plot image data from :attr:`.blob` or :attr:`.uri`. """
         from IPython.display import Image, display
 
@@ -104,38 +44,3 @@ def plot_image(self):
             display(Image(self.uri))
         else:
             raise ValueError('`uri` and `blob` is empty')
-
-    def plot(self, output: Optional[str] = None, inline_display: bool = False) -> None:
-        """
-        Visualize the Document recursively.
-
-        :param output: a filename specifying the name of the image to be created,
-                    the suffix svg/jpg determines the file type of the output image
-        :param inline_display: show image directly inside the Jupyter Notebook
-        """
-        image_type = 'svg'
-        if (
-            not output.endswith('.svg')
-            and not output.endswith('.jpg')
-            and not output.endswith('.jpeg')
-        ):
-            raise ValueError('`output` can be only SVG/JPG format')
-        elif output.endswith('.jpg') or output.endswith('.jpeg'):
-            image_type = 'img'
-
-        url = self._mermaid_to_url(image_type)
-        showed = False
-        if inline_display:
-            try:
-                from IPython.display import Image, display
-
-                display(Image(url=url))
-                showed = True
-            except:
-                # no need to panic users
-                pass
-
-        if output:
-            download_mermaid_url(url, output)
-        elif not showed:
-            print(f'Document visualization: {url}')
diff --git a/docarray/document/mixins/porting.py b/docarray/document/mixins/porting.py
@@ -1,5 +1,6 @@
+import dataclasses
 import pickle
-from typing import Optional, TYPE_CHECKING, Type, Dict
+from typing import Optional, TYPE_CHECKING, Type, Dict, Any
 
 from ...helper import compress_bytes, decompress_bytes
 
@@ -26,13 +27,16 @@ def from_json(cls: Type['T'], obj: str) -> 'T':
         json_format.Parse(obj, pb_msg)
         return cls.from_protobuf(pb_msg)
 
-    def to_dict(self):
-        from google.protobuf.json_format import MessageToDict
+    def to_dict(self, strict: bool = True) -> Dict[str, Any]:
+        if strict:
+            from google.protobuf.json_format import MessageToDict
 
-        return MessageToDict(
-            self.to_protobuf(),
-            preserving_proto_field_name=True,
-        )
+            return MessageToDict(
+                self.to_protobuf(),
+                preserving_proto_field_name=True,
+            )
+        else:
+            return dataclasses.asdict(self._data)
 
     def to_bytes(
         self, protocol: str = 'pickle', compress: Optional[str] = None
@@ -54,6 +58,13 @@ def from_bytes(
         protocol: str = 'pickle',
         compress: Optional[str] = None,
     ) -> 'T':
+        """Build Document object from binary bytes
+
+        :param data: binary bytes
+        :param protocol: protocol to use
+        :param compress: compress method to use
+        :return: a Document object
+        """
         bstr = decompress_bytes(data, algorithm=compress)
         if protocol == 'pickle':
             d = pickle.loads(bstr)

diff --git a/docarray/document/mixins/property.py b/docarray/document/mixins/property.py
@@ -11,6 +11,7 @@
 
 class PropertyMixin(_PropertyMixin):
     def _clear_content(self):
+        self._data.content = None
         self._data.text = None
         self._data.blob = None
         self._data.buffer = None

diff --git a/docarray/proto/io/__init__.py b/docarray/proto/io/__init__.py
@@ -13,6 +13,7 @@
 def parse_proto(pb_msg: 'DocumentProto') -> 'Document':
     from ... import Document
     from ...score import NamedScore
+
     fields = {}
     for (field, value) in pb_msg.ListFields():
         f_name = field.name
@@ -27,7 +28,9 @@ def parse_proto(pb_msg: 'DocumentProto') -> 'Document':
         elif f_name == 'scores' or f_name == 'evaluations':
             fields[f_name] = {}
             for k, v in value.items():
-                fields[f_name][k] = NamedScore({ff.name: vv for (ff, vv) in v.ListFields()})
+                fields[f_name][k] = NamedScore(
+                    {ff.name: vv for (ff, vv) in v.ListFields()}
+                )
         else:
             fields[f_name] = value
     return Document(**fields)
@@ -53,13 +56,17 @@ def flush_proto(doc: 'Document') -> 'DocumentProto':
                         setattr(getattr(pb_msg, key)[kk], ff, getattr(vv, ff))
             elif key == 'location':
                 pb_msg.location.extend(value)
+            elif key == 'content':
+                pass  # intentionally ignore `content` field as it is just a proxy
             else:
                 # other simple fields
                 setattr(pb_msg, key, value)
         except RecursionError as ex:
             if len(ex.args) >= 1:
-                ex.args = (f'Field `{key}` contains cyclic reference in memory. '
-                           f'Could it be your Document is referring to itself?',)
+                ex.args = (
+                    f'Field `{key}` contains cyclic reference in memory. '
+                    f'Could it be your Document is referring to itself?',
+                )
             raise
         except Exception as ex:
             if len(ex.args) >= 1: