Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion docarray/array/chunk.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from typing import TYPE_CHECKING
import itertools
from typing import (
TYPE_CHECKING,
Generator,
Iterator,
Sequence,
)

from .document import DocumentArray

Expand All @@ -24,6 +30,15 @@ def __init__(self, docs, reference_doc: 'Document'):
"""
self._ref_doc = reference_doc
super().__init__(docs)
if (
isinstance(
docs, (DocumentArray, Sequence, Generator, Iterator, itertools.chain)
)
and self._ref_doc is not None
):
for d in docs:
d.parent_id = self._ref_doc.id
d.granularity = self._ref_doc.granularity + 1

def append(self, document: 'Document'):
"""Add a sub-document (i.e chunk) to the current Document.
Expand Down
2 changes: 1 addition & 1 deletion docarray/array/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def __bool__(self):
return len(self) > 0

def __repr__(self):
return f'<{typename(self)} (length={len(self)}) at {id(self)}>'
return f'<{self.__class__.__name__} (length={len(self)}) at {id(self)}>'

def __add__(self, other: 'Document'):
v = type(self)()
Expand Down
17 changes: 15 additions & 2 deletions docarray/array/match.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@
from typing import TYPE_CHECKING
import itertools
from typing import (
TYPE_CHECKING,
Generator,
Iterator,
Sequence,
)

from .. import DocumentArray

Expand All @@ -18,13 +24,20 @@ class MatchArray(DocumentArray):
def __init__(self, docs, reference_doc: 'Document'):
self._ref_doc = reference_doc
super().__init__(docs)
if (
isinstance(
docs, (DocumentArray, Sequence, Generator, Iterator, itertools.chain)
)
and self._ref_doc is not None
):
for d in docs:
d.adjacency = self._ref_doc.adjacency + 1

def append(self, document: 'Document'):
"""Add a matched document to the current Document.

:param document: Sub-document to be added
"""
document.granularity = self._ref_doc.granularity
document.adjacency = self._ref_doc.adjacency + 1
super().append(document)

Expand Down
2 changes: 1 addition & 1 deletion docarray/array/mixins/io/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def to_dataframe(self, **kwargs) -> 'DataFrame':
"""
from pandas import DataFrame

return DataFrame.from_dict(self.to_list_safe(), **kwargs)
return DataFrame.from_dict(self.to_list(), **kwargs)

@classmethod
def from_dataframe(cls: Type['T'], df: 'DataFrame') -> 'T':
Expand Down
8 changes: 4 additions & 4 deletions docarray/array/mixins/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,24 +55,24 @@ def from_json(cls: Type['T'], file: Union[str, TextIO]) -> 'T':
return cls.load_json(file)

@classmethod
def from_list_safe(cls: Type['T'], values: List) -> 'T':
def from_list(cls: Type['T'], values: List) -> 'T':
from .... import Document

return cls(Document.from_dict(v) for v in values)

def to_list_safe(self) -> List:
def to_list(self, strict: bool = True) -> List:
"""Convert the object into a Python list.

.. note::
Array like object such as :class:`numpy.ndarray` will be converted to Python list.

:return: a Python list
"""
return [d.to_dict() for d in self]
return [d.to_dict(strict=strict) for d in self]

def to_json(self) -> str:
"""Convert the object into a JSON string. Can be loaded via :meth:`.load_json`.

:return: a Python list
"""
return json.dumps(self.to_list_safe())
return json.dumps(self.to_list())
28 changes: 16 additions & 12 deletions docarray/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def __init__(
_obj: Optional['T'] = None,
copy: bool = False,
field_resolver: Optional[Dict[str, str]] = None,
unknown_fields_handler: str = 'catch',
**kwargs,
):
self._data = None
Expand All @@ -32,23 +33,26 @@ def __init__(
kwargs = {field_resolver.get(k, k): v for k, v in kwargs.items()}

_unknown_kwargs = None
if hasattr(self, '_unresolved_fields_dest'):
_unresolved = set(kwargs.keys()).difference(
{f.name for f in fields(self._data_class)}
)
if _unresolved:
_unknown_kwargs = {k: kwargs[k] for k in _unresolved}
for k in _unresolved:
kwargs.pop(k)
_unresolved = set(kwargs.keys()).difference(
{f.name for f in fields(self._data_class)}
)

if _unresolved:
if unknown_fields_handler == 'raise':
raise AttributeError(f'unknown attributes: {_unresolved}')

_unknown_kwargs = {k: kwargs[k] for k in _unresolved}
for k in _unresolved:
kwargs.pop(k)

self._data = self._data_class(self)
for k, v in kwargs.items():
setattr(self._data, k, v)

if _unknown_kwargs:
if _unknown_kwargs and unknown_fields_handler == 'catch':
getattr(self, self._unresolved_fields_dest).update(_unknown_kwargs)

if _obj is None and not kwargs:
if _obj is None and not kwargs and self._data is None:
self._data = self._data_class(self)

if self._data is None:
Expand Down Expand Up @@ -100,8 +104,8 @@ def __hash__(self):

def __repr__(self):
content = str(self.non_empty_fields)
content += f' at {id(self)}'
return f'<{typename(self)} {content.strip()}>'
content += f' at {getattr(self, "id", id(self))}'
return f'<{self.__class__.__name__} {content.strip()}>'

def __bytes__(self):
return self.to_bytes()
Expand Down
1 change: 1 addition & 0 deletions docarray/document/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def __setattr__(self, key, value):
self.text = value
else:
self.blob = value
value = None
elif key == 'chunks':
from ..array.chunk import ChunkArray

Expand Down
97 changes: 1 addition & 96 deletions docarray/document/mixins/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,66 +7,6 @@
class PlotMixin:
"""Provide helper functions for :class:`Document` to plot and visualize itself. """

@property
def _mermaid_id(self):
if not hasattr(self, '__mermaid_id'):
self.__mermaid_id = random_identity()
return self.__mermaid_id

def __mermaid_str__(self):
results = []
_id = f'{self._mermaid_id[:3]}~Document~'

for idx, c in enumerate(self.chunks):
results.append(
f'{_id} --> "{idx + 1}/{len(self.chunks)}" {c._mermaid_id[:3]}~Document~: chunks'
)
results.append(c.__mermaid_str__())

for idx, c in enumerate(self.matches):
results.append(
f'{_id} ..> "{idx + 1}/{len(self.matches)}" {c._mermaid_id[:3]}~Document~: matches'
)
results.append(c.__mermaid_str__())

content = self.to_dict()
if 'chunks' in content:
content.pop('chunks')
if 'matches' in content:
content.pop('matches')
if content:
results.append(f'class {_id}{{')
for k, v in content.items():
if isinstance(v, (str, int, float, bytes)):
results.append(f'+{k} {str(v)[:10]}')
else:
results.append(f'+{k}({type(getattr(self, k, v))})')
results.append('}')

return '\n'.join(results)

def _mermaid_to_url(self, img_type: str) -> str:
"""
Rendering the current flow as a url points to a SVG, it needs internet connection

:param img_type: the type of image to be generated
:return: the url pointing to a SVG
"""
mermaid_str = (
"""
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#FFC666'}}}%%
classDiagram

"""
+ self.__mermaid_str__()
)

encoded_str = base64.b64encode(bytes(mermaid_str.strip(), 'utf-8')).decode(
'utf-8'
)

return f'https://mermaid.ink/{img_type}/{encoded_str}'

def _ipython_display_(self):
"""Displays the object in IPython as a side effect"""
self.summary()
Expand All @@ -92,7 +32,7 @@ def _plot_recursion(self, _str_list, indent, box_char='├─'):
_str_list, indent=len(prefix) + 4, box_char='└─'
)

def plot_image(self):
def plot(self):
""" Plot image data from :attr:`.blob` or :attr:`.uri`. """
from IPython.display import Image, display

Expand All @@ -104,38 +44,3 @@ def plot_image(self):
display(Image(self.uri))
else:
raise ValueError('`uri` and `blob` is empty')

def plot(self, output: Optional[str] = None, inline_display: bool = False) -> None:
"""
Visualize the Document recursively.

:param output: a filename specifying the name of the image to be created,
the suffix svg/jpg determines the file type of the output image
:param inline_display: show image directly inside the Jupyter Notebook
"""
image_type = 'svg'
if (
not output.endswith('.svg')
and not output.endswith('.jpg')
and not output.endswith('.jpeg')
):
raise ValueError('`output` can be only SVG/JPG format')
elif output.endswith('.jpg') or output.endswith('.jpeg'):
image_type = 'img'

url = self._mermaid_to_url(image_type)
showed = False
if inline_display:
try:
from IPython.display import Image, display

display(Image(url=url))
showed = True
except:
# no need to panic users
pass

if output:
download_mermaid_url(url, output)
elif not showed:
print(f'Document visualization: {url}')
25 changes: 18 additions & 7 deletions docarray/document/mixins/porting.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import dataclasses
import pickle
from typing import Optional, TYPE_CHECKING, Type, Dict
from typing import Optional, TYPE_CHECKING, Type, Dict, Any

from ...helper import compress_bytes, decompress_bytes

Expand All @@ -26,13 +27,16 @@ def from_json(cls: Type['T'], obj: str) -> 'T':
json_format.Parse(obj, pb_msg)
return cls.from_protobuf(pb_msg)

def to_dict(self):
from google.protobuf.json_format import MessageToDict
def to_dict(self, strict: bool = True) -> Dict[str, Any]:
if strict:
from google.protobuf.json_format import MessageToDict

return MessageToDict(
self.to_protobuf(),
preserving_proto_field_name=True,
)
return MessageToDict(
self.to_protobuf(),
preserving_proto_field_name=True,
)
else:
return dataclasses.asdict(self._data)

def to_bytes(
self, protocol: str = 'pickle', compress: Optional[str] = None
Expand All @@ -54,6 +58,13 @@ def from_bytes(
protocol: str = 'pickle',
compress: Optional[str] = None,
) -> 'T':
"""Build Document object from binary bytes

:param data: binary bytes
:param protocol: protocol to use
:param compress: compress method to use
:return: a Document object
"""
bstr = decompress_bytes(data, algorithm=compress)
if protocol == 'pickle':
d = pickle.loads(bstr)
Expand Down
1 change: 1 addition & 0 deletions docarray/document/mixins/property.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

class PropertyMixin(_PropertyMixin):
def _clear_content(self):
self._data.content = None
self._data.text = None
self._data.blob = None
self._data.buffer = None
Expand Down
13 changes: 10 additions & 3 deletions docarray/proto/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
def parse_proto(pb_msg: 'DocumentProto') -> 'Document':
from ... import Document
from ...score import NamedScore

fields = {}
for (field, value) in pb_msg.ListFields():
f_name = field.name
Expand All @@ -27,7 +28,9 @@ def parse_proto(pb_msg: 'DocumentProto') -> 'Document':
elif f_name == 'scores' or f_name == 'evaluations':
fields[f_name] = {}
for k, v in value.items():
fields[f_name][k] = NamedScore({ff.name: vv for (ff, vv) in v.ListFields()})
fields[f_name][k] = NamedScore(
{ff.name: vv for (ff, vv) in v.ListFields()}
)
else:
fields[f_name] = value
return Document(**fields)
Expand All @@ -53,13 +56,17 @@ def flush_proto(doc: 'Document') -> 'DocumentProto':
setattr(getattr(pb_msg, key)[kk], ff, getattr(vv, ff))
elif key == 'location':
pb_msg.location.extend(value)
elif key == 'content':
pass # intentionally ignore `content` field as it is just a proxy
else:
# other simple fields
setattr(pb_msg, key, value)
except RecursionError as ex:
if len(ex.args) >= 1:
ex.args = (f'Field `{key}` contains cyclic reference in memory. '
f'Could it be your Document is referring to itself?',)
ex.args = (
f'Field `{key}` contains cyclic reference in memory. '
f'Could it be your Document is referring to itself?',
)
raise
except Exception as ex:
if len(ex.args) >= 1:
Expand Down
Loading