Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/requirements-cicd.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ matplotlib
rich
Pillow
lz4
fastapi
fastapi
jupyterlab
31 changes: 29 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@

<!-- start elevator-pitch -->

DocArray is a library for nested, unstructured data such as text, image, audio, video, 3D mesh. Its Pythonic interface allows deep learning engineers to easily preprocess, embed, search, recommend and transfer the data.
DocArray is a library for nested, unstructured data such as text, image, audio, video, 3D mesh. It allows deep learning engineers to easily preprocess, embed, search, recommend and transfer the data.

🌌 **All data types**: super-expressive data structure for representing complicated/mixed/nested text, image, video, audio, 3D mesh data.

🧑‍🔬 **Data science powerhouse**: easy-to-use functions for facilitating data scientists work on embedding, matching, visualizing, evaluating via Torch/Tensorflow/ONNX/PaddlePaddle.
🐍 **Pythonic API**: easy-to-use idioms and interfaces just as the native Python List. If you know how to Python, you know how to DocArray.

🧑‍🔬 **Data science powerhouse**: greatly facilitate data scientists work on embedding, matching, visualizing, evaluating via Torch/Tensorflow/ONNX/PaddlePaddle.

🚡 **Portable**: ready to wire at anytime with efficient and compact serialization from/to Protobuf, binary, JSON, CSV, dataframe.

Expand Down Expand Up @@ -196,6 +198,31 @@ recall@5 0.0573470744680851

More metrics can be used such as `precision_at_k`, `ndcg_at_k`, `hit_at_k`.



### Save results

You can save a DocumentArray to binary, JSON, dict, dataframe, CSV or Protobuf message. In its simplest form,

```python
left_da.save('left_da.bin')
```

To reuse it, do `left_da = DocumentArray.load('left_da.bin')`.

If you want to transfer a DoucmentArray from one machine to another or share it with your colleagues, you can do:

```python
left_da.push(token='my_shared_da')
```

```python
left_da = DocumentArray.pull(token='my_shared_da')
```

Anyone knows the token `my_shared_da` can pull and work on it.


Intrigued? That's only scratching the surface of what DocArray is capable of. [Read our docs to learn more](https://docarray.jina.ai).

<!-- start support-pitch -->
Expand Down
4 changes: 2 additions & 2 deletions docarray/array/mixins/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class CommonIOMixin:
"""The common IO helper function for arrays. """

def save(
self, file: Union[str, TextIO, BinaryIO], file_format: str = 'json'
self, file: Union[str, TextIO, BinaryIO], file_format: str = 'binary'
) -> None:
"""Save array elements into a JSON, a binary file or a CSV file.

Expand All @@ -28,7 +28,7 @@ def save(

@classmethod
def load(
cls: Type['T'], file: Union[str, TextIO, BinaryIO], file_format: str = 'json'
cls: Type['T'], file: Union[str, TextIO, BinaryIO], file_format: str = 'binary'
) -> 'T':
"""Load array elements from a JSON or a binary file, or a CSV file.

Expand Down
19 changes: 14 additions & 5 deletions docarray/array/mixins/io/pushpull.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import io
from contextlib import nullcontext
from typing import Type, TYPE_CHECKING
from typing import Type, TYPE_CHECKING, Optional

from ....helper import get_request_header

Expand All @@ -13,7 +13,9 @@ class PushPullMixin:

_service_url = 'https://apihubble.jina.ai/v2/rpc/da.'

def push(self, token: str, show_progress: bool = False) -> None:
def push(
self, token: str, show_progress: bool = False, compress: Optional[str] = None
) -> None:
"""Push this DocumentArray object to Jina Cloud which can be later retrieved via :meth:`.push`

.. note::
Expand Down Expand Up @@ -53,7 +55,7 @@ def read(self, n=-1):
dict_data = {
'file': (
'DocumentArray',
self.to_bytes(protocol='protobuf', compress='gzip'),
self.to_bytes(protocol='protobuf', compress=compress),
),
'token': token,
}
Expand All @@ -69,7 +71,12 @@ def read(self, n=-1):
requests.post(self._service_url + 'push', data=body, headers=headers)

@classmethod
def pull(cls: Type['T'], token: str, show_progress: bool = False) -> 'T':
def pull(
cls: Type['T'],
token: str,
show_progress: bool = False,
compress: Optional[str] = None,
) -> 'T':
"""Pulling a :class:`DocumentArray` from Jina Cloud Service to local.

:param token: the upload token set during :meth:`.push`
Expand Down Expand Up @@ -103,7 +110,9 @@ def pull(cls: Type['T'], token: str, show_progress: bool = False) -> 'T':
if show_progress:
progress.update(task_id, advance=len(chunk))

return cls.from_bytes(f.getvalue(), protocol='protobuf', compress='lz4')
return cls.from_bytes(
f.getvalue(), protocol='protobuf', compress=compress
)


def _get_progressbar(show_progress):
Expand Down
44 changes: 39 additions & 5 deletions docarray/document/mixins/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,10 @@ def _mermaid_to_url(self, img_type: str) -> str:
"""
mermaid_str = (
"""
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#FFC666'}}}%%
classDiagram

"""
%%{init: {'theme': 'base', 'themeVariables': { 'primaryColor': '#FFC666'}}}%%
classDiagram
"""
+ self.__mermaid_str__()
)

Expand All @@ -69,7 +69,41 @@ def _mermaid_to_url(self, img_type: str) -> str:

def _ipython_display_(self):
"""Displays the object in IPython as a side effect"""
self.plot(inline_display=True)
self.summary()

def summary(self) -> None:
""" Print non-empty fields and nested structure of this Document object."""
_str_list = []
self._plot_recursion(_str_list, indent=0)
print('\n'.join(_str_list))

def _plot_recursion(self, _str_list, indent, box_char='├─'):
prefix = (' ' * indent + box_char) if indent else ''
_str_list.append(f'{prefix} {self}')

for a in ('matches', 'chunks'):
if getattr(self, a):
prefix = ' ' * (indent + 4) + '└─'
_str_list.append(f'{prefix} {a}')

for d in getattr(self, a)[:-1]:
d._plot_recursion(_str_list, indent=len(prefix) + 4)
getattr(self, a)[-1]._plot_recursion(
_str_list, indent=len(prefix) + 4, box_char='└─'
)

def plot_image(self):
""" Plot image data from :attr:`.blob` or :attr:`.uri`. """
from IPython.display import Image, display

if self.blob is not None:
import PIL.Image

display(PIL.Image.fromarray(self.blob))
elif self.uri:
display(Image(self.uri))
else:
raise ValueError('`uri` and `blob` is empty')

def plot(self, output: Optional[str] = None, inline_display: bool = False) -> None:
"""
Expand Down
15 changes: 15 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@

## Install

```{tip}
Jina 3.x users do not need to install `docarray` separately, it is shipped with Jina. To check your Jina version, type `jina -vf` in the console.
```

Make sure you have Python 3.7+ and `numpy` installed on Linux/Mac/Windows:

````{tab} Basic install
Expand Down Expand Up @@ -41,6 +45,17 @@ The following dependencies will be installed to enable additional features:
Alternatively, you can first do basic installation and then install missing dependencies on-demand.
````

```pycon
>>> import docarray
>>> docarray.__version__
'0.1.0'
```

```{attention}
If the printed version is smaller than `0.1.0`, say `0.0.x`, then you are
not installing `docarray` correctly. You are probably still using an old `docarray` shipped with Jina 2.x.
```




Expand Down
29 changes: 29 additions & 0 deletions tests/unit/document/test_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import os

from docarray import Document

cur_dir = os.path.dirname(os.path.abspath(__file__))


def test_single_doc_summary():
# empty doc
Document().summary()
# nested doc
Document(
chunks=[
Document(),
Document(chunks=[Document()]),
Document(),
],
matches=[Document(), Document()],
).summary()


def test_plot_image():
d = Document(uri=os.path.join(cur_dir, 'toydata/test.png'))
d.plot_image()

d.load_uri_to_image_blob()
d.uri = None

d.plot_image()