Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 32 additions & 5 deletions docarray/array/mixins/io/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,14 @@ def load_binary(
file: Union[str, BinaryIO, bytes],
protocol: str = 'pickle-array',
compress: Optional[str] = None,
_show_progress: bool = False,
) -> 'T':
"""Load array elements from a LZ4-compressed binary file.

:param file: File or filename or serialized bytes where the data is stored.
:param protocol: protocol to use
:param compress: compress algorithm to use
:param _show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`

:return: a DocumentArray object
"""
Expand Down Expand Up @@ -57,9 +61,15 @@ def load_binary(
else:
_len = len(random_uuid().bytes)
_binary_delimiter = d[:_len] # first get delimiter
if _show_progress:
from rich.progress import track as _track

track = lambda x: _track(x, description='Deserializing')
else:
track = lambda x: x
return cls(
Document.from_bytes(od, protocol=protocol, compress=compress)
for od in d[_len:].split(_binary_delimiter)
for od in track(d[_len:].split(_binary_delimiter))
)

@classmethod
Expand All @@ -68,22 +78,27 @@ def from_bytes(
data: bytes,
protocol: str = 'pickle-array',
compress: Optional[str] = None,
_show_progress: bool = False,
) -> 'T':
return cls.load_binary(data, protocol=protocol, compress=compress)
return cls.load_binary(
data, protocol=protocol, compress=compress, _show_progress=_show_progress
)

def save_binary(
self,
file: Union[str, BinaryIO],
protocol: str = 'pickle-array',
compress: Optional[str] = None,
) -> None:
"""Save array elements into a LZ4 compressed binary file.
"""Save array elements into a binary file.

Comparing to :meth:`save_json`, it is faster and the file is smaller, but not human-readable.

.. note::
To get a binary presentation in memory, use ``bytes(...)``.

:param protocol: protocol to use
:param compress: compress algorithm to use
:param file: File or filename to which the data is saved.
"""
if isinstance(file, io.BufferedWriter):
Expand All @@ -101,11 +116,16 @@ def to_bytes(
protocol: str = 'pickle-array',
compress: Optional[str] = None,
_file_ctx: Optional[BinaryIO] = None,
_show_progress: bool = False,
) -> bytes:
"""Serialize itself into bytes with LZ4 compression.
"""Serialize itself into bytes.

For more Pythonic code, please use ``bytes(...)``.

:param _file_ctx: File or filename or serialized bytes where the data is stored.
:param protocol: protocol to use
:param compress: compress algorithm to use
:param _show_progress: show progress bar, only works when protocol is `pickle` or `protobuf`
:return: the binary serialization in bytes
"""

Expand All @@ -126,7 +146,14 @@ def to_bytes(
elif protocol == 'pickle-array':
f.write(pickle.dumps(self))
else:
for d in self:
if _show_progress:
from rich.progress import track as _track

track = lambda x: _track(x, description='Serializing')
else:
track = lambda x: x

for d in track(self):
f.write(_binary_delimiter)
f.write(d.to_bytes(protocol=protocol, compress=compress))
if not _file_ctx:
Expand Down
41 changes: 28 additions & 13 deletions docarray/array/mixins/io/pushpull.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@ class PushPullMixin:
"""Transmitting :class:`DocumentArray` via Jina Cloud Service"""

_service_url = 'https://apihubble.jina.ai/v2/rpc/da.'
_max_bytes = 4 * 1024 * 1024 * 1024

def push(
self, token: str, show_progress: bool = False, compress: Optional[str] = None
) -> None:
def push(self, token: str, show_progress: bool = False) -> None:
"""Push this DocumentArray object to Jina Cloud which can be later retrieved via :meth:`.push`

.. note::
Expand All @@ -30,6 +29,8 @@ def push(
"""
import requests

dict_data = self._get_dict_data(token, show_progress)

progress = _get_progressbar(show_progress)
task_id = progress.add_task('upload', start=False) if show_progress else None

Expand All @@ -52,14 +53,6 @@ def read(self, n=-1):
self._p_bar.update(self._task_id, advance=len(chunk))
return chunk

dict_data = {
'file': (
'DocumentArray',
self.to_bytes(protocol='protobuf', compress=compress),
),
'token': token,
}

(data, ctype) = requests.packages.urllib3.filepost.encode_multipart_formdata(
dict_data
)
Expand All @@ -75,7 +68,6 @@ def pull(
cls: Type['T'],
token: str,
show_progress: bool = False,
compress: Optional[str] = None,
) -> 'T':
"""Pulling a :class:`DocumentArray` from Jina Cloud Service to local.

Expand Down Expand Up @@ -110,10 +102,33 @@ def pull(
if show_progress:
progress.update(task_id, advance=len(chunk))

if show_progress:
progress.stop()
return cls.from_bytes(
f.getvalue(), protocol='protobuf', compress=compress
f.getvalue(),
protocol='protobuf',
compress='gzip',
_show_progress=show_progress,
)

def _get_dict_data(self, token, show_progress):
_serialized = self.to_bytes(
protocol='protobuf', compress='gzip', _show_progress=show_progress
)
if len(_serialized) > self._max_bytes:
raise ValueError(
f'DocumentArray is too big. '
f'Size of the serialization {len(_serialized)} is larger than {self._max_bytes}.'
)

return {
'file': (
'DocumentArray',
_serialized,
),
'token': token,
}


def _get_progressbar(show_progress):
if show_progress:
Expand Down
12 changes: 4 additions & 8 deletions docs/fundamentals/documentarray/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,9 @@ da = DocumentArray.from_dataframe(df)
This feature requires `rich` and `requests` dependency. You can do `pip install "docarray[full]"` to install it.
```

{meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.push` and {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.pull` allows you to share a DocumentArray object across machines.
{meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.push` and {meth}`~docarray.array.mixins.io.pushpull.PushPullMixin.pull` allows you to serialize a DocumentArray object to Jina Cloud and share it across machines.

Considering you are working on a GPU machine via Google Colab/Jupyter. After preprocessing and embedding, you got everything you need in a DocumentArray. You can easily transfer it to the local laptop via:
Considering you are working on a GPU machine via Google Colab/Jupyter. After preprocessing and embedding, you got everything you need in a DocumentArray. You can easily store it to the cloud via:

```python
from docarray import DocumentArray
Expand All @@ -230,7 +230,7 @@ da.push(token='myda123')
```{figure} images/da-push.png
```

Then on your local laptop, simply
Then on your local laptop, simply pull it:

```python
from docarray import DocumentArray
Expand All @@ -240,8 +240,4 @@ da = DocumentArray.pull(token='myda123')

Now you can continue the work at local, analyzing `da` or visualizing it. Your friends & colleagues who know the token `myda123` can also pull that DocumentArray. It's useful when you want to quickly share the results with your colleagues & friends.

For more information of this feature, please refer to {class}`~jina.types.arrays.mixins.io.pushpull.PushPullMixin`.

```{danger}
The lifetime of the storage is not promised at the momennt: could be a day, could be a week. Do not use it for persistence in production. Only consider this as temporary transmission or a clipboard.
```
The maximum size of an upload is 4GB under the `protocol='protobuf'` and `compress='gzip'` setting. The lifetime of an upload is one week after its creation.
9 changes: 9 additions & 0 deletions tests/unit/array/test_from_to_bytes.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,12 @@ def test_from_to_protobuf(target_da):
@pytest.mark.parametrize('target_da', [DocumentArray.empty(100), random_docs(100)])
def test_from_to_safe_list(target_da):
DocumentArray.from_list(target_da.to_list())


@pytest.mark.parametrize('protocol', ['protobuf', 'pickle'])
@pytest.mark.parametrize('show_progress', [True, False])
def test_push_pull_show_progress(show_progress, protocol):
da = DocumentArray.empty(1000)
r = da.to_bytes(_show_progress=show_progress, protocol=protocol)
da_r = DocumentArray.from_bytes(r, _show_progress=show_progress, protocol=protocol)
assert da == da_r