Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions docarray/array/mixins/io/binary.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class BinaryIOMixin:
def load_binary(
cls: Type['T'],
file: Union[str, BinaryIO, bytes],
protocol: str = 'pickle-once',
protocol: str = 'pickle-array',
compress: Optional[str] = None,
) -> 'T':
"""Load array elements from a LZ4-compressed binary file.
Expand All @@ -45,14 +45,14 @@ def load_binary(
d = decompress_bytes(d, algorithm=compress)
compress = None

if protocol == 'protobuf-once':
if protocol == 'protobuf-array':
from ....proto.docarray_pb2 import DocumentArrayProto

dap = DocumentArrayProto()
dap.ParseFromString(d)

return cls.from_protobuf(dap)
elif protocol == 'pickle-once':
elif protocol == 'pickle-array':
return pickle.loads(d)
else:
_len = len(random_uuid().bytes)
Expand All @@ -66,15 +66,15 @@ def load_binary(
def from_bytes(
cls: Type['T'],
data: bytes,
protocol: str = 'pickle-once',
protocol: str = 'pickle-array',
compress: Optional[str] = None,
) -> 'T':
return cls.load_binary(data, protocol=protocol, compress=compress)

def save_binary(
self,
file: Union[str, BinaryIO],
protocol: str = 'pickle-once',
protocol: str = 'pickle-array',
compress: Optional[str] = None,
) -> None:
"""Save array elements into a LZ4 compressed binary file.
Expand All @@ -98,7 +98,7 @@ def save_binary(

def to_bytes(
self,
protocol: str = 'pickle-once',
protocol: str = 'pickle-array',
compress: Optional[str] = None,
_file_ctx: Optional[BinaryIO] = None,
) -> bytes:
Expand All @@ -121,9 +121,9 @@ def to_bytes(
fc = f
compress = None
with fc:
if protocol == 'protobuf-once':
if protocol == 'protobuf-array':
f.write(self.to_protobuf().SerializePartialToString())
elif protocol == 'pickle-once':
elif protocol == 'pickle-array':
f.write(pickle.dumps(self))
else:
for d in self:
Expand Down
72 changes: 39 additions & 33 deletions docs/fundamentals/document/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
DocArray is designed to be "ready-to-wire": it assumes you always want to send/receive Document over network across microservices. Hence, serialization of Document is important. This chapter introduces multiple serialization methods of a single Document.

```{tip}
One should use DocumentArray for serializing multiple Documents, instead of looping over Documents one by one. The former is much faster and yield more compact serialization.
One should use {ref}`DocumentArray for serializing multiple Documents<docarray-serialization>`, instead of looping over Documents one by one. The former is much faster and yield more compact serialization.
```


Expand Down Expand Up @@ -47,38 +47,7 @@ print(d_as_json, d)
<Document ('id', 'mime_type', 'text', 'embedding') at 27d4fa4c6d5711ec8c831e008a366d49>
```


## From/to dict

```{important}
This feature requires `protobuf` dependency. You can do `pip install docarray[full]` to install it.
```

You can serialize a Document as a Python `dict` via {meth}`~docarray.document.mixins.porting.PortingMixin.to_dict`, and then read from it via {meth}`~docarray.document.mixins.porting.PortingMixin.from_dict`.

```python
from docarray import Document
import numpy as np

d_as_dict = Document(text='hello, world', embedding=np.array([1, 2, 3])).to_dict()

d = Document.from_dict(d_as_dict)

print(d_as_dict, d)
```

```text
{'id': 'b29d39066d5611ec87661e008a366d49', 'text': 'hello, world', 'mime_type': 'text/plain', 'embedding': {'dense': {'buffer': 'AQAAAAAAAAACAAAAAAAAAAMAAAAAAAAA', 'shape': [3], 'dtype': '<i8'}, 'cls_name': 'numpy'}}

<Document ('id', 'mime_type', 'text', 'embedding') at b29d39066d5611ec87661e008a366d49>
```

```{note}
Note that the result dict is very "stricted" in the sense that all fields and values boil down to very basic data type such as `int`, `float`, `string`. This behavior is designed due to the "serialization to `dict`" is often an intermediate step of serializing into JSON/YAML. Hence all values in `dict` must be schema-friendly. After all, a Python `dict` object means nothing if you are not working in Python.

You can use `to_dict(strict=False)` to override this behavior. This will preserve the original Python data type of every value, which may not be JSON-friendly. But hey, you want it.
```

(doc-in-bytes)=
## From/to bytes

```{important}
Expand Down Expand Up @@ -127,6 +96,43 @@ Note that when deserializing from a non-default binary serialization, you need t
d = Document.from_bytes(d_bytes, protocol='protobuf', compress='gzip')
```

```{tip}
If you go with default `protcol` and `compress` settings, you can simply use `bytes(d)`, which is more Pythonic.
```


## From/to dict

```{important}
This feature requires `protobuf` dependency. You can do `pip install docarray[full]` to install it.
```

You can serialize a Document as a Python `dict` via {meth}`~docarray.document.mixins.porting.PortingMixin.to_dict`, and then read from it via {meth}`~docarray.document.mixins.porting.PortingMixin.from_dict`.

```python
from docarray import Document
import numpy as np

d_as_dict = Document(text='hello, world', embedding=np.array([1, 2, 3])).to_dict()

d = Document.from_dict(d_as_dict)

print(d_as_dict, d)
```

```text
{'id': 'b29d39066d5611ec87661e008a366d49', 'text': 'hello, world', 'mime_type': 'text/plain', 'embedding': {'dense': {'buffer': 'AQAAAAAAAAACAAAAAAAAAAMAAAAAAAAA', 'shape': [3], 'dtype': '<i8'}, 'cls_name': 'numpy'}}

<Document ('id', 'mime_type', 'text', 'embedding') at b29d39066d5611ec87661e008a366d49>
```

(strict-arg-explain)=
```{note}
Note that the result dict is very "stricted" in the sense that all fields and values boil down to very basic data type such as `int`, `float`, `string`. This behavior is designed due to the "serialization to `dict`" is often an intermediate step of serializing into JSON/YAML. Hence all values in `dict` must be schema-friendly. After all, a Python `dict` object means nothing if you are not working in Python.

You can use `to_dict(strict=False)` to override this behavior. This will preserve the original Python data type of every value, which may not be JSON-friendly. But hey, you want it.
```

## From/to Protobuf

```{important}
Expand Down
54 changes: 53 additions & 1 deletion docs/fundamentals/documentarray/construct.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,62 @@
```python
from docarray import DocumentArray

da = DocumentArray()
```

```text
<DocumentArray (length=0) at 4453362704>
```

Now you can use list-like interfaces such as `.append()` and `.extend()` as you would add elements to a Python List.

```python
da.append(Document(text='hello world!'))
da.extend([Document(text='hello'), Document(text='world!')])
```

```text
<DocumentArray (length=3) at 4446140816>
```

Directly printing a DocumentArray does not show you too much useful information, you can use {meth}`~docarray.array.mixins.plot.PlotMixin.summary`.

```{important}
This feature requires `rich` dependency. You can do `pip install docarray[full]` to install it.
```

```python
da.summary()
```

```text
Documents Summary

Length 3
Homogenous Documents True
Common Attributes ('id', 'mime_type', 'text')

Attributes Summary

Attribute Data type #Unique values Has empty value
──────────────────────────────────────────────────────────
id ('str',) 3 False
mime_type ('str',) 1 False
text ('str',) 3 False
```

## Construct with empty Documents

Like `numpy.zeros()`, you can quickly build a DocumentArray with only empty Documents:

```python
from docarray import DocumentArray

da = DocumentArray.empty(10)
```

```text
<DocumentArray (length=10) at 4456123280>
<DocumentArray (length=10) at 4453362704>
```

## Construct from list-like objects
Expand Down Expand Up @@ -41,6 +92,7 @@ da = DocumentArray((Document() for _ in range(10)))
```
````


As DocumentArray itself is also a "list-like object that yields `Document`", you can also construct DocumentArray from another DocumentArray:

```python
Expand Down
1 change: 1 addition & 0 deletions docs/fundamentals/documentarray/images/benchmark-size.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions docs/fundamentals/documentarray/images/benchmark-time.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading