Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,32 @@ Now anyone who knows the token `my_shared_da` can pull and work on it.
left_da = DocumentArray.pull(token='my_shared_da')
```

Intrigued? That's only scratching the surface of what DocArray is capable of. [Read our docs to learn more](https://docarray.jina.ai).

## Get Started for NLP Engineers

In this 10-Line code example, we search over "Pride and Prejudice" for top-5 similar sentences as `she entered the room`.

```python
from docarray import Document, DocumentArray

d = Document(uri='https://www.gutenberg.org/files/1342/1342-0.txt').load_uri_to_text()
da = DocumentArray(Document(text=s.strip()) for s in d.text.split('\n') if s.strip())
da.apply(lambda d: d.embed_feature_hashing())

q = (Document(text='she entered the room')
.embed_feature_hashing()
.match(da, limit=5, exclude_self=True, metric='jaccard', use_scipy=True))

print(q.matches[:, ('text', 'scores__jaccard')])
```

```text
[['staircase, than she entered the breakfast-room, and congratulated', 'of the room.', 'She entered the room with an air more than usually ungracious,', 'entered the breakfast-room, where Mrs. Bennet was alone, than she', 'those in the room.'], [{'value': 0.6, 'ref_id': '6559c1f6709811eca8811e008a366d49'}, {'value': 0.6666666666666666, 'ref_id': '6559c1f6709811eca8811e008a366d49'}, {'value': 0.6666666666666666, 'ref_id': '6559c1f6709811eca8811e008a366d49'}, {'value': 0.6666666666666666, 'ref_id': '6559c1f6709811eca8811e008a366d49'}, {'value': 0.7142857142857143, 'ref_id': '6559c1f6709811eca8811e008a366d49'}]]
```

Here the feature embedding is done by simple [feature hashing](https://en.wikipedia.org/wiki/Feature_hashing) and distance metric was measured by [Jaccard distance](https://en.wikipedia.org/wiki/Jaccard_index). For sure with your powerful deep learning models you can do much better, so go nuts!

Intrigued? That's only scratching the surface of what DocArray is capable of. [Read our docs to learn more](https://docarray.jina.ai).

<!-- start support-pitch -->
## Support
Expand Down
61 changes: 61 additions & 0 deletions docarray/document/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,69 @@
from typing import overload, Dict, Optional, List, TYPE_CHECKING

from .data import DocumentData, default_values
from .mixins import AllMixins
from ..base import BaseDCType

if TYPE_CHECKING:
from ..types import ArrayType, StructValueType, DocumentContentType
from .. import DocumentArray
from ..score import NamedScore


class Document(AllMixins, BaseDCType):
_data_class = DocumentData
_unresolved_fields_dest = 'tags'

@overload
def __init__(self):
...

@overload
def __init__(self, doc: Optional['Document'] = None, copy: bool = False):
...

@overload
def __init__(
self,
doc: Optional[Dict],
field_resolver: Optional[Dict[str, str]] = None,
unknown_fields_handler: str = 'catch',
):
...

@overload
def __init__(
self,
doc: Optional[Dict],
field_resolver: Optional[Dict[str, str]] = None,
unknown_fields_handler: str = 'catch',
):
...

@overload
def __init__(
self,
parent_id: Optional[str] = None,
granularity: Optional[int] = None,
adjacency: Optional[int] = None,
buffer: Optional[bytes] = None,
blob: Optional['ArrayType'] = None,
mime_type: Optional[str] = None,
text: Optional[str] = None,
content: Optional['DocumentContentType'] = None,
weight: Optional[float] = None,
uri: Optional[str] = None,
tags: Optional[Dict[str, 'StructValueType']] = None,
offset: Optional[float] = None,
location: Optional[List[float]] = None,
embedding: Optional['ArrayType'] = None,
modality: Optional[str] = None,
evaluations: Optional[Dict[str, 'NamedScore']] = None,
scores: Optional[Dict[str, 'NamedScore']] = None,
chunks: Optional['DocumentArray'] = None,
matches: Optional['DocumentArray'] = None,
):
...

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
2 changes: 2 additions & 0 deletions docarray/document/mixins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .content import ContentPropertyMixin
from .convert import ConvertMixin
from .dump import UriFileMixin
from .featurehash import FeatureHashMixin
from .image import ImageDataMixin
from .mesh import MeshDataMixin
from .plot import PlotMixin
Expand All @@ -30,6 +31,7 @@ class AllMixins(
UriFileMixin,
SingletonSugarMixin,
PortingMixin,
FeatureHashMixin,
GetAttributesMixin,
):
"""All plugins that can be used in :class:`Document`. """
Expand Down
85 changes: 85 additions & 0 deletions docarray/document/mixins/featurehash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import hashlib
import json
from typing import Tuple, TYPE_CHECKING

import numpy as np

if TYPE_CHECKING:
from ...types import T


class FeatureHashMixin:
"""Provide helper functions for feature hashing."""

def embed_feature_hashing(
self: 'T',
n_dim: int = 256,
sparse: bool = False,
fields: Tuple[str, ...] = ('text', 'tags'),
max_value: int = 1_000_000,
) -> 'T':
"""Convert an arbitrary set of attributes into a fixed-dimensional matrix using the hashing trick.

:param n_dim: the dimensionality of each document in the output embedding.
Small numbers of features are likely to cause hash collisions,
but large numbers will cause larger overall parameter dimensions.
:param sparse: whether the resulting feature matrix should be a sparse csr_matrix or dense ndarray.
Note that this feature requires ``scipy``
:param fields: which attributes to be considered as for feature hashing.
"""
if sparse:
from scipy.sparse import csr_matrix

idxs, data = [], [] # sparse
table = np.zeros(n_dim) # dense

for f in fields:
if 'text' in fields:
all_tokens = self.get_vocabulary(('text',))
for f_id, val in all_tokens.items():
_hash_column(f_id, val, n_dim, max_value, idxs, data, table)

if 'tags' in fields:
for k, v in self.tags.items():
_hash_column(k, v, n_dim, max_value, idxs, data, table)

v = getattr(self, f, None)
if v:
_hash_column(f, v, n_dim, max_value, idxs, data, table)

if sparse:
self.embedding = csr_matrix((data, zip(*idxs)), shape=(1, n_dim))
else:
self.embedding = table
return self


def _hash_column(col_name, col_val, n_dim, max_value, idxs, data, table):
h = _any_hash(col_name)
col_val = _any_hash(col_val) % max_value
col = h % n_dim
idxs.append((0, col))
data.append(np.sign(h) * col_val)
table[col] += np.sign(h) * col_val


def _any_hash(v):
try:
return int(v) # parse int parameter
except ValueError:
try:
return float(v) # parse float parameter
except ValueError:
if not v:
# ignore it when the parameter is empty
return 0
if isinstance(v, str):
v = v.strip()
if v.lower() in {'true', 'yes'}: # parse boolean parameter
return 1
if v.lower() in {'false', 'no'}:
return 0
if isinstance(v, (tuple, dict, list)):
v = json.dumps(v, sort_keys=True)

return int(hashlib.md5(str(v).encode('utf-8')).hexdigest(), base=16)
14 changes: 2 additions & 12 deletions docarray/document/mixins/sugar.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,14 +57,9 @@ def match(
...

def match(self: 'T', *args, **kwargs) -> 'T':
"""
# noqa: D102
# noqa: DAR101
:return: itself after modified
"""
from ... import DocumentArray

_tmp = DocumentArray([self])
_tmp = DocumentArray(self)
_tmp.match(*args, **kwargs)
return self

Expand All @@ -84,13 +79,8 @@ def embed(
"""

def embed(self: 'T', *args, **kwargs) -> 'T':
"""
# noqa: D102
# noqa: DAR101
:return: itself after modified.
"""
from ... import DocumentArray

_tmp = DocumentArray([self])
_tmp = DocumentArray(self)
_tmp.embed(*args, **kwargs)
return self
32 changes: 32 additions & 0 deletions docs/datatypes/text/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,35 @@ this is a much longer sentence
```


## Simple text matching via feature hashing

Let's search for `"she entered the room"` in *Pride and Prejudice*:

```python
from docarray import Document, DocumentArray

d = Document(uri='https://www.gutenberg.org/files/1342/1342-0.txt').load_uri_to_text()
da = DocumentArray(Document(text=s.strip()) for s in d.text.split('\n') if s.strip())
da.apply(lambda d: d.embed_feature_hashing())

q = (
Document(text='she entered the room')
.embed_feature_hashing()
.match(da, limit=5, exclude_self=True, metric='jaccard', use_scipy=True)
)

print(q.matches[:, ('text', 'scores__jaccard')])
```

```text
[['staircase, than she entered the breakfast-room, and congratulated',
'of the room.',
'She entered the room with an air more than usually ungracious,',
'entered the breakfast-room, where Mrs. Bennet was alone, than she',
'those in the room.'],
[{'value': 0.6, 'ref_id': 'f47f7448709811ec960a1e008a366d49'},
{'value': 0.6666666666666666, 'ref_id': 'f47f7448709811ec960a1e008a366d49'},
{'value': 0.6666666666666666, 'ref_id': 'f47f7448709811ec960a1e008a366d49'},
{'value': 0.6666666666666666, 'ref_id': 'f47f7448709811ec960a1e008a366d49'},
{'value': 0.7142857142857143, 'ref_id': 'f47f7448709811ec960a1e008a366d49'}]]
```
8 changes: 8 additions & 0 deletions docs/fundamentals/document/construct.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ d4 = Document(uri='https://jina.ai',
tags={'foo': 'bar'})
```

Don't forget to leverage autocomplete in your IDE.

```{figure} images/ide-autocomplete.png
:width: 80%
```

```text
<Document ('id',) at my_id>
<Document ('id', 'mime_type', 'text') at a14effee6d3e11ec8bde1e008a366d49>
Expand All @@ -57,6 +63,8 @@ When you `print()` a Document, you get a string representation such as `<Documen
```
````



One can also wrap the keyword arguments into `dict`. The following ways of initialization have the same effect:

```python
Expand Down
5 changes: 5 additions & 0 deletions docs/fundamentals/document/fluent-interface.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,11 @@ Provide sugary syntax for {class}`Document` by inheriting methods from {class}`D
- {meth}`~docarray.document.mixins.sugar.SingletonSugarMixin.match`


### FeatureHash
Provide helper functions for feature hashing.
- {meth}`~docarray.document.mixins.featurehash.FeatureHashMixin.embed_feature_hashing`


### Porting

- {meth}`~docarray.document.mixins.porting.PortingMixin.from_bytes`
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions docs/fundamentals/document/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

{class}`~docarray.document.Document` is the basic data type in DocArray. Whether you're working with text, image, video, audio, 3D meshes or the nested or the combined of them, you can always represent them as Document.

A Document object has a predefined data structure as below, each of the attributes can be set/get with the dot expression as you would do with any Python object.
A Document object has a predefined data schema as below, each of the attributes can be set/get with the dot expression as you would do with any Python object.

| Attribute | Type | Description |
|-------------|--------------------| ----------- |
Expand Down Expand Up @@ -30,7 +30,7 @@ A Document object has a predefined data structure as below, each of the attribut
An `ndarray`-like object can be a Python (nested) List/Tuple, Numpy ndarray, SciPy sparse matrix (spmatrix), TensorFlow dense and sparse tensor, PyTorch dense and sparse tensor, or PaddlePaddle dense tensor.
```

The data structure of the Document is comprehensive and well-organized. One can categorize those attributes into the following groups:
The data schema of the Document is comprehensive and well-organized. One can categorize those attributes into the following groups:

- Content related: `uri`, `text`, `blob`, `buffer`;
- Nest structure related: `chunks`, `matches`, `granularity`, `adjacency`, `parent_id`;
Expand Down
46 changes: 38 additions & 8 deletions docs/fundamentals/documentarray/access-attributes.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,15 @@ Here `element_selector` are the ones introduced {ref}`in the last chapter<access

As in element selector, one can use attribute selector to **get/set/delete** attributes in a DocumentArray.

| Example | Return |
|----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|
| `da[:, 'id']` | all `.id` in a List |
| `da['@m', 'id']` | all `.id` from all Documents `.matches` |
| Example | Return |
|----------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
| `da[:, 'id']` | all `.id` in a List |
| `da['@m', 'id']` | all `.id` from all Documents `.matches` |
| `da[1:3, ('id', 'scores')]` | a list of two list, first is all `.id` from the first three Documents, second is all `.scores` from the first three Documents |
| `da[1:3, 'embedding']`, `da[1:3].embeddings` | a NdArray-like object of the first three Documents embeddings |
| `da[:, 'blob']`, `da.blobs` | a NdArray-like object of the all top-level Documents blobs |
| `da[:, 'scores__cosine__value']` | all `.scores['cosine'].value` from the first three Documents |
| `da[1:3, 'embedding']`, `da[1:3].embeddings` | a NdArray-like object of the first three Documents embeddings |
| `da[:, 'blob']`, `da.blobs` | a NdArray-like object of the all top-level Documents blobs |


Let's see an example.

Expand Down Expand Up @@ -155,10 +157,38 @@ for d in da:
<class 'scipy.sparse.coo.coo_matrix'> (1, 10)
```

## Dunder syntax for nested attributes

Some attributes are nested by nature, e.g. `.tags` and `.scores`. Accessing the deep nested value is easy thanks to the dunder syntax. You can access `.tags['key1']` via `d[:, 'tags__key1']`.

Let's see an example,

```python
import numpy as np

from docarray import DocumentArray

da = DocumentArray.empty(3)
da.embeddings = np.random.random([3, 2])
da.match(da)
```

Now to print `id` and matched score, one can simply do:

```python
print(da['@m', ('id', 'scores__cosine__value')])
```

```text
[['5164d792709a11ec9ae71e008a366d49', '5164d986709a11ec9ae71e008a366d49', '5164d922709a11ec9ae71e008a366d49', '5164d922709a11ec9ae71e008a366d49', '5164d986709a11ec9ae71e008a366d49', '5164d792709a11ec9ae71e008a366d49', '5164d986709a11ec9ae71e008a366d49', '5164d792709a11ec9ae71e008a366d49', '5164d922709a11ec9ae71e008a366d49'],
[0.0, 0.006942970007385196, 0.48303283924326845, 0.0, 0.3859268166910603, 0.48303283924326845, 2.220446049250313e-16, 0.006942970007385196, 0.3859268166910603]]
```


(da-content-embedding)=
## Content and embedding attributes
## Content and embedding sugary attributes

DocumentArray provides `.texts`, `.buffers`, `.blobs`, `.contents` and `.embeddings` attributes for quickly accessing the content and embedding of Documents. You can use them to get/set/delete attributes of all Documents at the top-level.
DocumentArray provides `.texts`, `.buffers`, `.blobs`, `.contents` and `.embeddings` sugary attributes for quickly accessing the content and embedding of Documents. You can use them to get/set/delete attributes of all Documents at the top-level.

```python
from docarray import DocumentArray
Expand Down
2 changes: 1 addition & 1 deletion docs/fundamentals/documentarray/access-elements.md
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ da = DocumentArray.empty(1000)
da.shuffle()
```

## Splitting by `.tags`
### Splitting by `.tags`

One can split a DocumentArray into multiple DocumentArrays according to the tag value (stored in `tags`) of each Document.
It returns a Python `dict` where Documents with the same `tag` value are grouped together in a new DocumentArray, with their orders preserved from the original DocumentArray.
Expand Down
1 change: 1 addition & 0 deletions docs/fundamentals/documentarray/images/docarray-array.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading