docarray · hanxiao · Jan 8, 2022 · Jan 8, 2022 · Jan 8, 2022 · Jan 8, 2022
diff --git a/README.md b/README.md
@@ -232,9 +232,32 @@ Now anyone who knows the token `my_shared_da` can pull and work on it.
 left_da = DocumentArray.pull(token='my_shared_da')
 ```
 
+Intrigued? That's only scratching the surface of what DocArray is capable of. [Read our docs to learn more](https://docarray.jina.ai).
 
+## Get Started for NLP Engineers
+
+In this 10-Line code example, we search over "Pride and Prejudice" for top-5 similar sentences as `she entered the room`. 
+
+```python
+from docarray import Document, DocumentArray
+
+d = Document(uri='https://www.gutenberg.org/files/1342/1342-0.txt').load_uri_to_text()
+da = DocumentArray(Document(text=s.strip()) for s in d.text.split('\n') if s.strip())
+da.apply(lambda d: d.embed_feature_hashing())
+
+q = (Document(text='she entered the room')
+     .embed_feature_hashing()
+     .match(da, limit=5, exclude_self=True, metric='jaccard', use_scipy=True))
+
+print(q.matches[:, ('text', 'scores__jaccard')])
+```
+
+```text
+[['staircase, than she entered the breakfast-room, and congratulated', 'of the room.', 'She entered the room with an air more than usually ungracious,', 'entered the breakfast-room, where Mrs. Bennet was alone, than she', 'those in the room.'], [{'value': 0.6, 'ref_id': '6559c1f6709811eca8811e008a366d49'}, {'value': 0.6666666666666666, 'ref_id': '6559c1f6709811eca8811e008a366d49'}, {'value': 0.6666666666666666, 'ref_id': '6559c1f6709811eca8811e008a366d49'}, {'value': 0.6666666666666666, 'ref_id': '6559c1f6709811eca8811e008a366d49'}, {'value': 0.7142857142857143, 'ref_id': '6559c1f6709811eca8811e008a366d49'}]]
+```
+
+Here the feature embedding is done by simple [feature hashing](https://en.wikipedia.org/wiki/Feature_hashing) and distance metric was measured by [Jaccard distance](https://en.wikipedia.org/wiki/Jaccard_index). For sure with your powerful deep learning models you can do much better, so go nuts!
 
-Intrigued? That's only scratching the surface of what DocArray is capable of. [Read our docs to learn more](https://docarray.jina.ai).
 
 <!-- start support-pitch -->
 ## Support

diff --git a/docarray/document/__init__.py b/docarray/document/__init__.py
@@ -1,8 +1,69 @@
+from typing import overload, Dict, Optional, List, TYPE_CHECKING
+
 from .data import DocumentData, default_values
 from .mixins import AllMixins
 from ..base import BaseDCType
 
+if TYPE_CHECKING:
+    from ..types import ArrayType, StructValueType, DocumentContentType
+    from .. import DocumentArray
+    from ..score import NamedScore
+
 
 class Document(AllMixins, BaseDCType):
     _data_class = DocumentData
     _unresolved_fields_dest = 'tags'
+
+    @overload
+    def __init__(self):
+        ...
+
+    @overload
+    def __init__(self, doc: Optional['Document'] = None, copy: bool = False):
+        ...
+
+    @overload
+    def __init__(
+        self,
+        doc: Optional[Dict],
+        field_resolver: Optional[Dict[str, str]] = None,
+        unknown_fields_handler: str = 'catch',
+    ):
+        ...
+
+    @overload
+    def __init__(
+        self,
+        doc: Optional[Dict],
+        field_resolver: Optional[Dict[str, str]] = None,
+        unknown_fields_handler: str = 'catch',
+    ):
+        ...
+
+    @overload
+    def __init__(
+        self,
+        parent_id: Optional[str] = None,
+        granularity: Optional[int] = None,
+        adjacency: Optional[int] = None,
+        buffer: Optional[bytes] = None,
+        blob: Optional['ArrayType'] = None,
+        mime_type: Optional[str] = None,
+        text: Optional[str] = None,
+        content: Optional['DocumentContentType'] = None,
+        weight: Optional[float] = None,
+        uri: Optional[str] = None,
+        tags: Optional[Dict[str, 'StructValueType']] = None,
+        offset: Optional[float] = None,
+        location: Optional[List[float]] = None,
+        embedding: Optional['ArrayType'] = None,
+        modality: Optional[str] = None,
+        evaluations: Optional[Dict[str, 'NamedScore']] = None,
+        scores: Optional[Dict[str, 'NamedScore']] = None,
+        chunks: Optional['DocumentArray'] = None,
+        matches: Optional['DocumentArray'] = None,
+    ):
+        ...
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
diff --git a/docarray/document/mixins/__init__.py b/docarray/document/mixins/__init__.py
@@ -4,6 +4,7 @@
 from .content import ContentPropertyMixin
 from .convert import ConvertMixin
 from .dump import UriFileMixin
+from .featurehash import FeatureHashMixin
 from .image import ImageDataMixin
 from .mesh import MeshDataMixin
 from .plot import PlotMixin
@@ -30,6 +31,7 @@ class AllMixins(
     UriFileMixin,
     SingletonSugarMixin,
     PortingMixin,
+    FeatureHashMixin,
     GetAttributesMixin,
 ):
     """All plugins that can be used in :class:`Document`. """

diff --git a/docarray/document/mixins/featurehash.py b/docarray/document/mixins/featurehash.py
@@ -0,0 +1,85 @@
+import hashlib
+import json
+from typing import Tuple, TYPE_CHECKING
+
+import numpy as np
+
+if TYPE_CHECKING:
+    from ...types import T
+
+
+class FeatureHashMixin:
+    """Provide helper functions for feature hashing."""
+
+    def embed_feature_hashing(
+        self: 'T',
+        n_dim: int = 256,
+        sparse: bool = False,
+        fields: Tuple[str, ...] = ('text', 'tags'),
+        max_value: int = 1_000_000,
+    ) -> 'T':
+        """Convert an arbitrary set of attributes into a fixed-dimensional matrix using the hashing trick.
+
+        :param n_dim: the dimensionality of each document in the output embedding.
+            Small numbers of features are likely to cause hash collisions,
+            but large numbers will cause larger overall parameter dimensions.
+        :param sparse: whether the resulting feature matrix should be a sparse csr_matrix or dense ndarray.
+            Note that this feature requires ``scipy``
+        :param fields: which attributes to be considered as for feature hashing.
+        """
+        if sparse:
+            from scipy.sparse import csr_matrix
+
+        idxs, data = [], []  # sparse
+        table = np.zeros(n_dim)  # dense
+
+        for f in fields:
+            if 'text' in fields:
+                all_tokens = self.get_vocabulary(('text',))
+                for f_id, val in all_tokens.items():
+                    _hash_column(f_id, val, n_dim, max_value, idxs, data, table)
+
+            if 'tags' in fields:
+                for k, v in self.tags.items():
+                    _hash_column(k, v, n_dim, max_value, idxs, data, table)
+
+            v = getattr(self, f, None)
+            if v:
+                _hash_column(f, v, n_dim, max_value, idxs, data, table)
+
+        if sparse:
+            self.embedding = csr_matrix((data, zip(*idxs)), shape=(1, n_dim))
+        else:
+            self.embedding = table
+        return self
+
+
+def _hash_column(col_name, col_val, n_dim, max_value, idxs, data, table):
+    h = _any_hash(col_name)
+    col_val = _any_hash(col_val) % max_value
+    col = h % n_dim
+    idxs.append((0, col))
+    data.append(np.sign(h) * col_val)
+    table[col] += np.sign(h) * col_val
+
+
+def _any_hash(v):
+    try:
+        return int(v)  # parse int parameter
+    except ValueError:
+        try:
+            return float(v)  # parse float parameter
+        except ValueError:
+            if not v:
+                # ignore it when the parameter is empty
+                return 0
+            if isinstance(v, str):
+                v = v.strip()
+                if v.lower() in {'true', 'yes'}:  # parse boolean parameter
+                    return 1
+                if v.lower() in {'false', 'no'}:
+                    return 0
+            if isinstance(v, (tuple, dict, list)):
+                v = json.dumps(v, sort_keys=True)
+
+    return int(hashlib.md5(str(v).encode('utf-8')).hexdigest(), base=16)
diff --git a/docarray/document/mixins/sugar.py b/docarray/document/mixins/sugar.py
@@ -57,14 +57,9 @@ def match(
         ...
 
     def match(self: 'T', *args, **kwargs) -> 'T':
-        """
-        # noqa: D102
-        # noqa: DAR101
-        :return: itself after modified
-        """
         from ... import DocumentArray
 
-        _tmp = DocumentArray([self])
+        _tmp = DocumentArray(self)
         _tmp.match(*args, **kwargs)
         return self
 
@@ -84,13 +79,8 @@ def embed(
         """
 
     def embed(self: 'T', *args, **kwargs) -> 'T':
-        """
-        # noqa: D102
-        # noqa: DAR101
-        :return: itself after modified.
-        """
         from ... import DocumentArray
 
-        _tmp = DocumentArray([self])
+        _tmp = DocumentArray(self)
         _tmp.embed(*args, **kwargs)
         return self
diff --git a/docs/datatypes/text/index.md b/docs/datatypes/text/index.md
@@ -159,3 +159,35 @@ this is a much longer sentence
 ```
 
 
+## Simple text matching via feature hashing
+
+Let's search for `"she entered the room"` in *Pride and Prejudice*:
+
+```python
+from docarray import Document, DocumentArray
+
+d = Document(uri='https://www.gutenberg.org/files/1342/1342-0.txt').load_uri_to_text()
+da = DocumentArray(Document(text=s.strip()) for s in d.text.split('\n') if s.strip())
+da.apply(lambda d: d.embed_feature_hashing())
+
+q = (
+    Document(text='she entered the room')
+    .embed_feature_hashing()
+    .match(da, limit=5, exclude_self=True, metric='jaccard', use_scipy=True)
+)
+
+print(q.matches[:, ('text', 'scores__jaccard')])
+```
+
+```text
+[['staircase, than she entered the breakfast-room, and congratulated', 
+'of the room.', 
+'She entered the room with an air more than usually ungracious,', 
+'entered the breakfast-room, where Mrs. Bennet was alone, than she', 
+'those in the room.'], 
+[{'value': 0.6, 'ref_id': 'f47f7448709811ec960a1e008a366d49'}, 
+{'value': 0.6666666666666666, 'ref_id': 'f47f7448709811ec960a1e008a366d49'}, 
+{'value': 0.6666666666666666, 'ref_id': 'f47f7448709811ec960a1e008a366d49'}, 
+{'value': 0.6666666666666666, 'ref_id': 'f47f7448709811ec960a1e008a366d49'}, 
+{'value': 0.7142857142857143, 'ref_id': 'f47f7448709811ec960a1e008a366d49'}]]
+```
diff --git a/docs/fundamentals/document/construct.md b/docs/fundamentals/document/construct.md
@@ -36,6 +36,12 @@ d4 = Document(uri='https://jina.ai',
               tags={'foo': 'bar'})
 ```
 
+Don't forget to leverage autocomplete in your IDE.
+
+```{figure} images/ide-autocomplete.png
+:width: 80%
+```
+
 ```text
 <Document ('id',) at my_id>
 <Document ('id', 'mime_type', 'text') at a14effee6d3e11ec8bde1e008a366d49>
@@ -57,6 +63,8 @@ When you `print()` a Document, you get a string representation such as `<Documen
 ```
 ````
 
+
+
 One can also wrap the keyword arguments into `dict`. The following ways of initialization have the same effect:
 
 ```python

diff --git a/docs/fundamentals/document/fluent-interface.md b/docs/fundamentals/document/fluent-interface.md
@@ -95,6 +95,11 @@ Provide sugary syntax for {class}`Document` by inheriting methods from {class}`D
 - {meth}`~docarray.document.mixins.sugar.SingletonSugarMixin.match`
 
 
+### FeatureHash
+Provide helper functions for feature hashing.
+- {meth}`~docarray.document.mixins.featurehash.FeatureHashMixin.embed_feature_hashing`
+
+
 ### Porting
 
 - {meth}`~docarray.document.mixins.porting.PortingMixin.from_bytes`

diff --git a/docs/fundamentals/document/images/ide-autocomplete.png b/docs/fundamentals/document/images/ide-autocomplete.png
diff --git a/docs/fundamentals/document/index.md b/docs/fundamentals/document/index.md
@@ -2,7 +2,7 @@
 
 {class}`~docarray.document.Document` is the basic data type in DocArray. Whether you're working with text, image, video, audio, 3D meshes or the nested or the combined of them, you can always represent them as Document.
 
-A Document object has a predefined data structure as below, each of the attributes can be set/get with the dot expression as you would do with any Python object.
+A Document object has a predefined data schema as below, each of the attributes can be set/get with the dot expression as you would do with any Python object.
 
 | Attribute   | Type               | Description |
 |-------------|--------------------| ----------- |
@@ -30,7 +30,7 @@ A Document object has a predefined data structure as below, each of the attribut
 An `ndarray`-like object can be a Python (nested) List/Tuple, Numpy ndarray, SciPy sparse matrix (spmatrix), TensorFlow dense and sparse tensor, PyTorch dense and sparse tensor, or PaddlePaddle dense tensor.
 ```
 
-The data structure of the Document is comprehensive and well-organized. One can categorize those attributes into the following groups:
+The data schema of the Document is comprehensive and well-organized. One can categorize those attributes into the following groups:
 
 - Content related: `uri`, `text`, `blob`, `buffer`;
 - Nest structure related: `chunks`, `matches`, `granularity`, `adjacency`, `parent_id`;

diff --git a/docs/fundamentals/documentarray/access-attributes.md b/docs/fundamentals/documentarray/access-attributes.md
@@ -16,13 +16,15 @@ Here `element_selector` are the ones introduced {ref}`in the last chapter<access
 
 As in element selector, one can use attribute selector to **get/set/delete** attributes in a DocumentArray.
 
-| Example                                      | Return                                                                                                                       |
-|----------------------------------------------|------------------------------------------------------------------------------------------------------------------------------|
-| `da[:, 'id']`                                | all `.id` in a List                                                                                                          |
-| `da['@m', 'id']`                             | all `.id` from all Documents `.matches`                                                                                      |
+| Example                                      | Return                                                                                                                        |
+|----------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------|
+| `da[:, 'id']`                                | all `.id` in a List                                                                                                           |
+| `da['@m', 'id']`                             | all `.id` from all Documents `.matches`                                                                                       |
 | `da[1:3, ('id', 'scores')]`                  | a list of two list, first is all `.id` from the first three Documents, second is all `.scores` from the first three Documents |
-| `da[1:3, 'embedding']`, `da[1:3].embeddings` | a NdArray-like object of the first three Documents embeddings                                                                |
-| `da[:, 'blob']`, `da.blobs`                  | a NdArray-like object of the all top-level Documents blobs                                                                   |
+| `da[:, 'scores__cosine__value']`             | all `.scores['cosine'].value` from the first three Documents                                                                  |
+| `da[1:3, 'embedding']`, `da[1:3].embeddings` | a NdArray-like object of the first three Documents embeddings                                                                 |
+| `da[:, 'blob']`, `da.blobs`                  | a NdArray-like object of the all top-level Documents blobs                                                                    |
+
 
 Let's see an example.
 
@@ -155,10 +157,38 @@ for d in da:
 <class 'scipy.sparse.coo.coo_matrix'> (1, 10)
 ```
 
+## Dunder syntax for nested attributes
+
+Some attributes are nested by nature, e.g. `.tags` and `.scores`. Accessing the deep nested value is easy thanks to the dunder syntax. You can access `.tags['key1']` via `d[:, 'tags__key1']`. 
+
+Let's see an example,
+
+```python
+import numpy as np
+
+from docarray import DocumentArray
+
+da = DocumentArray.empty(3)
+da.embeddings = np.random.random([3, 2])
+da.match(da)
+```
+
+Now to print `id` and matched score, one can simply do:
+
+```python
+print(da['@m', ('id', 'scores__cosine__value')])
+```
+
+```text
+[['5164d792709a11ec9ae71e008a366d49', '5164d986709a11ec9ae71e008a366d49', '5164d922709a11ec9ae71e008a366d49', '5164d922709a11ec9ae71e008a366d49', '5164d986709a11ec9ae71e008a366d49', '5164d792709a11ec9ae71e008a366d49', '5164d986709a11ec9ae71e008a366d49', '5164d792709a11ec9ae71e008a366d49', '5164d922709a11ec9ae71e008a366d49'], 
+[0.0, 0.006942970007385196, 0.48303283924326845, 0.0, 0.3859268166910603, 0.48303283924326845, 2.220446049250313e-16, 0.006942970007385196, 0.3859268166910603]]
+```
+
+
 (da-content-embedding)=
-## Content and embedding attributes
+## Content and embedding sugary attributes
 
-DocumentArray provides `.texts`, `.buffers`, `.blobs`, `.contents` and `.embeddings` attributes for quickly accessing the content and embedding of Documents. You can use them to get/set/delete attributes of all Documents at the top-level.
+DocumentArray provides `.texts`, `.buffers`, `.blobs`, `.contents` and `.embeddings` sugary attributes for quickly accessing the content and embedding of Documents. You can use them to get/set/delete attributes of all Documents at the top-level.
 
 ```python
 from docarray import DocumentArray

diff --git a/docs/fundamentals/documentarray/access-elements.md b/docs/fundamentals/documentarray/access-elements.md
@@ -289,7 +289,7 @@ da = DocumentArray.empty(1000)
 da.shuffle()
 ```
 
-## Splitting by `.tags`
+### Splitting by `.tags`
 
 One can split a DocumentArray into multiple DocumentArrays according to the tag value (stored in `tags`) of each Document.
 It returns a Python `dict` where Documents with the same `tag` value are grouped together in a new DocumentArray, with their orders preserved from the original DocumentArray.

diff --git a/docs/fundamentals/documentarray/images/docarray-array.svg b/docs/fundamentals/documentarray/images/docarray-array.svg