docarray · anna-charlotte · Feb 23, 2023 · Feb 17, 2023 · Feb 20, 2023 · Feb 20, 2023
diff --git a/docarray/array/array/io.py b/docarray/array/array/io.py
@@ -16,8 +16,8 @@
     Dict,
     Generator,
     Iterable,
+    List,
     Optional,
-    Sequence,
     Tuple,
     Type,
     TypeVar,
@@ -26,14 +26,14 @@
 
 from docarray.base_document import AnyDocument, BaseDocument
 from docarray.helper import (
-    _access_path_to_dict,
+    _access_path_dict_to_nested_dict,
+    _all_access_paths_valid,
     _dict_to_access_paths,
-    _update_nested_dicts,
-    is_access_path_valid,
 )
 from docarray.utils.compress import _decompress_bytes, _get_compress_ctx
 
 if TYPE_CHECKING:
+    import pandas as pd
 
     from docarray import DocumentArray
     from docarray.proto import DocumentArrayProto
@@ -330,37 +330,37 @@ def from_csv(
         """
         from docarray import DocumentArray
 
-        doc_type = cls.document_type
-        if doc_type == AnyDocument:
+        if cls.document_type == AnyDocument:
             raise TypeError(
                 'There is no document schema defined. '
-                'To load from csv, please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.'
+                'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.'
             )
 
+        doc_type = cls.document_type
         da = DocumentArray.__class_getitem__(doc_type)()
+
         with open(file_path, 'r', encoding=encoding) as fp:
             rows = csv.DictReader(fp, dialect=dialect)
-            field_names: Optional[Sequence[Any]] = rows.fieldnames
-
-            if field_names is None:
+            field_names: List[str] = (
+                [] if rows.fieldnames is None else [str(f) for f in rows.fieldnames]
+            )
+            if field_names is None or len(field_names) == 0:
                 raise TypeError("No field names are given.")
 
-            valid = [is_access_path_valid(doc_type, field) for field in field_names]
-            if not all(valid):
+            valid_paths = _all_access_paths_valid(
+                doc_type=doc_type, access_paths=field_names
+            )
+            if not all(valid_paths):
                 raise ValueError(
-                    f'Fields provided in the csv file do not match the schema of the DocumentArray\'s '
-                    f'document type ({doc_type.__name__}): {list(compress(field_names, [not v for v in valid]))}'
+                    f'Column names do not match the schema of the DocumentArray\'s '
+                    f'document type ({cls.document_type.__name__}): '
+                    f'{list(compress(field_names, [not v for v in valid_paths]))}'
                 )
 
             for access_path2val in rows:
-                doc_dict: Dict[Any, Any] = {}
-                for access_path, value in access_path2val.items():
-                    field2val = _access_path_to_dict(
-                        access_path=access_path,
-                        value=value if value not in ['', 'None'] else None,
-                    )
-                    _update_nested_dicts(to_update=doc_dict, update_with=field2val)
-
+                doc_dict: Dict[Any, Any] = _access_path_dict_to_nested_dict(
+                    access_path2val
+                )
                 da.append(doc_type.parse_obj(doc_dict))
 
         return da
@@ -392,6 +392,101 @@ def to_csv(
                 doc_dict = _dict_to_access_paths(doc.dict())
                 writer.writerow(doc_dict)
 
+    @classmethod
+    def from_pandas(cls, df: 'pd.DataFrame') -> 'DocumentArray':
+        """
+        Load a DocumentArray from a `pandas.DataFrame` following the schema
+        defined in the :attr:`~docarray.DocumentArray.document_type` attribute.
+        Every row of the dataframe will be mapped to one Document in the array.
+        The column names of the dataframe have to match the field names of the
+        Document type.
+        For nested fields use "__"-separated access paths as column names,
+        such as 'image__url'.
+
+        List-like fields (including field of type DocumentArray) are not supported.
+
+        EXAMPLE USAGE:
+
+        .. code-block:: python
+
+            import pandas as pd
+
+            from docarray import BaseDocument, DocumentArray
+
+
+            class Person(BaseDocument):
+                name: str
+                follower: int
+
+
+            df = pd.DataFrame(
+                data=[['Maria', 12345], ['Jake', 54321]], columns=['name', 'follower']
+            )
+
+            da = DocumentArray[Person].from_pandas(df)
+
+            assert da.name == ['Maria', 'Jake']
+            assert da.follower == [12345, 54321]
+
+
+        :param df: pandas.DataFrame to extract Document's information from
+        :return: DocumentArray where each Document contains the information of one
+            corresponding row of the `pandas.DataFrame`.
+        """
+        from docarray import DocumentArray
+
+        if cls.document_type == AnyDocument:
+            raise TypeError(
+                'There is no document schema defined. '
+                'Please specify the DocumentArray\'s Document type using `DocumentArray[MyDoc]`.'
+            )
+
+        doc_type = cls.document_type
+        da = DocumentArray.__class_getitem__(doc_type)()
+        field_names = df.columns.tolist()
+
+        if field_names is None or len(field_names) == 0:
+            raise TypeError("No field names are given.")
+
+        valid_paths = _all_access_paths_valid(
+            doc_type=doc_type, access_paths=field_names
+        )
+        if not all(valid_paths):
+            raise ValueError(
+                f'Column names do not match the schema of the DocumentArray\'s '
+                f'document type ({cls.document_type.__name__}): '
+                f'{list(compress(field_names, [not v for v in valid_paths]))}'
+            )
+
+        for row in df.itertuples():
+            access_path2val = row._asdict()
+            access_path2val.pop('Index', None)
+            doc_dict = _access_path_dict_to_nested_dict(access_path2val)
+            da.append(doc_type.parse_obj(doc_dict))
+
+        return da
+
+    def to_pandas(self) -> 'pd.DataFrame':
+        """
+        Save a DocumentArray to a `pandas.DataFrame`.
+        The field names will be stored as column names. Each row of the dataframe corresponds
+        to the information of one Document.
+        Columns for nested fields will be named after the "__"-seperated access paths,
+        such as `"image__url"` for `image.url`.
+
+        :return: pandas.DataFrame
+        """
+        import pandas as pd
+
+        fields = self.document_type._get_access_paths()
+        df = pd.DataFrame(columns=fields)
+
+        for doc in self:
+            doc_dict = _dict_to_access_paths(doc.dict())
+            df = df.append(doc_dict, ignore_index=True)
+
+        return df
+
     # Methods to load from/to files in different formats
     @property
     def _stream_header(self) -> bytes:

diff --git a/docarray/helper.py b/docarray/helper.py
@@ -1,28 +1,37 @@
-from typing import TYPE_CHECKING, Any, Dict, Type
+from typing import TYPE_CHECKING, Any, Dict, List, Type
 
 if TYPE_CHECKING:
     from docarray import BaseDocument
 
 
-def is_access_path_valid(doc: Type['BaseDocument'], access_path: str) -> bool:
+def _is_access_path_valid(doc_type: Type['BaseDocument'], access_path: str) -> bool:
     """
     Check if a given access path ("__"-separated) is a valid path for a given Document class.
     """
     from docarray import BaseDocument
 
     field, _, remaining = access_path.partition('__')
     if len(remaining) == 0:
-        return access_path in doc.__fields__.keys()
+        return access_path in doc_type.__fields__.keys()
     else:
-        valid_field = field in doc.__fields__.keys()
+        valid_field = field in doc_type.__fields__.keys()
         if not valid_field:
             return False
         else:
-            d = doc._get_field_type(field)
+            d = doc_type._get_field_type(field)
             if not issubclass(d, BaseDocument):
                 return False
             else:
-                return is_access_path_valid(d, remaining)
+                return _is_access_path_valid(d, remaining)
+
+
+def _all_access_paths_valid(
+    doc_type: Type['BaseDocument'], access_paths: List[str]
+) -> List[bool]:
+    """
+    Check if all access paths ("__"-separated) are valid for a given Document class.
+    """
+    return [_is_access_path_valid(doc_type, path) for path in access_paths]
 
 
 def _access_path_to_dict(access_path: str, value) -> Dict[str, Any]:
@@ -40,6 +49,32 @@ def _access_path_to_dict(access_path: str, value) -> Dict[str, Any]:
     return result
 
 
+def _access_path_dict_to_nested_dict(access_path2val: Dict[str, Any]) -> Dict[Any, Any]:
+    """
+    Convert a dict, where the keys are access paths ("__"-separated) to a nested dictionary.
+
+    EXAMPLE USAGE
+
+    .. code-block:: python
+
+        access_path2val = {'image__url': 'some.png'}
+        assert access_path_dict_to_nested_dict(access_path2val) == {
+            'image': {'url': 'some.png'}
+        }
+
+    :param access_path2val: dict with access_paths as keys
+    :return: nested dict where the access path keys are split into separate field names and nested keys
+    """
+    nested_dict: Dict[Any, Any] = {}
+    for access_path, value in access_path2val.items():
+        field2val = _access_path_to_dict(
+            access_path=access_path,
+            value=value if value not in ['', 'None'] else None,
+        )
+        _update_nested_dicts(to_update=nested_dict, update_with=field2val)
+    return nested_dict
+
+
 def _dict_to_access_paths(d: dict) -> Dict[str, Any]:
     """
     Convert a (nested) dict to a Dict[access_path, value].

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -22,6 +22,7 @@ fastapi = {version = ">=0.87.0", optional = true }
 rich = ">=13.1.0"
 lz4 = {version= ">=1.0.0", optional = true}
 pydub = {version = "^0.25.1", optional = true }
+pandas = {version = ">=1.1.0", optional = true }
 
 [tool.poetry.extras]
 common = ["protobuf", "lz4"]
@@ -31,6 +32,7 @@ video = ["av"]
 audio = ["pydub"]
 mesh = ["trimesh"]
 web = ["fastapi"]
+pandas = ["pandas"]
 
 [tool.poetry.dev-dependencies]
 pytest = ">=6.1"
@@ -60,6 +62,10 @@ check_untyped_defs = true
 module = "av"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "pandas"
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = "trimesh"
 ignore_missing_imports = true

diff --git a/tests/units/array/test_array_from_to_csv.py b/tests/units/array/test_array_from_to_csv.py
@@ -93,9 +93,7 @@ def test_from_csv_without_schema_raise_exception():
 
 
 def test_from_csv_with_wrong_schema_raise_exception(nested_doc):
-    with pytest.raises(
-        ValueError, match='Fields provided in the csv file do not match the schema'
-    ):
+    with pytest.raises(ValueError, match='Column names do not match the schema'):
         DocumentArray[nested_doc.__class__].from_csv(
             file_path=str(TOYDATA_DIR / 'docs.csv')
         )