Skip to content

Commit b83db61

Browse files
committed
ARROW-3374: [Python] Implicitly set from_pandas=True when passing pandas.Categorical to pyarrow.array. Preserve ordered categories
Author: Wes McKinney <wesm+git@apache.org> Closes apache#2670 from wesm/ARROW-3374 and squashes the following commits: 914c7dd <Wes McKinney> Implicitly set from_pandas=True when passing pandas.Categorical to pyarrow.array. Ensure that ordered categories are preserved
1 parent c522bea commit b83db61

3 files changed

Lines changed: 38 additions & 18 deletions

File tree

cpp/src/arrow/python/arrow_to_pandas.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1898,6 +1898,11 @@ class ArrowDeserializer {
18981898
PyDict_SetItemString(result_, "dictionary", block->dictionary());
18991899
RETURN_IF_PYERROR();
19001900

1901+
PyObject* py_ordered = type.ordered() ? Py_True : Py_False;
1902+
Py_INCREF(py_ordered);
1903+
PyDict_SetItemString(result_, "ordered", py_ordered);
1904+
RETURN_IF_PYERROR();
1905+
19011906
return Status::OK();
19021907
}
19031908

python/pyarrow/array.pxi

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -160,7 +160,7 @@ def array(object obj, type=None, mask=None, size=None, bint from_pandas=False,
160160
return DictionaryArray.from_arrays(
161161
values.codes, values.categories.values,
162162
mask=mask, ordered=values.ordered,
163-
from_pandas=from_pandas, safe=safe,
163+
from_pandas=True, safe=safe,
164164
memory_pool=memory_pool)
165165
else:
166166
import pyarrow.pandas_compat as pdcompat
@@ -811,7 +811,7 @@ cdef wrap_array_output(PyObject* output):
811811
if isinstance(obj, dict):
812812
return Categorical(obj['indices'],
813813
categories=obj['dictionary'],
814-
fastpath=True)
814+
ordered=obj['ordered'], fastpath=True)
815815
else:
816816
return obj
817817

python/pyarrow/tests/test_convert_pandas.py

Lines changed: 31 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1983,26 +1983,41 @@ def test_category(self):
19831983
v1 = ['foo', None, 'bar', 'qux', np.nan]
19841984
v2 = [4, 5, 6, 7, 8]
19851985
v3 = [b'foo', None, b'bar', b'qux', np.nan]
1986-
df = pd.DataFrame({'cat_strings': pd.Categorical(v1 * repeats),
1987-
'cat_ints': pd.Categorical(v2 * repeats),
1988-
'cat_binary': pd.Categorical(v3 * repeats),
1989-
'cat_strings_ordered': pd.Categorical(
1990-
v1 * repeats, categories=['bar', 'qux', 'foo'],
1991-
ordered=True),
1992-
'ints': v2 * repeats,
1993-
'ints2': v2 * repeats,
1994-
'strings': v1 * repeats,
1995-
'strings2': v1 * repeats,
1996-
'strings3': v3 * repeats})
1986+
1987+
arrays = {
1988+
'cat_strings': pd.Categorical(v1 * repeats),
1989+
'cat_strings_with_na': pd.Categorical(v1 * repeats,
1990+
categories=['foo', 'bar']),
1991+
'cat_ints': pd.Categorical(v2 * repeats),
1992+
'cat_binary': pd.Categorical(v3 * repeats),
1993+
'cat_strings_ordered': pd.Categorical(
1994+
v1 * repeats, categories=['bar', 'qux', 'foo'],
1995+
ordered=True),
1996+
'ints': v2 * repeats,
1997+
'ints2': v2 * repeats,
1998+
'strings': v1 * repeats,
1999+
'strings2': v1 * repeats,
2000+
'strings3': v3 * repeats}
2001+
df = pd.DataFrame(arrays)
19972002
_check_pandas_roundtrip(df)
19982003

2004+
for k in arrays:
2005+
_check_array_roundtrip(arrays[k])
2006+
2007+
def test_category_implicit_from_pandas(self):
2008+
# ARROW-3374
2009+
def _check(v):
2010+
arr = pa.array(v)
2011+
result = arr.to_pandas()
2012+
tm.assert_series_equal(pd.Series(result), pd.Series(v))
2013+
19992014
arrays = [
2000-
pd.Categorical(v1 * repeats),
2001-
pd.Categorical(v2 * repeats),
2002-
pd.Categorical(v3 * repeats)
2015+
pd.Categorical(['a', 'b', 'c'], categories=['a', 'b']),
2016+
pd.Categorical(['a', 'b', 'c'], categories=['a', 'b'],
2017+
ordered=True)
20032018
]
2004-
for values in arrays:
2005-
_check_array_roundtrip(values)
2019+
for arr in arrays:
2020+
_check(arr)
20062021

20072022
def test_empty_category(self):
20082023
# ARROW-2443

0 commit comments

Comments
 (0)