Skip to content

Commit 11b15a5

Browse files
pitrouxhochy
authored andcommitted
ARROW-2357: [Python] Add microbenchmark for PandasObjectIsNull()
Also add Decimal fodder for other benchmarks. @cpcloud Author: Antoine Pitrou <antoine@python.org> Closes apache#1798 from pitrou/ARROW-2357-benchmarks-pandas-object-is-null and squashes the following commits: b1c6460 <Antoine Pitrou> ARROW-2357: Add microbenchmark for PandasObjectIsNull()
1 parent 8fdad18 commit 11b15a5

10 files changed

Lines changed: 432 additions & 227 deletions

File tree

cpp/src/arrow/python/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ set(ARROW_PYTHON_TEST_LINK_LIBS ${ARROW_PYTHON_MIN_TEST_LIBS})
5050
set(ARROW_PYTHON_SRCS
5151
arrow_to_pandas.cc
5252
arrow_to_python.cc
53+
benchmark.cc
5354
builtin_convert.cc
5455
common.cc
5556
config.cc
@@ -99,6 +100,7 @@ install(FILES
99100
api.h
100101
arrow_to_pandas.h
101102
arrow_to_python.h
103+
benchmark.h
102104
builtin_convert.h
103105
common.h
104106
config.h

cpp/src/arrow/python/benchmark.cc

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#include <arrow/python/benchmark.h>
19+
#include <arrow/python/helpers.h>
20+
21+
namespace arrow {
22+
namespace py {
23+
namespace benchmark {
24+
25+
void Benchmark_PandasObjectIsNull(PyObject* list) {
26+
if (!PyList_CheckExact(list)) {
27+
PyErr_SetString(PyExc_TypeError, "expected a list");
28+
return;
29+
}
30+
Py_ssize_t i, n = PyList_GET_SIZE(list);
31+
for (i = 0; i < n; i++) {
32+
internal::PandasObjectIsNull(PyList_GET_ITEM(list, i));
33+
}
34+
}
35+
36+
} // namespace benchmark
37+
} // namespace py
38+
} // namespace arrow

cpp/src/arrow/python/benchmark.h

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#ifndef ARROW_PYTHON_BENCHMARK_H
19+
#define ARROW_PYTHON_BENCHMARK_H
20+
21+
#include "arrow/python/platform.h"
22+
23+
#include "arrow/util/visibility.h"
24+
25+
namespace arrow {
26+
namespace py {
27+
namespace benchmark {
28+
29+
// Micro-benchmark routines for use from ASV
30+
31+
// Run PandasObjectIsNull() once over every object in *list*
32+
ARROW_EXPORT
33+
void Benchmark_PandasObjectIsNull(PyObject* list);
34+
35+
} // namespace benchmark
36+
} // namespace py
37+
} // namespace arrow
38+
39+
#endif // ARROW_PYTHON_BENCHMARK_H

python/benchmarks/common.py

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,16 +16,23 @@
1616
# under the License.
1717

1818
import codecs
19+
import decimal
20+
from functools import partial
21+
import itertools
1922
import os
2023
import sys
2124
import unicodedata
2225

2326
import numpy as np
2427

28+
import pyarrow as pa
29+
2530

2631
KILOBYTE = 1 << 10
2732
MEGABYTE = KILOBYTE * KILOBYTE
2833

34+
DEFAULT_NONE_PROB = 0.3
35+
2936

3037
def _multiplicate_sequence(base, target_size):
3138
q, r = divmod(target_size, len(base))
@@ -97,3 +104,248 @@ def get_random_unicode(n, *, seed=42):
97104
result = ''.join(unicode_arr.tolist())
98105
assert len(result) == n, (len(result), len(unicode_arr))
99106
return result
107+
108+
109+
class BuiltinsGenerator(object):
110+
111+
def __init__(self, seed=42):
112+
self.rnd = np.random.RandomState(seed)
113+
114+
def sprinkle(self, lst, prob, value):
115+
"""
116+
Sprinkle *value* entries in list *lst* with likelihood *prob*.
117+
"""
118+
for i, p in enumerate(self.rnd.random_sample(size=len(lst))):
119+
if p < prob:
120+
lst[i] = value
121+
122+
def sprinkle_nones(self, lst, prob):
123+
"""
124+
Sprinkle None entries in list *lst* with likelihood *prob*.
125+
"""
126+
self.sprinkle(lst, prob, None)
127+
128+
def generate_int_list(self, n, none_prob=DEFAULT_NONE_PROB):
129+
"""
130+
Generate a list of Python ints with *none_prob* probability of
131+
an entry being None.
132+
"""
133+
data = list(range(n))
134+
self.sprinkle_nones(data, none_prob)
135+
return data
136+
137+
def generate_float_list(self, n, none_prob=DEFAULT_NONE_PROB,
138+
use_nan=False):
139+
"""
140+
Generate a list of Python floats with *none_prob* probability of
141+
an entry being None (or NaN if *use_nan* is true).
142+
"""
143+
# Make sure we get Python floats, not np.float64
144+
data = list(map(float, self.rnd.uniform(0.0, 1.0, n)))
145+
assert len(data) == n
146+
self.sprinkle(data, none_prob, value=float('nan') if use_nan else None)
147+
return data
148+
149+
def generate_bool_list(self, n, none_prob=DEFAULT_NONE_PROB):
150+
"""
151+
Generate a list of Python bools with *none_prob* probability of
152+
an entry being None.
153+
"""
154+
# Make sure we get Python bools, not np.bool_
155+
data = [bool(x >= 0.5) for x in self.rnd.uniform(0.0, 1.0, n)]
156+
assert len(data) == n
157+
self.sprinkle_nones(data, none_prob)
158+
return data
159+
160+
def generate_decimal_list(self, n, none_prob=DEFAULT_NONE_PROB,
161+
use_nan=False):
162+
"""
163+
Generate a list of Python Decimals with *none_prob* probability of
164+
an entry being None (or NaN if *use_nan* is true).
165+
"""
166+
data = [decimal.Decimal('%.9f' % f)
167+
for f in self.rnd.uniform(0.0, 1.0, n)]
168+
assert len(data) == n
169+
self.sprinkle(data, none_prob,
170+
value=decimal.Decimal('nan') if use_nan else None)
171+
return data
172+
173+
def generate_object_list(self, n, none_prob=DEFAULT_NONE_PROB):
174+
"""
175+
Generate a list of generic Python objects with *none_prob*
176+
probability of an entry being None.
177+
"""
178+
data = [object() for i in range(n)]
179+
self.sprinkle_nones(data, none_prob)
180+
return data
181+
182+
def _generate_varying_sequences(self, random_factory, n, min_size, max_size, none_prob):
183+
"""
184+
Generate a list of *n* sequences of varying size between *min_size*
185+
and *max_size*, with *none_prob* probability of an entry being None.
186+
The base material for each sequence is obtained by calling
187+
`random_factory(<some size>)`
188+
"""
189+
base_size = 10000
190+
base = random_factory(base_size + max_size)
191+
data = []
192+
for i in range(n):
193+
off = self.rnd.randint(base_size)
194+
if min_size == max_size:
195+
size = min_size
196+
else:
197+
size = self.rnd.randint(min_size, max_size + 1)
198+
data.append(base[off:off + size])
199+
self.sprinkle_nones(data, none_prob)
200+
assert len(data) == n
201+
return data
202+
203+
def generate_fixed_binary_list(self, n, size, none_prob=DEFAULT_NONE_PROB):
204+
"""
205+
Generate a list of bytestrings with a fixed *size*.
206+
"""
207+
return self._generate_varying_sequences(get_random_bytes, n,
208+
size, size, none_prob)
209+
210+
211+
def generate_varying_binary_list(self, n, min_size, max_size,
212+
none_prob=DEFAULT_NONE_PROB):
213+
"""
214+
Generate a list of bytestrings with a random size between
215+
*min_size* and *max_size*.
216+
"""
217+
return self._generate_varying_sequences(get_random_bytes, n,
218+
min_size, max_size, none_prob)
219+
220+
221+
def generate_ascii_string_list(self, n, min_size, max_size,
222+
none_prob=DEFAULT_NONE_PROB):
223+
"""
224+
Generate a list of ASCII strings with a random size between
225+
*min_size* and *max_size*.
226+
"""
227+
return self._generate_varying_sequences(get_random_ascii, n,
228+
min_size, max_size, none_prob)
229+
230+
231+
def generate_unicode_string_list(self, n, min_size, max_size,
232+
none_prob=DEFAULT_NONE_PROB):
233+
"""
234+
Generate a list of unicode strings with a random size between
235+
*min_size* and *max_size*.
236+
"""
237+
return self._generate_varying_sequences(get_random_unicode, n,
238+
min_size, max_size, none_prob)
239+
240+
241+
def generate_int_list_list(self, n, min_size, max_size,
242+
none_prob=DEFAULT_NONE_PROB):
243+
"""
244+
Generate a list of lists of Python ints with a random size between
245+
*min_size* and *max_size*.
246+
"""
247+
return self._generate_varying_sequences(
248+
partial(self.generate_int_list, none_prob=none_prob),
249+
n, min_size, max_size, none_prob)
250+
251+
def generate_tuple_list(self, n, none_prob=DEFAULT_NONE_PROB):
252+
"""
253+
Generate a list of tuples with random values.
254+
Each tuple has the form `(int value, float value, bool value)`
255+
"""
256+
dicts = self.generate_dict_list(n, none_prob=none_prob)
257+
tuples = [(d.get('u'), d.get('v'), d.get('w'))
258+
if d is not None else None
259+
for d in dicts]
260+
assert len(tuples) == n
261+
return tuples
262+
263+
def generate_dict_list(self, n, none_prob=DEFAULT_NONE_PROB):
264+
"""
265+
Generate a list of dicts with random values.
266+
Each dict has the form `{'u': int value, 'v': float value, 'w': bool value}`
267+
"""
268+
ints = self.generate_int_list(n, none_prob=none_prob)
269+
floats = self.generate_float_list(n, none_prob=none_prob)
270+
bools = self.generate_bool_list(n, none_prob=none_prob)
271+
dicts = []
272+
# Keep half the Nones, omit the other half
273+
keep_nones = itertools.cycle([True, False])
274+
for u, v, w in zip(ints, floats, bools):
275+
d = {}
276+
if u is not None or next(keep_nones):
277+
d['u'] = u
278+
if v is not None or next(keep_nones):
279+
d['v'] = v
280+
if w is not None or next(keep_nones):
281+
d['w'] = w
282+
dicts.append(d)
283+
self.sprinkle_nones(dicts, none_prob)
284+
assert len(dicts) == n
285+
return dicts
286+
287+
def get_type_and_builtins(self, n, type_name):
288+
"""
289+
Return a `(arrow type, list)` tuple where the arrow type
290+
corresponds to the given logical *type_name*, and the list
291+
is a list of *n* random-generated Python objects compatible
292+
with the arrow type.
293+
"""
294+
size = None
295+
296+
if type_name in ('bool', 'decimal', 'ascii', 'unicode', 'int64 list'):
297+
kind = type_name
298+
elif type_name.startswith(('int', 'uint')):
299+
kind = 'int'
300+
elif type_name.startswith('float'):
301+
kind = 'float'
302+
elif type_name.startswith('struct'):
303+
kind = 'struct'
304+
elif type_name == 'binary':
305+
kind = 'varying binary'
306+
elif type_name.startswith('binary'):
307+
kind = 'fixed binary'
308+
size = int(type_name[6:])
309+
assert size > 0
310+
else:
311+
raise ValueError("unrecognized type %r" % (type_name,))
312+
313+
if kind in ('int', 'float'):
314+
ty = getattr(pa, type_name)()
315+
elif kind == 'bool':
316+
ty = pa.bool_()
317+
elif kind == 'decimal':
318+
ty = pa.decimal128(9, 9)
319+
elif kind == 'fixed binary':
320+
ty = pa.binary(size)
321+
elif kind == 'varying binary':
322+
ty = pa.binary()
323+
elif kind in ('ascii', 'unicode'):
324+
ty = pa.string()
325+
elif kind == 'int64 list':
326+
ty = pa.list_(pa.int64())
327+
elif kind == 'struct':
328+
ty = pa.struct([pa.field('u', pa.int64()),
329+
pa.field('v', pa.float64()),
330+
pa.field('w', pa.bool_())])
331+
332+
factories = {
333+
'int': self.generate_int_list,
334+
'float': self.generate_float_list,
335+
'bool': self.generate_bool_list,
336+
'decimal': self.generate_decimal_list,
337+
'fixed binary': partial(self.generate_fixed_binary_list,
338+
size=size),
339+
'varying binary': partial(self.generate_varying_binary_list,
340+
min_size=3, max_size=40),
341+
'ascii': partial(self.generate_ascii_string_list,
342+
min_size=3, max_size=40),
343+
'unicode': partial(self.generate_unicode_string_list,
344+
min_size=3, max_size=40),
345+
'int64 list': partial(self.generate_int_list_list,
346+
min_size=0, max_size=20),
347+
'struct': self.generate_dict_list,
348+
'struct from tuples': self.generate_tuple_list,
349+
}
350+
data = factories[kind](n)
351+
return ty, data

0 commit comments

Comments
 (0)