Skip to content

Commit 277842e

Browse files
committed
Issue python#21424: Optimize heaqp.nlargest() to make fewer tuple comparisons.
Consolidates the logic for nlargest() into a single function so that decoration tuples (elem,order) or (key, order, elem) only need to be formed when a new element is added to the heap. Formerly, a tuple was created for every element regardless of whether it was added to the heap. The change reduces the number of tuples created, the number of ordering integers created, and total number of tuple comparisons.
1 parent d6a46ae commit 277842e

File tree

4 files changed

+41
-174
lines changed

4 files changed

+41
-174
lines changed

Lib/heapq.py

Lines changed: 37 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -192,81 +192,6 @@ def _heapify_max(x):
192192
for i in reversed(range(n//2)):
193193
_siftup_max(x, i)
194194

195-
196-
# Algorithm notes for nlargest() and nsmallest()
197-
# ==============================================
198-
#
199-
# Makes just one pass over the data while keeping the n most extreme values
200-
# in a heap. Memory consumption is limited to keeping n values in a list.
201-
#
202-
# Number of comparisons for n random inputs, keeping the k smallest values:
203-
# -----------------------------------------------------------
204-
# Step Comparisons Action
205-
# 1 1.66*k heapify the first k-inputs
206-
# 2 n - k compare new input elements to top of heap
207-
# 3 k*lg2(k)*(ln(n)-ln(k)) add new extreme values to the heap
208-
# 4 k*lg2(k) final sort of the k most extreme values
209-
#
210-
# number of comparisons
211-
# n-random inputs k-extreme values average of 5 trials % more than min()
212-
# --------------- ---------------- ------------------- -----------------
213-
# 10,000 100 14,046 40.5%
214-
# 100,000 100 105,749 5.7%
215-
# 1,000,000 100 1,007,751 0.8%
216-
#
217-
# Computing the number of comparisons for step 3:
218-
# -----------------------------------------------
219-
# * For the i-th new value from the iterable, the probability of being in the
220-
# k most extreme values is k/i. For example, the probability of the 101st
221-
# value seen being in the 100 most extreme values is 100/101.
222-
# * If the value is a new extreme value, the cost of inserting it into the
223-
# heap is log(k, 2).
224-
# * The probabilty times the cost gives:
225-
# (k/i) * log(k, 2)
226-
# * Summing across the remaining n-k elements gives:
227-
# sum((k/i) * log(k, 2) for xrange(k+1, n+1))
228-
# * This reduces to:
229-
# (H(n) - H(k)) * k * log(k, 2)
230-
# * Where H(n) is the n-th harmonic number estimated by:
231-
# H(n) = log(n, e) + gamma + 1.0 / (2.0 * n)
232-
# gamma = 0.5772156649
233-
# http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
234-
# * Substituting the H(n) formula and ignoring the (1/2*n) fraction gives:
235-
# comparisons = k * log(k, 2) * (log(n,e) - log(k, e))
236-
#
237-
# Worst-case for step 3:
238-
# ----------------------
239-
# In the worst case, the input data is reversed sorted so that every new element
240-
# must be inserted in the heap:
241-
# comparisons = log(k, 2) * (n - k)
242-
#
243-
# Alternative Algorithms
244-
# ----------------------
245-
# Other algorithms were not used because they:
246-
# 1) Took much more auxiliary memory,
247-
# 2) Made multiple passes over the data.
248-
# 3) Made more comparisons in common cases (small k, large n, semi-random input).
249-
# See detailed comparisons at:
250-
# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
251-
252-
def nlargest(n, iterable):
253-
"""Find the n largest elements in a dataset.
254-
255-
Equivalent to: sorted(iterable, reverse=True)[:n]
256-
"""
257-
if n <= 0:
258-
return []
259-
it = iter(iterable)
260-
result = list(islice(it, n))
261-
if not result:
262-
return result
263-
heapify(result)
264-
_heappushpop = heappushpop
265-
for elem in it:
266-
_heappushpop(result, elem)
267-
result.sort(reverse=True)
268-
return result
269-
270195
def nsmallest(n, iterable):
271196
"""Find the n smallest elements in a dataset.
272197
@@ -480,7 +405,6 @@ def nsmallest(n, iterable, key=None):
480405
result = _nsmallest(n, it)
481406
return [r[2] for r in result] # undecorate
482407

483-
_nlargest = nlargest
484408
def nlargest(n, iterable, key=None):
485409
"""Find the n largest elements in a dataset.
486410
@@ -490,12 +414,12 @@ def nlargest(n, iterable, key=None):
490414
# Short-cut for n==1 is to use max() when len(iterable)>0
491415
if n == 1:
492416
it = iter(iterable)
493-
head = list(islice(it, 1))
494-
if not head:
495-
return []
417+
sentinel = object()
496418
if key is None:
497-
return [max(chain(head, it))]
498-
return [max(chain(head, it), key=key)]
419+
result = max(it, default=sentinel)
420+
else:
421+
result = max(it, default=sentinel, key=key)
422+
return [] if result is sentinel else [result]
499423

500424
# When n>=size, it's faster to use sorted()
501425
try:
@@ -508,15 +432,40 @@ def nlargest(n, iterable, key=None):
508432

509433
# When key is none, use simpler decoration
510434
if key is None:
511-
it = zip(iterable, count(0,-1)) # decorate
512-
result = _nlargest(n, it)
513-
return [r[0] for r in result] # undecorate
435+
it = iter(iterable)
436+
result = list(islice(zip(it, count(0, -1)), n))
437+
if not result:
438+
return result
439+
heapify(result)
440+
order = -n
441+
top = result[0][0]
442+
_heapreplace = heapreplace
443+
for elem in it:
444+
if top < elem:
445+
order -= 1
446+
_heapreplace(result, (elem, order))
447+
top = result[0][0]
448+
result.sort(reverse=True)
449+
return [r[0] for r in result]
514450

515451
# General case, slowest method
516-
in1, in2 = tee(iterable)
517-
it = zip(map(key, in1), count(0,-1), in2) # decorate
518-
result = _nlargest(n, it)
519-
return [r[2] for r in result] # undecorate
452+
it = iter(iterable)
453+
result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
454+
if not result:
455+
return result
456+
heapify(result)
457+
order = -n
458+
top = result[0][0]
459+
_heapreplace = heapreplace
460+
for elem in it:
461+
k = key(elem)
462+
if top < k:
463+
order -= 1
464+
_heapreplace(result, (k, order, elem))
465+
top = result[0][0]
466+
result.sort(reverse=True)
467+
return [r[2] for r in result]
468+
520469

521470
if __name__ == "__main__":
522471
# Simple sanity test

Lib/test/test_heapq.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
# _heapq.nlargest/nsmallest are saved in heapq._nlargest/_smallest when
1414
# _heapq is imported, so check them there
1515
func_names = ['heapify', 'heappop', 'heappush', 'heappushpop',
16-
'heapreplace', '_nlargest', '_nsmallest']
16+
'heapreplace', '_nsmallest']
1717

1818
class TestModules(TestCase):
1919
def test_py_functions(self):

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,9 @@ Library
8181
- Issue #21156: importlib.abc.InspectLoader.source_to_code() is now a
8282
staticmethod.
8383

84+
- Issue #21424: Simplified and optimized heaqp.nlargest() to make fewer
85+
tuple comparisons.
86+
8487
- Issue #21396: Fix TextIOWrapper(..., write_through=True) to not force a
8588
flush() on the underlying binary stream. Patch by akira.
8689

Modules/_heapqmodule.c

Lines changed: 0 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -267,89 +267,6 @@ heapify(PyObject *self, PyObject *heap)
267267
PyDoc_STRVAR(heapify_doc,
268268
"Transform list into a heap, in-place, in O(len(heap)) time.");
269269

270-
static PyObject *
271-
nlargest(PyObject *self, PyObject *args)
272-
{
273-
PyObject *heap=NULL, *elem, *iterable, *sol, *it, *oldelem;
274-
Py_ssize_t i, n;
275-
int cmp;
276-
277-
if (!PyArg_ParseTuple(args, "nO:nlargest", &n, &iterable))
278-
return NULL;
279-
280-
it = PyObject_GetIter(iterable);
281-
if (it == NULL)
282-
return NULL;
283-
284-
heap = PyList_New(0);
285-
if (heap == NULL)
286-
goto fail;
287-
288-
for (i=0 ; i<n ; i++ ){
289-
elem = PyIter_Next(it);
290-
if (elem == NULL) {
291-
if (PyErr_Occurred())
292-
goto fail;
293-
else
294-
goto sortit;
295-
}
296-
if (PyList_Append(heap, elem) == -1) {
297-
Py_DECREF(elem);
298-
goto fail;
299-
}
300-
Py_DECREF(elem);
301-
}
302-
if (PyList_GET_SIZE(heap) == 0)
303-
goto sortit;
304-
305-
for (i=n/2-1 ; i>=0 ; i--)
306-
if(_siftup((PyListObject *)heap, i) == -1)
307-
goto fail;
308-
309-
sol = PyList_GET_ITEM(heap, 0);
310-
while (1) {
311-
elem = PyIter_Next(it);
312-
if (elem == NULL) {
313-
if (PyErr_Occurred())
314-
goto fail;
315-
else
316-
goto sortit;
317-
}
318-
cmp = PyObject_RichCompareBool(sol, elem, Py_LT);
319-
if (cmp == -1) {
320-
Py_DECREF(elem);
321-
goto fail;
322-
}
323-
if (cmp == 0) {
324-
Py_DECREF(elem);
325-
continue;
326-
}
327-
oldelem = PyList_GET_ITEM(heap, 0);
328-
PyList_SET_ITEM(heap, 0, elem);
329-
Py_DECREF(oldelem);
330-
if (_siftup((PyListObject *)heap, 0) == -1)
331-
goto fail;
332-
sol = PyList_GET_ITEM(heap, 0);
333-
}
334-
sortit:
335-
if (PyList_Sort(heap) == -1)
336-
goto fail;
337-
if (PyList_Reverse(heap) == -1)
338-
goto fail;
339-
Py_DECREF(it);
340-
return heap;
341-
342-
fail:
343-
Py_DECREF(it);
344-
Py_XDECREF(heap);
345-
return NULL;
346-
}
347-
348-
PyDoc_STRVAR(nlargest_doc,
349-
"Find the n largest elements in a dataset.\n\
350-
\n\
351-
Equivalent to: sorted(iterable, reverse=True)[:n]\n");
352-
353270
static int
354271
_siftdownmax(PyListObject *heap, Py_ssize_t startpos, Py_ssize_t pos)
355272
{
@@ -531,8 +448,6 @@ static PyMethodDef heapq_methods[] = {
531448
METH_VARARGS, heapreplace_doc},
532449
{"heapify", (PyCFunction)heapify,
533450
METH_O, heapify_doc},
534-
{"nlargest", (PyCFunction)nlargest,
535-
METH_VARARGS, nlargest_doc},
536451
{"nsmallest", (PyCFunction)nsmallest,
537452
METH_VARARGS, nsmallest_doc},
538453
{NULL, NULL} /* sentinel */

0 commit comments

Comments
 (0)