@@ -192,81 +192,6 @@ def _heapify_max(x):
192192 for i in reversed (range (n // 2 )):
193193 _siftup_max (x , i )
194194
195-
196- # Algorithm notes for nlargest() and nsmallest()
197- # ==============================================
198- #
199- # Makes just one pass over the data while keeping the n most extreme values
200- # in a heap. Memory consumption is limited to keeping n values in a list.
201- #
202- # Number of comparisons for n random inputs, keeping the k smallest values:
203- # -----------------------------------------------------------
204- # Step Comparisons Action
205- # 1 1.66*k heapify the first k-inputs
206- # 2 n - k compare new input elements to top of heap
207- # 3 k*lg2(k)*(ln(n)-ln(k)) add new extreme values to the heap
208- # 4 k*lg2(k) final sort of the k most extreme values
209- #
210- # number of comparisons
211- # n-random inputs k-extreme values average of 5 trials % more than min()
212- # --------------- ---------------- ------------------- -----------------
213- # 10,000 100 14,046 40.5%
214- # 100,000 100 105,749 5.7%
215- # 1,000,000 100 1,007,751 0.8%
216- #
217- # Computing the number of comparisons for step 3:
218- # -----------------------------------------------
219- # * For the i-th new value from the iterable, the probability of being in the
220- # k most extreme values is k/i. For example, the probability of the 101st
221- # value seen being in the 100 most extreme values is 100/101.
222- # * If the value is a new extreme value, the cost of inserting it into the
223- # heap is log(k, 2).
224- # * The probabilty times the cost gives:
225- # (k/i) * log(k, 2)
226- # * Summing across the remaining n-k elements gives:
227- # sum((k/i) * log(k, 2) for xrange(k+1, n+1))
228- # * This reduces to:
229- # (H(n) - H(k)) * k * log(k, 2)
230- # * Where H(n) is the n-th harmonic number estimated by:
231- # H(n) = log(n, e) + gamma + 1.0 / (2.0 * n)
232- # gamma = 0.5772156649
233- # http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
234- # * Substituting the H(n) formula and ignoring the (1/2*n) fraction gives:
235- # comparisons = k * log(k, 2) * (log(n,e) - log(k, e))
236- #
237- # Worst-case for step 3:
238- # ----------------------
239- # In the worst case, the input data is reversed sorted so that every new element
240- # must be inserted in the heap:
241- # comparisons = log(k, 2) * (n - k)
242- #
243- # Alternative Algorithms
244- # ----------------------
245- # Other algorithms were not used because they:
246- # 1) Took much more auxiliary memory,
247- # 2) Made multiple passes over the data.
248- # 3) Made more comparisons in common cases (small k, large n, semi-random input).
249- # See detailed comparisons at:
250- # http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
251-
252- def nlargest (n , iterable ):
253- """Find the n largest elements in a dataset.
254-
255- Equivalent to: sorted(iterable, reverse=True)[:n]
256- """
257- if n <= 0 :
258- return []
259- it = iter (iterable )
260- result = list (islice (it , n ))
261- if not result :
262- return result
263- heapify (result )
264- _heappushpop = heappushpop
265- for elem in it :
266- _heappushpop (result , elem )
267- result .sort (reverse = True )
268- return result
269-
270195def nsmallest (n , iterable ):
271196 """Find the n smallest elements in a dataset.
272197
@@ -480,7 +405,6 @@ def nsmallest(n, iterable, key=None):
480405 result = _nsmallest (n , it )
481406 return [r [2 ] for r in result ] # undecorate
482407
483- _nlargest = nlargest
484408def nlargest (n , iterable , key = None ):
485409 """Find the n largest elements in a dataset.
486410
@@ -490,12 +414,12 @@ def nlargest(n, iterable, key=None):
490414 # Short-cut for n==1 is to use max() when len(iterable)>0
491415 if n == 1 :
492416 it = iter (iterable )
493- head = list (islice (it , 1 ))
494- if not head :
495- return []
417+ sentinel = object ()
496418 if key is None :
497- return [max (chain (head , it ))]
498- return [max (chain (head , it ), key = key )]
419+ result = max (it , default = sentinel )
420+ else :
421+ result = max (it , default = sentinel , key = key )
422+ return [] if result is sentinel else [result ]
499423
500424 # When n>=size, it's faster to use sorted()
501425 try :
@@ -508,15 +432,40 @@ def nlargest(n, iterable, key=None):
508432
509433 # When key is none, use simpler decoration
510434 if key is None :
511- it = zip (iterable , count (0 ,- 1 )) # decorate
512- result = _nlargest (n , it )
513- return [r [0 ] for r in result ] # undecorate
435+ it = iter (iterable )
436+ result = list (islice (zip (it , count (0 , - 1 )), n ))
437+ if not result :
438+ return result
439+ heapify (result )
440+ order = - n
441+ top = result [0 ][0 ]
442+ _heapreplace = heapreplace
443+ for elem in it :
444+ if top < elem :
445+ order -= 1
446+ _heapreplace (result , (elem , order ))
447+ top = result [0 ][0 ]
448+ result .sort (reverse = True )
449+ return [r [0 ] for r in result ]
514450
515451 # General case, slowest method
516- in1 , in2 = tee (iterable )
517- it = zip (map (key , in1 ), count (0 ,- 1 ), in2 ) # decorate
518- result = _nlargest (n , it )
519- return [r [2 ] for r in result ] # undecorate
452+ it = iter (iterable )
453+ result = [(key (elem ), i , elem ) for i , elem in zip (range (0 , - n , - 1 ), it )]
454+ if not result :
455+ return result
456+ heapify (result )
457+ order = - n
458+ top = result [0 ][0 ]
459+ _heapreplace = heapreplace
460+ for elem in it :
461+ k = key (elem )
462+ if top < k :
463+ order -= 1
464+ _heapreplace (result , (k , order , elem ))
465+ top = result [0 ][0 ]
466+ result .sort (reverse = True )
467+ return [r [2 ] for r in result ]
468+
520469
521470if __name__ == "__main__" :
522471 # Simple sanity test
0 commit comments