Issue #21424: Optimize heaqp.nlargest() to make fewer tuple comparisons.

author Raymond Hettinger <python@rcn.com>

Sun, 11 May 2014 08:55:46 +0000 (01:55 -0700)

committer Raymond Hettinger <python@rcn.com>

Sun, 11 May 2014 08:55:46 +0000 (01:55 -0700)
author Raymond Hettinger <python@rcn.com>
Sun, 11 May 2014 08:55:46 +0000 (01:55 -0700)
committer Raymond Hettinger <python@rcn.com>
Sun, 11 May 2014 08:55:46 +0000 (01:55 -0700)
diff --git a/Lib/heapq.py b/Lib/heapq.py

index 88c7019abc0492a8a3607433688bd29c03f8104c..fc73df9fd3fe475599933cffcd24b5a64b01c5a3 100644 (file)
--- a/Lib/heapq.py
+++ b/Lib/heapq.py
@@ -192,81 +192,6 @@ def _heapify_max(x):
      for i in reversed(range(n//2)):
          _siftup_max(x, i)
  
-
-# Algorithm notes for nlargest() and nsmallest()
-# ==============================================
-#
-# Makes just one pass over the data while keeping the n most extreme values
-# in a heap.  Memory consumption is limited to keeping n values in a list.
-#
-# Number of comparisons for n random inputs, keeping the k smallest values:
-# -----------------------------------------------------------
-# Step   Comparisons                 Action
-#  1        1.66*k                   heapify the first k-inputs
-#  2        n - k                    compare new input elements to top of heap
-#  3        k*lg2(k)*(ln(n)-ln(k))   add new extreme values to the heap
-#  4        k*lg2(k)                 final sort of the k most extreme values
-#
-#                                      number of comparisons
-# n-random inputs   k-extreme values    average of 5 trials     % more than min()
-# ---------------   ----------------    -------------------     -----------------
-#       10,000            100                   14,046                 40.5%
-#      100,000            100                  105,749                  5.7%
-#    1,000,000            100                1,007,751                  0.8%
-#
-# Computing the number of comparisons for step 3:
-# -----------------------------------------------
-# * For the i-th new value from the iterable, the probability of being in the
-#   k most extreme values is k/i.  For example, the probability of the 101st
-#   value seen being in the 100 most extreme values is 100/101.
-# * If the value is a new extreme value, the cost of inserting it into the
-#   heap is log(k, 2).
-# * The probabilty times the cost gives:
-#            (k/i) * log(k, 2)
-# * Summing across the remaining n-k elements gives:
-#            sum((k/i) * log(k, 2) for xrange(k+1, n+1))
-# * This reduces to:
-#            (H(n) - H(k)) * k * log(k, 2)
-# * Where H(n) is the n-th harmonic number estimated by:
-#            H(n) = log(n, e) + gamma + 1.0 / (2.0 * n)
-#            gamma = 0.5772156649
-#   http://en.wikipedia.org/wiki/Harmonic_series_(mathematics)#Rate_of_divergence
-# * Substituting the H(n) formula and ignoring the (1/2*n) fraction gives:
-#            comparisons = k * log(k, 2) * (log(n,e) - log(k, e))
-#
-# Worst-case for step 3:
-# ----------------------
-# In the worst case, the input data is reversed sorted so that every new element
-# must be inserted in the heap:
-#             comparisons = log(k, 2) * (n - k)
-#
-# Alternative Algorithms
-# ----------------------
-# Other algorithms were not used because they:
-# 1) Took much more auxiliary memory,
-# 2) Made multiple passes over the data.
-# 3) Made more comparisons in common cases (small k, large n, semi-random input).
-# See detailed comparisons at:
-# http://code.activestate.com/recipes/577573-compare-algorithms-for-heapqsmallest
-
-def nlargest(n, iterable):
-    """Find the n largest elements in a dataset.
-
-    Equivalent to:  sorted(iterable, reverse=True)[:n]
-    """
-    if n <= 0:
-        return []
-    it = iter(iterable)
-    result = list(islice(it, n))
-    if not result:
-        return result
-    heapify(result)
-    _heappushpop = heappushpop
-    for elem in it:
-        _heappushpop(result, elem)
-    result.sort(reverse=True)
-    return result
-
  def nsmallest(n, iterable):
      """Find the n smallest elements in a dataset.
  
@@ -480,7 +405,6 @@ def nsmallest(n, iterable, key=None):
      result = _nsmallest(n, it)
      return [r[2] for r in result]                           # undecorate
  
-_nlargest = nlargest
  def nlargest(n, iterable, key=None):
      """Find the n largest elements in a dataset.
  
@@ -490,12 +414,12 @@ def nlargest(n, iterable, key=None):
      # Short-cut for n==1 is to use max() when len(iterable)>0
      if n == 1:
          it = iter(iterable)
-        head = list(islice(it, 1))
-        if not head:
-            return []
+        sentinel = object()
          if key is None:
-            return [max(chain(head, it))]
-        return [max(chain(head, it), key=key)]
+            result = max(it, default=sentinel)
+        else:
+            result = max(it, default=sentinel, key=key)
+        return [] if result is sentinel else [result]
  
      # When n>=size, it's faster to use sorted()
      try:
@@ -508,15 +432,40 @@ def nlargest(n, iterable, key=None):
  
      # When key is none, use simpler decoration
      if key is None:
-        it = zip(iterable, count(0,-1))                     # decorate
-        result = _nlargest(n, it)
-        return [r[0] for r in result]                       # undecorate
+        it = iter(iterable)
+        result = list(islice(zip(it, count(0, -1)), n))
+        if not result:
+            return result
+        heapify(result)
+        order = -n
+        top = result[0][0]
+        _heapreplace = heapreplace
+        for elem in it:
+            if top < elem:
+                order -= 1
+                _heapreplace(result, (elem, order))
+                top = result[0][0]
+        result.sort(reverse=True)
+        return [r[0] for r in result]
  
      # General case, slowest method
-    in1, in2 = tee(iterable)
-    it = zip(map(key, in1), count(0,-1), in2)               # decorate
-    result = _nlargest(n, it)
-    return [r[2] for r in result]                           # undecorate
+    it = iter(iterable)
+    result = [(key(elem), i, elem) for i, elem in zip(range(0, -n, -1), it)]
+    if not result:
+        return result
+    heapify(result)
+    order = -n
+    top = result[0][0]
+    _heapreplace = heapreplace
+    for elem in it:
+        k = key(elem)
+        if top < k:
+            order -= 1
+            _heapreplace(result, (k, order, elem))
+            top = result[0][0]
+    result.sort(reverse=True)
+    return [r[2] for r in result]
+
  
  if __name__ == "__main__":
      # Simple sanity test
diff --git a/Lib/test/test_heapq.py b/Lib/test/test_heapq.py

index b5a2fd803a808fa9c7e4a3f0f0e3679321d9fdd6..1735a19ca8a2d24a6798681df20774625ccec274 100644 (file)
--- a/Lib/test/test_heapq.py
+++ b/Lib/test/test_heapq.py
@@ -13,7 +13,7 @@ c_heapq = support.import_fresh_module('heapq', fresh=['_heapq'])
  # _heapq.nlargest/nsmallest are saved in heapq._nlargest/_smallest when
  # _heapq is imported, so check them there
  func_names = ['heapify', 'heappop', 'heappush', 'heappushpop',
-              'heapreplace', '_nlargest', '_nsmallest']
+              'heapreplace', '_nsmallest']
  
  class TestModules(TestCase):
      def test_py_functions(self):
diff --git a/Misc/NEWS b/Misc/NEWS

index b1f67ce65e61ea3268264e5d00a6d8172189f430..27b6eee08ab48e49920f53e3e1da10dcaa368909 100644 (file)
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@ -81,6 +81,9 @@ Library
  - Issue #21156: importlib.abc.InspectLoader.source_to_code() is now a
    staticmethod.
  
+- Issue #21424: Simplified and optimized heaqp.nlargest() to make fewer
+  tuple comparisons.
+
  - Issue #21396: Fix TextIOWrapper(..., write_through=True) to not force a
    flush() on the underlying binary stream.  Patch by akira.
  
diff --git a/Modules/_heapqmodule.c b/Modules/_heapqmodule.c

index b3e4753f2654725441cdb943637a578b28c55071..964f511646935b18c294eb643adc09be78a48b9e 100644 (file)
--- a/Modules/_heapqmodule.c
+++ b/Modules/_heapqmodule.c
@@ -267,89 +267,6 @@ heapify(PyObject *self, PyObject *heap)
  PyDoc_STRVAR(heapify_doc,
  "Transform list into a heap, in-place, in O(len(heap)) time.");
  
-static PyObject *
-nlargest(PyObject *self, PyObject *args)
-{
-    PyObject *heap=NULL, *elem, *iterable, *sol, *it, *oldelem;
-    Py_ssize_t i, n;
-    int cmp;
-
-    if (!PyArg_ParseTuple(args, "nO:nlargest", &n, &iterable))
-        return NULL;
-
-    it = PyObject_GetIter(iterable);
-    if (it == NULL)
-        return NULL;
-
-    heap = PyList_New(0);
-    if (heap == NULL)
-        goto fail;
-
-    for (i=0 ; i<n ; i++ ){
-        elem = PyIter_Next(it);
-        if (elem == NULL) {
-            if (PyErr_Occurred())
-                goto fail;
-            else
-                goto sortit;
-        }
-        if (PyList_Append(heap, elem) == -1) {
-            Py_DECREF(elem);
-            goto fail;
-        }
-        Py_DECREF(elem);
-    }
-    if (PyList_GET_SIZE(heap) == 0)
-        goto sortit;
-
-    for (i=n/2-1 ; i>=0 ; i--)
-        if(_siftup((PyListObject *)heap, i) == -1)
-            goto fail;
-
-    sol = PyList_GET_ITEM(heap, 0);
-    while (1) {
-        elem = PyIter_Next(it);
-        if (elem == NULL) {
-            if (PyErr_Occurred())
-                goto fail;
-            else
-                goto sortit;
-        }
-        cmp = PyObject_RichCompareBool(sol, elem, Py_LT);
-        if (cmp == -1) {
-            Py_DECREF(elem);
-            goto fail;
-        }
-        if (cmp == 0) {
-            Py_DECREF(elem);
-            continue;
-        }
-        oldelem = PyList_GET_ITEM(heap, 0);
-        PyList_SET_ITEM(heap, 0, elem);
-        Py_DECREF(oldelem);
-        if (_siftup((PyListObject *)heap, 0) == -1)
-            goto fail;
-        sol = PyList_GET_ITEM(heap, 0);
-    }
-sortit:
-    if (PyList_Sort(heap) == -1)
-        goto fail;
-    if (PyList_Reverse(heap) == -1)
-        goto fail;
-    Py_DECREF(it);
-    return heap;
-
-fail:
-    Py_DECREF(it);
-    Py_XDECREF(heap);
-    return NULL;
-}
-
-PyDoc_STRVAR(nlargest_doc,
-"Find the n largest elements in a dataset.\n\
-\n\
-Equivalent to:  sorted(iterable, reverse=True)[:n]\n");
-
  static int
  _siftdownmax(PyListObject *heap, Py_ssize_t startpos, Py_ssize_t pos)
  {
@@ -531,8 +448,6 @@ static PyMethodDef heapq_methods[] = {
          METH_VARARGS,           heapreplace_doc},
      {"heapify",         (PyCFunction)heapify,
          METH_O,                 heapify_doc},
-    {"nlargest",        (PyCFunction)nlargest,
-        METH_VARARGS,           nlargest_doc},
      {"nsmallest",       (PyCFunction)nsmallest,
          METH_VARARGS,           nsmallest_doc},
      {NULL,              NULL}           /* sentinel */
author	Raymond Hettinger <python@rcn.com>
	Sun, 11 May 2014 08:55:46 +0000 (01:55 -0700)
committer	Raymond Hettinger <python@rcn.com>
	Sun, 11 May 2014 08:55:46 +0000 (01:55 -0700)
Lib/heapq.py		patch \| blob \| blame \| history
Lib/test/test_heapq.py		patch \| blob \| blame \| history
Misc/NEWS		patch \| blob \| blame \| history
Modules/_heapqmodule.c		patch \| blob \| blame \| history