GH-81620: Add random.binomialvariate() (GH-94719)

author Raymond Hettinger <rhettinger@users.noreply.github.com>

Wed, 13 Jul 2022 14:46:04 +0000 (09:46 -0500)

committer GitHub <noreply@github.com>

Wed, 13 Jul 2022 14:46:04 +0000 (09:46 -0500)
author Raymond Hettinger <rhettinger@users.noreply.github.com>
Wed, 13 Jul 2022 14:46:04 +0000 (09:46 -0500)
committer GitHub <noreply@github.com>
Wed, 13 Jul 2022 14:46:04 +0000 (09:46 -0500)
diff --git a/Doc/library/random.rst b/Doc/library/random.rst

index 613fbce0fdf20d0a238ef40d1290a1b528656e67..78c2b030d6095fc8cd514d6786d463836b937c92 100644 (file)
--- a/Doc/library/random.rst
+++ b/Doc/library/random.rst
@@ -258,6 +258,28 @@ Functions for sequences
        The *population* must be a sequence.  Automatic conversion of sets
        to lists is no longer supported.
  
+Discrete distributions
+----------------------
+
+The following function generates a discrete distribution.
+
+.. function:: binomialvariate(n=1, p=0.5)
+
+   `Binomial distribution
+   <http://mathworld.wolfram.com/BinomialDistribution.html>`_.
+   Return the number of successes for *n* independent trials with the
+   probability of success in each trial being *p*:
+
+   Mathematically equivalent to::
+
+       sum(random() < p for i in range(n))
+
+   The number of trials *n* should be a non-negative integer.
+   The probability of success *p* should be between ``0.0 <= p <= 1.0``.
+   The result is an integer in the range ``0 <= X <= n``.
+
+   .. versionadded:: 3.12
+
  
  .. _real-valued-distributions:
  
@@ -452,16 +474,13 @@ Simulations::
     >>> # Deal 20 cards without replacement from a deck
     >>> # of 52 playing cards, and determine the proportion of cards
     >>> # with a ten-value:  ten, jack, queen, or king.
-   >>> dealt = sample(['tens', 'low cards'], counts=[16, 36], k=20)
-   >>> dealt.count('tens') / 20
+   >>> deal = sample(['tens', 'low cards'], counts=[16, 36], k=20)
+   >>> deal.count('tens') / 20
     0.15
  
     >>> # Estimate the probability of getting 5 or more heads from 7 spins
     >>> # of a biased coin that settles on heads 60% of the time.
-   >>> def trial():
-   ...     return choices('HT', cum_weights=(0.60, 1.00), k=7).count('H') >= 5
-   ...
-   >>> sum(trial() for i in range(10_000)) / 10_000
+   >>> sum(binomialvariate(n=7, p=0.6) >= 5 for i in range(10_000)) / 10_000
     0.4169
  
     >>> # Probability of the median of 5 samples being in middle two quartiles
diff --git a/Lib/random.py b/Lib/random.py

index 2166474af0554b6d77543eaf5d736318323310a3..00849bd7e732fb3d9fa3cb2c8157a411506865a0 100644 (file)
--- a/Lib/random.py
+++ b/Lib/random.py
@@ -24,6 +24,7 @@
             negative exponential
             gamma
             beta
+           binomial
             pareto
             Weibull
  
@@ -49,6 +50,7 @@ from warnings import warn as _warn
  from math import log as _log, exp as _exp, pi as _pi, e as _e, ceil as _ceil
  from math import sqrt as _sqrt, acos as _acos, cos as _cos, sin as _sin
  from math import tau as TWOPI, floor as _floor, isfinite as _isfinite
+from math import lgamma as _lgamma, fabs as _fabs
  from os import urandom as _urandom
  from _collections_abc import Sequence as _Sequence
  from operator import index as _index
@@ -68,6 +70,7 @@ __all__ = [
      "Random",
      "SystemRandom",
      "betavariate",
+    "binomialvariate",
      "choice",
      "choices",
      "expovariate",
@@ -725,6 +728,91 @@ class Random(_random.Random):
              return y / (y + self.gammavariate(beta, 1.0))
          return 0.0
  
+
+    def binomialvariate(self, n=1, p=0.5):
+        """Binomial random variable.
+
+        Gives the number of successes for *n* independent trials
+        with the probability of success in each trial being *p*:
+
+            sum(random() < p for i in range(n))
+
+        Returns an integer in the range:   0 <= X <= n
+
+        """
+        # Error check inputs and handle edge cases
+        if n < 0:
+            raise ValueError("n must be non-negative")
+        if p <= 0.0 or p >= 1.0:
+            if p == 0.0:
+                return 0
+            if p == 1.0:
+                return n
+            raise ValueError("p must be in the range 0.0 <= p <= 1.0")
+
+        random = self.random
+
+        # Fast path for a common case
+        if n == 1:
+            return _index(random() < p)
+
+        # Exploit symmetry to establish:  p <= 0.5
+        if p > 0.5:
+            return n - self.binomialvariate(n, 1.0 - p)
+
+        if n * p < 10.0:
+            # BG: Geometric method by Devroye with running time of O(np).
+            # https://dl.acm.org/doi/pdf/10.1145/42372.42381
+            x = y = 0
+            c = _log(1.0 - p)
+            if not c:
+                return x
+            while True:
+                y += _floor(_log(random()) / c) + 1
+                if y > n:
+                    return x
+                x += 1
+
+        # BTRS: Transformed rejection with squeeze method by Wolfgang Hörmann
+        # https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.47.8407&rep=rep1&type=pdf
+        assert n*p >= 10.0 and p <= 0.5
+        setup_complete = False
+
+        spq = _sqrt(n * p * (1.0 - p))  # Standard deviation of the distribution
+        b = 1.15 + 2.53 * spq
+        a = -0.0873 + 0.0248 * b + 0.01 * p
+        c = n * p + 0.5
+        vr = 0.92 - 4.2 / b
+
+        while True:
+
+            u = random()
+            v = random()
+            u -= 0.5
+            us = 0.5 - _fabs(u)
+            k = _floor((2.0 * a / us + b) * u + c)
+            if k < 0 or k > n:
+                continue
+
+            # The early-out "squeeze" test substantially reduces
+            # the number of acceptance condition evaluations.
+            if us >= 0.07 and v <= vr:
+                return k
+
+            # Acceptance-rejection test.
+            # Note, the original paper errorneously omits the call to log(v)
+            # when comparing to the log of the rescaled binomial distribution.
+            if not setup_complete:
+                alpha = (2.83 + 5.1 / b) * spq
+                lpq = _log(p / (1.0 - p))
+                m = _floor((n + 1) * p)         # Mode of the distribution
+                h = _lgamma(m + 1) + _lgamma(n - m + 1)
+                setup_complete = True           # Only needs to be done once
+            v *= alpha / (a / (us * us) + b)
+            if _log(v) <= h - _lgamma(k + 1) - _lgamma(n - k + 1) + (k - m) * lpq:
+                return k
+
+
      def paretovariate(self, alpha):
          """Pareto distribution.  alpha is the shape parameter."""
          # Jain, pg. 495
@@ -810,6 +898,7 @@ vonmisesvariate = _inst.vonmisesvariate
  gammavariate = _inst.gammavariate
  gauss = _inst.gauss
  betavariate = _inst.betavariate
+binomialvariate = _inst.binomialvariate
  paretovariate = _inst.paretovariate
  weibullvariate = _inst.weibullvariate
  getstate = _inst.getstate
@@ -834,15 +923,17 @@ def _test_generator(n, func, args):
      low = min(data)
      high = max(data)
  
-    print(f'{t1 - t0:.3f} sec, {n} times {func.__name__}')
+    print(f'{t1 - t0:.3f} sec, {n} times {func.__name__}{args!r}')
      print('avg %g, stddev %g, min %g, max %g\n' % (xbar, sigma, low, high))
  
  
-def _test(N=2000):
+def _test(N=10_000):
      _test_generator(N, random, ())
      _test_generator(N, normalvariate, (0.0, 1.0))
      _test_generator(N, lognormvariate, (0.0, 1.0))
      _test_generator(N, vonmisesvariate, (0.0, 1.0))
+    _test_generator(N, binomialvariate, (15, 0.60))
+    _test_generator(N, binomialvariate, (100, 0.75))
      _test_generator(N, gammavariate, (0.01, 1.0))
      _test_generator(N, gammavariate, (0.1, 1.0))
      _test_generator(N, gammavariate, (0.1, 2.0))
diff --git a/Lib/test/test_random.py b/Lib/test/test_random.py

index fcf17a949c2a62126f007f0da62cb9554c722b44..1e825c3572d20aa2e9d39ce967973d1a48ebbaed 100644 (file)
--- a/Lib/test/test_random.py
+++ b/Lib/test/test_random.py
@@ -1045,6 +1045,9 @@ class TestDistributions(unittest.TestCase):
                  (g.lognormvariate, (0.0, 0.0), 1.0),
                  (g.lognormvariate, (-float('inf'), 0.0), 0.0),
                  (g.normalvariate, (10.0, 0.0), 10.0),
+                (g.binomialvariate, (0, 0.5), 0),
+                (g.binomialvariate, (10, 0.0), 0),
+                (g.binomialvariate, (10, 1.0), 10),
                  (g.paretovariate, (float('inf'),), 1.0),
                  (g.weibullvariate, (10.0, float('inf')), 10.0),
                  (g.weibullvariate, (0.0, 10.0), 0.0),
@@ -1052,6 +1055,59 @@ class TestDistributions(unittest.TestCase):
              for i in range(N):
                  self.assertEqual(variate(*args), expected)
  
+    def test_binomialvariate(self):
+        B = random.binomialvariate
+
+        # Cover all the code paths
+        with self.assertRaises(ValueError):
+            B(n=-1)                            # Negative n
+        with self.assertRaises(ValueError):
+            B(n=1, p=-0.5)                     # Negative p
+        with self.assertRaises(ValueError):
+            B(n=1, p=1.5)                      # p > 1.0
+        self.assertEqual(B(10, 0.0), 0)        # p == 0.0
+        self.assertEqual(B(10, 1.0), 10)       # p == 1.0
+        self.assertTrue(B(1, 0.3) in {0, 1})   # n == 1 fast path
+        self.assertTrue(B(1, 0.9) in {0, 1})   # n == 1 fast path
+        self.assertTrue(B(1, 0.0) in {0})      # n == 1 fast path
+        self.assertTrue(B(1, 1.0) in {1})      # n == 1 fast path
+
+        # BG method p <= 0.5 and n*p=1.25
+        self.assertTrue(B(5, 0.25) in set(range(6)))
+
+        # BG method p >= 0.5 and n*(1-p)=1.25
+        self.assertTrue(B(5, 0.75) in set(range(6)))
+
+        # BTRS method p <= 0.5 and n*p=25
+        self.assertTrue(B(100, 0.25) in set(range(101)))
+
+        # BTRS method p > 0.5 and n*(1-p)=25
+        self.assertTrue(B(100, 0.75) in set(range(101)))
+
+        # Statistical tests chosen such that they are
+        # exceedingly unlikely to ever fail for correct code.
+
+        # BG code path
+        # Expected dist: [31641, 42188, 21094, 4688, 391]
+        c = Counter(B(4, 0.25) for i in range(100_000))
+        self.assertTrue(29_641 <= c[0] <= 33_641, c)
+        self.assertTrue(40_188 <= c[1] <= 44_188)
+        self.assertTrue(19_094 <= c[2] <= 23_094)
+        self.assertTrue(2_688  <= c[3] <= 6_688)
+        self.assertEqual(set(c), {0, 1, 2, 3, 4})
+
+        # BTRS code path
+        # Sum of c[20], c[21], c[22], c[23], c[24] expected to be 36,214
+        c = Counter(B(100, 0.25) for i in range(100_000))
+        self.assertTrue(34_214 <= c[20]+c[21]+c[22]+c[23]+c[24] <= 38_214)
+        self.assertTrue(set(c) <= set(range(101)))
+        self.assertEqual(c.total(), 100_000)
+
+        # Demonstrate the BTRS works for huge values of n
+        self.assertTrue(19_000_000 <= B(100_000_000, 0.2) <= 21_000_000)
+        self.assertTrue(89_000_000 <= B(100_000_000, 0.9) <= 91_000_000)
+
+
      def test_von_mises_range(self):
          # Issue 17149: von mises variates were not consistently in the
          # range [0, 2*PI].
diff --git a/Misc/NEWS.d/next/Library/2022-07-09-15-17-02.gh-issue-81620.L0O_bV.rst b/Misc/NEWS.d/next/Library/2022-07-09-15-17-02.gh-issue-81620.L0O_bV.rst

new file mode 100644 (file)

index 0000000..b4ccea4
--- /dev/null
+++ b/Misc/NEWS.d/next/Library/2022-07-09-15-17-02.gh-issue-81620.L0O_bV.rst
@@ -0,0 +1 @@
+Add random.binomialvariate().
author	Raymond Hettinger <rhettinger@users.noreply.github.com>
	Wed, 13 Jul 2022 14:46:04 +0000 (09:46 -0500)
committer	GitHub <noreply@github.com>
	Wed, 13 Jul 2022 14:46:04 +0000 (09:46 -0500)
Doc/library/random.rst		patch \| blob \| blame \| history
Lib/random.py		patch \| blob \| blame \| history
Lib/test/test_random.py		patch \| blob \| blame \| history
Misc/NEWS.d/next/Library/2022-07-09-15-17-02.gh-issue-81620.L0O_bV.rst	[new file with mode: 0644]	patch \| blob