Add more comments to hypot() (GH-102817)

author Raymond Hettinger <rhettinger@users.noreply.github.com>

Sat, 18 Mar 2023 17:21:48 +0000 (12:21 -0500)

committer GitHub <noreply@github.com>

Sat, 18 Mar 2023 17:21:48 +0000 (12:21 -0500)
author Raymond Hettinger <rhettinger@users.noreply.github.com>
Sat, 18 Mar 2023 17:21:48 +0000 (12:21 -0500)
committer GitHub <noreply@github.com>
Sat, 18 Mar 2023 17:21:48 +0000 (12:21 -0500)
diff --git a/Modules/mathmodule.c b/Modules/mathmodule.c

index 48cd9a642de1503474f09ceda23497f82890f08d..c9a2be6686399381565f5c6c1dfa5c14e88b0116 100644 (file)
--- a/Modules/mathmodule.c
+++ b/Modules/mathmodule.c
@@ -2447,9 +2447,8 @@ Since lo**2 is less than 1/2 ulp(csum), we have csum+lo*lo == csum.
  To minimize loss of information during the accumulation of fractional
  values, each term has a separate accumulator.  This also breaks up
  sequential dependencies in the inner loop so the CPU can maximize
-floating point throughput. [4]  On a 2.6 GHz Haswell, adding one
-dimension has an incremental cost of only 5ns -- for example when
-moving from hypot(x,y) to hypot(x,y,z).
+floating point throughput. [4]  On an Apple M1 Max, hypot(*vec)
+takes only 3.33 µsec when len(vec) == 1000.
  
  The square root differential correction is needed because a
  correctly rounded square root of a correctly rounded sum of
@@ -2473,7 +2472,7 @@ step is exact.  The Neumaier summation computes as if in doubled
  precision (106 bits) and has the advantage that its input squares
  are non-negative so that the condition number of the sum is one.
  The square root with a differential correction is likewise computed
-as if in double precision.
+as if in doubled precision.
  
  For n <= 1000, prior to the final addition that rounds the overall
  result, the internal accuracy of "h" together with its correction of
@@ -2514,12 +2513,9 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
      }
      frexp(max, &max_e);
      if (max_e < -1023) {
-        /* When max_e < -1023, ldexp(1.0, -max_e) would overflow.
-           So we first perform lossless scaling from subnormals back to normals,
-           then recurse back to vector_norm(), and then finally undo the scaling.
-        */
+        /* When max_e < -1023, ldexp(1.0, -max_e) would overflow. */
          for (i=0 ; i < n ; i++) {
-            vec[i] /= DBL_MIN;
+            vec[i] /= DBL_MIN;          // convert subnormals to normals
          }
          return DBL_MIN * vector_norm(n, vec, max / DBL_MIN, found_nan);
      }
@@ -2529,17 +2525,14 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
      for (i=0 ; i < n ; i++) {
          x = vec[i];
          assert(Py_IS_FINITE(x) && fabs(x) <= max);
-
-        x *= scale;
+        x *= scale;                     // lossless scaling
          assert(fabs(x) < 1.0);
-
-        pr = dl_mul(x, x);
+        pr = dl_mul(x, x);              // lossless squaring
          assert(pr.hi <= 1.0);
-
-        sm = dl_fast_sum(csum, pr.hi);
+        sm = dl_fast_sum(csum, pr.hi);  // lossless addition
          csum = sm.hi;
-        frac1 += pr.lo;
-        frac2 += sm.lo;
+        frac1 += pr.lo;                 // lossy addition
+        frac2 += sm.lo;                 // lossy addition
      }
      h = sqrt(csum - 1.0 + (frac1 + frac2));
      pr = dl_mul(-h, h);
@@ -2548,7 +2541,8 @@ vector_norm(Py_ssize_t n, double *vec, double max, int found_nan)
      frac1 += pr.lo;
      frac2 += sm.lo;
      x = csum - 1.0 + (frac1 + frac2);
-    return (h + x / (2.0 * h)) / scale;
+    h +=  x / (2.0 * h);                 // differential correction
+    return h / scale;
  }
  
  #define NUM_STACK_ELEMS 16
author	Raymond Hettinger <rhettinger@users.noreply.github.com>
	Sat, 18 Mar 2023 17:21:48 +0000 (12:21 -0500)
committer	GitHub <noreply@github.com>
	Sat, 18 Mar 2023 17:21:48 +0000 (12:21 -0500)