regress.c

   1 /*
   2   chronyd/chronyc - Programs for keeping computer clocks accurate.
   3
   4  **********************************************************************
   5  * Copyright (C) Richard P. Curnow  1997-2003
   6  * Copyright (C) Miroslav Lichvar  2011, 2016-2017
   7  *
   8  * This program is free software; you can redistribute it and/or modify
   9  * it under the terms of version 2 of the GNU General Public License as
  10  * published by the Free Software Foundation.
  11  *
  12  * This program is distributed in the hope that it will be useful, but
  13  * WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License along
  18  * with this program; if not, write to the Free Software Foundation, Inc.,
  19  * 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  20  *
  21  **********************************************************************
  22
  23   =======================================================================
  24
  25   Regression algorithms.
  26
  27   */
  28
  29 #include "config.h"
  30
  31 #include "sysincl.h"
  32
  33 #include "regress.h"
  34 #include "logging.h"
  35 #include "util.h"
  36
  37 #define MAX_POINTS 64
  38
  39 void
  40 RGR_WeightedRegression
  41 (double *x,                     /* independent variable */
  42  double *y,                     /* measured data */
  43  double *w,                     /* weightings (large => data
  44                                    less reliable) */
  45
  46  int n,                         /* number of data points */
  47
  48  /* And now the results */
  49
  50  double *b0,                    /* estimated y axis intercept */
  51  double *b1,                    /* estimated slope */
  52  double *s2,                    /* estimated variance of data points */
  53
  54  double *sb0,                   /* estimated standard deviation of
  55                                    intercept */
  56  double *sb1                    /* estimated standard deviation of
  57                                    slope */
  58
  59  /* Could add correlation stuff later if required */
  60 )
  61 {
  62   double P, Q, U, V, W;
  63   double diff;
  64   double u, ui, aa;
  65   int i;
  66
  67   assert(n >= 3);
  68
  69   W = U = 0;
  70   for (i=0; i<n; i++) {
  71     U += x[i]        / w[i];
  72     W += 1.0         / w[i];
  73   }
  74
  75   u = U / W;
  76
  77   /* Calculate statistics from data */
  78   P = Q = V = 0.0;
  79   for (i=0; i<n; i++) {
  80     ui = x[i] - u;
  81     P += y[i]        / w[i];
  82     Q += y[i] * ui   / w[i];
  83     V += ui   * ui   / w[i];
  84   }
  85
  86   *b1 = Q / V;
  87   *b0 = (P / W) - (*b1) * u;
  88
  89   *s2 = 0.0;
  90   for (i=0; i<n; i++) {
  91     diff = y[i] - *b0 - *b1*x[i];
  92     *s2 += diff*diff / w[i];
  93   }
  94
  95   *s2 /= (double)(n-2);
  96
  97   *sb1 = sqrt(*s2 / V);
  98   aa = u * (*sb1);
  99   *sb0 = sqrt(*s2 / W + aa * aa);
 100
 101   *s2 *= (n / W); /* Giving weighted average of variances */
 102 }
 103
 104 /* ================================================== */
 105 /* Get the coefficient to multiply the standard deviation by, to get a
 106    particular size of confidence interval (assuming a t-distribution) */
 107
 108 double
 109 RGR_GetTCoef(int dof)
 110 {
 111   /* Assuming now the 99.95% quantile */
 112   static const float coefs[] =
 113   { 636.6, 31.6, 12.92, 8.61, 6.869,
 114     5.959, 5.408, 5.041, 4.781, 4.587,
 115     4.437, 4.318, 4.221, 4.140, 4.073,
 116     4.015, 3.965, 3.922, 3.883, 3.850,
 117     3.819, 3.792, 3.768, 3.745, 3.725,
 118     3.707, 3.690, 3.674, 3.659, 3.646,
 119     3.633, 3.622, 3.611, 3.601, 3.591,
 120     3.582, 3.574, 3.566, 3.558, 3.551};
 121
 122   if (dof <= 40) {
 123     return coefs[dof-1];
 124   } else {
 125     return 3.5; /* Until I can be bothered to do something better */
 126   }
 127 }
 128
 129 /* ================================================== */
 130 /* Get 90% quantile of chi-square distribution */
 131
 132 double
 133 RGR_GetChi2Coef(int dof)
 134 {
 135   static const float coefs[] = {
 136     2.706, 4.605, 6.251, 7.779, 9.236, 10.645, 12.017, 13.362,
 137     14.684, 15.987, 17.275, 18.549, 19.812, 21.064, 22.307, 23.542,
 138     24.769, 25.989, 27.204, 28.412, 29.615, 30.813, 32.007, 33.196,
 139     34.382, 35.563, 36.741, 37.916, 39.087, 40.256, 41.422, 42.585,
 140     43.745, 44.903, 46.059, 47.212, 48.363, 49.513, 50.660, 51.805,
 141     52.949, 54.090, 55.230, 56.369, 57.505, 58.641, 59.774, 60.907,
 142     62.038, 63.167, 64.295, 65.422, 66.548, 67.673, 68.796, 69.919,
 143     71.040, 72.160, 73.279, 74.397, 75.514, 76.630, 77.745, 78.860
 144   };
 145
 146   if (dof <= 64) {
 147     return coefs[dof-1];
 148   } else {
 149     return 1.2 * dof; /* Until I can be bothered to do something better */
 150   }
 151 }
 152
 153 /* ================================================== */
 154 /* Critical value for number of runs of residuals with same sign.
 155    5% critical region for now. */
 156
 157 static char critical_runs[] = {
 158   0,  0,  0,  0,  0,  0,  0,  0,  2,  3,
 159   3,  3,  4,  4,  5,  5,  5,  6,  6,  7,
 160   7,  7,  8,  8,  9,  9,  9, 10, 10, 11,
 161  11, 11, 12, 12, 13, 13, 14, 14, 14, 15,
 162  15, 16, 16, 17, 17, 18, 18, 18, 19, 19,
 163  20, 20, 21, 21, 21, 22, 22, 23, 23, 24,
 164  24, 25, 25, 26, 26, 26, 27, 27, 28, 28,
 165  29, 29, 30, 30, 30, 31, 31, 32, 32, 33,
 166  33, 34, 34, 35, 35, 35, 36, 36, 37, 37,
 167  38, 38, 39, 39, 40, 40, 40, 41, 41, 42,
 168  42, 43, 43, 44, 44, 45, 45, 46, 46, 46,
 169  47, 47, 48, 48, 49, 49, 50, 50, 51, 51,
 170  52, 52, 52, 53, 53, 54, 54, 55, 55, 56
 171 };
 172
 173 /* ================================================== */
 174
 175 static int
 176 n_runs_from_residuals(double *resid, int n)
 177 {
 178   int nruns;
 179   int i;
 180
 181   nruns = 1;
 182   for (i=1; i<n; i++) {
 183     if (((resid[i-1] < 0.0) && (resid[i] < 0.0)) ||
 184         ((resid[i-1] > 0.0) && (resid[i] > 0.0))) {
 185       /* Nothing to do */
 186     } else {
 187       nruns++;
 188     }
 189   }
 190
 191   return nruns;
 192 }
 193
 194 /* ================================================== */
 195 /* Return a boolean indicating whether we had enough points for
 196    regression */
 197
 198 int
 199 RGR_FindBestRegression
 200 (double *x,                     /* independent variable */
 201  double *y,                     /* measured data */
 202  double *w,                     /* weightings (large => data
 203                                    less reliable) */
 204
 205  int n,                         /* number of data points */
 206  int m,                         /* number of extra samples in x and y arrays
 207                                    (negative index) which can be used to
 208                                    extend runs test */
 209  int min_samples,               /* minimum number of samples to be kept after
 210                                    changing the starting index to pass the runs
 211                                    test */
 212
 213  /* And now the results */
 214
 215  double *b0,                    /* estimated y axis intercept */
 216  double *b1,                    /* estimated slope */
 217  double *s2,                    /* estimated variance of data points */
 218
 219  double *sb0,                   /* estimated standard deviation of
 220                                    intercept */
 221  double *sb1,                   /* estimated standard deviation of
 222                                    slope */
 223
 224  int *new_start,                /* the new starting index to make the
 225                                    residuals pass the two tests */
 226
 227  int *n_runs,                   /* number of runs amongst the residuals */
 228
 229  int *dof                       /* degrees of freedom in statistics (needed
 230                                    to get confidence intervals later) */
 231
 232 )
 233 {
 234   double P, Q, U, V, W; /* total */
 235   double resid[MAX_POINTS * REGRESS_RUNS_RATIO];
 236   double ss;
 237   double a, b, u, ui, aa;
 238
 239   int start, resid_start, nruns, npoints;
 240   int i;
 241
 242   assert(n <= MAX_POINTS && m >= 0);
 243   assert(n * REGRESS_RUNS_RATIO < sizeof (critical_runs) / sizeof (critical_runs[0]));
 244
 245   if (n < MIN_SAMPLES_FOR_REGRESS) {
 246     return 0;
 247   }
 248
 249   start = 0;
 250   do {
 251
 252     W = U = 0;
 253     for (i=start; i<n; i++) {
 254       U += x[i]        / w[i];
 255       W += 1.0         / w[i];
 256     }
 257
 258     u = U / W;
 259
 260     P = Q = V = 0.0;
 261     for (i=start; i<n; i++) {
 262       ui = x[i] - u;
 263       P += y[i]        / w[i];
 264       Q += y[i] * ui   / w[i];
 265       V += ui   * ui   / w[i];
 266     }
 267
 268     b = Q / V;
 269     a = (P / W) - (b * u);
 270
 271     /* Get residuals also for the extra samples before start */
 272     resid_start = n - (n - start) * REGRESS_RUNS_RATIO;
 273     if (resid_start < -m)
 274       resid_start = -m;
 275
 276     for (i=resid_start; i<n; i++) {
 277       resid[i - resid_start] = y[i] - a - b*x[i];
 278     }
 279
 280     /* Count number of runs */
 281     nruns = n_runs_from_residuals(resid, n - resid_start);
 282
 283     if (nruns > critical_runs[n - resid_start] ||
 284         n - start <= MIN_SAMPLES_FOR_REGRESS ||
 285         n - start <= min_samples) {
 286       if (start != resid_start) {
 287         /* Ignore extra samples in returned nruns */
 288         nruns = n_runs_from_residuals(resid + (start - resid_start), n - start);
 289       }
 290       break;
 291     } else {
 292       /* Try dropping one sample at a time until the runs test passes. */
 293       ++start;
 294     }
 295
 296   } while (1);
 297
 298   /* Work out statistics from full dataset */
 299   *b1 = b;
 300   *b0 = a;
 301
 302   ss = 0.0;
 303   for (i=start; i<n; i++) {
 304     ss += resid[i - resid_start]*resid[i - resid_start] / w[i];
 305   }
 306
 307   npoints = n - start;
 308   ss /= (double)(npoints - 2);
 309   *sb1 = sqrt(ss / V);
 310   aa = u * (*sb1);
 311   *sb0 = sqrt((ss / W) + (aa * aa));
 312   *s2 = ss * (double) npoints / W;
 313
 314   *new_start = start;
 315   *dof = npoints - 2;
 316   *n_runs = nruns;
 317
 318   return 1;
 319
 320 }
 321
 322 /* ================================================== */
 323
 324 #define EXCH(a,b) temp=(a); (a)=(b); (b)=temp
 325
 326 /* ================================================== */
 327 /* Find the index'th biggest element in the array x of n elements.
 328    flags is an array where a 1 indicates that the corresponding entry
 329    in x is known to be sorted into its correct position and a 0
 330    indicates that the corresponding entry is not sorted.  However, if
 331    flags[m] = flags[n] = 1 with m<n, then x[m] must be <= x[n] and for
 332    all i with m<i<n, x[m] <= x[i] <= x[n].  In practice, this means
 333    flags[] has to be the result of a previous call to this routine
 334    with the same array x, and is used to remember which parts of the
 335    x[] array we have already sorted.
 336
 337    The approach used is a cut-down quicksort, where we only bother to
 338    keep sorting the partition that contains the index we are after.
 339    The approach comes from Numerical Recipes in C (ISBN
 340    0-521-43108-5). */
 341
 342 static double
 343 find_ordered_entry_with_flags(double *x, int n, int index, char *flags)
 344 {
 345   int u, v, l, r;
 346   double temp;
 347   double piv;
 348   int pivind;
 349
 350   assert(index >= 0);
 351
 352   /* If this bit of the array is already sorted, simple! */
 353   if (flags[index]) {
 354     return x[index];
 355   }
 356
 357   /* Find subrange to look at */
 358   u = v = index;
 359   while (u > 0 && !flags[u]) u--;
 360   if (flags[u]) u++;
 361
 362   while (v < (n-1) && !flags[v]) v++;
 363   if (flags[v]) v--;
 364
 365   do {
 366     if (v - u < 2) {
 367       if (x[v] < x[u]) {
 368         EXCH(x[v], x[u]);
 369       }
 370       flags[v] = flags[u] = 1;
 371       return x[index];
 372     } else {
 373       pivind = (u + v) >> 1;
 374       EXCH(x[u], x[pivind]);
 375       piv = x[u]; /* New value */
 376       l = u + 1;
 377       r = v;
 378       do {
 379         while (l < v && x[l] < piv) l++;
 380         while (x[r] > piv) r--;
 381         if (r <= l) break;
 382         EXCH(x[l], x[r]);
 383         l++;
 384         r--;
 385       } while (1);
 386       EXCH(x[u], x[r]);
 387       flags[r] = 1; /* Pivot now in correct place */
 388       if (index == r) {
 389         return x[r];
 390       } else if (index < r) {
 391         v = r - 1;
 392       } else if (index > r) {
 393         u = l;
 394       }
 395     }
 396   } while (1);
 397 }
 398
 399 /* ================================================== */
 400
 401 #if 0
 402 /* Not used, but this is how it can be done */
 403 static double
 404 find_ordered_entry(double *x, int n, int index)
 405 {
 406   char flags[MAX_POINTS];
 407
 408   memset(flags, 0, n * sizeof (flags[0]));
 409   return find_ordered_entry_with_flags(x, n, index, flags);
 410 }
 411 #endif
 412
 413 /* ================================================== */
 414 /* Find the median entry of an array x[] with n elements. */
 415
 416 static double
 417 find_median(double *x, int n)
 418 {
 419   int k;
 420   char flags[MAX_POINTS];
 421
 422   memset(flags, 0, n * sizeof (flags[0]));
 423   k = n>>1;
 424   if (n&1) {
 425     return find_ordered_entry_with_flags(x, n, k, flags);
 426   } else {
 427     return 0.5 * (find_ordered_entry_with_flags(x, n, k, flags) +
 428                   find_ordered_entry_with_flags(x, n, k-1, flags));
 429   }
 430 }
 431
 432 /* ================================================== */
 433
 434 double
 435 RGR_FindMedian(double *x, int n)
 436 {
 437   double tmp[MAX_POINTS];
 438
 439   assert(n > 0 && n <= MAX_POINTS);
 440   memcpy(tmp, x, n * sizeof (tmp[0]));
 441
 442   return find_median(tmp, n);
 443 }
 444
 445 /* ================================================== */
 446 /* This function evaluates the equation
 447
 448    \sum_{i=0}^{n-1} x_i sign(y_i - a - b x_i)
 449
 450    and chooses the value of a that minimises the absolute value of the
 451    result.  (See pp703-704 of Numerical Recipes in C). */
 452
 453 static void
 454 eval_robust_residual
 455 (double *x,                     /* The independent points */
 456  double *y,                     /* The dependent points */
 457  int n,                         /* Number of points */
 458  double b,                      /* Slope */
 459  double *aa,                    /* Intercept giving smallest absolute
 460                                    value for the above equation */
 461  double *rr                     /* Corresponding value of equation */
 462 )
 463 {
 464   int i;
 465   double a, res, del;
 466   double d[MAX_POINTS];
 467
 468   for (i=0; i<n; i++) {
 469     d[i] = y[i] - b * x[i];
 470   }
 471
 472   a = find_median(d, n);
 473
 474   res = 0.0;
 475   for (i=0; i<n; i++) {
 476     del = y[i] - a - b * x[i];
 477     if (del > 0.0) {
 478       res += x[i];
 479     } else if (del < 0.0) {
 480       res -= x[i];
 481     }
 482   }
 483
 484   *aa = a;
 485   *rr = res;
 486 }
 487
 488 /* ================================================== */
 489 /* This routine performs a 'robust' regression, i.e. one which has low
 490    susceptibility to outliers amongst the data.  If one thinks of a
 491    normal (least squares) linear regression in 2D being analogous to
 492    the arithmetic mean in 1D, this algorithm in 2D is roughly
 493    analogous to the median in 1D.  This algorithm seems to work quite
 494    well until the number of outliers is approximately half the number
 495    of data points.
 496
 497    The return value is a status indicating whether there were enough
 498    data points to run the routine or not. */
 499
 500 int
 501 RGR_FindBestRobustRegression
 502 (double *x,                     /* The independent axis points */
 503  double *y,                     /* The dependent axis points (which
 504                                    may contain outliers). */
 505  int n,                         /* The number of points */
 506  double tol,                    /* The tolerance required in
 507                                    determining the value of b1 */
 508  double *b0,                    /* The estimated Y-axis intercept */
 509  double *b1,                    /* The estimated slope */
 510  int *n_runs,                   /* The number of runs of residuals */
 511  int *best_start                /* The best starting index */
 512 )
 513 {
 514   int i;
 515   int start;
 516   int n_points;
 517   double a, b;
 518   double P, U, V, W, X;
 519   double resid, resids[MAX_POINTS];
 520   double blo, bhi, bmid, rlo, rhi, rmid;
 521   double s2, sb, incr;
 522   double mx, dx, my, dy;
 523   int nruns = 0;
 524
 525   assert(n <= MAX_POINTS);
 526
 527   if (n < 2) {
 528     return 0;
 529   } else if (n == 2) {
 530     /* Just a straight line fit (we need this for the manual mode) */
 531     *b1 = (y[1] - y[0]) / (x[1] - x[0]);
 532     *b0 = y[0] - (*b1) * x[0];
 533     *n_runs = 0;
 534     *best_start = 0;
 535     return 1;
 536   }
 537
 538   /* else at least 3 points, apply normal algorithm */
 539
 540   start = 0;
 541
 542   /* Loop to strip oldest points that cause the regression residuals
 543      to fail the number of runs test */
 544   do {
 545
 546     n_points = n - start;
 547
 548     /* Use standard least squares regression to get starting estimate */
 549
 550     P = U = 0.0;
 551     for (i=start; i<n; i++) {
 552       P += y[i];
 553       U += x[i];
 554     }
 555
 556     W = (double) n_points;
 557
 558     my = P/W;
 559     mx = U/W;
 560
 561     X = V = 0.0;
 562     for (i=start; i<n; i++) {
 563       dy = y[i] - my;
 564       dx = x[i] - mx;
 565       X += dy * dx;
 566       V += dx * dx;
 567     }
 568
 569     b = X / V;
 570     a = my - b*mx;
 571
 572     s2 = 0.0;
 573     for (i=start; i<n; i++) {
 574       resid = y[i] - a - b * x[i];
 575       s2 += resid * resid;
 576     }
 577
 578     /* Need to expand range of b to get a root in the interval.
 579        Estimate standard deviation of b and expand range about b based
 580        on that. */
 581     sb = sqrt(s2 * W/V);
 582     incr = MAX(sb, tol);
 583
 584     do {
 585       incr *= 2.0;
 586
 587       /* Give up if the interval is too large */
 588       if (incr > 100.0)
 589         return 0;
 590
 591       blo = b - incr;
 592       bhi = b + incr;
 593
 594       /* We don't want 'a' yet */
 595       eval_robust_residual(x + start, y + start, n_points, blo, &a, &rlo);
 596       eval_robust_residual(x + start, y + start, n_points, bhi, &a, &rhi);
 597
 598     } while (rlo * rhi >= 0.0); /* fn vals have same sign or one is zero,
 599                                    i.e. root not in interval (rlo, rhi). */
 600
 601     /* OK, so the root for b lies in (blo, bhi). Start bisecting */
 602     do {
 603       bmid = 0.5 * (blo + bhi);
 604       if (!(blo < bmid && bmid < bhi))
 605         break;
 606       eval_robust_residual(x + start, y + start, n_points, bmid, &a, &rmid);
 607       if (rmid == 0.0) {
 608         break;
 609       } else if (rmid * rlo > 0.0) {
 610         blo = bmid;
 611         rlo = rmid;
 612       } else if (rmid * rhi > 0.0) {
 613         bhi = bmid;
 614         rhi = rmid;
 615       } else {
 616         assert(0);
 617       }
 618     } while (bhi - blo > tol);
 619
 620     *b0 = a;
 621     *b1 = bmid;
 622
 623     /* Number of runs test, but not if we're already down to the
 624        minimum number of points */
 625     if (n_points == MIN_SAMPLES_FOR_REGRESS) {
 626       break;
 627     }
 628
 629     for (i=start; i<n; i++) {
 630       resids[i] = y[i] - a - bmid * x[i];
 631     }
 632
 633     nruns = n_runs_from_residuals(resids + start, n_points);
 634
 635     if (nruns > critical_runs[n_points]) {
 636       break;
 637     } else {
 638       start++;
 639     }
 640
 641   } while (1);
 642
 643   *n_runs = nruns;
 644   *best_start = start;
 645
 646   return 1;
 647
 648 }
 649
 650 /* ================================================== */
 651 /* This routine performs linear regression with two independent variables.
 652    It returns non-zero status if there were enough data points and there
 653    was a solution. */
 654
 655 int
 656 RGR_MultipleRegress
 657 (double *x1,                    /* first independent variable */
 658  double *x2,                    /* second independent variable */
 659  double *y,                     /* measured data */
 660
 661  int n,                         /* number of data points */
 662
 663  /* The results */
 664  double *b2                     /* estimated second slope */
 665                                 /* other values are not needed yet */
 666 )
 667 {
 668   double Sx1, Sx2, Sx1x1, Sx1x2, Sx2x2, Sx1y, Sx2y, Sy;
 669   double U, V, V1, V2, V3;
 670   int i;
 671
 672   if (n < 4)
 673     return 0;
 674
 675   Sx1 = Sx2 = Sx1x1 = Sx1x2 = Sx2x2 = Sx1y = Sx2y = Sy = 0.0;
 676
 677   for (i = 0; i < n; i++) {
 678     Sx1 += x1[i];
 679     Sx2 += x2[i];
 680     Sx1x1 += x1[i] * x1[i];
 681     Sx1x2 += x1[i] * x2[i];
 682     Sx2x2 += x2[i] * x2[i];
 683     Sx1y += x1[i] * y[i];
 684     Sx2y += x2[i] * y[i];
 685     Sy += y[i];
 686   }
 687
 688   U = n * (Sx1x2 * Sx1y - Sx1x1 * Sx2y) +
 689       Sx1 * Sx1 * Sx2y - Sx1 * Sx2 * Sx1y +
 690       Sy * (Sx2 * Sx1x1 - Sx1 * Sx1x2);
 691
 692   V1 = n * (Sx1x2 * Sx1x2 - Sx1x1 * Sx2x2);
 693   V2 = Sx1 * Sx1 * Sx2x2 + Sx2 * Sx2 * Sx1x1;
 694   V3 = -2.0 * Sx1 * Sx2 * Sx1x2;
 695   V = V1 + V2 + V3;
 696
 697   /* Check if there is a (numerically stable) solution */
 698   if (fabs(V) * 1.0e10 <= -V1 + V2 + fabs(V3))
 699     return 0;
 700
 701   *b2 = U / V;
 702
 703   return 1;
 704 }