]> git.ipfire.org Git - thirdparty/chrony.git/blob - regress.c
cmdmon: save NTS cookies and server keys on dump command
[thirdparty/chrony.git] / regress.c
1 /*
2 chronyd/chronyc - Programs for keeping computer clocks accurate.
3
4 **********************************************************************
5 * Copyright (C) Richard P. Curnow 1997-2003
6 * Copyright (C) Miroslav Lichvar 2011, 2016-2017
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of version 2 of the GNU General Public License as
10 * published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it will be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
20 *
21 **********************************************************************
22
23 =======================================================================
24
25 Regression algorithms.
26
27 */
28
29 #include "config.h"
30
31 #include "sysincl.h"
32
33 #include "regress.h"
34 #include "logging.h"
35 #include "util.h"
36
37 #define MAX_POINTS 64
38
39 void
40 RGR_WeightedRegression
41 (double *x, /* independent variable */
42 double *y, /* measured data */
43 double *w, /* weightings (large => data
44 less reliable) */
45
46 int n, /* number of data points */
47
48 /* And now the results */
49
50 double *b0, /* estimated y axis intercept */
51 double *b1, /* estimated slope */
52 double *s2, /* estimated variance of data points */
53
54 double *sb0, /* estimated standard deviation of
55 intercept */
56 double *sb1 /* estimated standard deviation of
57 slope */
58
59 /* Could add correlation stuff later if required */
60 )
61 {
62 double P, Q, U, V, W;
63 double diff;
64 double u, ui, aa;
65 int i;
66
67 assert(n >= 3);
68
69 W = U = 0;
70 for (i=0; i<n; i++) {
71 U += x[i] / w[i];
72 W += 1.0 / w[i];
73 }
74
75 u = U / W;
76
77 /* Calculate statistics from data */
78 P = Q = V = 0.0;
79 for (i=0; i<n; i++) {
80 ui = x[i] - u;
81 P += y[i] / w[i];
82 Q += y[i] * ui / w[i];
83 V += ui * ui / w[i];
84 }
85
86 *b1 = Q / V;
87 *b0 = (P / W) - (*b1) * u;
88
89 *s2 = 0.0;
90 for (i=0; i<n; i++) {
91 diff = y[i] - *b0 - *b1*x[i];
92 *s2 += diff*diff / w[i];
93 }
94
95 *s2 /= (double)(n-2);
96
97 *sb1 = sqrt(*s2 / V);
98 aa = u * (*sb1);
99 *sb0 = sqrt(*s2 / W + aa * aa);
100
101 *s2 *= (n / W); /* Giving weighted average of variances */
102 }
103
104 /* ================================================== */
105 /* Get the coefficient to multiply the standard deviation by, to get a
106 particular size of confidence interval (assuming a t-distribution) */
107
108 double
109 RGR_GetTCoef(int dof)
110 {
111 /* Assuming now the 99.95% quantile */
112 static const float coefs[] =
113 { 636.6, 31.6, 12.92, 8.61, 6.869,
114 5.959, 5.408, 5.041, 4.781, 4.587,
115 4.437, 4.318, 4.221, 4.140, 4.073,
116 4.015, 3.965, 3.922, 3.883, 3.850,
117 3.819, 3.792, 3.768, 3.745, 3.725,
118 3.707, 3.690, 3.674, 3.659, 3.646,
119 3.633, 3.622, 3.611, 3.601, 3.591,
120 3.582, 3.574, 3.566, 3.558, 3.551};
121
122 if (dof <= 40) {
123 return coefs[dof-1];
124 } else {
125 return 3.5; /* Until I can be bothered to do something better */
126 }
127 }
128
129 /* ================================================== */
130 /* Get 90% quantile of chi-square distribution */
131
132 double
133 RGR_GetChi2Coef(int dof)
134 {
135 static const float coefs[] = {
136 2.706, 4.605, 6.251, 7.779, 9.236, 10.645, 12.017, 13.362,
137 14.684, 15.987, 17.275, 18.549, 19.812, 21.064, 22.307, 23.542,
138 24.769, 25.989, 27.204, 28.412, 29.615, 30.813, 32.007, 33.196,
139 34.382, 35.563, 36.741, 37.916, 39.087, 40.256, 41.422, 42.585,
140 43.745, 44.903, 46.059, 47.212, 48.363, 49.513, 50.660, 51.805,
141 52.949, 54.090, 55.230, 56.369, 57.505, 58.641, 59.774, 60.907,
142 62.038, 63.167, 64.295, 65.422, 66.548, 67.673, 68.796, 69.919,
143 71.040, 72.160, 73.279, 74.397, 75.514, 76.630, 77.745, 78.860
144 };
145
146 if (dof <= 64) {
147 return coefs[dof-1];
148 } else {
149 return 1.2 * dof; /* Until I can be bothered to do something better */
150 }
151 }
152
153 /* ================================================== */
154 /* Critical value for number of runs of residuals with same sign.
155 5% critical region for now. */
156
157 static char critical_runs[] = {
158 0, 0, 0, 0, 0, 0, 0, 0, 2, 3,
159 3, 3, 4, 4, 5, 5, 5, 6, 6, 7,
160 7, 7, 8, 8, 9, 9, 9, 10, 10, 11,
161 11, 11, 12, 12, 13, 13, 14, 14, 14, 15,
162 15, 16, 16, 17, 17, 18, 18, 18, 19, 19,
163 20, 20, 21, 21, 21, 22, 22, 23, 23, 24,
164 24, 25, 25, 26, 26, 26, 27, 27, 28, 28,
165 29, 29, 30, 30, 30, 31, 31, 32, 32, 33,
166 33, 34, 34, 35, 35, 35, 36, 36, 37, 37,
167 38, 38, 39, 39, 40, 40, 40, 41, 41, 42,
168 42, 43, 43, 44, 44, 45, 45, 46, 46, 46,
169 47, 47, 48, 48, 49, 49, 50, 50, 51, 51,
170 52, 52, 52, 53, 53, 54, 54, 55, 55, 56
171 };
172
173 /* ================================================== */
174
175 static int
176 n_runs_from_residuals(double *resid, int n)
177 {
178 int nruns;
179 int i;
180
181 nruns = 1;
182 for (i=1; i<n; i++) {
183 if (((resid[i-1] < 0.0) && (resid[i] < 0.0)) ||
184 ((resid[i-1] > 0.0) && (resid[i] > 0.0))) {
185 /* Nothing to do */
186 } else {
187 nruns++;
188 }
189 }
190
191 return nruns;
192 }
193
194 /* ================================================== */
195 /* Return a boolean indicating whether we had enough points for
196 regression */
197
198 int
199 RGR_FindBestRegression
200 (double *x, /* independent variable */
201 double *y, /* measured data */
202 double *w, /* weightings (large => data
203 less reliable) */
204
205 int n, /* number of data points */
206 int m, /* number of extra samples in x and y arrays
207 (negative index) which can be used to
208 extend runs test */
209 int min_samples, /* minimum number of samples to be kept after
210 changing the starting index to pass the runs
211 test */
212
213 /* And now the results */
214
215 double *b0, /* estimated y axis intercept */
216 double *b1, /* estimated slope */
217 double *s2, /* estimated variance of data points */
218
219 double *sb0, /* estimated standard deviation of
220 intercept */
221 double *sb1, /* estimated standard deviation of
222 slope */
223
224 int *new_start, /* the new starting index to make the
225 residuals pass the two tests */
226
227 int *n_runs, /* number of runs amongst the residuals */
228
229 int *dof /* degrees of freedom in statistics (needed
230 to get confidence intervals later) */
231
232 )
233 {
234 double P, Q, U, V, W; /* total */
235 double resid[MAX_POINTS * REGRESS_RUNS_RATIO];
236 double ss;
237 double a, b, u, ui, aa;
238
239 int start, resid_start, nruns, npoints;
240 int i;
241
242 assert(n <= MAX_POINTS && m >= 0);
243 assert(n * REGRESS_RUNS_RATIO < sizeof (critical_runs) / sizeof (critical_runs[0]));
244
245 if (n < MIN_SAMPLES_FOR_REGRESS) {
246 return 0;
247 }
248
249 start = 0;
250 do {
251
252 W = U = 0;
253 for (i=start; i<n; i++) {
254 U += x[i] / w[i];
255 W += 1.0 / w[i];
256 }
257
258 u = U / W;
259
260 P = Q = V = 0.0;
261 for (i=start; i<n; i++) {
262 ui = x[i] - u;
263 P += y[i] / w[i];
264 Q += y[i] * ui / w[i];
265 V += ui * ui / w[i];
266 }
267
268 b = Q / V;
269 a = (P / W) - (b * u);
270
271 /* Get residuals also for the extra samples before start */
272 resid_start = n - (n - start) * REGRESS_RUNS_RATIO;
273 if (resid_start < -m)
274 resid_start = -m;
275
276 for (i=resid_start; i<n; i++) {
277 resid[i - resid_start] = y[i] - a - b*x[i];
278 }
279
280 /* Count number of runs */
281 nruns = n_runs_from_residuals(resid, n - resid_start);
282
283 if (nruns > critical_runs[n - resid_start] ||
284 n - start <= MIN_SAMPLES_FOR_REGRESS ||
285 n - start <= min_samples) {
286 if (start != resid_start) {
287 /* Ignore extra samples in returned nruns */
288 nruns = n_runs_from_residuals(resid + (start - resid_start), n - start);
289 }
290 break;
291 } else {
292 /* Try dropping one sample at a time until the runs test passes. */
293 ++start;
294 }
295
296 } while (1);
297
298 /* Work out statistics from full dataset */
299 *b1 = b;
300 *b0 = a;
301
302 ss = 0.0;
303 for (i=start; i<n; i++) {
304 ss += resid[i - resid_start]*resid[i - resid_start] / w[i];
305 }
306
307 npoints = n - start;
308 ss /= (double)(npoints - 2);
309 *sb1 = sqrt(ss / V);
310 aa = u * (*sb1);
311 *sb0 = sqrt((ss / W) + (aa * aa));
312 *s2 = ss * (double) npoints / W;
313
314 *new_start = start;
315 *dof = npoints - 2;
316 *n_runs = nruns;
317
318 return 1;
319
320 }
321
322 /* ================================================== */
323
324 #define EXCH(a,b) temp=(a); (a)=(b); (b)=temp
325
326 /* ================================================== */
327 /* Find the index'th biggest element in the array x of n elements.
328 flags is an array where a 1 indicates that the corresponding entry
329 in x is known to be sorted into its correct position and a 0
330 indicates that the corresponding entry is not sorted. However, if
331 flags[m] = flags[n] = 1 with m<n, then x[m] must be <= x[n] and for
332 all i with m<i<n, x[m] <= x[i] <= x[n]. In practice, this means
333 flags[] has to be the result of a previous call to this routine
334 with the same array x, and is used to remember which parts of the
335 x[] array we have already sorted.
336
337 The approach used is a cut-down quicksort, where we only bother to
338 keep sorting the partition that contains the index we are after.
339 The approach comes from Numerical Recipes in C (ISBN
340 0-521-43108-5). */
341
342 static double
343 find_ordered_entry_with_flags(double *x, int n, int index, char *flags)
344 {
345 int u, v, l, r;
346 double temp;
347 double piv;
348 int pivind;
349
350 assert(index >= 0);
351
352 /* If this bit of the array is already sorted, simple! */
353 if (flags[index]) {
354 return x[index];
355 }
356
357 /* Find subrange to look at */
358 u = v = index;
359 while (u > 0 && !flags[u]) u--;
360 if (flags[u]) u++;
361
362 while (v < (n-1) && !flags[v]) v++;
363 if (flags[v]) v--;
364
365 do {
366 if (v - u < 2) {
367 if (x[v] < x[u]) {
368 EXCH(x[v], x[u]);
369 }
370 flags[v] = flags[u] = 1;
371 return x[index];
372 } else {
373 pivind = (u + v) >> 1;
374 EXCH(x[u], x[pivind]);
375 piv = x[u]; /* New value */
376 l = u + 1;
377 r = v;
378 do {
379 while (l < v && x[l] < piv) l++;
380 while (x[r] > piv) r--;
381 if (r <= l) break;
382 EXCH(x[l], x[r]);
383 l++;
384 r--;
385 } while (1);
386 EXCH(x[u], x[r]);
387 flags[r] = 1; /* Pivot now in correct place */
388 if (index == r) {
389 return x[r];
390 } else if (index < r) {
391 v = r - 1;
392 } else if (index > r) {
393 u = l;
394 }
395 }
396 } while (1);
397 }
398
399 /* ================================================== */
400
401 #if 0
402 /* Not used, but this is how it can be done */
403 static double
404 find_ordered_entry(double *x, int n, int index)
405 {
406 char flags[MAX_POINTS];
407
408 memset(flags, 0, n * sizeof (flags[0]));
409 return find_ordered_entry_with_flags(x, n, index, flags);
410 }
411 #endif
412
413 /* ================================================== */
414 /* Find the median entry of an array x[] with n elements. */
415
416 static double
417 find_median(double *x, int n)
418 {
419 int k;
420 char flags[MAX_POINTS];
421
422 memset(flags, 0, n * sizeof (flags[0]));
423 k = n>>1;
424 if (n&1) {
425 return find_ordered_entry_with_flags(x, n, k, flags);
426 } else {
427 return 0.5 * (find_ordered_entry_with_flags(x, n, k, flags) +
428 find_ordered_entry_with_flags(x, n, k-1, flags));
429 }
430 }
431
432 /* ================================================== */
433
434 double
435 RGR_FindMedian(double *x, int n)
436 {
437 double tmp[MAX_POINTS];
438
439 assert(n > 0 && n <= MAX_POINTS);
440 memcpy(tmp, x, n * sizeof (tmp[0]));
441
442 return find_median(tmp, n);
443 }
444
445 /* ================================================== */
446 /* This function evaluates the equation
447
448 \sum_{i=0}^{n-1} x_i sign(y_i - a - b x_i)
449
450 and chooses the value of a that minimises the absolute value of the
451 result. (See pp703-704 of Numerical Recipes in C). */
452
453 static void
454 eval_robust_residual
455 (double *x, /* The independent points */
456 double *y, /* The dependent points */
457 int n, /* Number of points */
458 double b, /* Slope */
459 double *aa, /* Intercept giving smallest absolute
460 value for the above equation */
461 double *rr /* Corresponding value of equation */
462 )
463 {
464 int i;
465 double a, res, del;
466 double d[MAX_POINTS];
467
468 for (i=0; i<n; i++) {
469 d[i] = y[i] - b * x[i];
470 }
471
472 a = find_median(d, n);
473
474 res = 0.0;
475 for (i=0; i<n; i++) {
476 del = y[i] - a - b * x[i];
477 if (del > 0.0) {
478 res += x[i];
479 } else if (del < 0.0) {
480 res -= x[i];
481 }
482 }
483
484 *aa = a;
485 *rr = res;
486 }
487
488 /* ================================================== */
489 /* This routine performs a 'robust' regression, i.e. one which has low
490 susceptibility to outliers amongst the data. If one thinks of a
491 normal (least squares) linear regression in 2D being analogous to
492 the arithmetic mean in 1D, this algorithm in 2D is roughly
493 analogous to the median in 1D. This algorithm seems to work quite
494 well until the number of outliers is approximately half the number
495 of data points.
496
497 The return value is a status indicating whether there were enough
498 data points to run the routine or not. */
499
500 int
501 RGR_FindBestRobustRegression
502 (double *x, /* The independent axis points */
503 double *y, /* The dependent axis points (which
504 may contain outliers). */
505 int n, /* The number of points */
506 double tol, /* The tolerance required in
507 determining the value of b1 */
508 double *b0, /* The estimated Y-axis intercept */
509 double *b1, /* The estimated slope */
510 int *n_runs, /* The number of runs of residuals */
511 int *best_start /* The best starting index */
512 )
513 {
514 int i;
515 int start;
516 int n_points;
517 double a, b;
518 double P, U, V, W, X;
519 double resid, resids[MAX_POINTS];
520 double blo, bhi, bmid, rlo, rhi, rmid;
521 double s2, sb, incr;
522 double mx, dx, my, dy;
523 int nruns = 0;
524
525 assert(n <= MAX_POINTS);
526
527 if (n < 2) {
528 return 0;
529 } else if (n == 2) {
530 /* Just a straight line fit (we need this for the manual mode) */
531 *b1 = (y[1] - y[0]) / (x[1] - x[0]);
532 *b0 = y[0] - (*b1) * x[0];
533 *n_runs = 0;
534 *best_start = 0;
535 return 1;
536 }
537
538 /* else at least 3 points, apply normal algorithm */
539
540 start = 0;
541
542 /* Loop to strip oldest points that cause the regression residuals
543 to fail the number of runs test */
544 do {
545
546 n_points = n - start;
547
548 /* Use standard least squares regression to get starting estimate */
549
550 P = U = 0.0;
551 for (i=start; i<n; i++) {
552 P += y[i];
553 U += x[i];
554 }
555
556 W = (double) n_points;
557
558 my = P/W;
559 mx = U/W;
560
561 X = V = 0.0;
562 for (i=start; i<n; i++) {
563 dy = y[i] - my;
564 dx = x[i] - mx;
565 X += dy * dx;
566 V += dx * dx;
567 }
568
569 b = X / V;
570 a = my - b*mx;
571
572 s2 = 0.0;
573 for (i=start; i<n; i++) {
574 resid = y[i] - a - b * x[i];
575 s2 += resid * resid;
576 }
577
578 /* Need to expand range of b to get a root in the interval.
579 Estimate standard deviation of b and expand range about b based
580 on that. */
581 sb = sqrt(s2 * W/V);
582 incr = MAX(sb, tol);
583
584 do {
585 incr *= 2.0;
586
587 /* Give up if the interval is too large */
588 if (incr > 100.0)
589 return 0;
590
591 blo = b - incr;
592 bhi = b + incr;
593
594 /* We don't want 'a' yet */
595 eval_robust_residual(x + start, y + start, n_points, blo, &a, &rlo);
596 eval_robust_residual(x + start, y + start, n_points, bhi, &a, &rhi);
597
598 } while (rlo * rhi >= 0.0); /* fn vals have same sign or one is zero,
599 i.e. root not in interval (rlo, rhi). */
600
601 /* OK, so the root for b lies in (blo, bhi). Start bisecting */
602 do {
603 bmid = 0.5 * (blo + bhi);
604 if (!(blo < bmid && bmid < bhi))
605 break;
606 eval_robust_residual(x + start, y + start, n_points, bmid, &a, &rmid);
607 if (rmid == 0.0) {
608 break;
609 } else if (rmid * rlo > 0.0) {
610 blo = bmid;
611 rlo = rmid;
612 } else if (rmid * rhi > 0.0) {
613 bhi = bmid;
614 rhi = rmid;
615 } else {
616 assert(0);
617 }
618 } while (bhi - blo > tol);
619
620 *b0 = a;
621 *b1 = bmid;
622
623 /* Number of runs test, but not if we're already down to the
624 minimum number of points */
625 if (n_points == MIN_SAMPLES_FOR_REGRESS) {
626 break;
627 }
628
629 for (i=start; i<n; i++) {
630 resids[i] = y[i] - a - bmid * x[i];
631 }
632
633 nruns = n_runs_from_residuals(resids + start, n_points);
634
635 if (nruns > critical_runs[n_points]) {
636 break;
637 } else {
638 start++;
639 }
640
641 } while (1);
642
643 *n_runs = nruns;
644 *best_start = start;
645
646 return 1;
647
648 }
649
650 /* ================================================== */
651 /* This routine performs linear regression with two independent variables.
652 It returns non-zero status if there were enough data points and there
653 was a solution. */
654
655 int
656 RGR_MultipleRegress
657 (double *x1, /* first independent variable */
658 double *x2, /* second independent variable */
659 double *y, /* measured data */
660
661 int n, /* number of data points */
662
663 /* The results */
664 double *b2 /* estimated second slope */
665 /* other values are not needed yet */
666 )
667 {
668 double Sx1, Sx2, Sx1x1, Sx1x2, Sx2x2, Sx1y, Sx2y, Sy;
669 double U, V, V1, V2, V3;
670 int i;
671
672 if (n < 4)
673 return 0;
674
675 Sx1 = Sx2 = Sx1x1 = Sx1x2 = Sx2x2 = Sx1y = Sx2y = Sy = 0.0;
676
677 for (i = 0; i < n; i++) {
678 Sx1 += x1[i];
679 Sx2 += x2[i];
680 Sx1x1 += x1[i] * x1[i];
681 Sx1x2 += x1[i] * x2[i];
682 Sx2x2 += x2[i] * x2[i];
683 Sx1y += x1[i] * y[i];
684 Sx2y += x2[i] * y[i];
685 Sy += y[i];
686 }
687
688 U = n * (Sx1x2 * Sx1y - Sx1x1 * Sx2y) +
689 Sx1 * Sx1 * Sx2y - Sx1 * Sx2 * Sx1y +
690 Sy * (Sx2 * Sx1x1 - Sx1 * Sx1x2);
691
692 V1 = n * (Sx1x2 * Sx1x2 - Sx1x1 * Sx2x2);
693 V2 = Sx1 * Sx1 * Sx2x2 + Sx2 * Sx2 * Sx1x1;
694 V3 = -2.0 * Sx1 * Sx2 * Sx1x2;
695 V = V1 + V2 + V3;
696
697 /* Check if there is a (numerically stable) solution */
698 if (fabs(V) * 1.0e10 <= -V1 + V2 + fabs(V3))
699 return 0;
700
701 *b2 = U / V;
702
703 return 1;
704 }