From: Martin Sperl <kernel@martin.sperl.org>
Date: Tue, 6 May 2014 20:12:44 +0000 (+0000)
Subject: added PREDICTPERC function
X-Git-Tag: v1.5.0-rc1~106^2~1
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b76b9c7bacf6b0017e8e8990559b4c62079edc9a;p=thirdparty%2Frrdtool-1.x.git

added PREDICTPERC function
there is one still open question: should we interpolate to get the final value?
Say: if we have 8 values, and we want thd 95th percventile, then
we should actually take the 6.65th 0-based index value (=95/100*(8-1)).
with the current implementation we round, so we return actually the 100th percentile.
what we could also do is:
value=val[floor(idx)]+(idx-floor(idx))*(val[floor(idx)+1]-val[floor(idx)])
(besides some boundry checking for the explicit percentile 100)

this should get decided prior to final merging
---

diff --git a/doc/rrdgraph_rpn.pod b/doc/rrdgraph_rpn.pod
index 733cfaa4..80b427b9 100644
--- a/doc/rrdgraph_rpn.pod
+++ b/doc/rrdgraph_rpn.pod
@@ -202,18 +202,20 @@ source value is NAN the complete sliding window is affected. The TRENDNAN
 operation ignores all NAN-values in a sliding window and computes the 
 average of the remaining values.
 
-B<PREDICT, PREDICTSIGMA>
+B<PREDICT, PREDICTSIGMA, PREDICTPERC>
 
-Create a "sliding window" average/sigma of another data series, that also
-shifts the data series by given amounts of of time as well
+Create a "sliding window" average/sigma/percentil of another data series, 
+that also shifts the data series by given amounts of of time as well
 
 Usage - explicit stating shifts:
 CDEF:predict=<shift n>,...,<shift 1>,n,<window>,x,PREDICT
 CDEF:sigma=<shift n>,...,<shift 1>,n,<window>,x,PREDICTSIGMA
+CDEF:perc=<shift n>,...,<shift 1>,n,<window>,<percentil>,x,PREDICTPERC
 
 Usage - shifts defined as a base shift and a number of time this is applied
 CDEF:predict=<shift multiplier>,-n,<window>,x,PREDICT
 CDEF:sigma=<shift multiplier>,-n,<window>,x,PREDICTSIGMA
+CDEF:sigma=<shift multiplier>,-n,<window>,<percentil>,x,PREDICTPERC
 
 Example:
 CDEF:predict=172800,86400,2,1800,x,PREDICT
@@ -267,13 +269,18 @@ rrdtool graph image.png --imgformat=PNG \
  LINE1:upper#0000ff:upper\ certainty\ limit \
  LINE1:lower#0000ff:lower\ certainty\ limit \
  CDEF:exceeds=value,UN,0,value,lower,upper,LIMIT,UN,IF \
- TICK:exceeds#aa000080:1
+ TICK:exceeds#aa000080:1 \
+ CDEF:perc95=86400,-7,1800,95,value,PREDICTPERC \
+ LINE1:perc95#ffff00:95th_percentile
 
 Note: Experience has shown that a factor between 3 and 5 to scale sigma is a good 
 discriminator to detect abnormal behavior. This obviously depends also on the type 
 of data and how "noisy" the data series is.
 
-This prediction can only be used for short term extrapolations - say a few days into the future-
+Also Note the explicit use of start= in the CDEF - this is necessary to load all
+the necessary data (even if it is not displayed)
+
+This prediction can only be used for short term extrapolations - say a few days into the future.
 
 =item Special values
 
diff --git a/src/rrd_graph.c b/src/rrd_graph.c
index ac2f3b8a..b3be4d96 100644
--- a/src/rrd_graph.c
+++ b/src/rrd_graph.c
@@ -997,6 +997,7 @@ long lcd(
     return num[i];
 }
 
+
 /* run the rpn calculator on all the VDEF and CDEF arguments */
 int data_calc(
     image_desc_t *im)
@@ -1008,6 +1009,7 @@ int data_calc(
     int       stepcnt;
     time_t    now;
     rpnstack_t rpnstack;
+    rpnp_t   *rpnp;
 
     rpnstack_init(&rpnstack);
 
@@ -1061,6 +1063,7 @@ int data_calc(
             steparray = NULL;
             stepcnt = 0;
             dataidx = -1;
+	    rpnp = im->gdes[gdi].rpnp;
 
             /* Find the variables in the expression.
              * - VDEF variables are substituted by their values
@@ -1173,7 +1176,6 @@ int data_calc(
              */
             for (now = im->gdes[gdi].start + im->gdes[gdi].step;
                  now <= im->gdes[gdi].end; now += im->gdes[gdi].step) {
-                rpnp_t   *rpnp = im->gdes[gdi].rpnp;
 
                 /* 3rd arg of rpn_calc is for OP_VARIABLE lookups;
                  * in this case we are advancing by timesteps;
@@ -1183,9 +1185,12 @@ int data_calc(
                              im->gdes[gdi].data, ++dataidx) == -1) {
                     /* rpn_calc sets the error string */
                     rpnstack_free(&rpnstack);
+		    rpnp_freeextra(rpnp);
                     return -1;
                 }
             }           /* enumerate over time steps within a CDEF */
+	    rpnp_freeextra(rpnp);
+	    
             break;
         default:
             continue;
diff --git a/src/rrd_rpncalc.c b/src/rrd_rpncalc.c
index aba6042f..80522717 100644
--- a/src/rrd_rpncalc.c
+++ b/src/rrd_rpncalc.c
@@ -78,6 +78,8 @@ rpnp_t   *rpn_expand(
     }
     for (i = 0; rpnc[i].op != OP_END; ++i) {
         rpnp[i].op = (enum op_en)rpnc[i].op;
+	rpnp[i].extra = NULL;
+	rpnp[i].free_extra = NULL;
         if (rpnp[i].op == OP_NUMBER) {
             rpnp[i].val = (double) rpnc[i].val;
         } else if (rpnp[i].op == OP_VARIABLE || rpnp[i].op == OP_PREV_OTHER) {
@@ -180,6 +182,7 @@ void rpn_compact2str(
             add_op(OP_TRENDNAN, TRENDNAN)
             add_op(OP_PREDICT, PREDICT)
             add_op(OP_PREDICTSIGMA, PREDICTSIGMA)
+            add_op(OP_PREDICTPERC, PREDICTPERC)
             add_op(OP_RAD2DEG, RAD2DEG)
             add_op(OP_DEG2RAD, DEG2RAD)
             add_op(OP_AVG, AVG)
@@ -241,9 +244,10 @@ void parseCDEF_DS(const char *def,
         if (rpnp[i].op == OP_TIME || rpnp[i].op == OP_LTIME ||
             rpnp[i].op == OP_PREV || rpnp[i].op == OP_COUNT ||
             rpnp[i].op == OP_TREND || rpnp[i].op == OP_TRENDNAN ||
-            rpnp[i].op == OP_PREDICT || rpnp[i].op ==  OP_PREDICTSIGMA ) {
+            rpnp[i].op == OP_PREDICT || rpnp[i].op ==  OP_PREDICTSIGMA ||
+            rpnp[i].op == OP_PREDICTPERC ) {
             rrd_set_error
-                ("operators TIME, LTIME, PREV COUNT TREND TRENDNAN PREDICT PREDICTSIGMA are not supported with DS COMPUTE");
+                ("operators TIME, LTIME, PREV COUNT TREND TRENDNAN PREDICT PREDICTSIGMA PREDICTPERC are not supported with DS COMPUTE");
             free(rpnp);
             return;
         }
@@ -385,6 +389,7 @@ rpnp_t   *rpn_parse(
             match_op(OP_TRENDNAN, TRENDNAN)
             match_op(OP_PREDICT, PREDICT)
             match_op(OP_PREDICTSIGMA, PREDICTSIGMA)
+            match_op(OP_PREDICTPERC, PREDICTPERC)
             match_op(OP_RAD2DEG, RAD2DEG)
             match_op(OP_DEG2RAD, DEG2RAD)
             match_op(OP_AVG, AVG)
@@ -409,6 +414,9 @@ rpnp_t   *rpn_parse(
             return NULL;
         }
 
+	rpnp[steps].extra = NULL;
+	rpnp[steps].free_extra = NULL;
+
         if (*expr == 0)
             break;
         if (*expr == ',')
@@ -435,11 +443,28 @@ void rpnstack_init(
 void rpnstack_free(
     rpnstack_t *rpnstack)
 {
-    if (rpnstack->s != NULL)
-        free(rpnstack->s);
+    free(rpnstack->s);
     rpnstack->dc_stacksize = 0;
 }
 
+void rpnp_freeextra(rpnp_t* rpnp)
+{
+    int rpi;
+    if (!rpnp)
+      return;
+    /* process each op from the rpn in turn */
+    for (rpi = 0; rpnp[rpi].op != OP_END; rpi++) {
+        if (rpnp[rpi].extra) {
+	    if (rpnp[rpi].free_extra) {
+	        rpnp[rpi].free_extra(rpnp[rpi].extra);
+	    } else {
+	        free(rpnp[rpi].extra);
+	    }
+	    rpnp[rpi].extra = NULL;
+	}
+    }
+}
+
 static int rpn_compare_double(
     const void *x,
     const void *y)
@@ -823,12 +848,27 @@ short rpn_calc(
             break;
         case OP_PREDICT:
         case OP_PREDICTSIGMA:
-            stackunderflow(2);
+        case OP_PREDICTPERC:
 	    {
-		/* the local averaging window (similar to trend, but better here, as we get better statistics thru numbers)*/
+	        /* the percentile requested */
+	        double  percentile = DNAN;
+		if (rpnp[rpi].op == OP_PREDICTPERC) {
+		    stackunderflow(1);
+		    percentile = rpnstack->s[--stptr];
+		    if ((percentile<0) || (percentile > 100)) {
+		        rrd_set_error("unsupported percentile: %f",percentile);
+			return -1;
+		    }
+		    percentile/=100;
+		}
+		/* the local averaging window (similar to trend,
+		 * but better here, as we get better statistics 
+		 * thru numbers)*/
+	        stackunderflow(2);
 		int   locstepsize = rpnstack->s[--stptr];
 		/* the number of shifts and range-checking*/
 		int     shifts = rpnstack->s[--stptr];
+
                 stackunderflow(shifts);
 		// handle negative shifts special
 		if (shifts<0) {
@@ -839,9 +879,9 @@ short rpn_calc(
 		/* the real calculation */
 		double val=DNAN;
 		/* the info on the datasource */
-		time_t  dsstep = (time_t) rpnp[rpi - 1].step;
-		int    dscount = rpnp[rpi - 1].ds_cnt;
-		int   locstep = (int)ceil((float)locstepsize/(float)dsstep);
+		time_t  dsstep  = (time_t) rpnp[rpi - 1].step;
+		int     dscount = rpnp[rpi - 1].ds_cnt;
+		int     locstep = (int)ceil((float)locstepsize/(float)dsstep);
 
 		/* the sums */
                 double    sum = 0;
@@ -850,6 +890,16 @@ short rpn_calc(
 		/* now loop for each position */
 		int doshifts=shifts;
 		if (shifts<0) { doshifts=-shifts; }
+		/* alloc memory */
+		double *extra = rpnp[rpi].extra;
+		if (rpnp[rpi].op == OP_PREDICTPERC) {
+		    if (! extra) {
+		      int size = (doshifts + 1) * (locstep + 2);
+		      rpnp[rpi].extra =
+			  extra =  malloc(sizeof(double) * size);
+		    }
+		}
+		/* loop the shifts */
 		for(int loop=0;loop<doshifts;loop++) {
 		    /* calculate shift step */
 		    int shiftstep=1;
@@ -865,7 +915,8 @@ short rpn_calc(
 		    shiftstep=(int)ceil((float)shiftstep/(float)dsstep);
 		    /* loop all local shifts */
 		    for(int i=0;i<=locstep;i++) {
-			/* now calculate offset into data-array - relative to output_idx*/
+			/* now calculate offset into data-array 
+			 * - relative to output_idx */
 			int offset=shiftstep+i;
 			/* and process if we have index 0 of above */
 			if ((offset>=0)&&(offset<output_idx)) {
@@ -875,6 +926,9 @@ short rpn_calc(
 			    if (! isnan(val)) {
 				sum+=val;
 				sum2+=val*val;
+				if (extra) {
+				    extra[count]=val;
+				}
 				count++;
 			    }
 			}
@@ -882,11 +936,13 @@ short rpn_calc(
 		}
 		/* do the final calculations */
 		val=DNAN;
-		if (rpnp[rpi].op == OP_PREDICT) {  /* the average */
+		switch (rpnp[rpi].op) {
+		case OP_PREDICT:
 		    if (count>0) {
 			val = sum/(double)count;
 		    } 
-		} else {
+		    break;
+		case OP_PREDICTSIGMA:
 		    if (count>1) { /* the sigma case */
 			val=count*sum2-sum*sum;
 			if (val<0) {
@@ -895,6 +951,21 @@ short rpn_calc(
 			    val=sqrt(val/((float)count*((float)count-1.0)));
 			}
 		    }
+		    break;
+		case OP_PREDICTPERC:
+		    if ((count>0) && extra) {
+		        /* sort the numbers */
+		        qsort(extra,count,sizeof(double),rpn_compare_double);
+			/* get the percentile selected */
+			int idx=(int)round(percentile * ((float)count-1.0));
+			/* maybe we should also do an interpolation between the 2
+			 * neighboring fields, similar to what we do with MEDIAN 
+			 */
+			val = extra[idx];
+		    }
+		    break;
+		default: /* should not get here ... */
+		    break; 
 		}
 		rpnstack->s[stptr] = val;
 	    }
diff --git a/src/rrd_rpncalc.h b/src/rrd_rpncalc.h
index 26b52cb4..2072bfc7 100644
--- a/src/rrd_rpncalc.h
+++ b/src/rrd_rpncalc.h
@@ -21,7 +21,7 @@ enum op_en { OP_NUMBER = 0, OP_VARIABLE, OP_INF, OP_PREV, OP_NEGINF,
     OP_PREDICT,OP_PREDICTSIGMA,
     OP_AVG, OP_ABS, OP_ADDNAN,
     OP_MINNAN, OP_MAXNAN,
-    OP_MEDIAN
+    OP_MEDIAN, OP_PREDICTPERC
  };
 
 typedef struct rpnp_t {
@@ -31,8 +31,14 @@ typedef struct rpnp_t {
     double   *data;     /* pointer to the current value from OP_VAR DAS */
     long      ds_cnt;   /* data source count for data pointer */
     long      step;     /* time step for OP_VAR das */
+    void     *extra;    /* some extra data for longer setups */
+    void      (*free_extra)(void *); /* function pointer used to free extra 
+				      * - NULL for "simple" free(extra); */
 } rpnp_t;
 
+void      rpnp_freeextra(
+    rpnp_t *rpnp);
+
 /* a compact representation of rpnp_t for computed data sources */
 typedef struct rpn_cdefds_t {
     char      op;       /* rpn operator type */
diff --git a/src/rrd_update.c b/src/rrd_update.c
index efa04824..a26005e0 100644
--- a/src/rrd_update.c
+++ b/src/rrd_update.c
@@ -1713,13 +1713,18 @@ static int process_pdp_st(
                 rpnp[i].op = OP_NUMBER;
                 rpnp[i].val = pdp_temp[rpnp[i].ptr];
             }
+	    /* just in case */
+	    rpnp[i].extra = NULL;
+	    rpnp[i].free_extra = NULL;
         }
         /* run the rpn calculator */
         if (rpn_calc(rpnp, &rpnstack, 0, pdp_temp, ds_idx) == -1) {
+	    rpnp_freeextra(rpnp);
             free(rpnp);
             rpnstack_free(&rpnstack);
             return -1;
         }
+	rpnp_freeextra(rpnp);
         free(rpnp);
     }