Commit
6498287696d improved corr()'s final function to cope with
overflow/underflow in the final calculation, and clamped its result to
[-1, 1] in case of roundoff error. Improve regr_r2() in a similar way,
clamping its result to [0, 1].
Arguably this is a bug fix, but given the lack of prior complaints,
refrain from back-patching, as we did with
6498287696d.
Reported-by: Chengpeng Yan <chengpeng_yan@outlook.com>
Author: Chengpeng Yan <chengpeng_yan@outlook.com>
Reviewed-by: Dean Rasheed <dean.a.rasheed@gmail.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Discussion: https://postgr.es/m/
33E01656-BB3B-46E9-A41F-
24A01A7C35F4@outlook.com
float8 N,
Sxx,
Syy,
- Sxy;
+ Sxy,
+ numerator,
+ denominator,
+ sqrtdenominator,
+ sqrtresult,
+ result;
transvalues = check_float8_array(transarray, "float8_regr_r2", 8);
N = transvalues[0];
if (Syy == 0)
PG_RETURN_FLOAT8(1.0);
- PG_RETURN_FLOAT8((Sxy * Sxy) / (Sxx * Syy));
+ /*
+ * The products Sxy * Sxy and/or Sxx * Syy might underflow or overflow. If
+ * so, we can recover by computing Sxy / (sqrt(Sxx) * sqrt(Syy)) and
+ * squaring it instead. However, the double sqrt() calculation is a bit
+ * slower and less accurate, so don't do it if we don't have to.
+ */
+ numerator = Sxy * Sxy;
+ denominator = Sxx * Syy;
+ if (numerator == 0 || isinf(numerator) ||
+ denominator == 0 || isinf(denominator))
+ {
+ sqrtdenominator = sqrt(Sxx) * sqrt(Syy);
+ sqrtresult = Sxy / sqrtdenominator;
+ result = sqrtresult * sqrtresult;
+ }
+ else
+ result = numerator / denominator;
+
+ /*
+ * Despite all these precautions, this formula can yield results outside
+ * [0, 1] due to roundoff error. Clamp it to the expected range.
+ *
+ * Note that result is guaranteed to be non-negative becase Sxx and Syy
+ * are non-negative, so we only need to clamp the upper end of the range.
+ */
+ if (result > 1)
+ result = 1;
+
+ PG_RETURN_FLOAT8(result);
}
Datum
(1 row)
-- check some cases that formerly had poor roundoff-error behavior
+-- note: regr_r2() differs from corr() for a horizontal line, per spec
SELECT corr(0.09, g), regr_r2(0.09, g)
FROM generate_series(1, 30) g;
corr | regr_r2
(1 row)
-SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
+-- check some cases that formerly suffered from internal overflow/underflow
+SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105),
+ regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
FROM generate_series(1, 3) g;
- corr
-------
- 1
+ corr | regr_r2
+------+---------
+ 1 | 1
(1 row)
-SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
+SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105),
+ regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
FROM generate_series(1, 30) g;
- corr
-------
- 1
+ corr | regr_r2
+------+---------
+ 1 | 1
+(1 row)
+
+SELECT corr(1e100 + g * 1e95, 1e100 + g * 1e95),
+ regr_r2(1e100 + g * 1e95, 1e100 + g * 1e95)
+ FROM generate_series(1, 2) g;
+ corr | regr_r2
+------+---------
+ 1 | 1
(1 row)
-- these examples pose definitional questions for NaN inputs,
-- which we resolve by saying that an all-NaN input column is not all equal
-SELECT corr(g, 'NaN') FROM generate_series(1, 30) g;
- corr
-------
- NaN
+SELECT corr(g, 'NaN'), regr_r2(g, 'NaN') FROM generate_series(1, 30) g;
+ corr | regr_r2
+------+---------
+ NaN | NaN
(1 row)
-SELECT corr(0.1, 'NaN') FROM generate_series(1, 30) g;
- corr
-------
-
+SELECT corr(0.1, 'NaN'), regr_r2(0.1, 'NaN') FROM generate_series(1, 30) g;
+ corr | regr_r2
+------+---------
+ | 1
(1 row)
-SELECT corr('NaN', 'NaN') FROM generate_series(1, 30) g;
- corr
-------
- NaN
+SELECT corr('NaN', 0.1), regr_r2('NaN', 0.1) FROM generate_series(1, 30) g;
+ corr | regr_r2
+------+---------
+ |
+(1 row)
+
+SELECT corr('NaN', 'NaN'), regr_r2('NaN', 'NaN') FROM generate_series(1, 30) g;
+ corr | regr_r2
+------+---------
+ NaN | NaN
(1 row)
-- test accum and combine functions directly
SELECT covar_pop(1::float8,'nan'::float8), covar_samp(3::float8,'nan'::float8);
-- check some cases that formerly had poor roundoff-error behavior
+-- note: regr_r2() differs from corr() for a horizontal line, per spec
SELECT corr(0.09, g), regr_r2(0.09, g)
FROM generate_series(1, 30) g;
SELECT corr(g, 0.09), regr_r2(g, 0.09), regr_slope(g, 0.09), regr_intercept(g, 0.09)
FROM generate_series(1, 30) g;
SELECT corr(1.3 + g * 1e-16, 1.3 + g * 1e-16)
FROM generate_series(1, 3) g;
-SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
+
+-- check some cases that formerly suffered from internal overflow/underflow
+SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105),
+ regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
FROM generate_series(1, 3) g;
-SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
+SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105),
+ regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105)
FROM generate_series(1, 30) g;
+SELECT corr(1e100 + g * 1e95, 1e100 + g * 1e95),
+ regr_r2(1e100 + g * 1e95, 1e100 + g * 1e95)
+ FROM generate_series(1, 2) g;
-- these examples pose definitional questions for NaN inputs,
-- which we resolve by saying that an all-NaN input column is not all equal
-SELECT corr(g, 'NaN') FROM generate_series(1, 30) g;
-SELECT corr(0.1, 'NaN') FROM generate_series(1, 30) g;
-SELECT corr('NaN', 'NaN') FROM generate_series(1, 30) g;
+SELECT corr(g, 'NaN'), regr_r2(g, 'NaN') FROM generate_series(1, 30) g;
+SELECT corr(0.1, 'NaN'), regr_r2(0.1, 'NaN') FROM generate_series(1, 30) g;
+SELECT corr('NaN', 0.1), regr_r2('NaN', 0.1) FROM generate_series(1, 30) g;
+SELECT corr('NaN', 'NaN'), regr_r2('NaN', 'NaN') FROM generate_series(1, 30) g;
-- test accum and combine functions directly
CREATE TABLE regr_test (x float8, y float8);