From: Dean Rasheed Date: Wed, 3 Jun 2026 08:01:13 +0000 (+0100) Subject: Improve overflow/underflow handling in regr_r2(). X-Git-Url: http://git.ipfire.org/gitweb/index.cgi?a=commitdiff_plain;h=d58ec50e0faae364fa59cdb88eba7e23b55ab583;p=thirdparty%2Fpostgresql.git Improve overflow/underflow handling in regr_r2(). Commit 6498287696d improved corr()'s final function to cope with overflow/underflow in the final calculation, and clamped its result to [-1, 1] in case of roundoff error. Improve regr_r2() in a similar way, clamping its result to [0, 1]. Arguably this is a bug fix, but given the lack of prior complaints, refrain from back-patching, as we did with 6498287696d. Reported-by: Chengpeng Yan Author: Chengpeng Yan Reviewed-by: Dean Rasheed Reviewed-by: Tom Lane Discussion: https://postgr.es/m/33E01656-BB3B-46E9-A41F-24A01A7C35F4@outlook.com --- diff --git a/src/backend/utils/adt/float.c b/src/backend/utils/adt/float.c index 362c29ab803..cc00c10c0d4 100644 --- a/src/backend/utils/adt/float.c +++ b/src/backend/utils/adt/float.c @@ -3916,7 +3916,12 @@ float8_regr_r2(PG_FUNCTION_ARGS) float8 N, Sxx, Syy, - Sxy; + Sxy, + numerator, + denominator, + sqrtdenominator, + sqrtresult, + result; transvalues = check_float8_array(transarray, "float8_regr_r2", 8); N = transvalues[0]; @@ -3938,7 +3943,35 @@ float8_regr_r2(PG_FUNCTION_ARGS) if (Syy == 0) PG_RETURN_FLOAT8(1.0); - PG_RETURN_FLOAT8((Sxy * Sxy) / (Sxx * Syy)); + /* + * The products Sxy * Sxy and/or Sxx * Syy might underflow or overflow. If + * so, we can recover by computing Sxy / (sqrt(Sxx) * sqrt(Syy)) and + * squaring it instead. However, the double sqrt() calculation is a bit + * slower and less accurate, so don't do it if we don't have to. + */ + numerator = Sxy * Sxy; + denominator = Sxx * Syy; + if (numerator == 0 || isinf(numerator) || + denominator == 0 || isinf(denominator)) + { + sqrtdenominator = sqrt(Sxx) * sqrt(Syy); + sqrtresult = Sxy / sqrtdenominator; + result = sqrtresult * sqrtresult; + } + else + result = numerator / denominator; + + /* + * Despite all these precautions, this formula can yield results outside + * [0, 1] due to roundoff error. Clamp it to the expected range. + * + * Note that result is guaranteed to be non-negative becase Sxx and Syy + * are non-negative, so we only need to clamp the upper end of the range. + */ + if (result > 1) + result = 1; + + PG_RETURN_FLOAT8(result); } Datum diff --git a/src/test/regress/expected/aggregates.out b/src/test/regress/expected/aggregates.out index fbda0e3bbc2..1ccdf7dfdd7 100644 --- a/src/test/regress/expected/aggregates.out +++ b/src/test/regress/expected/aggregates.out @@ -516,6 +516,7 @@ SELECT covar_pop(1::float8,'nan'::float8), covar_samp(3::float8,'nan'::float8); (1 row) -- check some cases that formerly had poor roundoff-error behavior +-- note: regr_r2() differs from corr() for a horizontal line, per spec SELECT corr(0.09, g), regr_r2(0.09, g) FROM generate_series(1, 30) g; corr | regr_r2 @@ -537,38 +538,55 @@ SELECT corr(1.3 + g * 1e-16, 1.3 + g * 1e-16) (1 row) -SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105) +-- check some cases that formerly suffered from internal overflow/underflow +SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105), + regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105) FROM generate_series(1, 3) g; - corr ------- - 1 + corr | regr_r2 +------+--------- + 1 | 1 (1 row) -SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105) +SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105), + regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105) FROM generate_series(1, 30) g; - corr ------- - 1 + corr | regr_r2 +------+--------- + 1 | 1 +(1 row) + +SELECT corr(1e100 + g * 1e95, 1e100 + g * 1e95), + regr_r2(1e100 + g * 1e95, 1e100 + g * 1e95) + FROM generate_series(1, 2) g; + corr | regr_r2 +------+--------- + 1 | 1 (1 row) -- these examples pose definitional questions for NaN inputs, -- which we resolve by saying that an all-NaN input column is not all equal -SELECT corr(g, 'NaN') FROM generate_series(1, 30) g; - corr ------- - NaN +SELECT corr(g, 'NaN'), regr_r2(g, 'NaN') FROM generate_series(1, 30) g; + corr | regr_r2 +------+--------- + NaN | NaN (1 row) -SELECT corr(0.1, 'NaN') FROM generate_series(1, 30) g; - corr ------- - +SELECT corr(0.1, 'NaN'), regr_r2(0.1, 'NaN') FROM generate_series(1, 30) g; + corr | regr_r2 +------+--------- + | 1 (1 row) -SELECT corr('NaN', 'NaN') FROM generate_series(1, 30) g; - corr ------- - NaN +SELECT corr('NaN', 0.1), regr_r2('NaN', 0.1) FROM generate_series(1, 30) g; + corr | regr_r2 +------+--------- + | +(1 row) + +SELECT corr('NaN', 'NaN'), regr_r2('NaN', 'NaN') FROM generate_series(1, 30) g; + corr | regr_r2 +------+--------- + NaN | NaN (1 row) -- test accum and combine functions directly diff --git a/src/test/regress/sql/aggregates.sql b/src/test/regress/sql/aggregates.sql index 580f364ba97..a310b39e7b8 100644 --- a/src/test/regress/sql/aggregates.sql +++ b/src/test/regress/sql/aggregates.sql @@ -141,22 +141,31 @@ SELECT covar_pop(1::float8,'inf'::float8), covar_samp(3::float8,'inf'::float8); SELECT covar_pop(1::float8,'nan'::float8), covar_samp(3::float8,'nan'::float8); -- check some cases that formerly had poor roundoff-error behavior +-- note: regr_r2() differs from corr() for a horizontal line, per spec SELECT corr(0.09, g), regr_r2(0.09, g) FROM generate_series(1, 30) g; SELECT corr(g, 0.09), regr_r2(g, 0.09), regr_slope(g, 0.09), regr_intercept(g, 0.09) FROM generate_series(1, 30) g; SELECT corr(1.3 + g * 1e-16, 1.3 + g * 1e-16) FROM generate_series(1, 3) g; -SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105) + +-- check some cases that formerly suffered from internal overflow/underflow +SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105), + regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105) FROM generate_series(1, 3) g; -SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105) +SELECT corr(1e-100 + g * 1e-105, 1e-100 + g * 1e-105), + regr_r2(1e-100 + g * 1e-105, 1e-100 + g * 1e-105) FROM generate_series(1, 30) g; +SELECT corr(1e100 + g * 1e95, 1e100 + g * 1e95), + regr_r2(1e100 + g * 1e95, 1e100 + g * 1e95) + FROM generate_series(1, 2) g; -- these examples pose definitional questions for NaN inputs, -- which we resolve by saying that an all-NaN input column is not all equal -SELECT corr(g, 'NaN') FROM generate_series(1, 30) g; -SELECT corr(0.1, 'NaN') FROM generate_series(1, 30) g; -SELECT corr('NaN', 'NaN') FROM generate_series(1, 30) g; +SELECT corr(g, 'NaN'), regr_r2(g, 'NaN') FROM generate_series(1, 30) g; +SELECT corr(0.1, 'NaN'), regr_r2(0.1, 'NaN') FROM generate_series(1, 30) g; +SELECT corr('NaN', 0.1), regr_r2('NaN', 0.1) FROM generate_series(1, 30) g; +SELECT corr('NaN', 'NaN'), regr_r2('NaN', 'NaN') FROM generate_series(1, 30) g; -- test accum and combine functions directly CREATE TABLE regr_test (x float8, y float8);