From 3960eabca781e892eb8fb12cde5bb3272f0ba366 Mon Sep 17 00:00:00 2001 From: Douglas Bagnall Date: Wed, 5 Jul 2023 14:32:05 +1200 Subject: [PATCH] libutil/iconv: avoid overflow in surrogate pairs Consider the non-conforment utf-8 sequence "\xf5\x80\x80\x80", which would encode 0x140000. We would set the high byte of the first surrogate to 0xd8 | (0x130000 >> 18), or 0xdc, which is an invalid start for a high surrogate, making the sequence as a whole invalid (as you would expect -- the Unicode range was set precisely to that covered by utf-16 surrogates). Signed-off-by: Douglas Bagnall Reviewed-by: Andrew Bartlett --- lib/util/charset/iconv.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/util/charset/iconv.c b/lib/util/charset/iconv.c index 952b9e7911b..131df640986 100644 --- a/lib/util/charset/iconv.c +++ b/lib/util/charset/iconv.c @@ -923,6 +923,16 @@ static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft, errno = EILSEQ; goto error; } + if (codepoint > 0x10ffff) { + /* + * Unicode stops at 0x10ffff, and if + * we ignore that, we'll end up + * encoding the wrong characters in + * the surrogate pair. + */ + errno = EILSEQ; + goto error; + } codepoint -= 0x10000; -- 2.47.3