[utf8] Add ability to accumulate Unicode characters from UTF-8 bytes

author Michael Brown <mcb30@ipxe.org>

Mon, 28 Feb 2022 13:37:40 +0000 (13:37 +0000)

committer Michael Brown <mcb30@ipxe.org>

Tue, 1 Mar 2022 15:57:33 +0000 (15:57 +0000)
author Michael Brown <mcb30@ipxe.org>
Mon, 28 Feb 2022 13:37:40 +0000 (13:37 +0000)
committer Michael Brown <mcb30@ipxe.org>
Tue, 1 Mar 2022 15:57:33 +0000 (15:57 +0000)
diff --git a/src/core/utf8.c b/src/core/utf8.c

new file mode 100644 (file)

index 0000000..4ee01ba
--- /dev/null
+++ b/src/core/utf8.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2022 Michael Brown <mbrown@fensystems.co.uk>.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * You can also choose to distribute this program under the terms of
+ * the Unmodified Binary Distribution Licence (as given in the file
+ * COPYING.UBDL), provided that you have satisfied its requirements.
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+#include <stdint.h>
+#include <assert.h>
+#include <ipxe/utf8.h>
+
+/** @file
+ *
+ * UTF-8 Unicode encoding
+ *
+ */
+
+/**
+ * Accumulate Unicode character from UTF-8 byte sequence
+ *
+ * @v utf8             UTF-8 accumulator
+ * @v byte             UTF-8 byte
+ * @ret character      Unicode character, or 0 if incomplete
+ */
+unsigned int utf8_accumulate ( struct utf8_accumulator *utf8, uint8_t byte ) {
+       static unsigned int min[] = {
+               UTF8_MIN_TWO,
+               UTF8_MIN_THREE,
+               UTF8_MIN_FOUR,
+       };
+       unsigned int shift;
+       unsigned int len;
+       uint8_t tmp;
+
+       /* Handle continuation bytes */
+       if ( UTF8_IS_CONTINUATION ( byte ) ) {
+
+               /* Fail if this is an unexpected continuation byte */
+               if ( utf8->remaining == 0 ) {
+                       DBGC ( utf8, "UTF8 %p unexpected %02x\n", utf8, byte );
+                       return UTF8_INVALID;
+               }
+
+               /* Apply continuation byte */
+               utf8->character <<= UTF8_CONTINUATION_BITS;
+               utf8->character |= ( byte & UTF8_CONTINUATION_MASK );
+
+               /* Return 0 if more continuation bytes are expected */
+               if ( --utf8->remaining != 0 )
+                       return 0;
+
+               /* Fail if sequence is illegal */
+               if ( utf8->character < utf8->min ) {
+                       DBGC ( utf8, "UTF8 %p illegal %02x\n", utf8,
+                              utf8->character );
+                       return UTF8_INVALID;
+               }
+
+               /* Sanity check */
+               assert ( utf8->character != 0 );
+
+               /* Return completed character */
+               DBGC2 ( utf8, "UTF8 %p accumulated %02x\n",
+                       utf8, utf8->character );
+               return utf8->character;
+       }
+
+       /* Reset state and report failure if this is an unexpected
+        * non-continuation byte.  Do not return UTF8_INVALID since
+        * doing so could cause us to drop a valid ASCII character.
+        */
+       if ( utf8->remaining != 0 ) {
+               shift = ( utf8->remaining * UTF8_CONTINUATION_BITS );
+               DBGC ( utf8, "UTF8 %p unexpected %02x (partial %02x/%02x)\n",
+                      utf8, byte, ( utf8->character << shift ),
+                      ( ( 1 << shift ) - 1 ) );
+               utf8->remaining = 0;
+       }
+
+       /* Handle initial bytes */
+       if ( ! UTF8_IS_ASCII ( byte ) ) {
+
+               /* Sanity check */
+               assert ( utf8->remaining == 0 );
+
+               /* Count total number of bytes in sequence */
+               tmp = byte;
+               len = 0;
+               while ( tmp & UTF8_HIGH_BIT ) {
+                       tmp <<= 1;
+                       len++;
+               }
+
+               /* Check for illegal length */
+               if ( len > UTF8_MAX_LEN ) {
+                       DBGC ( utf8, "UTF8 %p illegal %02x length %d\n",
+                              utf8, byte, len );
+                       return UTF8_INVALID;
+               }
+
+               /* Store initial bits of character */
+               utf8->character = ( tmp >> len );
+
+               /* Store number of bytes remaining */
+               len--;
+               utf8->remaining = len;
+               assert ( utf8->remaining > 0 );
+
+               /* Store minimum legal value */
+               utf8->min = min[ len - 1 ];
+               assert ( utf8->min > 0 );
+
+               /* Await continuation bytes */
+               return 0;
+       }
+
+       /* Handle ASCII bytes */
+       return byte;
+}
diff --git a/src/include/ipxe/utf8.h b/src/include/ipxe/utf8.h

new file mode 100644 (file)

index 0000000..299c255
--- /dev/null
+++ b/src/include/ipxe/utf8.h
@@ -0,0 +1,69 @@
+#ifndef _IPXE_UTF8_H
+#define _IPXE_UTF8_H
+
+/** @file
+ *
+ * UTF-8 Unicode encoding
+ *
+ */
+
+FILE_LICENCE ( GPL2_OR_LATER_OR_UBDL );
+
+#include <stdint.h>
+
+/** Maximum length of UTF-8 sequence */
+#define UTF8_MAX_LEN 4
+
+/** Minimum legal value for two-byte UTF-8 sequence */
+#define UTF8_MIN_TWO 0x80
+
+/** Minimum legal value for three-byte UTF-8 sequence */
+#define UTF8_MIN_THREE 0x800
+
+/** Minimum legal value for four-byte UTF-8 sequence */
+#define UTF8_MIN_FOUR 0x10000
+
+/** High bit of UTF-8 bytes */
+#define UTF8_HIGH_BIT 0x80
+
+/** Number of data bits in each continuation byte */
+#define UTF8_CONTINUATION_BITS 6
+
+/** Bit mask for data bits in a continuation byte */
+#define UTF8_CONTINUATION_MASK ( ( 1 << UTF8_CONTINUATION_BITS ) - 1 )
+
+/** Non-data bits in a continuation byte */
+#define UTF8_CONTINUATION 0x80
+
+/** Check for a continuation byte
+ *
+ * @v byte             UTF-8 byte
+ * @ret is_continuation        Byte is a continuation byte
+ */
+#define UTF8_IS_CONTINUATION( byte ) \
+       ( ( (byte) & ~UTF8_CONTINUATION_MASK ) == UTF8_CONTINUATION )
+
+/** Check for an ASCII byte
+ *
+ * @v byte             UTF-8 byte
+ * @ret is_ascii       Byte is an ASCII byte
+ */
+#define UTF8_IS_ASCII( byte ) ( ! ( (byte) & UTF8_HIGH_BIT ) )
+
+/** Invalid character returned when decoding fails */
+#define UTF8_INVALID 0xfffd
+
+/** A UTF-8 character accumulator */
+struct utf8_accumulator {
+       /** Character in progress */
+       unsigned int character;
+       /** Number of remaining continuation bytes */
+       unsigned int remaining;
+       /** Minimum legal character */
+       unsigned int min;
+};
+
+extern unsigned int utf8_accumulate ( struct utf8_accumulator *utf8,
+                                     uint8_t byte );
+
+#endif /* _IPXE_UTF8_H */
author	Michael Brown <mcb30@ipxe.org>
	Mon, 28 Feb 2022 13:37:40 +0000 (13:37 +0000)
committer	Michael Brown <mcb30@ipxe.org>
	Tue, 1 Mar 2022 15:57:33 +0000 (15:57 +0000)
src/core/utf8.c	[new file with mode: 0644]	patch \| blob
src/include/ipxe/utf8.h	[new file with mode: 0644]	patch \| blob