]> git.ipfire.org Git - thirdparty/plymouth.git/commitdiff
utils: Rework UTF-8 handling
authorRay Strode <rstrode@redhat.com>
Sun, 10 Dec 2023 15:18:30 +0000 (10:18 -0500)
committerRay Strode <rstrode@redhat.com>
Sun, 10 Dec 2023 16:11:01 +0000 (11:11 -0500)
ply_utf8_character_get_size currently has this odd argument at
the end that is often just set to PLY_UTF8_MAX_CHARACTER_SIZE
and also the function returns magic values for cases where it
can't figure out the size because the byte isn't a leading
byte or is otherwise not valid UTF-8.

That means that API has a nuance to it that makes the code hard
to follow at a light read.

This commit attempts to improve the situation by dropping the
extra argument, and adds a way to get the type separate from the
size for clarity.

At the same time, this commit updates all the callers to use the
new API. There are two cases where the callers are trying to
remove the last character from a UTF-8 string, so this commit
adds a new function to consolidate that logic as well.

src/libply-splash-core/ply-keyboard.c
src/libply/ply-utils.c
src/libply/ply-utils.h
src/main.c

index 128d3ceda1891dbd936e5de91deb15d6f3145f7b..ab375f37b738d379512c6a7d8ae67f1d83db5716 100644 (file)
@@ -153,26 +153,15 @@ ply_keyboard_new_for_renderer (ply_renderer_t *renderer)
 static void
 process_backspace (ply_keyboard_t *keyboard)
 {
-        size_t bytes_to_remove;
-        ssize_t previous_character_size;
-        const char *bytes;
+        char *bytes;
         size_t size;
+        size_t capacity;
         ply_list_node_t *node;
 
-        bytes = ply_buffer_get_bytes (keyboard->line_buffer);
-        size = ply_buffer_get_size (keyboard->line_buffer);
-
-        bytes_to_remove = MIN (size, PLY_UTF8_CHARACTER_SIZE_MAX);
-        while ((previous_character_size = ply_utf8_character_get_size (bytes + size - bytes_to_remove, bytes_to_remove)) < (ssize_t) bytes_to_remove) {
-                if (previous_character_size > 0)
-                        bytes_to_remove -= previous_character_size;
-                else
-                        bytes_to_remove--;
+        ply_buffer_borrow_bytes (keyboard->line_buffer, &bytes, &size, &capacity) {
+                ply_utf8_string_remove_last_character (&bytes, &size);
         }
 
-        if (bytes_to_remove <= size)
-                ply_buffer_remove_bytes_at_end (keyboard->line_buffer, bytes_to_remove);
-
         for (node = ply_list_get_first_node (keyboard->backspace_handler_list);
              node; node = ply_list_get_next_node (keyboard->backspace_handler_list, node)) {
                 ply_keyboard_closure_t *closure = ply_list_node_get_data (node);
@@ -277,6 +266,7 @@ on_key_event (ply_keyboard_t *keyboard,
 
         i = 0;
         while (i < size) {
+                ply_utf8_character_byte_type_t character_byte_type;
                 ssize_t character_size;
                 char *keyboard_input;
                 size_t bytes_left = size - i;
@@ -318,18 +308,23 @@ on_key_event (ply_keyboard_t *keyboard,
                         continue;
                 }
 
-                character_size = (ssize_t) ply_utf8_character_get_size (bytes + i, bytes_left);
+                character_byte_type = ply_utf8_character_get_byte_type (bytes[i]);
 
-                if (character_size < 0)
+                if (PLY_UTF8_CHARACTER_BYTE_TYPE_IS_NOT_LEADING (character_byte_type))
                         break;
 
                 /* If we're at a NUL character walk through it
                  */
-                if (character_size == 0) {
+                if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING) {
                         i++;
                         continue;
                 }
 
+                character_size = ply_utf8_character_get_size_from_byte_type (character_byte_type);
+
+                if (character_size > bytes_left)
+                        break;
+
                 keyboard_input = strndup (bytes + i, character_size);
 
                 process_keyboard_input (keyboard, keyboard_input, character_size);
@@ -665,4 +660,4 @@ ply_keyboard_get_capslock_state (ply_keyboard_t *keyboard)
         }
 
         return NULL;
-}
\ No newline at end of file
+}
index cebeacc66b182fad6b8aa15d0226bcd125cacafb..8acc79e1ef6d5d6c99d8266119a6cce31ed55711 100644 (file)
@@ -742,21 +742,101 @@ ply_detach_daemon (ply_daemon_handle_t *handle,
  * 11100000-11101111    E0-EF   Start of 3-byte sequence
  * 11110000-11110100    F0-F4   Start of 4-byte sequence
  */
-int
-ply_utf8_character_get_size (const char *string,
-                             size_t      n)
+ply_utf8_character_byte_type_t
+ply_utf8_character_get_byte_type (const char byte)
+{
+        ply_utf8_character_byte_type_t byte_type;
+
+        if (byte == '\0')
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING;
+        else if ((byte & 0x80) == 0x00)
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE;
+        else if ((byte & 0xE0) == 0xC0)
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES;
+        else if ((byte & 0xF0) == 0xE0)
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES;
+        else if ((byte & 0xF8) == 0xF0)
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES;
+        else
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION;
+
+        return byte_type;
+}
+
+ssize_t
+ply_utf8_character_get_size_from_byte_type (ply_utf8_character_byte_type_t byte_type)
+{
+        size_t size;
+
+        switch (byte_type) {
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE:
+                size = 1;
+                break;
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES:
+                size = 2;
+                break;
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES:
+                size = 3;
+                break;
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES:
+                size = 4;
+                break;
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION:
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID:
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING:
+                size = 0;
+                break;
+        }
+        return size;
+}
+
+ssize_t
+ply_utf8_character_get_size (const char *bytes)
+{
+        ply_utf8_character_byte_type_t byte_type;
+        ssize_t size;
+
+        byte_type = ply_utf8_character_get_byte_type (bytes[0]);
+        size = ply_utf8_character_get_size_from_byte_type (byte_type);
+
+        return size;
+}
+
+void
+ply_utf8_string_remove_last_character (char   **string,
+                                       size_t  *n)
 {
-        int length;
-
-        if (n < 1) return -1;
-        if (string[0] == 0x00) length = 0;
-        else if ((string[0] & 0x80) == 0x00) length = 1;
-        else if ((string[0] & 0xE0) == 0xC0) length = 2;
-        else if ((string[0] & 0xF0) == 0xE0) length = 3;
-        else if ((string[0] & 0xF8) == 0xF0) length = 4;
-        else return -2;
-        if (length > (int) n) return -1;
-        return length;
+        char *bytes = *string;
+        size_t bytes_to_remove;
+        ssize_t previous_character_size;
+        size_t size = *n;
+
+        bytes_to_remove = MIN (size, PLY_UTF8_CHARACTER_SIZE_MAX);
+        do {
+                ply_utf8_character_byte_type_t previous_character_byte_type;
+                const char *previous_character = bytes + size - bytes_to_remove;
+
+                previous_character_byte_type = ply_utf8_character_get_byte_type (*previous_character);
+                previous_character_size = ply_utf8_character_get_size_from_byte_type (previous_character_byte_type);
+
+                if (bytes_to_remove < previous_character_size)
+                        break;
+
+                if (PLY_UTF8_CHARACTER_BYTE_TYPE_IS_NOT_LEADING (previous_character_byte_type))
+                        bytes_to_remove--;
+                else if (previous_character_size > bytes_to_remove)
+                        bytes_to_remove--;
+                else
+                        bytes_to_remove -= previous_character_size;
+
+        } while (previous_character_size <= bytes_to_remove);
+
+        if (bytes_to_remove <= size) {
+                size_t new_size;
+                new_size = size - bytes_to_remove;
+                bytes[new_size - 1] = '\0';
+                *n = new_size;
+        }
 }
 
 int
@@ -766,10 +846,16 @@ ply_utf8_string_get_length (const char *string,
         size_t count = 0;
 
         while (true) {
-                int charlen = ply_utf8_character_get_size (string, n);
-                if (charlen <= 0) break;
-                string += charlen;
-                n -= charlen;
+                size_t size = ply_utf8_character_get_size (string);
+
+                if (size == 0)
+                        break;
+
+                if (size > n)
+                        break;
+
+                string += size;
+                n -= size;
                 count++;
         }
         return count;
@@ -783,7 +869,7 @@ ply_utf8_string_get_byte_offset_from_character_offset (const char *string,
         size_t i;
 
         for (i = 0; i < character_offset && string[byte_offset] != '\0'; i++) {
-                byte_offset += ply_utf8_character_get_size (string + byte_offset, PLY_UTF8_CHARACTER_SIZE_MAX);
+                byte_offset += ply_utf8_character_get_size (string + byte_offset);
         }
 
         return byte_offset;
@@ -818,8 +904,7 @@ ply_utf8_string_iterator_next (ply_utf8_string_iterator_t *iterator,
         if (iterator->string[iterator->current_byte_offset] == '\0')
                 return false;
 
-        size_of_current_character = ply_utf8_character_get_size (iterator->string + iterator->current_byte_offset,
-                                                                 PLY_UTF8_CHARACTER_SIZE_MAX);
+        size_of_current_character = ply_utf8_character_get_size (iterator->string + iterator->current_byte_offset);
 
         if (size_of_current_character == 0)
                 return false;
index 3e487b4e4516af73137e11dfe55ea28978b124e9..d93cdb414f37c7a00d02ed79b2d9ea3fbad5eb2f 100644 (file)
@@ -55,6 +55,19 @@ typedef enum
         PLY_UNIX_SOCKET_TYPE_TRIMMED_ABSTRACT
 } ply_unix_socket_type_t;
 
+typedef enum {
+        PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION = -2,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID = -1,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING = 0,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE = 1,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES = 2,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES = 3,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES = 4
+} ply_utf8_character_byte_type_t;
+
+#define PLY_UTF8_CHARACTER_BYTE_TYPE_IS_NOT_LEADING(t) ((t) == PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION)
+#define PLY_UTF8_CHARACTER_BYTE_TYPE_IS_MULTI_BYTE(t) (((t) == PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES))
+
 typedef struct
 {
         const char *string;
@@ -120,8 +133,12 @@ ply_daemon_handle_t *ply_create_daemon (void);
 bool ply_detach_daemon (ply_daemon_handle_t *handle,
                         int                  exit_code);
 
-int ply_utf8_character_get_size (const char *string,
-                                 size_t      n);
+ply_utf8_character_byte_type_t ply_utf8_character_get_byte_type (const char byte);
+ssize_t ply_utf8_character_get_size_from_byte_type (ply_utf8_character_byte_type_t byte_type);
+ssize_t ply_utf8_character_get_size (const char *bytes);
+
+void ply_utf8_string_remove_last_character (char   **string,
+                                            size_t  *n);
 int ply_utf8_string_get_length (const char *string,
                                 size_t      n);
 
index 4617dfcf8973e06bbbcf829e590a4662b2d9928f..407ca490237bb2d61226acb7f9a2cfed32c67050 100644 (file)
@@ -1652,28 +1652,17 @@ on_keyboard_input (state_t    *state,
 static void
 on_backspace (state_t *state)
 {
-        ssize_t bytes_to_remove;
-        ssize_t previous_character_size;
-        const char *bytes;
+        char *bytes;
         size_t size;
+        size_t capacity;
         ply_list_node_t *node = ply_list_get_first_node (state->entry_triggers);
 
         if (!node) return;
 
-        bytes = ply_buffer_get_bytes (state->entry_buffer);
-        size = ply_buffer_get_size (state->entry_buffer);
-        if (size == 0)
-                return;
-
-        bytes_to_remove = MIN (size, PLY_UTF8_CHARACTER_SIZE_MAX);
-        while ((previous_character_size = ply_utf8_character_get_size (bytes + size - bytes_to_remove, bytes_to_remove)) < bytes_to_remove) {
-                if (previous_character_size > 0)
-                        bytes_to_remove -= previous_character_size;
-                else
-                        bytes_to_remove--;
+        ply_buffer_borrow_bytes (state->entry_buffer, &bytes, &size, &capacity) {
+                ply_utf8_string_remove_last_character (&bytes, &size);
         }
 
-        ply_buffer_remove_bytes_at_end (state->entry_buffer, bytes_to_remove);
         update_display (state);
 }