]> git.ipfire.org Git - thirdparty/plymouth.git/commitdiff
utils: Rework UTF-8 handling
authorRay Strode <rstrode@redhat.com>
Sun, 10 Dec 2023 15:18:30 +0000 (10:18 -0500)
committerRay Strode <rstrode@redhat.com>
Mon, 11 Dec 2023 03:13:58 +0000 (22:13 -0500)
ply_utf8_character_get_size currently has this odd argument at
the end that is often just set to PLY_UTF8_MAX_CHARACTER_SIZE
and also the function returns magic values for cases where it
can't figure out the size because the byte isn't a leading
byte or is otherwise not valid UTF-8.

That means that API has a nuance to it that makes the code hard
to follow at a light read.

This commit attempts to improve the situation by dropping the
extra argument, and adds a way to get the type separate from the
size for clarity.

At the same time, this commit updates all the callers to use the
new API. There are two cases where the callers are trying to
remove the last character from a UTF-8 string, so this commit
adds a new function to consolidate that logic as well.

src/libply-splash-core/ply-keyboard.c
src/libply/ply-utils.c
src/libply/ply-utils.h
src/main.c

index 128d3ceda1891dbd936e5de91deb15d6f3145f7b..ab375f37b738d379512c6a7d8ae67f1d83db5716 100644 (file)
@@ -153,26 +153,15 @@ ply_keyboard_new_for_renderer (ply_renderer_t *renderer)
 static void
 process_backspace (ply_keyboard_t *keyboard)
 {
-        size_t bytes_to_remove;
-        ssize_t previous_character_size;
-        const char *bytes;
+        char *bytes;
         size_t size;
+        size_t capacity;
         ply_list_node_t *node;
 
-        bytes = ply_buffer_get_bytes (keyboard->line_buffer);
-        size = ply_buffer_get_size (keyboard->line_buffer);
-
-        bytes_to_remove = MIN (size, PLY_UTF8_CHARACTER_SIZE_MAX);
-        while ((previous_character_size = ply_utf8_character_get_size (bytes + size - bytes_to_remove, bytes_to_remove)) < (ssize_t) bytes_to_remove) {
-                if (previous_character_size > 0)
-                        bytes_to_remove -= previous_character_size;
-                else
-                        bytes_to_remove--;
+        ply_buffer_borrow_bytes (keyboard->line_buffer, &bytes, &size, &capacity) {
+                ply_utf8_string_remove_last_character (&bytes, &size);
         }
 
-        if (bytes_to_remove <= size)
-                ply_buffer_remove_bytes_at_end (keyboard->line_buffer, bytes_to_remove);
-
         for (node = ply_list_get_first_node (keyboard->backspace_handler_list);
              node; node = ply_list_get_next_node (keyboard->backspace_handler_list, node)) {
                 ply_keyboard_closure_t *closure = ply_list_node_get_data (node);
@@ -277,6 +266,7 @@ on_key_event (ply_keyboard_t *keyboard,
 
         i = 0;
         while (i < size) {
+                ply_utf8_character_byte_type_t character_byte_type;
                 ssize_t character_size;
                 char *keyboard_input;
                 size_t bytes_left = size - i;
@@ -318,18 +308,23 @@ on_key_event (ply_keyboard_t *keyboard,
                         continue;
                 }
 
-                character_size = (ssize_t) ply_utf8_character_get_size (bytes + i, bytes_left);
+                character_byte_type = ply_utf8_character_get_byte_type (bytes[i]);
 
-                if (character_size < 0)
+                if (PLY_UTF8_CHARACTER_BYTE_TYPE_IS_NOT_LEADING (character_byte_type))
                         break;
 
                 /* If we're at a NUL character walk through it
                  */
-                if (character_size == 0) {
+                if (character_byte_type == PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING) {
                         i++;
                         continue;
                 }
 
+                character_size = ply_utf8_character_get_size_from_byte_type (character_byte_type);
+
+                if (character_size > bytes_left)
+                        break;
+
                 keyboard_input = strndup (bytes + i, character_size);
 
                 process_keyboard_input (keyboard, keyboard_input, character_size);
@@ -665,4 +660,4 @@ ply_keyboard_get_capslock_state (ply_keyboard_t *keyboard)
         }
 
         return NULL;
-}
\ No newline at end of file
+}
index c5b0847e47c15093e14a4bea60a458f4284c52c1..95b505b170f6ee40e60c2bbf6cafccd238a8c99f 100644 (file)
@@ -742,21 +742,93 @@ ply_detach_daemon (ply_daemon_handle_t *handle,
  * 11100000-11101111    E0-EF   Start of 3-byte sequence
  * 11110000-11110100    F0-F4   Start of 4-byte sequence
  */
-int
-ply_utf8_character_get_size (const char *string,
-                             size_t      n)
+ply_utf8_character_byte_type_t
+ply_utf8_character_get_byte_type (const char byte)
+{
+        ply_utf8_character_byte_type_t byte_type;
+
+        if (byte == '\0')
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING;
+        else if ((byte & 0x80) == 0x00)
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE;
+        else if ((byte & 0xE0) == 0xC0)
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES;
+        else if ((byte & 0xF0) == 0xE0)
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES;
+        else if ((byte & 0xF8) == 0xF0)
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES;
+        else
+                byte_type = PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION;
+
+        return byte_type;
+}
+
+ssize_t
+ply_utf8_character_get_size_from_byte_type (ply_utf8_character_byte_type_t byte_type)
+{
+        size_t size;
+
+        switch (byte_type) {
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE:
+                size = 1;
+                break;
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES:
+                size = 2;
+                break;
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES:
+                size = 3;
+                break;
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES:
+                size = 4;
+                break;
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION:
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID:
+        case PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING:
+                size = 0;
+                break;
+        }
+        return size;
+}
+
+ssize_t
+ply_utf8_character_get_size (const char *bytes)
+{
+        ply_utf8_character_byte_type_t byte_type;
+        ssize_t size;
+
+        byte_type = ply_utf8_character_get_byte_type (bytes[0]);
+        size = ply_utf8_character_get_size_from_byte_type (byte_type);
+
+        return size;
+}
+
+void
+ply_utf8_string_remove_last_character (char  **string,
+                                       size_t *size)
 {
-        int length;
-
-        if (n < 1) return -1;
-        if (string[0] == 0x00) length = 0;
-        else if ((string[0] & 0x80) == 0x00) length = 1;
-        else if ((string[0] & 0xE0) == 0xC0) length = 2;
-        else if ((string[0] & 0xF0) == 0xE0) length = 3;
-        else if ((string[0] & 0xF8) == 0xF0) length = 4;
-        else return -2;
-        if (length > (int) n) return -1;
-        return length;
+        char *bytes = *string;
+        size_t size_in = *size, end_offset;
+
+        if (size_in == 0)
+                return;
+
+        end_offset = size_in - 1;
+        do {
+                ply_utf8_character_byte_type_t byte_type;
+
+                byte_type = ply_utf8_character_get_byte_type (bytes[end_offset]);
+
+                if (byte_type != PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION) {
+                        memset (bytes + end_offset, '\0', size_in - end_offset);
+                        *size = end_offset;
+                        break;
+                }
+
+                if (end_offset == 0)
+                        break;
+
+                end_offset--;
+        } while (true);
 }
 
 int
@@ -766,10 +838,16 @@ ply_utf8_string_get_length (const char *string,
         size_t count = 0;
 
         while (true) {
-                int charlen = ply_utf8_character_get_size (string, n);
-                if (charlen <= 0) break;
-                string += charlen;
-                n -= charlen;
+                size_t size = ply_utf8_character_get_size (string);
+
+                if (size == 0)
+                        break;
+
+                if (size > n)
+                        break;
+
+                string += size;
+                n -= size;
                 count++;
         }
         return count;
@@ -783,7 +861,7 @@ ply_utf8_string_get_byte_offset_from_character_offset (const char *string,
         size_t i;
 
         for (i = 0; i < character_offset && string[byte_offset] != '\0'; i++) {
-                byte_offset += ply_utf8_character_get_size (string + byte_offset, PLY_UTF8_CHARACTER_SIZE_MAX);
+                byte_offset += ply_utf8_character_get_size (string + byte_offset);
         }
 
         return byte_offset;
@@ -818,8 +896,7 @@ ply_utf8_string_iterator_next (ply_utf8_string_iterator_t *iterator,
         if (iterator->string[iterator->current_byte_offset] == '\0')
                 return false;
 
-        size_of_current_character = ply_utf8_character_get_size (iterator->string + iterator->current_byte_offset,
-                                                                 PLY_UTF8_CHARACTER_SIZE_MAX);
+        size_of_current_character = ply_utf8_character_get_size (iterator->string + iterator->current_byte_offset);
 
         if (size_of_current_character == 0)
                 return false;
index b99d2b23e1bc30c4b480e7222f2535ee7640507f..7cbbb2f4d34139dca3453d371a4607af4f3c9f07 100644 (file)
@@ -55,6 +55,20 @@ typedef enum
         PLY_UNIX_SOCKET_TYPE_TRIMMED_ABSTRACT
 } ply_unix_socket_type_t;
 
+typedef enum
+{
+        PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION  = -2,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID       = -1,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_END_OF_STRING = 0,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_1_BYTE        = 1,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES       = 2,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES       = 3,
+        PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES       = 4
+} ply_utf8_character_byte_type_t;
+
+#define PLY_UTF8_CHARACTER_BYTE_TYPE_IS_NOT_LEADING(t) ((t) == PLY_UTF8_CHARACTER_BYTE_TYPE_INVALID || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_CONTINUATION)
+#define PLY_UTF8_CHARACTER_BYTE_TYPE_IS_MULTI_BYTE(t) (((t) == PLY_UTF8_CHARACTER_BYTE_TYPE_2_BYTES || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_3_BYTES || (t) == PLY_UTF8_CHARACTER_BYTE_TYPE_4_BYTES))
+
 typedef struct
 {
         const char *string;
@@ -120,8 +134,12 @@ ply_daemon_handle_t *ply_create_daemon (void);
 bool ply_detach_daemon (ply_daemon_handle_t *handle,
                         int                  exit_code);
 
-int ply_utf8_character_get_size (const char *string,
-                                 size_t      n);
+ply_utf8_character_byte_type_t ply_utf8_character_get_byte_type (const char byte);
+ssize_t ply_utf8_character_get_size_from_byte_type (ply_utf8_character_byte_type_t byte_type);
+ssize_t ply_utf8_character_get_size (const char *bytes);
+
+void ply_utf8_string_remove_last_character (char  **string,
+                                            size_t *n);
 int ply_utf8_string_get_length (const char *string,
                                 size_t      n);
 
index e301051b898734129ef407f201d6901bf2a7f373..09ca685440db5a3bde4d60cfcf2073253bc9e801 100644 (file)
@@ -1654,28 +1654,17 @@ on_keyboard_input (state_t    *state,
 static void
 on_backspace (state_t *state)
 {
-        ssize_t bytes_to_remove;
-        ssize_t previous_character_size;
-        const char *bytes;
+        char *bytes;
         size_t size;
+        size_t capacity;
         ply_list_node_t *node = ply_list_get_first_node (state->entry_triggers);
 
         if (!node) return;
 
-        bytes = ply_buffer_get_bytes (state->entry_buffer);
-        size = ply_buffer_get_size (state->entry_buffer);
-        if (size == 0)
-                return;
-
-        bytes_to_remove = MIN (size, PLY_UTF8_CHARACTER_SIZE_MAX);
-        while ((previous_character_size = ply_utf8_character_get_size (bytes + size - bytes_to_remove, bytes_to_remove)) < bytes_to_remove) {
-                if (previous_character_size > 0)
-                        bytes_to_remove -= previous_character_size;
-                else
-                        bytes_to_remove--;
+        ply_buffer_borrow_bytes (state->entry_buffer, &bytes, &size, &capacity) {
+                ply_utf8_string_remove_last_character (&bytes, &size);
         }
 
-        ply_buffer_remove_bytes_at_end (state->entry_buffer, bytes_to_remove);
         update_display (state);
 }