return i;
}
+
+static size_t
+str_sanitize_skip_start_utf8(const char *src, uintmax_t max_chars)
+{
+ unichar_t chr;
+ uintmax_t c;
+ size_t i;
+
+ for (i = 0, c = 0; c < max_chars && src[i] != '\0'; ) {
+ int len = uni_utf8_get_char(src+i, &chr);
+ if (len <= 0)
+ break;
+ if ((unsigned char)src[i] < 32)
+ break;
+ c++;
+ i += len;
+ }
+ i_assert(c <= max_chars);
+ return i;
+}
+
static void str_sanitize_truncate_char(string_t *dest, unsigned int initial_pos)
{
const unsigned char *data = str_data(dest);
}
}
+void str_sanitize_append_utf8(string_t *dest, const char *src,
+ uintmax_t max_cps)
+{
+ size_t last_pos = 0;
+ unichar_t chr;
+ uintmax_t c;
+ size_t i;
+
+ i_assert(max_cps > 0);
+
+ for (i = 0, c = 0; c < max_cps && src[i] != '\0'; ) {
+ int len = uni_utf8_get_char(src+i, &chr);
+ if (len == 0)
+ break; /* input ended too early */
+
+ last_pos = str_len(dest);
+ if (len < 0) {
+ /* invalid UTF-8 */
+ str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
+ i++;
+ continue;
+ }
+ if ((unsigned char)src[i] < 32)
+ str_append(dest, UNICODE_REPLACEMENT_CHAR_UTF8);
+ else
+ str_append_n(dest, src+i, len);
+ i += len;
+ c++;
+ }
+
+ if (src[i] != '\0') {
+ str_truncate(dest, last_pos);
+ str_append(dest, UNICODE_HORIZONTAL_ELLIPSIS_CHAR_UTF8);
+ }
+}
+
const char *str_sanitize(const char *src, size_t max_bytes)
{
string_t *str;
str_sanitize_append(str, src, max_bytes);
return str_c(str);
}
+
+const char *str_sanitize_utf8(const char *src, uintmax_t max_cps)
+{
+ string_t *str;
+ size_t i;
+
+ if (src == NULL)
+ return NULL;
+
+ i = str_sanitize_skip_start_utf8(src, max_cps);
+ if (src[i] == '\0')
+ return src;
+
+ str = t_str_new(I_MIN(max_cps, 256));
+ str_sanitize_append_utf8(str, src, max_cps);
+ return str_c(str);
+}
+
src is treated as UTF-8 input, but max_bytes is in bytes instead of
UTF-8 characters. */
void str_sanitize_append(string_t *dest, const char *src, size_t max_bytes);
+/* All control characters in src will be appended as the unicode replacement
+ character (U+FFFD). If src has more than max_cps unicode code points, it's
+ truncated with a horizontal ellipsis character (U+2026) appended to the end.
+ */
+void str_sanitize_append_utf8(string_t *dest, const char *src,
+ uintmax_t max_cps);
/* Return src sanitized. If there are no changes, src pointer is returned.
If src is NULL, returns NULL. */
const char *str_sanitize(const char *src, size_t max_bytes);
+/* The unicode version of str_sanitize() using str_sanitize_append_utf8()
+ internally. */
+const char *str_sanitize_utf8(const char *src, uintmax_t max_cps);
#endif
const char *sanitized; /* NULL for no change */
};
-void test_str_sanitize(void)
+static void test_str_sanitize_max_bytes(void)
{
static struct str_sanitize_test tests[] = {
{ NULL, 2, NULL },
}
test_end();
}
+
+static void test_str_sanitize_max_codepoints(void)
+{
+ static const struct str_sanitize_test tests[] = {
+ { NULL, 2, NULL },
+ { "", 2, NULL },
+ { "a", 2, NULL },
+ { "ab", 2, NULL },
+ { "abc", 2, "a\xE2\x80\xA6" },
+ { "abcd", 3, "ab\xE2\x80\xA6" },
+ { "abcde", 4, "abc\xE2\x80\xA6" },
+ { "\xD1\x81", 1, "\xD1\x81" },
+ { "\xD1\x81", 2, "\xD1\x81" },
+ { "\xD1\x81", 3, NULL },
+ { "\xC3\xA4\xC3\xA4zyxa", 1, "\xE2\x80\xA6" },
+ { "\xC3\xA4\xC3\xA4zyxa", 2, "\xC3\xA4\xE2\x80\xA6" },
+ { "\xC3\xA4\xC3\xA4zyxa", 3, "\xC3\xA4\xC3\xA4\xE2\x80\xA6" },
+ { "\xC3\xA4\xC3\xA4zyxa", 4, "\xC3\xA4\xC3\xA4z\xE2\x80\xA6" },
+ { "\xC3\xA4\xC3\xA4zyxa", 5, "\xC3\xA4\xC3\xA4zy\xE2\x80\xA6" },
+ { "\xC3\xA4\xC3\xA4zyxa", 6, "\xC3\xA4\xC3\xA4zyxa" },
+ { "\xC3\xA4\xC3\xA4zyxa", 7, "\xC3\xA4\xC3\xA4zyxa" },
+ { "\xC3\xA4\xC3\xA4zyxa", 8, "\xC3\xA4\xC3\xA4zyxa" },
+ { "\001x\x1fy\x81", 10, "\xEF\xBF\xBDx\xEF\xBF\xBDy\xEF\xBF\xBD" }
+ };
+ const char *str;
+ string_t *str2;
+ unsigned int i;
+
+ test_begin("str_sanitize_utf8");
+ for (i = 0; i < N_ELEMENTS(tests); i++) {
+ str = str_sanitize_utf8(tests[i].str, tests[i].max_len);
+ if (tests[i].sanitized != NULL)
+ test_assert_idx(null_strcmp(str, tests[i].sanitized) == 0, i);
+ else
+ test_assert_idx(str == tests[i].str, i);
+ }
+ test_end();
+
+ test_begin("str_sanitize_append_utf8");
+ str2 = t_str_new(128);
+ for (i = 0; i < N_ELEMENTS(tests); i++) {
+ if (tests[i].str == NULL)
+ continue;
+ str_truncate(str2, 0);
+ str_append(str2, "1234567890");
+ str_sanitize_append_utf8(str2, tests[i].str, tests[i].max_len);
+
+ test_assert_idx(strncmp(str_c(str2), "1234567890", 10) == 0, i);
+ if (tests[i].sanitized != NULL)
+ test_assert_idx(strcmp(str_c(str2)+10, tests[i].sanitized) == 0, i);
+ else
+ test_assert_idx(strcmp(str_c(str2)+10, tests[i].str) == 0, i);
+ }
+ test_end();
+}
+
+void test_str_sanitize(void)
+{
+ test_str_sanitize_max_bytes();
+ test_str_sanitize_max_codepoints();
+}