]> git.ipfire.org Git - thirdparty/dovecot/core.git/commitdiff
lib-mail: message_header_encode() now preserves folding whitespace
authorTimo Sirainen <tss@iki.fi>
Sun, 11 May 2014 19:15:08 +0000 (22:15 +0300)
committerTimo Sirainen <tss@iki.fi>
Sun, 11 May 2014 19:15:08 +0000 (22:15 +0300)
This function could still use some cleaning up, but good enough for now..

Also it should try to minimize the encoded words, not necessarily encoding
everything between the first and the last words that have to be encoded.

src/lib-mail/message-header-encode.c
src/lib-mail/message-header-encode.h
src/lib-mail/test-message-header-encode.c

index 1dabffb374a90fbec2b41e459d53fbf2f1e2eb3c..b0c511d2f7eedf4392786e698dd653ec081417b0 100644 (file)
 static bool input_idx_need_encoding(const unsigned char *input,
                                    unsigned int i, unsigned int len)
 {
-       /* 8bit chars */
-       if ((input[i] & 0x80) != 0)
-               return TRUE;
-       /* control chars */
-       if (input[i] < 32)
-               return TRUE;
-
-       /* <LWSP>=? */
-       if (input[i] == '=' && i+1 < len && input[i+1] == '?' &&
-           (i == 0 || IS_LWSP(input[i-1])))
-               return TRUE;
+       switch (input[i]) {
+       case '\r':
+               if (i+1 == len || input[i+1] != '\n')
+                       return TRUE;
+               i++;
+               /* fall through and verify the LF as well */
+       case '\n':
+               if (i+1 == len) {
+                       /* trailing LF - we need to drop it */
+                       return TRUE;
+               }
+               if (input[i+1] != '\t' && input[i+1] != ' ') {
+                       /* LF not followed by whitespace - we need to
+                          add the whitespace */
+                       return TRUE;
+               }
+               break;
+       case '\t':
+               /* TAB doesn't need to be encoded */
+               break;
+       case '=':
+               /* <LWSP>=? - we need to check backwards a bit to see if
+                  there is LWSP (note that we don't want to return TRUE for
+                  the LWSP itself yet, so we need to do this backwards
+                  check) */
+               if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len &&
+                   memcmp(input + i, "=?", 2) == 0)
+                       return TRUE;
+               break;
+       default:
+               /* 8bit chars */
+               if ((input[i] & 0x80) != 0)
+                       return TRUE;
+               /* control chars */
+               if (input[i] < 32)
+                       return TRUE;
+               break;
+       }
        return FALSE;
 }
 
@@ -45,9 +72,9 @@ void message_header_encode_q(const unsigned char *input, unsigned int len,
        str_append(output, "=?utf-8?q?");
        for (i = 0; i < len; i++) {
                if (line_len_left < 3) {
-                       /* if we're not at the beginning of a character,
+                       /* if we're not at the beginning of an UTF8 character,
                           go backwards until we are */
-                       while ((input[i] & 0xc0) == 0x80) {
+                       while (i > 0 && (input[i] & 0xc0) == 0x80) {
                                str_truncate(output, str_len(output)-3);
                                i--;
                        }
@@ -131,9 +158,11 @@ void message_header_encode(const char *input, string_t *output)
 void message_header_encode_data(const unsigned char *input, unsigned int len,
                                string_t *output)
 {
-       unsigned int i, first_idx, last_idx;
+       unsigned int i, j, first_line_len, cur_line_len, last_idx;
        unsigned int enc_chars, enc_len, base64_len, q_len;
-       bool use_q;
+       const unsigned char *next_line_input;
+       unsigned int next_line_len;
+       bool use_q, cr;
 
        /* find the first word that needs encoding */
        for (i = 0; i < len; i++) {
@@ -145,13 +174,36 @@ void message_header_encode_data(const unsigned char *input, unsigned int len,
                str_append_data(output, input, len);
                return;
        }
-       first_idx = i;
-       while (first_idx > 0 && !IS_LWSP(input[first_idx-1]))
-               first_idx--;
+       /* go back to the beginning of the word so it is fully encoded */
+       if (input[i] != '\r' && input[i] != '\n') {
+               while (i > 0 && !IS_LWSP(input[i-1]))
+                       i--;
+       }
+
+       /* write the prefix */
+       str_append_data(output, input, i);
+       first_line_len = j = i;
+       while (j > 0 && input[j-1] != '\n') j--;
+       if (j != 0)
+               first_line_len = j;
+
+       input += i;
+       len -= i;
+
+       /* we'll encode data only up to the next LF, the rest is handled
+          recursively. */
+       next_line_input = memchr(input, '\n', len);
+       if (next_line_input != NULL) {
+               if (next_line_input != input && next_line_input[-1] == '\r')
+                       next_line_input--;
+               cur_line_len = next_line_input - input;
+               next_line_len = len - cur_line_len;
+               len = cur_line_len;
+       }
 
        /* find the last word that needs encoding */
-       last_idx = ++i; enc_chars = 1;
-       for (; i < len; i++) {
+       last_idx = 0; enc_chars = 0;
+       for (i = 0; i < len; i++) {
                if (input_idx_need_encoding(input, i, len)) {
                        last_idx = i + 1;
                        enc_chars++;
@@ -162,19 +214,43 @@ void message_header_encode_data(const unsigned char *input, unsigned int len,
 
        /* figure out if we should use Q or B encoding. Prefer Q if it's not
           too much larger. */
-       enc_len = last_idx - first_idx;
+       enc_len = last_idx;
        base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
        q_len = enc_len + enc_chars*3;
        use_q = q_len*2/3 <= base64_len;
 
        /* and do it */
-       str_append_data(output, input, first_idx);
-       if (use_q) {
-               message_header_encode_q(input + first_idx, enc_len,
-                                       output, first_idx);
-       } else {
-               message_header_encode_b(input + first_idx, enc_len,
-                                       output, first_idx);
-       }
+       if (enc_len == 0)
+               ;
+       else if (use_q)
+               message_header_encode_q(input, enc_len, output, first_line_len);
+       else
+               message_header_encode_b(input, enc_len, output, first_line_len);
        str_append_data(output, input + last_idx, len - last_idx);
+
+       if (next_line_input != NULL) {
+               /* we're at [CR]LF */
+               i = 0;
+               if (next_line_input[0] == '\r') {
+                       cr = TRUE;
+                       i++;
+               }
+               i_assert(next_line_input[i] == '\n');
+               if (++i == next_line_len)
+                       return; /* drop trailing [CR]LF */
+
+               if (cr)
+                       str_append_c(output, '\r');
+               str_append_c(output, '\n');
+
+               if (next_line_input[i] == ' ' || next_line_input[i] == '\t') {
+                       str_append_c(output, next_line_input[i]);
+                       i++;
+               } else {
+                       /* make it valid folding whitespace by adding a TAB */
+                       str_append_c(output, '\t');
+               }
+               message_header_encode_data(next_line_input+i, next_line_len-i,
+                                          output);
+       }
 }
index ba7c54988aeac2aea84c4ff19b2fc220cee3cdb7..ae5aa5b1bdc5f755a354ddd7db741fe7bdb0ec39 100644 (file)
@@ -2,14 +2,17 @@
 #define MESSAGE_HEADER_ENCODE_H
 
 /* Encode UTF-8 input into output wherever necessary using either Q or B
-   encoding depending on which takes less space (approximately). */
+   encoding depending on which takes less space (approximately). Folding
+   whitespace is preserved. Bare [CR]LF will be preserved by adding a TAB
+   after it to make it a valid folding whitespace. */
 void message_header_encode(const char *input, string_t *output);
 void message_header_encode_data(const unsigned char *input, unsigned int len,
                                string_t *output);
 
 /* Encode the whole UTF-8 input using "Q" or "B" encoding into output.
    The output is split into multiple lines if necessary (max 76 chars/line).
-   The first line's length is given as parameter. */
+   The first line's length is given as parameter. All the control characters
+   are encoded, including NUL, CR and LF. */
 void message_header_encode_q(const unsigned char *input, unsigned int len,
                             string_t *output, unsigned int first_line_len);
 void message_header_encode_b(const unsigned char *input, unsigned int len,
index 13df05adf7c86739205792aa0c2b2e2bac026e1b..895ad772610a512237b04f1fa86c16f83f797baf 100644 (file)
@@ -170,8 +170,27 @@ static void test_message_header_encode(void)
                "a ää ä b", "a =?utf-8?b?w6TDpCDDpA==?= b",
                "ä a ä", "=?utf-8?q?=C3=A4_a_=C3=A4?=",
                "ää a ä", "=?utf-8?b?w6TDpCBhIMOk?=",
+               "=", "=",
+               "?", "?",
+               "a=?", "a=?",
+               "=?", "=?utf-8?q?=3D=3F?=",
+               "=?x", "=?utf-8?q?=3D=3Fx?=",
+               "a\n=?", "a\n\t=?utf-8?q?=3D=3F?=",
+               "a\t=?", "a\t=?utf-8?q?=3D=3F?=",
+               "a =?", "a =?utf-8?q?=3D=3F?=",
                "foo\001bar", "=?utf-8?q?foo=01bar?=",
-               "\x01\x02\x03\x04\x05\x06\x07\x08", "=?utf-8?b?AQIDBAUGBwg=?="
+               "\x01\x02\x03\x04\x05\x06\x07\x08", "=?utf-8?b?AQIDBAUGBwg=?=",
+
+               "a\r\n b", "a\r\n b",
+               "a\r\n\tb", "a\r\n\tb",
+               "a\r\nb", "a\r\n\tb",
+               "a\n b", "a\n b",
+               "a\n  b", "a\n  b",
+               "a\nb", "a\n\tb",
+               "a\r\n", "a",
+               "a\n", "a",
+               "foo\n \001bar", "foo\n =?utf-8?q?=01bar?=",
+               "foo\001\n bar", "=?utf-8?q?foo=01?=\n bar"
        };                          
        string_t *str = t_str_new(128);
        unsigned int i;