lib-mail: message_header_encode() now preserves folding whitespace

author Timo Sirainen <tss@iki.fi>

Sun, 11 May 2014 19:15:08 +0000 (22:15 +0300)

committer Timo Sirainen <tss@iki.fi>

Sun, 11 May 2014 19:15:08 +0000 (22:15 +0300)
author Timo Sirainen <tss@iki.fi>
Sun, 11 May 2014 19:15:08 +0000 (22:15 +0300)
committer Timo Sirainen <tss@iki.fi>
Sun, 11 May 2014 19:15:08 +0000 (22:15 +0300)
diff --git a/src/lib-mail/message-header-encode.c b/src/lib-mail/message-header-encode.c

index 1dabffb374a90fbec2b41e459d53fbf2f1e2eb3c..b0c511d2f7eedf4392786e698dd653ec081417b0 100644 (file)
--- a/src/lib-mail/message-header-encode.c
+++ b/src/lib-mail/message-header-encode.c
@@ -14,17 +14,44 @@
  static bool input_idx_need_encoding(const unsigned char *input,
                                     unsigned int i, unsigned int len)
  {
-       /* 8bit chars */
-       if ((input[i] & 0x80) != 0)
-               return TRUE;
-       /* control chars */
-       if (input[i] < 32)
-               return TRUE;
-
-       /* <LWSP>=? */
-       if (input[i] == '=' && i+1 < len && input[i+1] == '?' &&
-           (i == 0 || IS_LWSP(input[i-1])))
-               return TRUE;
+       switch (input[i]) {
+       case '\r':
+               if (i+1 == len || input[i+1] != '\n')
+                       return TRUE;
+               i++;
+               /* fall through and verify the LF as well */
+       case '\n':
+               if (i+1 == len) {
+                       /* trailing LF - we need to drop it */
+                       return TRUE;
+               }
+               if (input[i+1] != '\t' && input[i+1] != ' ') {
+                       /* LF not followed by whitespace - we need to
+                          add the whitespace */
+                       return TRUE;
+               }
+               break;
+       case '\t':
+               /* TAB doesn't need to be encoded */
+               break;
+       case '=':
+               /* <LWSP>=? - we need to check backwards a bit to see if
+                  there is LWSP (note that we don't want to return TRUE for
+                  the LWSP itself yet, so we need to do this backwards
+                  check) */
+               if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len &&
+                   memcmp(input + i, "=?", 2) == 0)
+                       return TRUE;
+               break;
+       default:
+               /* 8bit chars */
+               if ((input[i] & 0x80) != 0)
+                       return TRUE;
+               /* control chars */
+               if (input[i] < 32)
+                       return TRUE;
+               break;
+       }
         return FALSE;
  }
  
@@ -45,9 +72,9 @@ void message_header_encode_q(const unsigned char *input, unsigned int len,
         str_append(output, "=?utf-8?q?");
         for (i = 0; i < len; i++) {
                 if (line_len_left < 3) {
-                       /* if we're not at the beginning of a character,
+                       /* if we're not at the beginning of an UTF8 character,
                            go backwards until we are */
-                       while ((input[i] & 0xc0) == 0x80) {
+                       while (i > 0 && (input[i] & 0xc0) == 0x80) {
                                 str_truncate(output, str_len(output)-3);
                                 i--;
                         }
@@ -131,9 +158,11 @@ void message_header_encode(const char *input, string_t *output)
  void message_header_encode_data(const unsigned char *input, unsigned int len,
                                 string_t *output)
  {
-       unsigned int i, first_idx, last_idx;
+       unsigned int i, j, first_line_len, cur_line_len, last_idx;
         unsigned int enc_chars, enc_len, base64_len, q_len;
-       bool use_q;
+       const unsigned char *next_line_input;
+       unsigned int next_line_len;
+       bool use_q, cr;
  
         /* find the first word that needs encoding */
         for (i = 0; i < len; i++) {
@@ -145,13 +174,36 @@ void message_header_encode_data(const unsigned char *input, unsigned int len,
                 str_append_data(output, input, len);
                 return;
         }
-       first_idx = i;
-       while (first_idx > 0 && !IS_LWSP(input[first_idx-1]))
-               first_idx--;
+       /* go back to the beginning of the word so it is fully encoded */
+       if (input[i] != '\r' && input[i] != '\n') {
+               while (i > 0 && !IS_LWSP(input[i-1]))
+                       i--;
+       }
+
+       /* write the prefix */
+       str_append_data(output, input, i);
+       first_line_len = j = i;
+       while (j > 0 && input[j-1] != '\n') j--;
+       if (j != 0)
+               first_line_len = j;
+
+       input += i;
+       len -= i;
+
+       /* we'll encode data only up to the next LF, the rest is handled
+          recursively. */
+       next_line_input = memchr(input, '\n', len);
+       if (next_line_input != NULL) {
+               if (next_line_input != input && next_line_input[-1] == '\r')
+                       next_line_input--;
+               cur_line_len = next_line_input - input;
+               next_line_len = len - cur_line_len;
+               len = cur_line_len;
+       }
  
         /* find the last word that needs encoding */
-       last_idx = ++i; enc_chars = 1;
-       for (; i < len; i++) {
+       last_idx = 0; enc_chars = 0;
+       for (i = 0; i < len; i++) {
                 if (input_idx_need_encoding(input, i, len)) {
                         last_idx = i + 1;
                         enc_chars++;
@@ -162,19 +214,43 @@ void message_header_encode_data(const unsigned char *input, unsigned int len,
  
         /* figure out if we should use Q or B encoding. Prefer Q if it's not
            too much larger. */
-       enc_len = last_idx - first_idx;
+       enc_len = last_idx;
         base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
         q_len = enc_len + enc_chars*3;
         use_q = q_len*2/3 <= base64_len;
  
         /* and do it */
-       str_append_data(output, input, first_idx);
-       if (use_q) {
-               message_header_encode_q(input + first_idx, enc_len,
-                                       output, first_idx);
-       } else {
-               message_header_encode_b(input + first_idx, enc_len,
-                                       output, first_idx);
-       }
+       if (enc_len == 0)
+               ;
+       else if (use_q)
+               message_header_encode_q(input, enc_len, output, first_line_len);
+       else
+               message_header_encode_b(input, enc_len, output, first_line_len);
         str_append_data(output, input + last_idx, len - last_idx);
+
+       if (next_line_input != NULL) {
+               /* we're at [CR]LF */
+               i = 0;
+               if (next_line_input[0] == '\r') {
+                       cr = TRUE;
+                       i++;
+               }
+               i_assert(next_line_input[i] == '\n');
+               if (++i == next_line_len)
+                       return; /* drop trailing [CR]LF */
+
+               if (cr)
+                       str_append_c(output, '\r');
+               str_append_c(output, '\n');
+
+               if (next_line_input[i] == ' ' || next_line_input[i] == '\t') {
+                       str_append_c(output, next_line_input[i]);
+                       i++;
+               } else {
+                       /* make it valid folding whitespace by adding a TAB */
+                       str_append_c(output, '\t');
+               }
+               message_header_encode_data(next_line_input+i, next_line_len-i,
+                                          output);
+       }
  }
diff --git a/src/lib-mail/message-header-encode.h b/src/lib-mail/message-header-encode.h

index ba7c54988aeac2aea84c4ff19b2fc220cee3cdb7..ae5aa5b1bdc5f755a354ddd7db741fe7bdb0ec39 100644 (file)
--- a/src/lib-mail/message-header-encode.h
+++ b/src/lib-mail/message-header-encode.h
@@ -2,14 +2,17 @@
  #define MESSAGE_HEADER_ENCODE_H
  
  /* Encode UTF-8 input into output wherever necessary using either Q or B
-   encoding depending on which takes less space (approximately). */
+   encoding depending on which takes less space (approximately). Folding
+   whitespace is preserved. Bare [CR]LF will be preserved by adding a TAB
+   after it to make it a valid folding whitespace. */
  void message_header_encode(const char *input, string_t *output);
  void message_header_encode_data(const unsigned char *input, unsigned int len,
                                 string_t *output);
  
  /* Encode the whole UTF-8 input using "Q" or "B" encoding into output.
     The output is split into multiple lines if necessary (max 76 chars/line).
-   The first line's length is given as parameter. */
+   The first line's length is given as parameter. All the control characters
+   are encoded, including NUL, CR and LF. */
  void message_header_encode_q(const unsigned char *input, unsigned int len,
                              string_t *output, unsigned int first_line_len);
  void message_header_encode_b(const unsigned char *input, unsigned int len,
diff --git a/src/lib-mail/test-message-header-encode.c b/src/lib-mail/test-message-header-encode.c

index 13df05adf7c86739205792aa0c2b2e2bac026e1b..895ad772610a512237b04f1fa86c16f83f797baf 100644 (file)
--- a/src/lib-mail/test-message-header-encode.c
+++ b/src/lib-mail/test-message-header-encode.c
@@ -170,8 +170,27 @@ static void test_message_header_encode(void)
                 "a ää ä b", "a =?utf-8?b?w6TDpCDDpA==?= b",
                 "ä a ä", "=?utf-8?q?=C3=A4_a_=C3=A4?=",
                 "ää a ä", "=?utf-8?b?w6TDpCBhIMOk?=",
+               "=", "=",
+               "?", "?",
+               "a=?", "a=?",
+               "=?", "=?utf-8?q?=3D=3F?=",
+               "=?x", "=?utf-8?q?=3D=3Fx?=",
+               "a\n=?", "a\n\t=?utf-8?q?=3D=3F?=",
+               "a\t=?", "a\t=?utf-8?q?=3D=3F?=",
+               "a =?", "a =?utf-8?q?=3D=3F?=",
                 "foo\001bar", "=?utf-8?q?foo=01bar?=",
-               "\x01\x02\x03\x04\x05\x06\x07\x08", "=?utf-8?b?AQIDBAUGBwg=?="
+               "\x01\x02\x03\x04\x05\x06\x07\x08", "=?utf-8?b?AQIDBAUGBwg=?=",
+
+               "a\r\n b", "a\r\n b",
+               "a\r\n\tb", "a\r\n\tb",
+               "a\r\nb", "a\r\n\tb",
+               "a\n b", "a\n b",
+               "a\n  b", "a\n  b",
+               "a\nb", "a\n\tb",
+               "a\r\n", "a",
+               "a\n", "a",
+               "foo\n \001bar", "foo\n =?utf-8?q?=01bar?=",
+               "foo\001\n bar", "=?utf-8?q?foo=01?=\n bar"
         };                          
         string_t *str = t_str_new(128);
         unsigned int i;
author	Timo Sirainen <tss@iki.fi>
	Sun, 11 May 2014 19:15:08 +0000 (22:15 +0300)
committer	Timo Sirainen <tss@iki.fi>
	Sun, 11 May 2014 19:15:08 +0000 (22:15 +0300)
src/lib-mail/message-header-encode.c		patch \| blob \| blame \| history
src/lib-mail/message-header-encode.h		patch \| blob \| blame \| history
src/lib-mail/test-message-header-encode.c		patch \| blob \| blame \| history