Added message header (RFC 2047) encoder.

author Timo Sirainen <tss@iki.fi>

Wed, 28 Oct 2009 02:44:39 +0000 (22:44 -0400)

committer Timo Sirainen <tss@iki.fi>

Wed, 28 Oct 2009 02:44:39 +0000 (22:44 -0400)
author Timo Sirainen <tss@iki.fi>
Wed, 28 Oct 2009 02:44:39 +0000 (22:44 -0400)
committer Timo Sirainen <tss@iki.fi>
Wed, 28 Oct 2009 02:44:39 +0000 (22:44 -0400)
diff --git a/src/lib-mail/Makefile.am b/src/lib-mail/Makefile.am

index 4db24a4731b94008c74d68f86ec9b0b7c1a4ad01..a5513be2be800773b0bea6e45f073ea7e2d9431f 100644 (file)
--- a/src/lib-mail/Makefile.am
+++ b/src/lib-mail/Makefile.am
@@ -13,6 +13,7 @@ libmail_la_SOURCES = \
         message-date.c \
         message-decoder.c \
         message-header-decode.c \
+       message-header-encode.c \
         message-header-parser.c \
         message-id.c \
         message-parser.c \
@@ -33,6 +34,7 @@ headers = \
         message-date.h \
         message-decoder.h \
         message-header-decode.h \
+       message-header-encode.h \
         message-header-parser.h \
         message-id.h \
         message-parser.h \
@@ -59,6 +61,7 @@ test_programs = \
         test-message-date \
         test-message-decoder \
         test-message-header-decode \
+       test-message-header-encode \
         test-message-header-parser \
         test-message-id \
         test-message-parser \
@@ -99,6 +102,10 @@ test_message_header_decode_SOURCES = test-message-header-decode.c
  test_message_header_decode_LDADD = message-header-decode.lo quoted-printable.lo $(test_libs)
  test_message_header_decode_DEPENDENCIES = message-header-decode.lo quoted-printable.lo $(test_libs)
  
+test_message_header_encode_SOURCES = test-message-header-encode.c
+test_message_header_encode_LDADD = message-header-encode.lo $(test_libs)
+test_message_header_encode_DEPENDENCIES = message-header-encode.lo $(test_libs)
+
  test_message_header_parser_SOURCES = test-message-header-parser.c
  test_message_header_parser_LDADD = message-header-parser.lo $(test_libs)
  test_message_header_parser_DEPENDENCIES = message-header-parser.lo $(test_libs)
diff --git a/src/lib-mail/message-header-encode.c b/src/lib-mail/message-header-encode.c

new file mode 100644 (file)

index 0000000..0b4cb50
--- /dev/null
+++ b/src/lib-mail/message-header-encode.c
@@ -0,0 +1,173 @@
+/* Copyright (c) 2009 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "str.h"
+#include "base64.h"
+#include "message-header-encode.h"
+
+#define MIME_WRAPPER_LEN (strlen("=?utf-8?q?""?="))
+#define MIME_MAX_LINE_LEN 76
+
+#define IS_LWSP(c) \
+       ((c) == ' ' || (c) == '\t' || (c) == '\n')
+
+static bool input_idx_need_encoding(const unsigned char *input, unsigned int i)
+{
+       if ((input[i] & 0x80) != 0)
+               return TRUE;
+
+       if (input[i] == '=' && input[i+1] == '?' &&
+           (i == 0 || IS_LWSP(input[i-1])))
+               return TRUE;
+       return FALSE;
+}
+
+static unsigned int str_last_line_len(string_t *str)
+{
+       const unsigned char *data = str_data(str);
+       unsigned int i = str_len(str);
+
+       while (i > 0 && data[i-1] != '\n')
+               i--;
+       return str_len(str) - i;
+}
+
+void message_header_encode_q(const unsigned char *input, unsigned int len,
+                            string_t *output)
+{
+       unsigned int i, line_len, line_len_left;
+
+       line_len = str_last_line_len(output);
+       if (line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
+               str_append(output, "\n\t");
+               line_len = 1;
+       }
+
+       str_append(output, "=?utf-8?q?");
+       line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - line_len;
+       for (i = 0; i < len; i++) {
+               if (line_len_left < 3) {
+                       /* if we're not at the beginning of a character,
+                          go backwards until we are */
+                       while ((input[i] & 0xc0) == 0x80) {
+                               str_truncate(output, str_len(output)-3);
+                               i--;
+                       }
+                       str_append(output, "?=\n\t=?utf-8?q?");
+                       line_len_left = MIME_MAX_LINE_LEN -
+                               MIME_WRAPPER_LEN - 1;
+               }
+               switch (input[i]) {
+               case ' ':
+                       str_append_c(output, '_');
+                       break;
+               case '=':
+               case '?':
+               case '_':
+                       str_printfa(output, "=%2X", input[i]);
+                       break;
+               default:
+                       if (input[i] < 32 || (input[i] & 0x80) != 0) {
+                               line_len_left -= 2;
+                               str_printfa(output, "=%2X", input[i]);
+                       } else {
+                               str_append_c(output, input[i]);
+                       }
+                       break;
+               }
+               line_len_left--;
+       }
+       str_append(output, "?=");
+}
+
+void message_header_encode_b(const unsigned char *input, unsigned int len,
+                            string_t *output)
+{
+       unsigned int line_len, line_len_left, max;
+
+       line_len = str_last_line_len(output);
+       if (line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN) {
+               str_append(output, "\n\t");
+               line_len = 1;
+       }
+
+       for (;;) {
+               line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - line_len;
+               max = MAX_BASE64_DECODED_SIZE(line_len_left);
+               do {
+                       max--;
+                       if (max > len)
+                               max = len;
+                       else {
+                               /* all of it doesn't fit. find a character where we
+                                  can split it from. */
+                               while (max > 0 && (input[max] & 0xc0) == 0x80)
+                                       max--;
+                       }
+               } while (MAX_BASE64_ENCODED_SIZE(max) > line_len_left &&
+                        max > 0);
+
+               if (max > 0) {
+                       str_append(output, "=?utf-8?b?");
+                       base64_encode(input, max, output);
+                       str_append(output, "?=");
+               }
+
+               input += max;
+               len -= max;
+
+               if (len == 0)
+                       break;
+
+               str_append(output, "\n\t");
+               line_len = 1;
+       }
+}
+
+void message_header_encode(const char *_input, string_t *output)
+{
+       const unsigned char *input = (const unsigned char *)_input;
+       unsigned int i, first_idx, last_idx;
+       unsigned int enc_chars, enc_len, base64_len, q_len;
+       bool use_q;
+
+       /* find the first word that needs encoding */
+       for (i = 0; input[i] != '\0'; i++) {
+               if (input_idx_need_encoding(input, i))
+                       break;
+       }
+       if (input[i] == '\0') {
+               /* no encoding necessary */
+               str_append(output, _input);
+               return;
+       }
+       first_idx = i;
+       while (first_idx > 0 && !IS_LWSP(input[first_idx-1]))
+               first_idx--;
+
+       /* find the last word that needs encoding */
+       last_idx = ++i; enc_chars = 1;
+       for (; input[i] != '\0'; i++) {
+               if (input_idx_need_encoding(input, i)) {
+                       last_idx = i + 1;
+                       enc_chars++;
+               }
+       }
+       while (input[last_idx] != '\0' && !IS_LWSP(input[last_idx]))
+               last_idx++;
+
+       /* figure out if we should use Q or B encoding. Prefer Q if it's not
+          too much larger. */
+       enc_len = last_idx - first_idx;
+       base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
+       q_len = enc_len + enc_chars*3;
+       use_q = q_len*2/3 <= base64_len;
+
+       /* and do it */
+       str_append_n(output, input, first_idx);
+       if (use_q)
+               message_header_encode_q(input + first_idx, enc_len, output);
+       else
+               message_header_encode_b(input + first_idx, enc_len, output);
+       str_append(output, _input + last_idx);
+}
diff --git a/src/lib-mail/message-header-encode.h b/src/lib-mail/message-header-encode.h

new file mode 100644 (file)

index 0000000..13ef7e1
--- /dev/null
+++ b/src/lib-mail/message-header-encode.h
@@ -0,0 +1,15 @@
+#ifndef MESSAGE_HEADER_ENCODE_H
+#define MESSAGE_HEADER_ENCODE_H
+
+/* Encode UTF-8 input into output wherever necessary. */
+void message_header_encode(const char *input, string_t *output);
+
+/* Encode the whole UTF-8 input using "Q" or "B" encoding into output.
+   The output is split into multiple lines if necessary. The first line length
+   is looked up from the output string. */
+void message_header_encode_q(const unsigned char *input, unsigned int len,
+                            string_t *output);
+void message_header_encode_b(const unsigned char *input, unsigned int len,
+                            string_t *output);
+
+#endif
diff --git a/src/lib-mail/test-message-header-encode.c b/src/lib-mail/test-message-header-encode.c

new file mode 100644 (file)

index 0000000..d1bed3c
--- /dev/null
+++ b/src/lib-mail/test-message-header-encode.c
@@ -0,0 +1,193 @@
+/* Copyright (c) 2009 Dovecot authors, see the included COPYING file */
+
+#include "lib.h"
+#include "base64.h"
+#include "buffer.h"
+#include "str.h"
+#include "message-header-encode.h"
+#include "test-common.h"
+
+static bool verify_q(const char *str, unsigned int i, bool starts_with_a)
+{
+       unsigned int line_start = i, char_count = 0;
+
+       if (strncmp(str+i, "\n\t", 2) == 0) {
+               i += 2;
+               line_start = i - 1;
+       }
+
+       for (;;) {
+               if (strncmp(str+i, "=?utf-8?q?", 10) != 0)
+                       return FALSE;
+               i += 10;
+
+               if (starts_with_a) {
+                       if (str[i] != 'a')
+                               return FALSE;
+                       starts_with_a = FALSE;
+                       i++;
+               }
+               while (strncmp(str+i, "?=", 2) != 0) {
+                       if (strncmp(str+i, "=C3=A4", 6) != 0)
+                               return FALSE;
+                       i += 6;
+                       char_count++;
+               }
+               i += 2;
+               if (i - line_start > 76)
+                       return FALSE;
+
+               if (str[i] == '\0')
+                       break;
+               if (strncmp(str+i, "\n\t", 2) != 0)
+                       return FALSE;
+               i += 2;
+               line_start = i - 1;
+       }
+       return char_count == 40;
+}
+
+static void test_message_header_encode_q(void)
+{
+       string_t *input = t_str_new(100);
+       string_t *str = t_str_new(512);
+       unsigned int i, j, skip;
+
+       test_begin("message header encode q");
+
+       str_append_c(input, 'a');
+       for (i = 0; i < 40; i++)
+               str_append(input, "ä");
+       for (i = 0; i < 80; i++) {
+               for (skip = 0; skip < 2; skip++) {
+                       str_truncate(str, 0);
+                       for (j = 1; j < i; j++)
+                               str_append_c(str, 'X');
+                       if (i != 0)
+                               str_append_c(str, ' ');
+
+                       message_header_encode_q(str_data(input) + skip,
+                                               str_len(input) - skip, str);
+                       test_assert(verify_q(str_c(str), i, !skip));
+               }
+       }
+       test_end();
+}
+
+static bool verify_b(const char *str, unsigned int i, bool starts_with_a)
+{
+       unsigned int line_start = i, start, j, char_count = 0;
+       char bufdata[1000];
+       buffer_t buf;
+
+       buffer_create_data(&buf, bufdata, sizeof(bufdata));
+       if (strncmp(str+i, "\n\t", 2) == 0) {
+               i += 2;
+               line_start = i - 1;
+       }
+
+       for (;;) {
+               if (strncmp(str+i, "=?utf-8?b?", 10) != 0)
+                       return FALSE;
+               i += 10;
+
+               start = i;
+               for (; str[i] != '?'; i++) {
+                       if (str[i] == '\0')
+                               return FALSE;
+               }
+               buffer_set_used_size(&buf, 0);
+               if (base64_decode(str+start, i-start, NULL, &buf) < 0)
+                       return FALSE;
+               i++;
+
+               if (!starts_with_a)
+                       j = 0;
+               else {
+                       if (bufdata[0] != 'a')
+                               return FALSE;
+                       starts_with_a = FALSE;
+                       j = 1;
+               }
+               for (; j < buf.used; j += 2) {
+                       if (bufdata[j] != '\xc3' || bufdata[j+1] != '\xa4')
+                               return FALSE;
+                       char_count++;
+               }
+               if (j != buf.used)
+                       return FALSE;
+
+               if (str[i++] != '=')
+                       return FALSE;
+
+               if (i - line_start > 76)
+                       return FALSE;
+
+               if (str[i] == '\0')
+                       break;
+               if (strncmp(str+i, "\n\t", 2) != 0)
+                       return FALSE;
+               i += 2;
+               line_start = i - 1;
+       }
+       return char_count == 40;
+}
+
+static void test_message_header_encode_b(void)
+{
+       string_t *input = t_str_new(100);
+       string_t *str = t_str_new(512);
+       unsigned int i, j, skip;
+
+       test_begin("message header encode b");
+
+       str_append_c(input, 'a');
+       for (i = 0; i < 40; i++)
+               str_append(input, "ä");
+       for (i = 0; i < 80; i++) {
+               for (skip = 0; skip < 2; skip++) {
+                       str_truncate(str, 0);
+                       for (j = 1; j < i; j++)
+                               str_append_c(str, 'X');
+                       if (i != 0)
+                               str_append_c(str, ' ');
+
+                       message_header_encode_b(str_data(input) + skip,
+                                               str_len(input) - skip, str);
+                       test_assert(verify_b(str_c(str), i, !skip));
+               }
+       }
+       test_end();
+}
+
+static void test_message_header_encode(void)
+{
+       const char *data[] = {
+               "a b", "a b",
+               "a bcäde f", "a =?utf-8?q?bc=C3=A4de?= f",
+               "a ää ä b", "a =?utf-8?b?w6TDpCDDpA==?= b",
+               "ä a ä", "=?utf-8?q?=C3=A4_a_=C3=A4?=",
+               "ää a ä", "=?utf-8?b?w6TDpCBhIMOk?=",
+       };                          
+       string_t *str = t_str_new(128);
+       unsigned int i;
+
+       test_begin("message header encode");
+       for (i = 0; i < N_ELEMENTS(data); i += 2) {
+               str_truncate(str, 0);
+               message_header_encode(data[i], str);
+               test_assert(strcmp(str_c(str), data[i+1]) == 0);
+       }
+       test_end();
+}
+
+int main(void)
+{
+       static void (*test_functions[])(void) = {
+               test_message_header_encode_q,
+               test_message_header_encode_b,
+               test_message_header_encode,
+               NULL
+       };
+       return test_run(test_functions);
+}
author	Timo Sirainen <tss@iki.fi>
	Wed, 28 Oct 2009 02:44:39 +0000 (22:44 -0400)
committer	Timo Sirainen <tss@iki.fi>
	Wed, 28 Oct 2009 02:44:39 +0000 (22:44 -0400)
src/lib-mail/Makefile.am		patch \| blob \| blame \| history
src/lib-mail/message-header-encode.c	[new file with mode: 0644]	patch \| blob
src/lib-mail/message-header-encode.h	[new file with mode: 0644]	patch \| blob
src/lib-mail/test-message-header-encode.c	[new file with mode: 0644]	patch \| blob