static bool input_idx_need_encoding(const unsigned char *input,
unsigned int i, unsigned int len)
{
- /* 8bit chars */
- if ((input[i] & 0x80) != 0)
- return TRUE;
- /* control chars */
- if (input[i] < 32)
- return TRUE;
-
- /* <LWSP>=? */
- if (input[i] == '=' && i+1 < len && input[i+1] == '?' &&
- (i == 0 || IS_LWSP(input[i-1])))
- return TRUE;
+ switch (input[i]) {
+ case '\r':
+ if (i+1 == len || input[i+1] != '\n')
+ return TRUE;
+ i++;
+ /* fall through and verify the LF as well */
+ case '\n':
+ if (i+1 == len) {
+ /* trailing LF - we need to drop it */
+ return TRUE;
+ }
+ if (input[i+1] != '\t' && input[i+1] != ' ') {
+ /* LF not followed by whitespace - we need to
+ add the whitespace */
+ return TRUE;
+ }
+ break;
+ case '\t':
+ /* TAB doesn't need to be encoded */
+ break;
+ case '=':
+ /* <LWSP>=? - we need to check backwards a bit to see if
+ there is LWSP (note that we don't want to return TRUE for
+ the LWSP itself yet, so we need to do this backwards
+ check) */
+ if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len &&
+ memcmp(input + i, "=?", 2) == 0)
+ return TRUE;
+ break;
+ default:
+ /* 8bit chars */
+ if ((input[i] & 0x80) != 0)
+ return TRUE;
+ /* control chars */
+ if (input[i] < 32)
+ return TRUE;
+ break;
+ }
return FALSE;
}
str_append(output, "=?utf-8?q?");
for (i = 0; i < len; i++) {
if (line_len_left < 3) {
- /* if we're not at the beginning of a character,
+ /* if we're not at the beginning of an UTF8 character,
go backwards until we are */
- while ((input[i] & 0xc0) == 0x80) {
+ while (i > 0 && (input[i] & 0xc0) == 0x80) {
str_truncate(output, str_len(output)-3);
i--;
}
void message_header_encode_data(const unsigned char *input, unsigned int len,
string_t *output)
{
- unsigned int i, first_idx, last_idx;
+ unsigned int i, j, first_line_len, cur_line_len, last_idx;
unsigned int enc_chars, enc_len, base64_len, q_len;
- bool use_q;
+ const unsigned char *next_line_input;
+ unsigned int next_line_len;
+ bool use_q, cr;
/* find the first word that needs encoding */
for (i = 0; i < len; i++) {
str_append_data(output, input, len);
return;
}
- first_idx = i;
- while (first_idx > 0 && !IS_LWSP(input[first_idx-1]))
- first_idx--;
+ /* go back to the beginning of the word so it is fully encoded */
+ if (input[i] != '\r' && input[i] != '\n') {
+ while (i > 0 && !IS_LWSP(input[i-1]))
+ i--;
+ }
+
+ /* write the prefix */
+ str_append_data(output, input, i);
+ first_line_len = j = i;
+ while (j > 0 && input[j-1] != '\n') j--;
+ if (j != 0)
+ first_line_len = j;
+
+ input += i;
+ len -= i;
+
+ /* we'll encode data only up to the next LF, the rest is handled
+ recursively. */
+ next_line_input = memchr(input, '\n', len);
+ if (next_line_input != NULL) {
+ if (next_line_input != input && next_line_input[-1] == '\r')
+ next_line_input--;
+ cur_line_len = next_line_input - input;
+ next_line_len = len - cur_line_len;
+ len = cur_line_len;
+ }
/* find the last word that needs encoding */
- last_idx = ++i; enc_chars = 1;
- for (; i < len; i++) {
+ last_idx = 0; enc_chars = 0;
+ for (i = 0; i < len; i++) {
if (input_idx_need_encoding(input, i, len)) {
last_idx = i + 1;
enc_chars++;
/* figure out if we should use Q or B encoding. Prefer Q if it's not
too much larger. */
- enc_len = last_idx - first_idx;
+ enc_len = last_idx;
base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
q_len = enc_len + enc_chars*3;
use_q = q_len*2/3 <= base64_len;
/* and do it */
- str_append_data(output, input, first_idx);
- if (use_q) {
- message_header_encode_q(input + first_idx, enc_len,
- output, first_idx);
- } else {
- message_header_encode_b(input + first_idx, enc_len,
- output, first_idx);
- }
+ if (enc_len == 0)
+ ;
+ else if (use_q)
+ message_header_encode_q(input, enc_len, output, first_line_len);
+ else
+ message_header_encode_b(input, enc_len, output, first_line_len);
str_append_data(output, input + last_idx, len - last_idx);
+
+ if (next_line_input != NULL) {
+ /* we're at [CR]LF */
+ i = 0;
+ if (next_line_input[0] == '\r') {
+ cr = TRUE;
+ i++;
+ }
+ i_assert(next_line_input[i] == '\n');
+ if (++i == next_line_len)
+ return; /* drop trailing [CR]LF */
+
+ if (cr)
+ str_append_c(output, '\r');
+ str_append_c(output, '\n');
+
+ if (next_line_input[i] == ' ' || next_line_input[i] == '\t') {
+ str_append_c(output, next_line_input[i]);
+ i++;
+ } else {
+ /* make it valid folding whitespace by adding a TAB */
+ str_append_c(output, '\t');
+ }
+ message_header_encode_data(next_line_input+i, next_line_len-i,
+ output);
+ }
}
#define MESSAGE_HEADER_ENCODE_H
/* Encode UTF-8 input into output wherever necessary using either Q or B
- encoding depending on which takes less space (approximately). */
+ encoding depending on which takes less space (approximately). Folding
+ whitespace is preserved. Bare [CR]LF will be preserved by adding a TAB
+ after it to make it a valid folding whitespace. */
void message_header_encode(const char *input, string_t *output);
void message_header_encode_data(const unsigned char *input, unsigned int len,
string_t *output);
/* Encode the whole UTF-8 input using "Q" or "B" encoding into output.
The output is split into multiple lines if necessary (max 76 chars/line).
- The first line's length is given as parameter. */
+ The first line's length is given as parameter. All the control characters
+ are encoded, including NUL, CR and LF. */
void message_header_encode_q(const unsigned char *input, unsigned int len,
string_t *output, unsigned int first_line_len);
void message_header_encode_b(const unsigned char *input, unsigned int len,
"a ää ä b", "a =?utf-8?b?w6TDpCDDpA==?= b",
"ä a ä", "=?utf-8?q?=C3=A4_a_=C3=A4?=",
"ää a ä", "=?utf-8?b?w6TDpCBhIMOk?=",
+ "=", "=",
+ "?", "?",
+ "a=?", "a=?",
+ "=?", "=?utf-8?q?=3D=3F?=",
+ "=?x", "=?utf-8?q?=3D=3Fx?=",
+ "a\n=?", "a\n\t=?utf-8?q?=3D=3F?=",
+ "a\t=?", "a\t=?utf-8?q?=3D=3F?=",
+ "a =?", "a =?utf-8?q?=3D=3F?=",
"foo\001bar", "=?utf-8?q?foo=01bar?=",
- "\x01\x02\x03\x04\x05\x06\x07\x08", "=?utf-8?b?AQIDBAUGBwg=?="
+ "\x01\x02\x03\x04\x05\x06\x07\x08", "=?utf-8?b?AQIDBAUGBwg=?=",
+
+ "a\r\n b", "a\r\n b",
+ "a\r\n\tb", "a\r\n\tb",
+ "a\r\nb", "a\r\n\tb",
+ "a\n b", "a\n b",
+ "a\n b", "a\n b",
+ "a\nb", "a\n\tb",
+ "a\r\n", "a",
+ "a\n", "a",
+ "foo\n \001bar", "foo\n =?utf-8?q?=01bar?=",
+ "foo\001\n bar", "=?utf-8?q?foo=01?=\n bar"
};
string_t *str = t_str_new(128);
unsigned int i;