/* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
#include "lib.h"
+#include "base64.h"
#include "buffer.h"
#include "str.h"
#include "unichar.h"
#include "word-boundary-data.c"
#include "word-break-data.c"
+/* see comments below between is_base64() and skip_base64() */
+#define FTS_SKIP_BASE64_MIN_SEQUENCES 1
+#define FTS_SKIP_BASE64_MIN_CHARS 50
+
#define FTS_DEFAULT_TOKEN_MAX_LENGTH 30
#define FTS_WB5A_PREFIX_MAX_LENGTH 3 /* Including apostrophe */
tok->untruncated_length += size;
}
+inline static bool
+is_base64(const unsigned char ch)
+{
+ return base64_scheme.decmap[ch] != 0xff;
+}
+
+/* So far the following rule seems give good results in avoid indexing base64
+ as keywords. It also seems to run well against against base64 embedded
+ headers, like ARC-Seal, DKIM-Signature, X-SG-EID, X-SG-ID, including
+ encoded parts (e.g. =?us-ascii?Q?...?= sequences).
+
+ leader characters : [ \t\r\n=:;?]*
+ matching characters : base64_scheme.decmap[ch] != 0xff
+ trailing characters : none or [ \t\r\n=:;?] (other characters cause
+ the run to be ignored)
+ minimum run length : 50
+ minimum runs count : 1
+
+ i.e. (single or multiple) 50-chars runs of characters in the base64 set
+ - excluded the trailing '=' - are recognized as base64 and ignored
+ in indexing. */
+
+#define allowed_base64_trailers allowed_base64_leaders
+static unsigned char allowed_base64_leaders[] = {
+ ' ', '\t', '\r', '\n', '=', ';', ':', '?'
+};
+
+/* skip_base64() works doing lookahead on the data available in the tokenizer
+ buffer, .i.e. it is not able to see "what will come next" to perform more
+ extensive matches. This implies that a very long base64 sequence, which is
+ split halfway into two different chunks while feeding tokenizer, will be
+ matched separately as the trailing part of first buffer and as the leading
+ part of the second. Each of these two segments must fulfill the match
+ criteria on its own to be discarded. What we pay is we will fail to reject
+ small base64 chunks segments instead of rejecting the whole sequence.
+
+ When skip_base64() is invoked in fts_tokenizer_generic_XX_next(), we know
+ that we are not halfway the collection of a token.
+
+ As (after the previous token) the buffer will contain non-token characters
+ (i.e. token separators of some kind), we try to move forward among those
+ until we find a base64 character. If we don't find one, there's nothing we
+ can skip in the buffer and the skip phase terminates.
+
+ If we found a base64 character, we check that the previous one is in
+ allowed_base64_leaders[]; otherwise, the skip phase terminates.
+
+ Now we try to determine how long the base64 sequence is. If it is too short,
+ the skip phase terminates. It also terminates if there's a character
+ in the buffer after the sequence and this is not in
+ allowed_base64_trailers[].
+
+ At this point we know that we have:
+ - possibly a skipped sequence of non base64 characters ending with an
+ allowed leader character, followed by:
+ - a skipped sequence of base64 characters, possibly followed by an allowed
+ trailed character
+ we advance the start pointer to after the last skipped base64 character,
+ and scan again to see if we can skip further chunks in the same way. */
+
+static size_t
+skip_base64(const unsigned char *data, size_t size)
+{
+ if (data == NULL) {
+ i_assert(size == 0);
+ return 0;
+ }
+
+ const unsigned char *start, *end = data + size;
+ unsigned int matches = 0;
+ for (start = data; start < end; ) {
+ const unsigned char *first;
+ for (first = start; first < end && !is_base64(*first); first++);
+ if (first > start && memchr(allowed_base64_leaders, *(first - 1),
+ N_ELEMENTS(allowed_base64_leaders)) == NULL)
+ break;
+
+ const unsigned char *past;
+ for (past = first; past < end && is_base64(*past); past++);
+ if (past - first < FTS_SKIP_BASE64_MIN_CHARS)
+ break;
+ if (past < end && memchr(allowed_base64_trailers, *past,
+ N_ELEMENTS(allowed_base64_trailers)) == NULL)
+ break;
+ start = past;
+ matches++;
+ }
+ return matches < FTS_SKIP_BASE64_MIN_SEQUENCES ? 0 : start - data;
+}
+
static int
fts_tokenizer_generic_simple_next(struct fts_tokenizer *_tok,
const unsigned char *data, size_t size,
{
struct generic_fts_tokenizer *tok =
container_of(_tok, struct generic_fts_tokenizer, tokenizer);
- size_t i, start = 0;
+ size_t i, start;
int char_size;
unichar_t c;
bool apostrophe;
enum fts_break_type break_type;
- for (i = 0; i < size; i += char_size) {
+ start = tok->token->used > 0 ? 0 : skip_base64(data, size);
+ for (i = start; i < size; i += char_size) {
char_size = uni_utf8_get_char_n(data + i, size - i, &c);
i_assert(char_size > 0);
struct generic_fts_tokenizer *tok =
container_of(_tok, struct generic_fts_tokenizer, tokenizer);
unichar_t c;
- size_t i, char_start_i, start_pos = 0;
+ size_t i, char_start_i, start_pos;
enum letter_type lt;
int char_size;
- for (i = 0; i < size; ) {
+ start_pos = tok->token->used > 0 ? 0 : skip_base64(data, size);
+ for (i = start_pos; i < size; ) {
char_start_i = i;
char_size = uni_utf8_get_char_n(data + i, size - i, &c);
i_assert(char_size > 0);
}
}
+static void test_fts_tokenizer_skip_base64(void)
+{
+ /* The skip_base64 works on the data already available in the buffer
+ of the tokenizer, it does not pull more data to see if a base64
+ sequence long enough would match or not. This is why it does not
+ use test_tokenizer_inputoutput that also tests with one-byte-at-once
+ or random chunking, as those are known to fail with the current
+ implementation */
+ struct fts_tokenizer *tok;
+ const char *error;
+ const char *token;
+
+ static const char *input =
+ ",/dirtyleader/456789012345678901234567890123456789/\r\n"
+
+ " /cleanleader/456789012345678901234567890123456789/\r\n"
+ "\t/cleanleader/456789012345678901234567890123456789/\r\n"
+ "\r/cleanleader/456789012345678901234567890123456789/\r\n"
+ "\n/cleanleader/456789012345678901234567890123456789/\r\n"
+ "=/cleanleader/456789012345678901234567890123456789/\r\n"
+ ";/cleanleader/456789012345678901234567890123456789/\r\n"
+ ":/cleanleader/456789012345678901234567890123456789/\r\n"
+ ";/cleanleader/456789012345678901234567890123456789/\r\n"
+
+ "/23456789012345678901234567890123456/dirtytrailer/,\r\n"
+
+ "/23456789012345678901234567890123456/cleantrailer/ \r\n"
+ "/23456789012345678901234567890123456/cleantrailer/\t\r\n"
+ "/23456789012345678901234567890123456/cleantrailer/\r\r\n"
+ "/23456789012345678901234567890123456/cleantrailer/\n\r\n"
+ "/23456789012345678901234567890123456/cleantrailer/=\r\n"
+ "/23456789012345678901234567890123456/cleantrailer/;\r\n"
+ "/23456789012345678901234567890123456/cleantrailer/:\r\n"
+ "/23456789012345678901234567890123456/cleantrailer/?\r\n"
+
+ "J1RrDrZSWxIAphKpYckeKNs10iTeiGMY0hNI32SMoSqCTgH96\r\n" // 49
+ "MziUaLMK6FAOQws3OIuX0tgvQcyhu06ILAWWB1nGPy/bSEAEYg\r\n" // 50
+ "ljWSJo8kxsm4/CiZBpwFfWkd64y+5ZytnKqgkQD87UbQ7FcpZgj\r\n" // 51
+ "pTXUOBszCfdAgfZpWpPiOEQSthPxN9XMaS7HnOTyXtRBPVt96vw=\r\n" // 51=
+ "MJmsWlDKXo7NCSt1wvazf9Xad18qOzpLJkVs/sxKsvLYyPD/zv=\r\n" // 50=
+ "CBLsZ5dUybAEWcDkQwytSL348U/2lvadma7lF4wdNOc8sjUL8=\r\n" // 49=
+
+ "4HWw7lJ15ZW3G1GtH9/NQbylcThN2IJo1kr83Fa2c9z2GFK1/NF+DpAkjbhDA3Al\r\n"
+
+ "alpha bravo charlie delta echo foxtrot golf hotel india\r\n"
+ "=juliet=kilo=lima=mike=november=oscar=papa=qebec=romeo=\r\n";
+
+ static const char *const expected_output[] = {
+ "dirtyleader", "456789012345678901234567890123",
+ "234567890123456789012345678901", "dirtytrailer",
+ "J1RrDrZSWxIAphKpYckeKNs10iTeiG", // 49
+ "CBLsZ5dUybAEWcDkQwytSL348U", "2lvadma7lF4wdNOc8sjUL8", // 49=
+ "alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india",
+ "juliet", "kilo", "lima", "mike", "november", "oscar", "papa", "qebec", "romeo",
+ NULL
+ };
+
+ test_begin("fts tokenizer skip base64");
+ test_assert(fts_tokenizer_create(fts_tokenizer_generic, NULL, tr29_settings, &tok, &error) == 0);
+
+ size_t index = 0;
+ while (fts_tokenizer_next(tok, (const unsigned char *) input, strlen(input), &token, &error) > 0) {
+ i_assert(index < N_ELEMENTS(expected_output));
+ test_assert_strcmp(token, expected_output[index]);
+ ++index;
+ }
+ while (fts_tokenizer_next(tok, NULL, 0, &token, &error) > 0) {
+ i_assert(index < N_ELEMENTS(expected_output));
+ test_assert_strcmp(token, expected_output[index]);
+ ++index;
+ }
+ i_assert(index < N_ELEMENTS(expected_output));
+ test_assert_idx(expected_output[index] == NULL, index);
+
+ fts_tokenizer_unref(&tok);
+ test_end();
+}
+
int main(void)
{
static void (*const test_functions[])(void) = {
+ test_fts_tokenizer_skip_base64,
test_fts_tokenizer_find,
test_fts_tokenizer_generic_only,
test_fts_tokenizer_generic_tr29_only,