cur_ucs = ucs_elt->s;
nsym = 0;
+ uc_err = U_ZERO_ERROR;
- while (keylen > 0) {
+ while (cur_utf < end) {
*cur_ucs++ = ucnv_getNextUChar (d->uchar_converter, &cur_utf,
end, &uc_err);
if (!U_SUCCESS (uc_err)) {
}
nsym ++;
- keylen --;
}
if (!U_SUCCESS (uc_err)) {
- msg_warn_config ("cannot convert key to unicode: %s",
- u_errorName (uc_err));
+ msg_warn_config ("cannot convert key %*s to unicode: %s",
+ (gint)keylen, key, u_errorName (uc_err));
continue;
}
{
guint nparts = MIN (words->len, nwords);
goffset *selected_words;
- rspamd_stat_token_t *tok, ucs_w;
+ rspamd_stat_token_t *tok;
guint i;
selected_words = g_new0 (goffset, nparts);
if (part->utf_words) {
for (j = 0; j < part->utf_words->len; j ++) {
- tok = &g_array_index (part->utf_words, rspamd_stat_token_t, j);
- scvec[cnt] = tok->begin;
- lenvec[cnt++] = tok->len;
+ tok = &g_array_index (part->utf_words,
+ rspamd_stat_token_t, j);
+ scvec[cnt] = tok->normalized.begin;
+ lenvec[cnt++] = tok->normalized.len;
}
}
}
case RSPAMD_RE_SELECTOR:
ret = "selector";
break;
+ case RSPAMD_RE_WORDS:
+ ret = "words";
+ break;
case RSPAMD_RE_MAX:
ret = "invalid class";
break;
"HSET %b_tokens %b %b:%b",
prefix, (size_t) prefix_len,
n0, (size_t) l0,
- tok->t1->begin, tok->t1->len,
- tok->t2->begin, tok->t2->len);
+ tok->t1->stemmed.begin, tok->t1->stemmed.len,
+ tok->t2->stemmed.begin, tok->t2->stemmed.len);
} else if (tok->t1) {
redisAsyncCommand (rt->redis, NULL, NULL,
"HSET %b_tokens %b %b",
prefix, (size_t) prefix_len,
n0, (size_t) l0,
- tok->t1->begin, tok->t1->len);
+ tok->t1->stemmed.begin, tok->t1->stemmed.len);
}
}
else {
"HSET %b %s %b:%b",
n0, (size_t) l0,
"tokens",
- tok->t1->begin, tok->t1->len,
- tok->t2->begin, tok->t2->len);
+ tok->t1->stemmed.begin, tok->t1->stemmed.len,
+ tok->t2->stemmed.begin, tok->t2->stemmed.len);
} else if (tok->t1) {
redisAsyncCommand (rt->redis, NULL, NULL,
"HSET %b %s %b",
n0, (size_t) l0,
"tokens",
- tok->t1->begin, tok->t1->len);
+ tok->t1->stemmed.begin, tok->t1->stemmed.len);
}
}
msg_debug_bayes (
"token(meta) %uL <%*s:%*s> probabilistically skipped",
tok->data,
- (int) tok->t1->len, tok->t1->begin,
- (int) tok->t2->len, tok->t2->begin);
+ (int) tok->t1->original.len, tok->t1->original.begin,
+ (int) tok->t2->original.len, tok->t2->original.begin);
}
return;
msg_debug_bayes (
"token %uL <%*s:%*s> skipped, prob not in range: %f",
tok->data,
- (int) tok->t1->len, tok->t1->begin,
- (int) tok->t2->len, tok->t2->begin, bayes_spam_prob);
+ (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+ (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
+ bayes_spam_prob);
return;
}
"current spam prob: %.3f, current ham prob: %.3f",
token_type,
tok->data,
- (int) tok->t1->len, tok->t1->begin,
- (int) tok->t2->len, tok->t2->begin,
+ (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+ (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
fw, w, total_count, spam_count, ham_count,
spam_prob, ham_prob,
bayes_spam_prob, bayes_ham_prob,
msg_debug_bayes ("token %uL <%*s:%*s>: window: %d, total_count: %d, "
"spam_count: %d, ham_count: %d",
tok->data,
- (int) tok->t1->len, tok->t1->begin,
- (int) tok->t2->len, tok->t2->begin,
+ (int) tok->t1->stemmed.len, tok->t1->stemmed.begin,
+ (int) tok->t2->stemmed.len, tok->t2->stemmed.begin,
tok->window_idx, total_cnt, spam_cnt, ham_cnt);
}
else {
lua_State *L = task->cfg->lua_state;
ar = g_array_sized_new (FALSE, FALSE, sizeof (elt), 16);
+ memset (&elt, 0, sizeof (elt));
elt.flags = RSPAMD_STAT_TOKEN_FLAG_META;
if (st_ctx->lua_stat_tokens_ref != -1) {
tok.begin = lua_tolstring (L, -1, &tok.len);
if (tok.begin && tok.len > 0) {
- elt.begin = rspamd_mempool_ftokdup (task->task_pool, &tok);
- elt.len = tok.len;
+ elt.original.begin =
+ rspamd_mempool_ftokdup (task->task_pool, &tok);
+ elt.original.len = tok.len;
+ elt.stemmed.begin = elt.original.begin;
+ elt.stemmed.len = elt.original.len;
+ elt.normalized.begin = elt.original.begin;
+ elt.normalized.len = elt.original.len;
g_array_append_val (ar, elt);
}
for (w = 0; w < words->len; w ++) {
token = &g_array_index (words, rspamd_stat_token_t, w);
token_flags = token->flags;
+ const gchar *begin;
+ gsize len;
- if (task->lang_det) {
- if (token->flags & RSPAMD_STAT_TOKEN_FLAG_STOP_WORD) {
- /* Skip stop word */
- continue;
- }
+ if (token->flags &
+ (RSPAMD_STAT_TOKEN_FLAG_STOP_WORD|RSPAMD_STAT_TOKEN_FLAG_SKIPPED)) {
+ /* Skip stop/skipped words */
+ continue;
+ }
+
+ if (token->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT) {
+ begin = token->stemmed.begin;
+ len = token->stemmed.len;
+ }
+ else {
+ begin = token->original.begin;
+ len = token->original.len;
}
if (osb_cf->ht == RSPAMD_OSB_HASH_COMPAT) {
rspamd_ftok_t ftok;
- ftok.begin = token->begin;
- ftok.len = token->len;
+ ftok.begin = begin;
+ ftok.len = len;
cur = rspamd_fstrhash_lc (&ftok, is_utf);
}
else {
/* We know that the words are normalized */
if (osb_cf->ht == RSPAMD_OSB_HASH_XXHASH) {
cur = rspamd_cryptobox_fast_hash_specific (RSPAMD_CRYPTOBOX_XXHASH64,
- token->begin, token->len, osb_cf->seed);
+ begin, len, osb_cf->seed);
}
else {
- rspamd_cryptobox_siphash ((guchar *)&cur, token->begin,
- token->len, osb_cf->sk);
+ rspamd_cryptobox_siphash ((guchar *)&cur, begin,
+ len, osb_cf->sk);
if (prefix) {
cur ^= seed;
if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
for (j = beg; j < i; j ++) {
word = &g_array_index (input, rspamd_stat_token_t, j);
- row = rspamd_fstring_append (row, word->begin, word->len);
+ row = rspamd_fstring_append (row, word->stemmed.begin,
+ word->stemmed.len);
}
/* Now we need to create a new row here */
}
}
else {
- guint64 res[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
+ guint64 window[SHINGLES_WINDOW * RSPAMD_SHINGLE_SIZE], seed;
switch (alg) {
case RSPAMD_SHINGLES_XXHASH:
break;
}
- memset (res, 0, sizeof (res));
+ memset (window, 0, sizeof (window));
for (i = 0; i <= (gint)input->len; i ++) {
if (i - beg >= SHINGLES_WINDOW || i == (gint)input->len) {
for (j = 0; j < RSPAMD_SHINGLE_SIZE; j ++) {
/* Shift hashes window to right */
for (k = 0; k < SHINGLES_WINDOW - 1; k ++) {
- res[j * SHINGLES_WINDOW + k] =
- res[j * SHINGLES_WINDOW + k + 1];
+ window[j * SHINGLES_WINDOW + k] =
+ window[j * SHINGLES_WINDOW + k + 1];
}
word = &g_array_index (input, rspamd_stat_token_t, beg);
/* Insert the last element to the pipe */
memcpy (&seed, keys[j], sizeof (seed));
- res[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
+ window[j * SHINGLES_WINDOW + SHINGLES_WINDOW - 1] =
rspamd_cryptobox_fast_hash_specific (ht,
- word->begin, word->len,
+ word->stemmed.begin, word->stemmed.len,
seed);
val = 0;
for (k = 0; k < SHINGLES_WINDOW; k ++) {
- val ^= res[j * SHINGLES_WINDOW + k] >>
+ val ^= window[j * SHINGLES_WINDOW + k] >>
(8 * (SHINGLES_WINDOW - k - 1));
}
for (i = 0; i < part->utf_words->len; i ++) {
w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
- lua_pushlstring (L, w->begin, w->len);
+ lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
lua_rawseti (L, -2, i + 1);
}
}
for (i = 0; i < part->utf_words->len; i ++) {
word = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
- rspamd_cryptobox_hash_update (&st, word->begin, word->len);
+ rspamd_cryptobox_hash_update (&st,
+ word->stemmed.begin, word->stemmed.len);
}
rspamd_cryptobox_hash_final (&st, digest);
if (tok->t1) {
lua_pushstring (L, "t1");
- lua_pushlstring (L, tok->t1->begin, tok->t1->len);
+ lua_pushlstring (L, tok->t1->stemmed.begin, tok->t1->stemmed.len);
lua_settable (L, -3);
}
if (tok->t2) {
lua_pushstring (L, "t2");
- lua_pushlstring (L, tok->t2->begin, tok->t2->len);
+ lua_pushlstring (L, tok->t2->stemmed.begin, tok->t2->stemmed.len);
lua_settable (L, -3);
}
for (i = 0; i < res->len; i ++) {
w = &g_array_index (res, rspamd_stat_token_t, i);
- lua_pushlstring (L, w->begin, w->len);
+ lua_pushlstring (L, w->stemmed.begin, w->stemmed.len);
lua_rawseti (L, -2, i + 1);
}
}
guint *ncap,
struct chartable_ctx *chartable_module_ctx)
{
- const gchar *p, *end;
+ const UChar32 *p, *end;
gdouble badness = 0.0;
UChar32 uc;
UBlockCode sc;
gint last_is_latin = -1;
- guint same_script_count = 0, nsym = 0, i = 0;
+ guint same_script_count = 0, nsym = 0;
enum {
start_process = 0,
got_alpha,
got_unknown,
} state = start_process, prev_state = start_process;
- p = w->begin;
- end = p + w->len;
+ p = w->unicode.begin;
+ end = p + w->unicode.len;
/* We assume that w is normalized */
- while (p + i < end) {
- U8_NEXT (p, i, w->len, uc);
+ while (p < end) {
+ uc = *p++;
if (((gint32)uc) < 0) {
break;
}
}
- msg_debug_chartable ("word %*s, badness: %.2f", (gint)w->len, w->begin,
+ msg_debug_chartable ("word %*s, badness: %.2f",
+ (gint)w->normalized.len, w->normalized.begin,
badness);
return badness;
got_unknown,
} state = start_process;
- p = w->begin;
- end = p + w->len;
+ p = w->normalized.begin;
+ end = p + w->normalized.len;
last_sc = 0;
- if (w->len > chartable_module_ctx->max_word_len) {
+ if (w->normalized.len > chartable_module_ctx->max_word_len) {
return 0.0;
}
badness = 4.0;
}
- msg_debug_chartable ("word %*s, badness: %.2f", (gint)w->len, w->begin,
+ msg_debug_chartable ("word %*s, badness: %.2f",
+ (gint)w->normalized.len, w->normalized.begin,
badness);
return badness;
for (i = 0; i < part->utf_words->len; i++) {
w = &g_array_index (part->utf_words, rspamd_stat_token_t, i);
- if (w->len > 0 && (w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
+ if ((w->flags & RSPAMD_STAT_TOKEN_FLAG_TEXT)) {
- if (IS_PART_UTF (part)) {
+ if (w->flags & RSPAMD_STAT_TOKEN_FLAG_UTF) {
cur_score += rspamd_chartable_process_word_utf (task, w, FALSE,
&ncap, chartable_module_ctx);
}
struct rspamd_symcache_item *item,
void *unused)
{
+ /* XXX: TODO: unbreak module once URLs unicode project is over */
+#if 0
struct rspamd_url *u;
GHashTableIter it;
gpointer k, v;
}
if (u->hostlen > 0) {
- w.begin = u->host;
- w.len = u->hostlen;
+ w.stemmed.begin = u->host;
+ w.stemmed.len = u->hostlen;
- if (g_utf8_validate (w.begin, w.len, NULL)) {
+ if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, NULL)) {
cur_score += rspamd_chartable_process_word_utf (task, &w,
TRUE, NULL, chartable_module_ctx);
}
}
if (u->hostlen > 0) {
- w.begin = u->host;
- w.len = u->hostlen;
+ w.stemmed.begin = u->host;
+ w.stemmed.len = u->hostlen;
- if (g_utf8_validate (w.begin, w.len, NULL)) {
+ if (g_utf8_validate (w.stemmed.begin, w.stemmed.len, NULL)) {
cur_score += rspamd_chartable_process_word_utf (task, &w,
TRUE, NULL, chartable_module_ctx);
}
cur_score, NULL);
}
-
+#endif
rspamd_symcache_finalize_item (task, item);
}
for (i = 0; i < words->len; i ++) {
word = &g_array_index (words, rspamd_stat_token_t, i);
- rspamd_cryptobox_hash_update (&st, word->begin, word->len);
+ rspamd_cryptobox_hash_update (&st, word->stemmed.begin,
+ word->stemmed.len);
}
rspamd_cryptobox_hash_final (&st, shcmd->basic.digest);