From 93f085f6f1893bf075945e23ff3a269dfac2c1cb Mon Sep 17 00:00:00 2001 From: Vsevolod Stakhov Date: Tue, 6 Aug 2013 17:31:21 +0100 Subject: [PATCH] Implement strings parser. --- src/rcl/rcl.h | 5 +- src/rcl/rcl_internal.h | 14 +- src/rcl/rcl_parser.c | 442 ++++++++++++++++++++++++++++++++--------- 3 files changed, 362 insertions(+), 99 deletions(-) diff --git a/src/rcl/rcl.h b/src/rcl/rcl.h index 76f9523fb7..54c3c81e54 100644 --- a/src/rcl/rcl.h +++ b/src/rcl/rcl.h @@ -38,7 +38,8 @@ enum rspamd_cl_error { RSPAMD_CL_EOK = 0, RSPAMD_CL_ESYNTAX, RSPAMD_CL_EIO, - RSPAMD_CL_ESTATE + RSPAMD_CL_ESTATE, + RSPAMD_CL_ENESTED }; enum rspamd_cl_type { @@ -57,10 +58,12 @@ enum rspamd_cl_emitter { }; typedef struct rspamd_cl_object_s { + gchar *key; /**< the key of an object */ union { gint64 iv; /**< int value of an object */ gchar *sv; /**< string value of an object */ gdouble dv; /**< double value of an object */ + struct rspamd_cl_object_s *ov; /**< array or hash */ } value; enum rspamd_cl_type type; /**< real type */ struct rspamd_cl_object_s *next; /**< array handle */ diff --git a/src/rcl/rcl_internal.h b/src/rcl/rcl_internal.h index 48047b19f4..33256119dd 100644 --- a/src/rcl/rcl_internal.h +++ b/src/rcl/rcl_internal.h @@ -64,16 +64,24 @@ struct rspamd_cl_stack { struct rspamd_cl_stack *next; }; +struct rspamd_cl_chunk { + const guchar *begin; + const guchar *end; + const guchar *pos; + gsize remain; + guint line; + guint column; + struct rspamd_cl_chunk *next; +}; + struct rspamd_cl_parser { enum rspamd_cl_parser_state state; enum rspamd_cl_parser_state prev_state; - gint comments_nested; rspamd_cl_object_t *top_obj; rspamd_cl_object_t *cur_obj; struct rspamd_cl_macro *macroes; struct rspamd_cl_stack *stack; - guint line; - guint column; + struct rspamd_cl_chunk *chunks; }; #endif /* RCL_INTERNAL_H_ */ diff --git a/src/rcl/rcl_parser.c b/src/rcl/rcl_parser.c index 8ace95cf03..f727991de1 100644 --- a/src/rcl/rcl_parser.c +++ b/src/rcl/rcl_parser.c @@ -24,6 +24,7 @@ #include "config.h" #include "rcl.h" #include "rcl_internal.h" +#include "util.h" /** * @file rcl_parser.c @@ -47,67 +48,93 @@ rspamd_cl_object_new (void) * @param len * @return new position in chunk */ -static inline const guchar * -rspamd_cl_chunk_getc (struct rspamd_cl_parser *parser, const guchar *begin, gsize len) +static inline void +rspamd_cl_chunk_skipc (struct rspamd_cl_chunk *chunk, guchar c) { - while (len > 0) { - len --; - if (*begin == '\n') { - parser->line ++; - parser->column = 0; - } - else { - parser->column ++; - } - begin ++; + if (c == '\n') { + chunk->line ++; + chunk->column = 0; + } + else { + chunk->column ++; } - return begin; + + chunk->pos ++; + chunk->remain --; +} + +static inline void +rspamd_cl_set_err (struct rspamd_cl_chunk *chunk, gint code, const char *str, GError **err) +{ + g_set_error (err, RCL_ERROR, code, "Error detected on line %d at pos %d: '%s'", + chunk->line, chunk->column, str); } static gboolean -rspamd_cl_check_open_comment (struct rspamd_cl_parser *parser, const guchar **begin, gsize *len) +rspamd_cl_skip_comments (struct rspamd_cl_parser *parser, GError **err) { - const guchar *p = *begin; + struct rspamd_cl_chunk *chunk = parser->chunks; + const guchar *p; + gint comments_nested = 0; + + p = chunk->pos; if (*p == '#') { if (parser->state != RSPAMD_RCL_STATE_SCOMMENT && parser->state != RSPAMD_RCL_STATE_MCOMMENT) { - parser->prev_state = parser->state; - parser->state = RSPAMD_RCL_STATE_SCOMMENT; - *begin = rspamd_cl_chunk_getc (parser, *begin, 1); - (*len) --; - return TRUE; + while (p < chunk->end) { + if (*p == '\n') { + rspamd_cl_chunk_skipc (chunk, *p); + break; + } + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + } } } - else if (*p == '/' && *len >= 2) { + else if (*p == '/' && chunk->remain >= 2) { + p ++; if (*p == '/' && parser->state != RSPAMD_RCL_STATE_SCOMMENT && parser->state != RSPAMD_RCL_STATE_MCOMMENT) { - parser->prev_state = parser->state; - parser->state = RSPAMD_RCL_STATE_SCOMMENT; - *begin = rspamd_cl_chunk_getc (parser, *begin, 2); - (*len) -= 2; - return TRUE; + chunk->pos = p; + while (p < chunk->end) { + if (*p == '\n') { + rspamd_cl_chunk_skipc (chunk, *p); + break; + } + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + } } else if (*p == '*') { - /* Multiline comment */ - if (parser->state == RSPAMD_RCL_STATE_SCOMMENT) { - /* Immediately finish single line comment and start multiline one */ - parser->state = RSPAMD_RCL_STATE_MCOMMENT; - parser->comments_nested ++; - } - else if (parser->state == RSPAMD_RCL_STATE_MCOMMENT) { - parser->comments_nested ++; + comments_nested ++; + chunk->pos = p; + + while (p < chunk->end) { + if (*p == '*') { + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + rspamd_cl_chunk_skipc (chunk, *p); + if (*p == '/') { + comments_nested --; + if (comments_nested == 0) { + break; + } + } + p ++; + rspamd_cl_chunk_skipc (chunk, *p); + } + rspamd_cl_chunk_skipc (chunk, *p); + p ++; } - else { - parser->prev_state = parser->state; - parser->state = RSPAMD_RCL_STATE_SCOMMENT; + if (comments_nested != 0) { + rspamd_cl_set_err (chunk, RSPAMD_CL_ENESTED, "comments nesting is invalid", err); + return FALSE; } - *begin = rspamd_cl_chunk_getc (parser, *begin, 2); - (*len) -= 2; } } - return FALSE; + return TRUE; } /** @@ -138,91 +165,304 @@ rspamd_cl_includes_handler (const guchar *data, gsize len, gpointer ud, GError * return TRUE; } -static const guchar * -rspamd_cl_skip_spaces (struct rspamd_cl_parser *parser, const guchar *data, gsize *len) +/** + * Parse quoted string with possible escapes + * @param parser + * @param chunk + * @param err + * @return TRUE if a string has been parsed + */ +static gboolean +rspamd_cl_lex_json_string (struct rspamd_cl_parser *parser, + struct rspamd_cl_chunk *chunk, GError **err) { - const guchar *p, *end; - - p = data; - end = data + *len; + const guchar *p = chunk->pos; + guchar c; + gint i; - if (parser->state == RSPAMD_RCL_STATE_KEY) { - /* Skip any space character */ - while (p < end) { - if (!g_ascii_isspace (*p)) { - break; + while (p < chunk->end) { + c = *p; + if (c < 0x1F) { + /* Unmasked control character */ + if (c == '\n') { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unexpected newline", err); } - p = rspamd_cl_chunk_getc (parser, p, 1); - (*len) --; + else { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unexpected control character", err); + } + return FALSE; + } + if (c == '\\') { + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + c = *p; + if (p >= chunk->end) { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unfinished escape character", err); + return FALSE; + } + if (*p == 'u') { + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + for (i = 0; i < 4 && p < chunk->end; i ++) { + if (!g_ascii_isxdigit (*p)) { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "invalid utf escape", err); + return FALSE; + } + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + } + if (p >= chunk->end) { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unfinished escape character", err); + return FALSE; + } + } + else if (c == '"' || c == '\\' || c == '/' || c == 'b' || + c == 'f' || c == 'n' || c == 'r' || c == 't') { + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + } + else { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "invalid escape character", err); + return FALSE; + } + continue; + } + else if (c == '"') { + return TRUE; } } - else { - while (p < end) { - if (!g_ascii_isspace (*p) || *p == '\n' || *p == '\r') { + + return FALSE; +} + +/** + * Unescape json string inplace + * @param str + */ +static void +rspamd_cl_unescape_json_string (gchar *str) +{ + gchar *t = str, *h = str; + gint i, uval; + + /* t is target (tortoise), h is source (hare) */ + + while (*h != '\0') { + if (*h == '\\') { + h ++; + switch (*h) { + case 'n': + *t++ = '\n'; + break; + case 'r': + *t++ = '\r'; + break; + case 'b': + *t++ = '\b'; + break; + case 't': + *t++ = '\t'; + break; + case 'f': + *t++ = '\f'; + break; + case '\\': + *t++ = '\\'; + break; + case '"': + *t++ = '"'; + break; + case 'u': + /* Unicode escape */ + uval = 0; + for (i = 0; i < 4; i++) { + uval <<= 4; + if (g_ascii_isdigit (h[i])) { + uval += h[i] - '0'; + } + else if (h[i] >= 'a' && h[i] <= 'f') { + uval += h[i] - 'a' + 10; + } + else if (h[i] >= 'A' && h[i] <= 'F') { + uval += h[i] - 'A' + 10; + } + } + /* Encode */ + if(uval < 0x80) { + t[0] = (char)uval; + t ++; + } + else if(uval < 0x800) { + t[0] = 0xC0 + ((uval & 0x7C0) >> 6); + t[1] = 0x80 + ((uval & 0x03F)); + t += 2; + } + else if(uval < 0x10000) { + t[0] = 0xE0 + ((uval & 0xF000) >> 12); + t[1] = 0x80 + ((uval & 0x0FC0) >> 6); + t[2] = 0x80 + ((uval & 0x003F)); + t += 3; + } + else if(uval <= 0x10FFFF) { + t[0] = 0xF0 + ((uval & 0x1C0000) >> 18); + t[1] = 0x80 + ((uval & 0x03F000) >> 12); + t[2] = 0x80 + ((uval & 0x000FC0) >> 6); + t[3] = 0x80 + ((uval & 0x00003F)); + t += 4; + } + else { + *t++ = '?'; + } + break; + default: + *t++ = '?'; break; } - p = rspamd_cl_chunk_getc (parser, p, 1); - (*len) --; + } + else { + *t++ = *h++; } } - - return p; } +/** + * Parse a key in an object + * @param parser + * @param chunk + * @param err + * @return TRUE if a key has been parsed + */ static gboolean -rspamd_cl_parse_key (struct rspamd_cl_parser *parser, const guchar **data, - gsize *len, GError **err) +rspamd_cl_parse_key (struct rspamd_cl_parser *parser, + struct rspamd_cl_chunk *chunk, GError **err) { const guchar *p, *c = NULL, *end; + gboolean got_quote = FALSE, got_eq = FALSE, got_semicolon = FALSE; + rspamd_cl_object_t *nobj; + p = chunk->pos; - p = *data; - end = p + *len; + /* Skip any spaces */ + while (p < chunk->end && g_ascii_isspace (*p)) { + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + } - while (p < end) { + while (p < chunk->end) { /* * A key must start with alpha and end with space character */ if (*p == '.') { /* It is macro actually */ - p = rspamd_cl_chunk_getc (parser, p, 1); - len --; + rspamd_cl_chunk_skipc (chunk, *p); parser->state = RSPAMD_RCL_STATE_MACRO_NAME; - *data = p; return TRUE; } else if (c == NULL) { if (g_ascii_isalpha (*p)) { /* The first symbol */ c = p; - p = rspamd_cl_chunk_getc (parser, p, 1); - (*len) --; + rspamd_cl_chunk_skipc (chunk, *p); + p ++; } else if (*p == '"') { /* JSON style key */ c = p + 1; - p = rspamd_cl_chunk_getc (parser, p, 2); - (*len) -= 2; + got_quote = TRUE; + rspamd_cl_chunk_skipc (chunk, *p); + p ++; } else { /* Invalid identifier */ parser->state = RSPAMD_RCL_STATE_ERROR; - g_set_error (err, RCL_ERROR, RSPAMD_CL_ESYNTAX, "key must start with a letter, " - "line %d, pos: %d", parser->line, parser->column); + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "key must begin with a letter", err); return FALSE; } } else { - if (g_ascii_isalnum (*p)) { - p = rspamd_cl_chunk_getc (parser, p, 1); - (*len) --; + /* Parse the body of a key */ + if (!got_quote) { + if (g_ascii_isalnum (*p)) { + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + } + else if (*p == ' ' || *p == '\t' || *p == ':' || *p == '=') { + end = p; + break; + } + else { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "invalid character in a key", err); + return FALSE; + } } - else if (*p == ' ' || *p == '\t') { - p = rspamd_cl_skip_spaces (parser, p, len); + else { + /* We need to parse json like quoted string */ + if (!rspamd_cl_lex_json_string (parser, chunk, err)) { + return FALSE; + } + end = chunk->pos; + p = end; + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + break; } } } - *data = p; + + if (p >= chunk->end) { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unfinished key", err); + return FALSE; + } + + /* We are now at the end of the key, need to parse the rest */ + while (p < chunk->end) { + if (*p == ' ' || *p == '\t') { + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + } + else if (*p == '=') { + if (!got_eq && !got_semicolon) { + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + got_eq = TRUE; + } + else { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unexpected '=' character", err); + return FALSE; + } + } + else if (*p == ':') { + if (!got_eq && !got_semicolon) { + rspamd_cl_chunk_skipc (chunk, *p); + p ++; + got_semicolon = TRUE; + } + else { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unexpected ':' character", err); + return FALSE; + } + } + else { + /* Start value */ + break; + } + } + + if (p >= chunk->end) { + rspamd_cl_set_err (chunk, RSPAMD_CL_ESYNTAX, "unfinished key", err); + return FALSE; + } + + /* Create a new object */ + nobj = rspamd_cl_object_new (); + nobj->key = g_malloc (end - c + 1); + rspamd_strlcpy (nobj->key, c, end - c + 1); + + if (got_quote) { + rspamd_cl_unescape_json_string (nobj->key); + } + + HASH_ADD_KEYPTR (hh, parser->cur_obj->value.ov, nobj->key, strlen (nobj->key), nobj); return TRUE; } @@ -236,15 +476,14 @@ rspamd_cl_parse_key (struct rspamd_cl_parser *parser, const guchar **data, * @return TRUE if chunk has been parsed and FALSE in case of error */ static gboolean -rspamd_cl_state_machine (struct rspamd_cl_parser *parser, const guchar *data, - gsize len, GError **err) +rspamd_cl_state_machine (struct rspamd_cl_parser *parser, GError **err) { - const guchar *p, *end; rspamd_cl_object_t *obj; + struct rspamd_cl_chunk *chunk = parser->chunks; + const guchar *p; - p = data; - end = p + len; - while (p < end) { + p = chunk->pos; + while (chunk->pos < chunk->end) { switch (parser->state) { case RSPAMD_RCL_STATE_INIT: /* @@ -252,29 +491,33 @@ rspamd_cl_state_machine (struct rspamd_cl_parser *parser, const guchar *data, * if we got [ or { correspondingly or can just treat new data as * a key of newly created object */ - if (!rspamd_cl_check_open_comment (parser, &p, &len)) { + if (!rspamd_cl_skip_comments (parser, err)) { + parser->state = RSPAMD_RCL_STATE_ERROR; + return FALSE; + } + else { obj = rspamd_cl_object_new (); if (*p == '[') { parser->state = RSPAMD_RCL_STATE_ARRAY; obj->type = RSPAMD_CL_ARRAY; - p = rspamd_cl_chunk_getc (parser, p, 1); - len --; + rspamd_cl_chunk_skipc (chunk, *p); + p ++; } else { parser->state = RSPAMD_RCL_STATE_KEY; obj->type = RSPAMD_CL_OBJECT; if (*p == '{') { - p = rspamd_cl_chunk_getc (parser, p, 1); - len --; + rspamd_cl_chunk_skipc (chunk, *p); + p ++; } } parser->cur_obj = obj; parser->top_obj = obj; - p = rspamd_cl_skip_spaces (parser, p, &len); } break; case RSPAMD_RCL_STATE_KEY: - if (!rspamd_cl_parse_key (parser, &p, &len, err)) { + if (!rspamd_cl_parse_key (parser, chunk, err)) { + parser->state = RSPAMD_RCL_STATE_ERROR; return FALSE; } break; @@ -294,7 +537,6 @@ rspamd_cl_parser_new (void) new = g_slice_alloc0 (sizeof (struct rspamd_cl_parser)); - new->line = 1; rspamd_cl_parser_register_macro (new, "include", rspamd_cl_include_handler, new); rspamd_cl_parser_register_macro (new, "includes", rspamd_cl_includes_handler, new); @@ -319,8 +561,18 @@ gboolean rspamd_cl_parser_add_chunk (struct rspamd_cl_parser *parser, const guchar *data, gsize len, GError **err) { + struct rspamd_cl_chunk *chunk; + if (parser->state != RSPAMD_RCL_STATE_ERROR) { - return rspamd_cl_state_machine (parser, data, len, err); + chunk = g_slice_alloc (sizeof (struct rspamd_cl_chunk)); + chunk->begin = data; + chunk->remain = len; + chunk->pos = chunk->begin; + chunk->end = chunk->begin + len; + chunk->line = 1; + chunk->column = 0; + LL_PREPEND (parser->chunks, chunk); + return rspamd_cl_state_machine (parser, err); } g_set_error (err, RCL_ERROR, RSPAMD_CL_ESTATE, "a parser is in an invalid state"); -- 2.47.3