From: Alan T. DeKok Date: Tue, 1 Feb 2022 14:40:27 +0000 (-0500) Subject: start of regex handling X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=c43b572195b08a402fc46df541f40cff34eea0a5;p=thirdparty%2Ffreeradius-server.git start of regex handling --- diff --git a/src/lib/unlang/xlat_expr.c b/src/lib/unlang/xlat_expr.c index a2aa79167f2..a13521f20bd 100644 --- a/src/lib/unlang/xlat_expr.c +++ b/src/lib/unlang/xlat_expr.c @@ -576,6 +576,9 @@ static const fr_sbuff_term_elem_t binary_ops[T_TOKEN_LAST] = { [ T_OP_LE ] = L("cmp_le"), [ T_OP_GT ] = L("cmp_gt"), [ T_OP_GE ] = L("cmp_ge"), + + [ T_OP_REG_EQ ] = L("reg_eq"), + [ T_OP_REG_NE ] = L("reg_ne"), }; @@ -699,7 +702,7 @@ static xlat_exp_t *xlat_exp_cast_alloc(TALLOC_CTX *ctx, fr_type_t type, xlat_exp return node; } -static ssize_t tokenize_expression(TALLOC_CTX *ctx, xlat_exp_t **head, xlat_flags_t *flags, fr_sbuff_t *input, +static ssize_t tokenize_expression(TALLOC_CTX *ctx, xlat_exp_t **head, xlat_flags_t *flags, fr_sbuff_t *in, fr_sbuff_parse_rules_t const *p_rules, tmpl_rules_t const *t_rules, fr_token_t prev, fr_type_t type, fr_sbuff_parse_rules_t const *bracket_rules, fr_dict_attr_t const *da); @@ -713,6 +716,76 @@ static fr_table_num_sorted_t const expr_quote_table[] = { }; static size_t expr_quote_table_len = NUM_ELEMENTS(expr_quote_table); +#ifdef HAVE_REGEX +static ssize_t tokenize_regex(TALLOC_CTX *ctx, xlat_exp_t **head, UNUSED xlat_flags_t *flags, fr_sbuff_t *in, + fr_sbuff_parse_rules_t const *p_rules, tmpl_rules_t const *t_rules) +{ + ssize_t slen; + char quote = '/'; + xlat_exp_t *node; + fr_sbuff_t our_in = FR_SBUFF(in); + fr_sbuff_marker_t marker; + + fr_sbuff_skip_whitespace(&our_in); + + fr_sbuff_marker(&marker, &our_in); + + /* + * Allow m:foo: + */ + if (fr_sbuff_next_if_char(&our_in, 'm')) { + quote = *fr_sbuff_current(&our_in); /* screw UTF-8! Who needs emojis? */ + fr_sbuff_advance(&our_in, 1); + + // @todo - update the terminal rules to use this character, too! + + } else { + if (!fr_sbuff_next_if_char(&our_in, '/')) { + fr_strerror_const("Regular expression does not start with '/'"); + FR_SBUFF_ERROR_RETURN(&our_in); + } + } + + MEM(node = xlat_exp_alloc_null(ctx)); + xlat_exp_set_type(node, XLAT_TMPL); + + slen = tmpl_afrom_substr(node, &node->vpt, &our_in, T_SOLIDUS_QUOTED_STRING, + value_parse_rules_quoted[T_SOLIDUS_QUOTED_STRING], t_rules); + if (slen <= 0) { + error: + fr_sbuff_advance(&our_in, slen * -1); + talloc_free(node); + return -(fr_sbuff_used_total(&our_in)); + } + + /* + * Check for, and skip, the trailing quote if we had a leading quote. + */ + if (!fr_sbuff_next_if_char(&our_in, quote)) { + fr_strerror_printf("Regular expression does not edit with '%c'", quote); + FR_SBUFF_ERROR_RETURN(&our_in); + } + + fr_assert(node->vpt != NULL); + + slen = tmpl_regex_flags_substr(node->vpt, &our_in, p_rules->terminals); + if (slen < 0) goto error; + + /* + * We've now got the expressions and + * the flags. Try to compile the + * regex. + */ + if (tmpl_is_regex_uncompiled(node->vpt)) { + slen = tmpl_regex_compile(node->vpt, true); + if (slen <= 0) goto error; + } + + *head = node; + return fr_sbuff_used(&our_in); +} +#endif + /* * Look for prefix operators @@ -727,7 +800,7 @@ static size_t expr_quote_table_len = NUM_ELEMENTS(expr_quote_table); * to parse the next thing we get. Otherwise, parse the thing as * int64_t. */ -static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_flags_t *flags, fr_sbuff_t *input, +static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_flags_t *flags, fr_sbuff_t *in, fr_sbuff_parse_rules_t const *p_rules, tmpl_rules_t const *t_rules, fr_type_t type, fr_sbuff_parse_rules_t const *bracket_rules, fr_dict_attr_t const *da) { @@ -738,22 +811,22 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla fr_type_t cast_type = FR_TYPE_NULL; TALLOC_CTX *ctx = input_ctx; TALLOC_CTX *free_ctx = NULL; - fr_sbuff_t in = FR_SBUFF(input); + fr_sbuff_t our_in = FR_SBUFF(in); /* * Handle !-~ by adding a unary function to the xlat * node, with the first argument being the _next_ thing * we allocate. */ - if (fr_sbuff_next_if_char(&in, '!')) { /* unary not */ + if (fr_sbuff_next_if_char(&our_in, '!')) { /* unary not */ func = xlat_func_find("unary_not", 9); fr_assert(func != NULL); } - else if (fr_sbuff_next_if_char(&in, '-')) { /* unary minus */ + else if (fr_sbuff_next_if_char(&our_in, '-')) { /* unary minus */ func = xlat_func_find("unary_minus", 11); fr_assert(func != NULL); } - else if (fr_sbuff_next_if_char(&in, '+')) { /* ignore unary + */ + else if (fr_sbuff_next_if_char(&our_in, '+')) { /* ignore unary + */ /* nothing */ } @@ -788,7 +861,7 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla { tmpl_rules_t tmp_rules = {}; - slen = tmpl_cast_from_substr(&tmp_rules, &in); + slen = tmpl_cast_from_substr(&tmp_rules, &our_in); cast_type = tmp_rules.cast; } @@ -826,7 +899,7 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla } } - fr_sbuff_skip_whitespace(&in); + fr_sbuff_skip_whitespace(&our_in); /* * If we have '(', then recurse for other expressions @@ -845,17 +918,17 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla * The double casting is technically invalid, and will likely cause breakages at run * time. */ - if (fr_sbuff_next_if_char(&in, '(')) { - slen = tokenize_expression(ctx, &node, flags, &in, bracket_rules, t_rules, T_INVALID, FR_TYPE_NULL, bracket_rules, da); + if (fr_sbuff_next_if_char(&our_in, '(')) { + slen = tokenize_expression(ctx, &node, flags, &our_in, bracket_rules, t_rules, T_INVALID, FR_TYPE_NULL, bracket_rules, da); if (slen <= 0) { talloc_free(free_ctx); - FR_SBUFF_ERROR_RETURN_ADJ(&in, slen); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, slen); } - if (!fr_sbuff_next_if_char(&in, ')')) { + if (!fr_sbuff_next_if_char(&our_in, ')')) { fr_strerror_printf("Failed to find trailing ')'"); talloc_free(free_ctx); - FR_SBUFF_ERROR_RETURN_ADJ(&in, -slen); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, -slen); } goto done; @@ -866,17 +939,17 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla * * @todo - this case is arguably handled by tmpl_afrom_substr() */ - if (fr_sbuff_is_char(&in, '&')) { + if (fr_sbuff_is_char(&our_in, '&')) { tmpl_t *vpt = NULL; MEM(node = xlat_exp_alloc_null(ctx)); xlat_exp_set_type(node, XLAT_TMPL); - slen = tmpl_afrom_attr_substr(node, NULL, &vpt, &in, p_rules, t_rules); + slen = tmpl_afrom_attr_substr(node, NULL, &vpt, &our_in, p_rules, t_rules); if (slen <= 0) { talloc_free(node); talloc_free(free_ctx); - FR_SBUFF_ERROR_RETURN_ADJ(&in, slen); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, slen); } /* @@ -900,8 +973,8 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla * * @todo - optimization - do we want to create a cast node here, instead of later? */ - if (fr_sbuff_adv_past_str_literal(&in, "%{")) { - if (xlat_tokenize_expansion(ctx, &node, flags, &in, &t_rules->attr) < 0) { + if (fr_sbuff_adv_past_str_literal(&our_in, "%{")) { + if (xlat_tokenize_expansion(ctx, &node, flags, &our_in, &t_rules->attr) < 0) { talloc_free(free_ctx); return -1; } @@ -922,8 +995,8 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla * * @todo - optimization - do we want to create a cast node here, instead of later? */ - if (fr_sbuff_adv_past_str_literal(&in, "%(")) { - if (xlat_tokenize_function_args(ctx, &node, flags, &in, &t_rules->attr) < 0) { + if (fr_sbuff_adv_past_str_literal(&our_in, "%(")) { + if (xlat_tokenize_function_args(ctx, &node, flags, &our_in, &t_rules->attr) < 0) { talloc_free(free_ctx); return -1; } @@ -985,21 +1058,21 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla MEM(node = xlat_exp_alloc_null(ctx)); xlat_exp_set_type(node, XLAT_TMPL); - fr_sbuff_marker(&marker, &in); + fr_sbuff_marker(&marker, &our_in); /* * This thing is a value of some kind. Try to parse it as that. */ - fr_sbuff_out_by_longest_prefix(&slen, &token, expr_quote_table, &in, T_BARE_WORD); + fr_sbuff_out_by_longest_prefix(&slen, &token, expr_quote_table, &our_in, T_BARE_WORD); switch (token) { fr_dict_enum_value_t *enumv; case T_BARE_WORD: if (da) { - slen = fr_dict_enum_by_name_substr(&enumv, da, &in); + slen = fr_dict_enum_by_name_substr(&enumv, da, &our_in); if (slen < 0) { fr_strerror_printf("Failed parsing value - %s", fr_strerror()); - FR_SBUFF_ERROR_RETURN_ADJ(&in, slen); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, slen); } if (slen > 0) { @@ -1020,43 +1093,43 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla * doesn't stop at anything. Instead, we have to pass in our bracket rules, * which stops at any of the operators / brackets we care about. */ - slen = tmpl_afrom_substr(node, &node->vpt, &in, token, + slen = tmpl_afrom_substr(node, &node->vpt, &our_in, token, bracket_rules, &my_rules); if (slen <= 0) { - FR_SBUFF_ERROR_RETURN_ADJ(&in, slen); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, slen); } break; case T_DOUBLE_QUOTED_STRING: case T_SINGLE_QUOTED_STRING: case T_BACK_QUOTED_STRING: - slen = tmpl_afrom_substr(node, &node->vpt, &in, token, + slen = tmpl_afrom_substr(node, &node->vpt, &our_in, token, value_parse_rules_quoted[token], &my_rules); if (slen <= 0) { - FR_SBUFF_ERROR_RETURN_ADJ(&in, slen); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, slen); } /* * Check for, and skip, the trailing quote if we had a leading quote. */ - if (!fr_sbuff_is_char(&in, fr_token_quote[token])) { + if (!fr_sbuff_is_char(&our_in, fr_token_quote[token])) { fr_strerror_const("Unexpected end of quoted string"); - FR_SBUFF_ERROR_RETURN(&in); + FR_SBUFF_ERROR_RETURN(&our_in); } - fr_sbuff_advance(&in, 1); + fr_sbuff_advance(&our_in, 1); fr_assert(node->vpt != NULL); break; case T_SOLIDUS_QUOTED_STRING: fr_strerror_const("Unexpected regular expression"); - fr_sbuff_set(&in, &marker); - FR_SBUFF_ERROR_RETURN(&in); + fr_sbuff_set(&our_in, &marker); + FR_SBUFF_ERROR_RETURN(&our_in); default: fr_strerror_const("Unexpected token"); - fr_sbuff_set(&in, &marker); - FR_SBUFF_ERROR_RETURN(&in); + fr_sbuff_set(&our_in, &marker); + FR_SBUFF_ERROR_RETURN(&our_in); } /* @@ -1065,8 +1138,8 @@ static ssize_t tokenize_field(TALLOC_CTX *input_ctx, xlat_exp_t **head, xlat_fla * creates TMPL_TYPE_UNRESOLVED. */ if (tmpl_resolve(node->vpt, NULL) < 0) { - fr_sbuff_set(&in, &marker); - FR_SBUFF_ERROR_RETURN(&in); + fr_sbuff_set(&our_in, &marker); + FR_SBUFF_ERROR_RETURN(&our_in); } fr_assert(tmpl_value_type(node->vpt) != FR_TYPE_NULL); @@ -1088,7 +1161,7 @@ done: node = cast; } - fr_sbuff_skip_whitespace(&in); + fr_sbuff_skip_whitespace(&our_in); /* * Purify things in place, where we can. @@ -1096,7 +1169,7 @@ done: if (flags->pure) { if (xlat_purify_expr(node) < 0) { talloc_free(node); - FR_SBUFF_ERROR_RETURN(&in); /* @todo m_lhs ? */ + FR_SBUFF_ERROR_RETURN(&our_in); /* @todo m_lhs ? */ } } @@ -1111,7 +1184,7 @@ done: fr_assert(node != NULL); *head = node; - return fr_sbuff_set(input, &in); + return fr_sbuff_set(in, &our_in); } @@ -1138,6 +1211,9 @@ static fr_table_num_ordered_t const expr_assignment_op_table[] = { { L("="), T_OP_EQ }, { L("=="), T_OP_CMP_EQ }, + { L("=~"), T_OP_REG_EQ }, + { L("!="), T_OP_REG_NE }, + { L(">"), T_OP_GT }, { L(">="), T_OP_GE }, { L(">>"), T_RSHIFT }, @@ -1153,7 +1229,7 @@ static size_t const expr_assignment_op_table_len = NUM_ELEMENTS(expr_assignment_ * !EXPR * A OP B */ -static ssize_t tokenize_expression(TALLOC_CTX *ctx, xlat_exp_t **head, xlat_flags_t *flags, fr_sbuff_t *input, +static ssize_t tokenize_expression(TALLOC_CTX *ctx, xlat_exp_t **head, xlat_flags_t *flags, fr_sbuff_t *in, fr_sbuff_parse_rules_t const *p_rules, tmpl_rules_t const *t_rules, fr_token_t prev, fr_type_t type, fr_sbuff_parse_rules_t const *bracket_rules, fr_dict_attr_t const *da) @@ -1163,28 +1239,28 @@ static ssize_t tokenize_expression(TALLOC_CTX *ctx, xlat_exp_t **head, xlat_flag fr_token_t op; ssize_t slen; fr_sbuff_marker_t marker; - fr_sbuff_t in = FR_SBUFF(input); + fr_sbuff_t our_in = FR_SBUFF(in); - fr_sbuff_skip_whitespace(&in); + fr_sbuff_skip_whitespace(&our_in); /* * Get the LHS of the operation. */ - slen = tokenize_field(ctx, &lhs, flags, &in, p_rules, t_rules, type, bracket_rules, da); + slen = tokenize_field(ctx, &lhs, flags, &our_in, p_rules, t_rules, type, bracket_rules, da); if (slen <= 0) return slen; redo: fr_assert(lhs != NULL); - fr_sbuff_skip_whitespace(&in); + fr_sbuff_skip_whitespace(&our_in); /* * No more input, we're done. */ - if (fr_sbuff_extend(&in) == 0) { + if (fr_sbuff_extend(&our_in) == 0) { done: *head = lhs; - return fr_sbuff_set(input, &in); + return fr_sbuff_set(in, &our_in); } /* @@ -1194,65 +1270,79 @@ redo: * If we did expect it, then we return whatever we found, * and let the caller eat the ')'. */ - if (fr_sbuff_is_char(&in, ')')) { + if (fr_sbuff_is_char(&our_in, ')')) { if (!bracket_rules) { fr_strerror_printf("Unexpected ')'"); - FR_SBUFF_ERROR_RETURN(&in); + FR_SBUFF_ERROR_RETURN(&our_in); } goto done; } - fr_sbuff_skip_whitespace(&in); + fr_sbuff_skip_whitespace(&our_in); /* * Remember where we were after parsing the LHS. */ - fr_sbuff_marker(&marker, &in); + fr_sbuff_marker(&marker, &our_in); /* * Get the operator. */ - fr_sbuff_out_by_longest_prefix(&slen, &op, expr_assignment_op_table, &in, T_INVALID); + fr_sbuff_out_by_longest_prefix(&slen, &op, expr_assignment_op_table, &our_in, T_INVALID); if (op == T_INVALID) { talloc_free(lhs); - fr_strerror_printf("Expected operator at '%.4s'", fr_sbuff_current(&in)); - FR_SBUFF_ERROR_RETURN(&in); - } - - if (!binary_ops[op].str) { - fr_strerror_printf("Invalid operator '%s'", fr_tokens[op]); - FR_SBUFF_ERROR_RETURN_ADJ(&in, -slen); + fr_strerror_printf("Expected operator at '%.4s'", fr_sbuff_current(&our_in)); + FR_SBUFF_ERROR_RETURN(&our_in); } - fr_assert(precedence[op] != 0); - -#if 0 /* - * @todo - handle regexes as a special case. The LHS ideally should be a simple xlat (i.e. not a - * comparison). The RHS MUST be a solidus-quoted string. + * Regular expressions are a special case, and have precedence over everything else. Because it + * makes zero sense to do things like: + * + * Foo-Bar =~ (a | b) ???????? + * + * @todo - make sure that the LHS is something "real", and isn't (for example) a comparison? Tho + * TBH why not allow that: + * + * (1 < 3) =~ /foo/ + * + * will get the LHS to be evaluated, then printed to a string, and then the string will be used + * for the regex. If the user is stupid enough to do this, why not allow it? */ if ((op == T_OP_REG_EQ) || (op == T_OP_REG_NE)) { +#ifdef HAVE_REGEX /* * @todo - if we have * * &Foo =~ s/foo/bar/... * * then do substitution, ala %(subst:...), or maybe just create a %(subst:...) node? + * + * It's syntactic sugar, but it's *nice* syntactic sugar. */ -// slen = tokenize_regex(ctx, &rhs, &in, p_rules, t_rules); + slen = tokenize_regex(ctx, &rhs, flags, &our_in, bracket_rules, t_rules); if (slen <= 0) { - FR_SBUFF_ERROR_RETURN_ADJ(&in, slen); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, slen); } - /* - * xlat_func_regex() takes a LHS FR_TYPE_STRING, and RHS FR_TYPE_STRING - * - * or RHS FR_TYPE_VOID, which is a pre-compiled regex? - */ + fr_sbuff_advance(&our_in, slen); - goto alloc_func; - } + // @todo - get regex func! + fr_assert(0); + +#else + fr_sbuff_set(&our_in, &marker); + fr_strerror_printf("Invalid operator '%s' - regular expressions are not supported in this build.", fr_tokens[op]); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, -slen); #endif + } + + if (!binary_ops[op].str) { + fr_strerror_printf("Invalid operator '%s'", fr_tokens[op]); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, -slen); + } + + fr_assert(precedence[op] != 0); /* * a * b + c ... = (a * b) + c ... @@ -1261,10 +1351,13 @@ redo: * take care of continuing. */ if (precedence[op] <= precedence[prev]) { - fr_sbuff_set(&in, &marker); + fr_sbuff_set(&our_in, &marker); goto done; } + if ((op == T_OP_REG_EQ) || (op == T_OP_REG_NE)) { + } + /* * By default we don't parse enums on the RHS, and we're also flexible about what we see on the * RHS. @@ -1353,16 +1446,16 @@ redo: /* * We now parse the RHS, allowing a (perhaps different) cast on the RHS. */ - slen = tokenize_expression(ctx, &rhs, flags, &in, p_rules, t_rules, op, type, bracket_rules, da); + slen = tokenize_expression(ctx, &rhs, flags, &our_in, p_rules, t_rules, op, type, bracket_rules, da); if (slen <= 0) { talloc_free(lhs); - FR_SBUFF_ERROR_RETURN_ADJ(&in, slen); + FR_SBUFF_ERROR_RETURN_ADJ(&our_in, slen); } #ifdef __clang_analyzer__ if (!rhs) { talloc_free(lhs); - FR_SBUFF_ERROR_RETURN(&in); + FR_SBUFF_ERROR_RETURN(&our_in); } #endif @@ -1397,7 +1490,7 @@ redo: if (flags->pure) { if (xlat_purify_expr(node) < 0) { talloc_free(node); - FR_SBUFF_ERROR_RETURN(&in); /* @todo m_lhs ? */ + FR_SBUFF_ERROR_RETURN(&our_in); /* @todo m_lhs ? */ } } @@ -1473,9 +1566,23 @@ ssize_t xlat_tokenize_expression(TALLOC_CTX *ctx, xlat_exp_t **head, xlat_flags_ if (!t_rules) t_rules = &my_rules; + *head = NULL; + slen = tokenize_expression(ctx, head, flags, in, terminal_rules, t_rules, T_INVALID, FR_TYPE_NULL, bracket_rules, NULL); talloc_free(bracket_rules); talloc_free(terminal_rules); + + if (slen <= 0) return slen; + + /* + * Add nodes that need to be bootstrapped to + * the registry. + */ + if (xlat_bootstrap(*head) < 0) { + TALLOC_FREE(*head); + return 0; + } + return slen; } diff --git a/src/tests/unit/xlat/expr.txt b/src/tests/unit/xlat/expr.txt index bac3e29f2b9..11e39154a90 100644 --- a/src/tests/unit/xlat/expr.txt +++ b/src/tests/unit/xlat/expr.txt @@ -116,6 +116,9 @@ match (1 + 2) xlat_expr (((1 + 2)) * ((3 + 4))) match ((1 + 2) * (3 + 4)) +#xlat_expr &Filter-Id =~ /foo/ +#match bar + count match 55