From: Jukka-Pekka Iivonen Date: Wed, 24 Mar 2010 09:07:32 +0000 (+0100) Subject: Add experimental support for regular expression literals X-Git-Tag: 0.8.0~41 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1afe020286302dcce26abc19ed559da05d21e3eb;p=thirdparty%2Fvala.git Add experimental support for regular expression literals Fixes bug 607702. --- diff --git a/codegen/valaccodebasemodule.vala b/codegen/valaccodebasemodule.vala index 1e7175d42..f839654ad 100644 --- a/codegen/valaccodebasemodule.vala +++ b/codegen/valaccodebasemodule.vala @@ -150,6 +150,7 @@ internal class Vala.CCodeBaseModule : CCodeModule { Set reserved_identifiers; public int next_temp_var_id = 0; + public int next_regex_id = 0; public bool in_creation_method { get { return current_method is CreationMethod; } } public bool in_constructor = false; public bool in_static_or_class_context = false; @@ -178,6 +179,7 @@ internal class Vala.CCodeBaseModule : CCodeModule { public DataType int64_type; public DataType uint64_type; public DataType string_type; + public DataType regex_type; public DataType float_type; public DataType double_type; public TypeSymbol gtype_type; @@ -313,6 +315,7 @@ internal class Vala.CCodeBaseModule : CCodeModule { float_type = new FloatingType ((Struct) root_symbol.scope.lookup ("float")); double_type = new FloatingType ((Struct) root_symbol.scope.lookup ("double")); string_type = new ObjectType ((Class) root_symbol.scope.lookup ("string")); + regex_type = new ObjectType ((Class) root_symbol.scope.lookup ("GLib").scope.lookup ("Regex")); var unichar_struct = (Struct) root_symbol.scope.lookup ("unichar"); if (unichar_struct != null) { @@ -3548,6 +3551,42 @@ internal class Vala.CCodeBaseModule : CCodeModule { expr.ccodenode = new CCodeConstant.string (expr.value); } + public override void visit_regex_literal (RegexLiteral expr) { + string[] parts = expr.value.split ("/", 3); + string re = parts[2].escape (""); + string flags = "0"; + + if (parts[1].contains ("i")) { + flags += " | G_REGEX_CASELESS"; + } + if (parts[1].contains ("m")) { + flags += " | G_REGEX_MULTILINE"; + } + if (parts[1].contains ("s")) { + flags += " | G_REGEX_DOTALL"; + } + if (parts[1].contains ("x")) { + flags += " | G_REGEX_EXTENDED"; + } + + var regex_var = get_temp_variable (regex_type, true, expr, false); + expr.temp_vars.add (regex_var); + + var cdecl = new CCodeDeclaration ("GRegex*"); + + var cname = regex_var.name + "regex_" + next_regex_id.to_string (); + this.next_regex_id++; + + cdecl.add_declarator (new CCodeVariableDeclarator (cname + " = NULL")); + cdecl.modifiers = CCodeModifiers.STATIC; + + var regex_const = new CCodeConstant ("(%s == NULL) ? (%s = g_regex_new (\"".printf (cname, cname) + + re + "\", " + flags + ", 0, NULL)) : %s".printf (cname)); + + source_declarations.add_constant_declaration (cdecl); + expr.ccodenode = regex_const; + } + public override void visit_null_literal (NullLiteral expr) { if (context.profile != Profile.GOBJECT) { source_declarations.add_include ("stddef.h"); diff --git a/codegen/valaccodegenerator.vala b/codegen/valaccodegenerator.vala index f8af697d0..e7472c263 100644 --- a/codegen/valaccodegenerator.vala +++ b/codegen/valaccodegenerator.vala @@ -280,6 +280,11 @@ public class Vala.CCodeGenerator : CodeGenerator { head.visit_tuple (expr); } + public override void visit_regex_literal (RegexLiteral expr) { + head.visit_regex_literal (expr); + } + + public override void visit_null_literal (NullLiteral expr) { head.visit_null_literal (expr); } diff --git a/codegen/valaccodemodule.vala b/codegen/valaccodemodule.vala index 523ac083d..1afeac224 100644 --- a/codegen/valaccodemodule.vala +++ b/codegen/valaccodemodule.vala @@ -259,6 +259,10 @@ public abstract class Vala.CCodeModule { next.visit_tuple (expr); } + public virtual void visit_regex_literal (RegexLiteral re) { + next.visit_regex_literal (re); + } + public virtual void visit_null_literal (NullLiteral expr) { next.visit_null_literal (expr); } diff --git a/tests/Makefile.am b/tests/Makefile.am index 04e3980a8..929be7a3d 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -55,6 +55,7 @@ TESTS = \ objects/interfaces.vala \ objects/methods.vala \ objects/properties.vala \ + objects/regex.vala \ objects/signals.vala \ objects/test-025.vala \ objects/test-026.vala \ diff --git a/tests/objects/regex.vala b/tests/objects/regex.vala new file mode 100644 index 000000000..1c517f385 --- /dev/null +++ b/tests/objects/regex.vala @@ -0,0 +1,107 @@ +using GLib; + + +static Regex get_from_array (int index) +{ + Regex[] arr = { + /(\d+\.\d+\.\d+)/, + /(\d+)\.\d+\.\d+/, + /\d+\.(\d+)\.\d+/, + /(\d+)\.\d+\.(\d+)/ + }; + + assert (0 <= index <= 3); + return arr[index]; +} + +static Regex get_fixed () +{ + return /(is.*ip)/; +} + +class Test : Object { + public signal void regexTest (string str); + public void run (string s) + { + regexTest (s); + } +} + +void main () +{ + MatchInfo info; + + // Simple greedy regular expression matching, regex received as a function return value. + var str1 = "mississippi"; + if (get_fixed ().match (str1, 0, out info)) { + stdout.printf ("Part of %s is '%s'...\n", str1, info.fetch (1)); + } else { + stdout.printf ("Did not match at all.\n"); + } + + // Match caseless. + var str2 = "demonStration"; + if (/mon(str.*o)n/i.match (str2, 0, out info)) { + stdout.printf ("Part of %s is '%s'...\n", str2, info.fetch (1)); + } else { + stdout.printf ("%s did not match at all.\n", str2); + } + + // Match and pick substrings. + var ts = "Time: 10:42:12"; + if (/Time: (..):(..):(..)/.match (ts, 0, out info)) { + stdout.printf ("%s\n\thours = %s\n\tminutes = %s\n\tseconds = %s\n\n", ts, info.fetch (1), info.fetch (2), info.fetch (3)); + } + + // Replace demo: word swapping + try { + var str = "apple grape"; + stdout.printf ("'%s' becomes '%s'\n", str, /^([^ ]*) *([^ ]*)/.replace (str, -1, 0, """\2 \1""")); + } catch (RegexError err) { + // Replacing still needs exception catching + message (err.message); + } + + // Regex literals in an array + for (int i=0; i<4; i++) { + if (get_from_array (i).match ("23.3.2010", 0, out info)) { + stdout.printf ("Round %d: %s\n", i, info.fetch (1)); + } + } + + // ??-operator + Regex? r = null; + Regex? r1 = null; + Regex? r2 = null; + + r = r1 ?? r2 ?? /match (this)/i; + if (r.match ("match THIS", 0, out info)) { + stdout.printf ("Match: %s\n", info.fetch (1)); + } + + // Escape sequences + if (/\.\+\(\)\-\?\/\"\$\[\]\*\^/.match (".+()-?/\"$[]*^")) { + stdout.printf ("Matches\n"); + } else { + stdout.printf ("Does not match.\n"); + } + + // Lambda and closure test + Regex? rc = /foo(bar)/i; + var test = new Test (); + test.regexTest.connect ((s) => { + if (rc.match (s, 0, out info)) { + stdout.printf ("Lambda (closure var.): %s -> %s\n", s, info.fetch (1)); + } else { + stdout.printf ("Does not match.\n"); + } + if (/foo(bar)/i.match (s, 0, out info)) { + stdout.printf ("Lambda (lit.): %s -> %s\n", s, info.fetch (1)); + } else { + stdout.printf ("Does not match.\n"); + } + }); + test.run ("fooBar"); + test.run ("foobAr"); +} + diff --git a/vala/Makefile.am b/vala/Makefile.am index aa0b861f4..f6203a919 100644 --- a/vala/Makefile.am +++ b/vala/Makefile.am @@ -118,6 +118,7 @@ libvalacore_la_VALASOURCES = \ valarealliteral.vala \ valareferencetransferexpression.vala \ valareferencetype.vala \ + valaregexliteral.vala \ valareport.vala \ valareturnstatement.vala \ valascanner.vala \ diff --git a/vala/valacodevisitor.vala b/vala/valacodevisitor.vala index 55f87d7cc..5f6a0c8c2 100644 --- a/vala/valacodevisitor.vala +++ b/vala/valacodevisitor.vala @@ -468,6 +468,15 @@ public abstract class Vala.CodeVisitor { public virtual void visit_real_literal (RealLiteral lit) { } + /** + * Visit operation called for regex literals. + * + * @param lit a regex literal + */ + public virtual void visit_regex_literal (RegexLiteral lit) { + } + + /** * Visit operation called for string literals. * diff --git a/vala/valaparser.vala b/vala/valaparser.vala index c9563c0ac..15ae81b96 100644 --- a/vala/valaparser.vala +++ b/vala/valaparser.vala @@ -290,6 +290,13 @@ public class Vala.Parser : CodeVisitor { Report.error (lit.source_reference, "invalid character literal"); } return lit; + case TokenType.REGEX_LITERAL: + next (); + string match_part = get_last_string (); + SourceReference src_begin = get_src (begin); + expect (TokenType.CLOSE_REGEX_LITERAL); + string close_token = get_last_string (); + return new RegexLiteral ("%s/%s".printf (close_token, match_part), src_begin); case TokenType.STRING_LITERAL: next (); return new StringLiteral (get_last_string (), get_src (begin)); @@ -560,6 +567,7 @@ public class Vala.Parser : CodeVisitor { case TokenType.REAL_LITERAL: case TokenType.CHARACTER_LITERAL: case TokenType.STRING_LITERAL: + case TokenType.REGEX_LITERAL: case TokenType.TEMPLATE_STRING_LITERAL: case TokenType.VERBATIM_STRING_LITERAL: case TokenType.NULL: @@ -585,6 +593,9 @@ public class Vala.Parser : CodeVisitor { case TokenType.OPEN_TEMPLATE: expr = parse_template (); break; + case TokenType.OPEN_REGEX_LITERAL: + expr = parse_regex_literal (); + break; case TokenType.THIS: expr = parse_this_access (); break; @@ -694,6 +705,14 @@ public class Vala.Parser : CodeVisitor { return template; } + Expression parse_regex_literal () throws ParseError { + expect (TokenType.OPEN_REGEX_LITERAL); + + var expr = parse_literal (); + + return expr; + } + Expression parse_member_access (SourceLocation begin, Expression inner) throws ParseError { expect (TokenType.DOT); string id = parse_identifier (); @@ -985,6 +1004,7 @@ public class Vala.Parser : CodeVisitor { case TokenType.STRING_LITERAL: case TokenType.TEMPLATE_STRING_LITERAL: case TokenType.VERBATIM_STRING_LITERAL: + case TokenType.REGEX_LITERAL: case TokenType.NULL: case TokenType.THIS: case TokenType.BASE: diff --git a/vala/valaregexliteral.vala b/vala/valaregexliteral.vala new file mode 100644 index 000000000..eb1edd53a --- /dev/null +++ b/vala/valaregexliteral.vala @@ -0,0 +1,89 @@ +/* valaregexliteral.vala + * + * Copyright (C) 2010 Jukka-Pekka Iivonen + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * Author: + * Jukka-Pekka Iivonen + */ + +using GLib; + +/** + * Represents a regular expression literal in the source code. + */ +public class Vala.RegexLiteral : Literal { + /** + * The literal value. + */ + public string value { get; set; } + + /** + * Creates a new regular expression literal. + * + * @param s the literal value + * @param source reference to source code + * @return newly created string literal + */ + public RegexLiteral (string value, SourceReference? source_reference = null) { + this.value = value; + this.source_reference = source_reference; + } + + public override void accept (CodeVisitor visitor) { + visitor.visit_regex_literal (this); + + visitor.visit_expression (this); + } + + public override bool is_pure () { + return true; + } + + public override bool is_non_null () { + return true; + } + + public override string to_string () { + return value; + } + + public override bool check (SemanticAnalyzer analyzer) { + if (checked) { + return !error; + } + + checked = true; + + if (!analyzer.context.experimental) { + Report.warning (source_reference, "regular expression literals are experimental"); + } + + try { + var regex = new GLib.Regex (value); + if (regex != null) { /* Regex is valid. */ } + } catch (RegexError err) { + error = true; + Report.error (source_reference, "Invalid regular expression `%s'.".printf (value)); + return false; + } + + value_type = analyzer.regex_type.copy (); + + return !error; + } +} + diff --git a/vala/valascanner.vala b/vala/valascanner.vala index 744dcf14c..3a05b5c37 100644 --- a/vala/valascanner.vala +++ b/vala/valascanner.vala @@ -18,6 +18,7 @@ * * Author: * Jürg Billeter + * Jukka-Pekka Iivonen */ using GLib; @@ -28,6 +29,7 @@ using GLib; public class Vala.Scanner { public SourceFile source_file { get; private set; } + TokenType previous; char* current; char* end; @@ -51,7 +53,8 @@ public class Vala.Scanner { BRACE, BRACKET, TEMPLATE, - TEMPLATE_PART + TEMPLATE_PART, + REGEX_LITERAL } public Scanner (SourceFile source_file) { @@ -83,10 +86,178 @@ public class Vala.Scanner { return (state_stack.length > 0 && state_stack[state_stack.length - 1] == State.TEMPLATE_PART); } + bool in_regex_literal () { + return (state_stack.length > 0 && state_stack[state_stack.length - 1] == State.REGEX_LITERAL); + } + bool is_ident_char (char c) { return (c.isalnum () || c == '_'); } + public TokenType read_regex_token (out SourceLocation token_begin, out SourceLocation token_end) { + TokenType type; + char* begin = current; + token_begin.pos = begin; + token_begin.line = line; + token_begin.column = column; + + int token_length_in_chars = -1; + + if (current >= end) { + type = TokenType.EOF; + } else { + switch (current[0]) { + case '/': + type = TokenType.CLOSE_REGEX_LITERAL; + current++; + state_stack.length--; + var fl_i = false; + var fl_s = false; + var fl_m = false; + var fl_x = false; + while (current[0] == 'i' || current[0] == 's' || current[0] == 'm' || current[0] == 'x') { + switch (current[0]) { + case 'i': + if (fl_i) { + Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "modifier 'i' used more than once"); + } + fl_i = true; + break; + case 's': + if (fl_s) { + Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "modifier 's' used more than once"); + } + fl_s = true; + break; + case 'm': + if (fl_m) { + Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "modifier 'm' used more than once"); + } + fl_m = true; + break; + case 'x': + if (fl_x) { + Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "modifier 'x' used more than once"); + } + fl_x = true; + break; + } + current++; + token_length_in_chars++; + } + break; + default: + type = TokenType.REGEX_LITERAL; + token_length_in_chars = 0; + while (current < end && current[0] != '/') { + if (current[0] == '\\') { + current++; + token_length_in_chars++; + if (current >= end) { + break; + } + + switch (current[0]) { + case '\'': + case '"': + case '\\': + case '/': + case '^': + case '$': + case '.': + case '[': + case ']': + case '{': + case '}': + case '(': + case ')': + case '?': + case '*': + case '+': + case '-': + case '#': + case '&': + case '~': + case ':': + case ';': + case '<': + case '>': + case '|': + case '%': + case '=': + case '@': + case '0': + case 'b': + case 'B': + case 'f': + case 'n': + case 'r': + case 't': + case 'a': + case 'A': + case 'p': + case 'P': + case 'e': + case 'd': + case 'D': + case 's': + case 'S': + case 'w': + case 'W': + case 'G': + case 'z': + case 'Z': + current++; + token_length_in_chars++; + break; + case 'x': + // hexadecimal escape character + current++; + token_length_in_chars++; + while (current < end && current[0].isxdigit ()) { + current++; + token_length_in_chars++; + } + break; + default: + Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "invalid escape sequence"); + break; + } + } else if (current[0] == '\n') { + break; + } else { + unichar u = ((string) current).get_char_validated ((long) (end - current)); + if (u != (unichar) (-1)) { + current += u.to_utf8 (null); + token_length_in_chars++; + } else { + current++; + Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "invalid UTF-8 character"); + } + } + } + if (current >= end || current[0] == '\n') { + Report.error (new SourceReference (source_file, line, column + token_length_in_chars, line, column + token_length_in_chars), "syntax error, expected \""); + state_stack.length--; + return read_token (out token_begin, out token_end); + } + break; + } + } + + if (token_length_in_chars < 0) { + column += (int) (current - begin); + } else { + column += token_length_in_chars; + } + + token_end.pos = current; + token_end.line = line; + token_end.column = column - 1; + + return type; + } + public static TokenType get_identifier_or_keyword (char* begin, int len) { switch (len) { case 2: @@ -585,6 +756,8 @@ public class Vala.Scanner { token_end.column = column - 1; return TokenType.COMMA; + } else if (in_regex_literal ()) { + return read_regex_token (out token_begin, out token_end); } space (); @@ -843,11 +1016,18 @@ public class Vala.Scanner { } break; case '/': - type = TokenType.DIV; - current++; - if (current < end && current[0] == '=') { - type = TokenType.ASSIGN_DIV; + if (previous == TokenType.OPEN_PARENS || previous == TokenType.ASSIGN || previous == TokenType.OP_COALESCING + || previous == TokenType.COMMA || previous == TokenType.RETURN || previous == TokenType.OPEN_BRACE) { + type = TokenType.OPEN_REGEX_LITERAL; + state_stack += State.REGEX_LITERAL; + current++; + } else { + type = TokenType.DIV; current++; + if (current < end && current[0] == '=') { + type = TokenType.ASSIGN_DIV; + current++; + } } break; case '%': @@ -979,6 +1159,7 @@ public class Vala.Scanner { token_end.pos = current; token_end.line = line; token_end.column = column - 1; + previous = type; return type; } diff --git a/vala/valasemanticanalyzer.vala b/vala/valasemanticanalyzer.vala index 818ea7d50..54e6ced91 100644 --- a/vala/valasemanticanalyzer.vala +++ b/vala/valasemanticanalyzer.vala @@ -132,6 +132,7 @@ public class Vala.SemanticAnalyzer : CodeVisitor { public DataType void_type = new VoidType (); public DataType bool_type; public DataType string_type; + public DataType regex_type; public DataType uchar_type; public DataType short_type; public DataType ushort_type; @@ -176,6 +177,7 @@ public class Vala.SemanticAnalyzer : CodeVisitor { bool_type = new BooleanType ((Struct) root_symbol.scope.lookup ("bool")); string_type = new ObjectType ((Class) root_symbol.scope.lookup ("string")); + regex_type = new ObjectType ((Class) root_symbol.scope.lookup ("GLib").scope.lookup ("Regex")); short_type = new IntegerType ((Struct) root_symbol.scope.lookup ("short")); ushort_type = new IntegerType ((Struct) root_symbol.scope.lookup ("ushort")); diff --git a/vala/valatokentype.vala b/vala/valatokentype.vala index 88b876708..8a246841f 100644 --- a/vala/valatokentype.vala +++ b/vala/valatokentype.vala @@ -49,6 +49,7 @@ public enum Vala.TokenType { CLOSE_BRACE, CLOSE_BRACKET, CLOSE_PARENS, + CLOSE_REGEX_LITERAL, CLOSE_TEMPLATE, COLON, COMMA, @@ -109,6 +110,7 @@ public enum Vala.TokenType { OPEN_BRACE, OPEN_BRACKET, OPEN_PARENS, + OPEN_REGEX_LITERAL, OPEN_TEMPLATE, OVERRIDE, OWNED, @@ -120,6 +122,7 @@ public enum Vala.TokenType { PUBLIC, REAL_LITERAL, REF, + REGEX_LITERAL, REQUIRES, RETURN, SEMICOLON, @@ -177,6 +180,7 @@ public enum Vala.TokenType { case CLOSE_BRACE: return "`}'"; case CLOSE_BRACKET: return "`]'"; case CLOSE_PARENS: return "`)'"; + case CLOSE_REGEX_LITERAL: return "`/'"; case COLON: return "`:'"; case COMMA: return "`,'"; case CONST: return "`const'"; @@ -232,6 +236,7 @@ public enum Vala.TokenType { case OPEN_BRACE: return "`{'"; case OPEN_BRACKET: return "`['"; case OPEN_PARENS: return "`('"; + case OPEN_REGEX_LITERAL: return "`/'"; case OVERRIDE: return "`override'"; case OWNED: return "`owned'"; case PARAMS: return "`params'"; @@ -242,6 +247,7 @@ public enum Vala.TokenType { case PUBLIC: return "`public'"; case REAL_LITERAL: return "real literal"; case REF: return "`ref'"; + case REGEX_LITERAL: return "regex literal"; case REQUIRES: return "`requires'"; case RETURN: return "`return'"; case SEMICOLON: return "`;'";