From: Alex Coyte Date: Mon, 19 Jun 2017 01:03:05 +0000 (+1000) Subject: Treat characters between \Q \E as codepoints in UTF8 mode. X-Git-Tag: v4.5.2^2~5 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a185be5a4f684c9bdbd90a2b9716ca02dde9e7b2;p=thirdparty%2Fvectorscan.git Treat characters between \Q \E as codepoints in UTF8 mode. fixes github issue #57 --- diff --git a/src/parser/Parser.rl b/src/parser/Parser.rl index 52b3340c..05a084bb 100644 --- a/src/parser/Parser.rl +++ b/src/parser/Parser.rl @@ -1155,6 +1155,35 @@ unichar readUtf8CodePoint4c(const char *s) { '\\E' => { fgoto main; }; + + #unicode chars + utf8_2c when is_utf8 => { + assert(mode.utf8); + /* leverage ComponentClass to generate the vertices */ + auto cc = getComponentClass(mode); + cc->add(readUtf8CodePoint2c(ts)); + cc->finalize(); + currentSeq->addComponent(move(cc)); + }; + + utf8_3c when is_utf8 => { + assert(mode.utf8); + /* leverage ComponentClass to generate the vertices */ + auto cc = getComponentClass(mode); + cc->add(readUtf8CodePoint3c(ts)); + cc->finalize(); + currentSeq->addComponent(move(cc)); + }; + + utf8_4c when is_utf8 => { + assert(mode.utf8); + /* leverage ComponentClass to generate the vertices */ + auto cc = getComponentClass(mode); + cc->add(readUtf8CodePoint4c(ts)); + cc->finalize(); + currentSeq->addComponent(move(cc)); + }; + # Literal character any => { addLiteral(currentSeq, *ts, mode);