From: Mike Stepanek (mstepane) Date: Tue, 23 Mar 2021 13:05:23 +0000 (+0000) Subject: Merge pull request #2778 in SNORT/snort3 from ~OSERHIIE/snort3:javascript_normalizati... X-Git-Tag: 3.1.3.0~8 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=8f60341a896291945acbb81fdb2b1ea5189016d3;p=thirdparty%2Fsnort3.git Merge pull request #2778 in SNORT/snort3 from ~OSERHIIE/snort3:javascript_normalization to master Squashed commit of the following: commit 5371730d74442a199d46ed862639172f18437193 Author: Oleksandr Serhiienko Date: Mon Feb 1 16:01:38 2021 +0200 http_inspect: add JavaScript whitespace normalization http_inspect: integrate JSNormalizer (whitespace normalizzation) keeping the old one http_inspect: add normalization_depth config option utils: add JSNormalizer cmake: add flex build dependency doc: update http_inspect feature doc --- diff --git a/cmake/include_libraries.cmake b/cmake/include_libraries.cmake index 811c95ac6..82ac24529 100644 --- a/cmake/include_libraries.cmake +++ b/cmake/include_libraries.cmake @@ -3,6 +3,7 @@ find_package(Threads REQUIRED) find_package(DAQ REQUIRED) find_package(DNET REQUIRED) +find_package(FLEX REQUIRED) find_package(HWLOC REQUIRED) find_package(LuaJIT REQUIRED) find_package(OpenSSL REQUIRED) diff --git a/doc/user/http_inspect.txt b/doc/user/http_inspect.txt index f0c706864..00ce605dc 100755 --- a/doc/user/http_inspect.txt +++ b/doc/user/http_inspect.txt @@ -153,6 +153,20 @@ decodeURIComponent are normalized. The different encodings handled within the unescape, decodeURI, or decodeURIComponent are %XX, %uXXXX, XX and uXXXXi. http_inspect also replaces consecutive whitespaces with a single space and normalizes the plus by concatenating the strings. +Such normalizations refer to basic JavaScript normalization. + +===== normalization_depth + +normalization_depth = N {-1 : 65535} will set a number of input JavaScript +bytes to normalize and enable the whitespace normalizer instead of the +basic one. Meanwhile, normalize_javascript = true must be configured as +well. When the depth is reached, normalization will be stopped. It's +implemented per-script. normalization_depth = -1 will configure max depth +value. By default, the value is set to 0. Configure this option to enable +more precise whitespace normalization of JavaScript, that removes all +redundant whitespaces and line terminators from the JavaScript syntax point +of view (between identifier and punctuator, between identifier and operator, +etc.) according to ECMAScript 5.1 standard. ===== xff_headers diff --git a/src/service_inspectors/http_inspect/http_js_norm.cc b/src/service_inspectors/http_inspect/http_js_norm.cc index da46624f3..405b3afbe 100644 --- a/src/service_inspectors/http_inspect/http_js_norm.cc +++ b/src/service_inspectors/http_inspect/http_js_norm.cc @@ -23,29 +23,86 @@ #include "http_js_norm.h" -#include "utils/util_jsnorm.h" +#include "utils/js_normalizer.h" #include "utils/safec.h" +#include "utils/util_jsnorm.h" #include "http_enum.h" using namespace HttpEnums; using namespace snort; -HttpJsNorm::HttpJsNorm(int max_javascript_whitespaces_, const HttpParaList::UriParam& uri_param_) : - max_javascript_whitespaces(max_javascript_whitespaces_), uri_param(uri_param_), - javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr) {} +class JsNormBase +{ +public: + virtual ~JsNormBase() = default; + + virtual int normalize(const char*, uint16_t, char*, uint16_t, const char**, int*, JSState*, + uint8_t*) = 0; + +}; + +class UtilJsNorm : public JsNormBase +{ +public: + UtilJsNorm() : JsNormBase() {} + +protected: + virtual int normalize(const char* src, uint16_t srclen, char* dst, uint16_t destlen, + const char** ptr, int* bytes_copied, JSState* js, uint8_t* iis_unicode_map) override + { + return JSNormalizeDecode(src, srclen, dst, destlen, ptr, bytes_copied, js, iis_unicode_map); + } + +}; + +class JsNorm : public JsNormBase +{ +public: + JsNorm(int normalization_depth) + : JsNormBase(), + norm_depth(normalization_depth) + {} + +protected: + virtual int normalize(const char* src, uint16_t srclen, char* dst, uint16_t destlen, + const char** ptr, int* bytes_copied, JSState*, uint8_t*) override + { + return JSNormalizer::normalize(src, srclen, dst, destlen, ptr, bytes_copied, norm_depth); + } + +private: + int norm_depth; + +}; + +HttpJsNorm::HttpJsNorm(int max_javascript_whitespaces_, const HttpParaList::UriParam& uri_param_, + int normalization_depth) : + normalizer(nullptr), max_javascript_whitespaces(max_javascript_whitespaces_), + uri_param(uri_param_), normalization_depth(normalization_depth), + javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr) +{} HttpJsNorm::~HttpJsNorm() { + delete normalizer; delete javascript_search_mpse; delete htmltype_search_mpse; } void HttpJsNorm::configure() { - if ( javascript_search_mpse || htmltype_search_mpse ) + if ( configure_once ) return; + // Based on this option configuration, default or whitespace normalizer will be initialized + // normalization_depth = 0 means to initialize default normalizer + // normalization_depth != 0 means to initialize whitespace normalizer with specified depth + if ( normalization_depth != 0 ) + normalizer = new JsNorm(normalization_depth); + else + normalizer = new UtilJsNorm; + javascript_search_mpse = new SearchTool; htmltype_search_mpse = new SearchTool; @@ -72,6 +129,8 @@ void HttpJsNorm::configure() htmltype_search_mpse->add(tmp->name, tmp->name_len, tmp->search_id); } htmltype_search_mpse->prep(); + + configure_once = true; } void HttpJsNorm::normalize(const Field& input, Field& output, HttpInfractions* infractions, @@ -100,7 +159,7 @@ void HttpJsNorm::normalize(const Field& input, Field& output, HttpInfractions* i const char* js_start = ptr + mindex; const char* const angle_bracket = (const char*)SnortStrnStr(js_start, end - js_start, ">"); - if (angle_bracket == nullptr) + if (angle_bracket == nullptr || (end - angle_bracket) == 0) break; bool type_js = false; @@ -110,7 +169,7 @@ void HttpJsNorm::normalize(const Field& input, Field& output, HttpInfractions* i const int script_found = htmltype_search_mpse->find( js_start, (angle_bracket-js_start), search_html_found, false, &mid); - js_start = angle_bracket; + js_start = angle_bracket + 1; if (script_found > 0) { switch (mid) @@ -144,7 +203,7 @@ void HttpJsNorm::normalize(const Field& input, Field& output, HttpInfractions* i if (!type_js) continue; - JSNormalizeDecode(js_start, (uint16_t)(end-js_start), (char*)buffer+index, + normalizer->normalize(js_start, (uint16_t)(end-js_start), (char*)buffer+index, (uint16_t)(input.length() - index), &ptr, &bytes_copied, &js, uri_param.iis_unicode ? uri_param.unicode_map : nullptr); diff --git a/src/service_inspectors/http_inspect/http_js_norm.h b/src/service_inspectors/http_inspect/http_js_norm.h index 05da0719b..2e1dbe2df 100644 --- a/src/service_inspectors/http_inspect/http_js_norm.h +++ b/src/service_inspectors/http_inspect/http_js_norm.h @@ -32,15 +32,22 @@ // HttpJsNorm class //------------------------------------------------------------------------- +class JsNormBase; + class HttpJsNorm { public: - HttpJsNorm(int max_javascript_whitespaces_, const HttpParaList::UriParam& uri_param_); + HttpJsNorm(int max_javascript_whitespaces_, const HttpParaList::UriParam& uri_param_, + int normalization_depth); ~HttpJsNorm(); void normalize(const Field& input, Field& output, HttpInfractions* infractions, HttpEventGen* events) const; void configure(); private: + bool configure_once = false; + + JsNormBase* normalizer; + enum JsSearchId { JS_JAVASCRIPT }; enum HtmlSearchId { HTML_JS, HTML_EMA, HTML_VB }; @@ -49,6 +56,7 @@ private: const int max_javascript_whitespaces; const HttpParaList::UriParam& uri_param; + const int normalization_depth; snort::SearchTool* javascript_search_mpse; snort::SearchTool* htmltype_search_mpse; diff --git a/src/service_inspectors/http_inspect/http_module.cc b/src/service_inspectors/http_inspect/http_module.cc index d60171a43..88d370c47 100755 --- a/src/service_inspectors/http_inspect/http_module.cc +++ b/src/service_inspectors/http_inspect/http_module.cc @@ -89,6 +89,9 @@ const Parameter HttpModule::http_params[] = { "normalize_javascript", Parameter::PT_BOOL, nullptr, "false", "normalize JavaScript in response bodies" }, + { "normalization_depth", Parameter::PT_INT, "-1:65535", "0", + "number of input JavaScript bytes to normalize" }, + { "max_javascript_whitespaces", Parameter::PT_INT, "1:65535", "200", "maximum consecutive whitespaces allowed within the JavaScript obfuscated data" }, @@ -214,6 +217,11 @@ bool HttpModule::set(const char*, Value& val, SnortConfig*) { params->js_norm_param.normalize_javascript = val.get_bool(); } + else if (val.is("normalization_depth")) + { + int v = val.get_int32(); + params->js_norm_param.normalization_depth = (v == -1) ? 65535 : v; + } else if (val.is("max_javascript_whitespaces")) { params->js_norm_param.max_javascript_whitespaces = val.get_uint16(); @@ -393,7 +401,8 @@ bool HttpModule::end(const char*, int, SnortConfig*) if (params->js_norm_param.normalize_javascript) { params->js_norm_param.js_norm = - new HttpJsNorm(params->js_norm_param.max_javascript_whitespaces, params->uri_param); + new HttpJsNorm(params->js_norm_param.max_javascript_whitespaces, params->uri_param, + params->js_norm_param.normalization_depth); } prepare_http_header_list(params); diff --git a/src/service_inspectors/http_inspect/http_module.h b/src/service_inspectors/http_inspect/http_module.h index 4b0adfe7d..f7d76f650 100755 --- a/src/service_inspectors/http_inspect/http_module.h +++ b/src/service_inspectors/http_inspect/http_module.h @@ -52,6 +52,7 @@ public: public: ~JsNormParam(); bool normalize_javascript = false; + int normalization_depth = 0; int max_javascript_whitespaces = 200; class HttpJsNorm* js_norm = nullptr; }; diff --git a/src/service_inspectors/http_inspect/test/http_module_test.cc b/src/service_inspectors/http_inspect/test/http_module_test.cc index 48776f604..ec5d19029 100755 --- a/src/service_inspectors/http_inspect/test/http_module_test.cc +++ b/src/service_inspectors/http_inspect/test/http_module_test.cc @@ -64,9 +64,9 @@ int32_t substr_to_code(const uint8_t*, const int32_t, const StrCode []) { return long HttpTestManager::print_amount {}; bool HttpTestManager::print_hex {}; -HttpJsNorm::HttpJsNorm(int, const HttpParaList::UriParam& uri_param_) : - max_javascript_whitespaces(0), uri_param(uri_param_), javascript_search_mpse(nullptr), - htmltype_search_mpse(nullptr) {} +HttpJsNorm::HttpJsNorm(int, const HttpParaList::UriParam& uri_param_, int) : + normalizer(nullptr), max_javascript_whitespaces(0), uri_param(uri_param_), + normalization_depth(0), javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr) {} HttpJsNorm::~HttpJsNorm() = default; void HttpJsNorm::configure(){} diff --git a/src/service_inspectors/http_inspect/test/http_uri_norm_test.cc b/src/service_inspectors/http_inspect/test/http_uri_norm_test.cc index 295f7f5df..83a2041ef 100755 --- a/src/service_inspectors/http_inspect/test/http_uri_norm_test.cc +++ b/src/service_inspectors/http_inspect/test/http_uri_norm_test.cc @@ -53,9 +53,9 @@ LiteralSearch* LiteralSearch::instantiate(LiteralSearch::Handle*, const uint8_t* void show_stats(PegCount*, const PegInfo*, unsigned, const char*) { } void show_stats(PegCount*, const PegInfo*, const IndexVec&, const char*, FILE*) { } -HttpJsNorm::HttpJsNorm(int, const HttpParaList::UriParam& uri_param_) : - max_javascript_whitespaces(0), uri_param(uri_param_), javascript_search_mpse(nullptr), - htmltype_search_mpse(nullptr) {} +HttpJsNorm::HttpJsNorm(int, const HttpParaList::UriParam& uri_param_, int) : + normalizer(nullptr), max_javascript_whitespaces(0), uri_param(uri_param_), + normalization_depth(0), javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr) {} HttpJsNorm::~HttpJsNorm() = default; void HttpJsNorm::configure() {} diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt index a8ebc8e6f..bbaf23380 100644 --- a/src/utils/CMakeLists.txt +++ b/src/utils/CMakeLists.txt @@ -15,18 +15,28 @@ set( UTIL_INCLUDES util.h util_ber.h util_cstring.h - util_jsnorm.h util_unfold.h util_utf.h ) +set (FLEX_EXECUTABLE flex++) + +FLEX_TARGET ( js_tokenizer ${CMAKE_CURRENT_SOURCE_DIR}/js_tokenizer.l + ${CMAKE_CURRENT_BINARY_DIR}/js_tokenizer.cc + COMPILE_FLAGS -Ca +) + add_library ( utils OBJECT ${UTIL_INCLUDES} ${SNPRINTF_SOURCES} + ${FLEX_js_tokenizer_OUTPUTS} boyer_moore.cc dnet_header.h dyn_array.cc dyn_array.h + js_normalizer.cc + js_normalizer.h + js_tokenizer.h kmap.cc segment_mem.cc sflsq.cc @@ -36,6 +46,7 @@ add_library ( utils OBJECT util_ber.cc util_cstring.cc util_jsnorm.cc + util_jsnorm.h util_net.cc util_net.h util_unfold.cc diff --git a/src/utils/js_normalizer.cc b/src/utils/js_normalizer.cc new file mode 100644 index 000000000..f5793b9ac --- /dev/null +++ b/src/utils/js_normalizer.cc @@ -0,0 +1,42 @@ +//-------------------------------------------------------------------------- +// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved. +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License Version 2 as published +// by the Free Software Foundation. You may not use, modify or distribute +// this program under any other version of the GNU General Public License. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +//-------------------------------------------------------------------------- +// js_normalizer.cc author Oleksandr Serhiienko + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "js_normalizer.h" + +#include + +#include "js_tokenizer.h" + +using namespace snort; + +int JSNormalizer::normalize(const char* srcbuf, uint16_t srclen, char* dstbuf, uint16_t dstlen, + const char** ptr, int* bytes_copied, int norm_depth) +{ + std::stringstream in, out; + + in.rdbuf()->pubsetbuf(const_cast(srcbuf), (norm_depth >= srclen) ? srclen : norm_depth); + JSTokenizer tokenizer(in, out, dstbuf, dstlen, ptr, bytes_copied); + + return tokenizer.yylex(); +} + diff --git a/src/utils/js_normalizer.h b/src/utils/js_normalizer.h new file mode 100644 index 000000000..4688e98d1 --- /dev/null +++ b/src/utils/js_normalizer.h @@ -0,0 +1,36 @@ +//-------------------------------------------------------------------------- +// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved. +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License Version 2 as published +// by the Free Software Foundation. You may not use, modify or distribute +// this program under any other version of the GNU General Public License. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +//-------------------------------------------------------------------------- +// js_normalizer.h author Oleksandr Serhiienko + +#ifndef JS_NORMALIZER_H +#define JS_NORMALIZER_H + +#include "main/snort_types.h" + +namespace snort +{ +class JSNormalizer +{ +public: + static int normalize(const char* srcbuf, uint16_t srclen, char* dstbuf, uint16_t dstlen, + const char** ptr, int* bytes_copied, int norm_depth); +}; +} + +#endif //JS_NORMALIZER_H + diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h new file mode 100644 index 000000000..892fdc425 --- /dev/null +++ b/src/utils/js_tokenizer.h @@ -0,0 +1,106 @@ +//-------------------------------------------------------------------------- +// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved. +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License Version 2 as published +// by the Free Software Foundation. You may not use, modify or distribute +// this program under any other version of the GNU General Public License. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +//-------------------------------------------------------------------------- +// js_tokenizer.h author Oleksandr Serhiienko + +#ifndef JS_TOKENIZER_H +#define JS_TOKENIZER_H + +#include + +#include "log/messages.h" + +class JSTokenizer : public yyFlexLexer +{ +private: + enum JSToken + { + UNDEFINED = 0, + IDENTIFIER, + KEYWORD, + PUNCTUATOR, + OPERATOR, + LITERAL, + DIRECTIVE, + TAG_SCRIPT_OPEN + }; + +public: + // we need an out stream because yyFlexLexer API strongly requires that + JSTokenizer(std::stringstream& in, std::stringstream& out, char* dstbuf, + const uint16_t dstlen, const char** ptr, int* bytes_copied); + ~JSTokenizer() override; + + // so, Flex will treat this class as yyclass + // must come with yyclass Flex option + // don't need to define this method, it'll be substituted by Flex + // returns 0 if OK, 1 otherwise + int yylex() override; + +protected: + [[noreturn]] void LexerError(const char* msg) override + { snort::FatalError("%s", msg); } + +private: + void init(); + + // scan buffers control + void switch_to_temporal(const std::string& data); + void switch_to_initial(); + + bool eval_identifier(const char* lexeme); + bool eval_string_literal(const char* match_prefix, const char quotes); + bool eval_regex_literal(const char* match_prefix); + bool eval_eof(); + void skip_single_line_comment(); + void skip_multi_line_comment(); + + bool parse_literal(const std::string& match_prefix, const char sentinel_ch, + std::string& result, bool is_regex = false); + + // main lexeme handler + // all scanned tokens must pass here + bool eval(const JSToken tok, const char* lexeme); + + bool normalize_identifier(const JSToken prev_tok, const char* lexeme); + bool normalize_punctuator(const JSToken prev_tok, const char* lexeme); + bool normalize_operator(const JSToken prev_tok, const char* lexeme); + bool normalize_directive(const JSToken prev_tok, const char* lexeme); + bool normalize_tag_script_open(const JSToken prev_tok, const char* lexeme); + bool normalize_undefined(const JSToken prev_tok, const char* lexeme); + bool normalize_lexeme(const JSToken prev_tok, const char* lexeme); + + bool write_output(const std::string& str); + + void update_ptr(); + +private: + char* dstbuf; + const uint16_t dstlen; + const char** ptr; + int* bytes_copied; + + struct ScanBuffers; + ScanBuffers* buffers = nullptr; + std::stringstream temporal; + + JSToken prev_tok = UNDEFINED; + +}; + +#endif // JS_TOKENIZER_H + diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l new file mode 100644 index 000000000..f4f51fbf5 --- /dev/null +++ b/src/utils/js_tokenizer.l @@ -0,0 +1,1348 @@ +/*-------------------------------------------------------------------------- +// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved. +// +// This program is free software; you can redistribute it and/or modify it +// under the terms of the GNU General Public License Version 2 as published +// by the Free Software Foundation. You may not use, modify or distribute +// this program under any other version of the GNU General Public License. +// +// This program is distributed in the hope that it will be useful, but +// WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// General Public License for more details. +// +// You should have received a copy of the GNU General Public License along +// with this program; if not, write to the Free Software Foundation, Inc., +// 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +//-------------------------------------------------------------------------- +// js_tokenizer.l author Oleksandr Serhiienko +*/ + +/* Define JSTokenizer as yyClass */ +%option yyclass="JSTokenizer" +/* Disable yywrap() generation */ +%option noyywrap + +%{ + #ifdef HAVE_CONFIG_H + #include "config.h" + #endif + + #include "utils/js_tokenizer.h" +%} + +/* The following grammar was created based on ECMAScript specification */ +/* source https://ecma-international.org/ecma-262/5.1/ */ + +/* whitespaces */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.2 */ +TAB \x9 +VT \xB +FF \xC +SP \x20 +NBSP \xA0 +BOM \xEF\xBB\xBF +WHITESPACES {TAB}|{VT}|{FF}|{SP}|{NBSP}|{BOM} + +/* single char escape sequences */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 */ +NUL \x0 +BS \x8 +HT \x9 +CHAR_ESCAPE_SEQUENCES {NUL}|{BS}|{HT} + +/* line terminators */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.3 */ +LF \xA +CR \xD +LS \xE2\x80\xA8 +PS \xE2\x80\xA9 +LINE_TERMINATORS {LF}|{CR}|{LS}|{PS} + +/* comments */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.4 */ +SINGLE_LINE_COMMENT "//" +MULTI_LINE_COMMENT "/\*" + +/* directives */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-14.1 */ +USE_STRICT_DIRECTIVE "\"use strict\"";*|"\'use strict\'";* + +/* keywords */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6.1.1 */ +KEYWORD break|case|debugger|in|import|protected|do|else|function|try|implements|static|instanceof|new|this|class|let|typeof|var|with|enum|private|catch|continue|default|extends|public|finally|for|if|super|yield|return|switch|throw|const|interface|void|while|delete|export|package + +/* punctuators */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.7 */ +CLOSING_BRACES ")"|"]" +PUNCTUATOR "{"|"}"|"("|"["|">="|"=="|"!="|"==="|"!=="|"."|";"|","|"<"|">"|"<="|"<<"|">>"|">>>"|"&"|"|"|"^"|"!"|"&&"|"||"|"?"|":"|"="|"+="|"-="|"*="|"%="|"<<="|">>="|">>>="|"&="|"|="|"^="|"~" +OPERATOR "+"|"-"|"*"|"++"|"--"|"%" +DIV_OPERATOR "/" +DIV_ASSIGNMENT_OPERATOR "/=" + +/* Unicode letter ranges (categories Lu, Ll, Lt, Lm, Lo and Nl) */ +/* generated with unicode_range_generator.l */ +/* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */ +/* the script above converts Unicode multi-byte ranges into UTF-8 encoding regex ranges since Flex doesn't support Unicode */ +/* for example, the Unicode range from 0x00D1 to 0x00D6 will look like this: \xC3[\x91-\x96] */ +/* just because each character in this range consists of two UTF-8 characters: \xC3 and the one of the range [\x91-\x96] */ +/* using this trick it's possible to handle unicode character ranges within the Flex regular expressions */ +/* i.e. the idea is to represent Unicode as a UTF-8 character sequence */ +LETTER_RNG_1 [A-Z] +LETTER_RNG_2 [a-z] +LETTER_RNG_3 \xC2\xAA +LETTER_RNG_4 \xC2\xB5 +LETTER_RNG_5 \xC2\xBA +LETTER_RNG_6 \xC3[\x80-\x96] +LETTER_RNG_7 \xC3[\x98-\xB6] +LETTER_RNG_8 \xC3[\xB8-\xBF]|\xCB[\x80-\x81]|[\xC4-\xCA][\x80-\xBF] +LETTER_RNG_9 \xCB[\x86-\x91] +LETTER_RNG_10 \xCB[\xA0-\xA4] +LETTER_RNG_11 \xCB\xAC +LETTER_RNG_12 \xCB\xAE +LETTER_RNG_13 \xCD[\xB0-\xB4] +LETTER_RNG_14 \xCD[\xB6-\xBD] +LETTER_RNG_15 \xCD\xBF +LETTER_RNG_16 \xCE\x86 +LETTER_RNG_17 \xCE[\x88-\xBF]|\xCF[\x80-\xB5] +LETTER_RNG_18 \xCF[\xB7-\xBF]|\xD2[\x80-\x81]|[\xD0-\xD1][\x80-\xBF] +LETTER_RNG_19 \xD2[\x8A-\xBF]|\xD5[\x80-\x99]|[\xD3-\xD4][\x80-\xBF] +LETTER_RNG_20 \xD5[\xA0-\xBF]|\xD6[\x80-\x88] +LETTER_RNG_21 \xD7[\x90-\xB2] +LETTER_RNG_22 \xD8[\xA0-\xBF]|\xD9[\x80-\x8A] +LETTER_RNG_23 \xD9[\xAE-\xAF] +LETTER_RNG_24 \xD9[\xB1-\xBF]|\xDB[\x80-\x93]|\xDA[\x80-\xBF] +LETTER_RNG_25 \xDB\x95 +LETTER_RNG_26 \xDB[\xA5-\xA6] +LETTER_RNG_27 \xDB[\xAE-\xAF] +LETTER_RNG_28 \xDB[\xBA-\xBC] +LETTER_RNG_29 \xDB\xBF +LETTER_RNG_30 \xDC\x90 +LETTER_RNG_31 \xDC[\x92-\xAF] +LETTER_RNG_32 \xDD[\x8D-\xBF]|\xDE[\x80-\xA5] +LETTER_RNG_33 \xDE\xB1 +LETTER_RNG_34 \xDF[\x8A-\xAA] +LETTER_RNG_35 \xDF[\xB4-\xB5] +LETTER_RNG_36 \xDF\xBA +LETTER_RNG_37 \xE0\xA0[\x80-\x95] +LETTER_RNG_38 \xE0\xA0\x9A +LETTER_RNG_39 \xE0\xA0\xA4 +LETTER_RNG_40 \xE0\xA0\xA8 +LETTER_RNG_41 \xE0\xA1[\x80-\x98] +LETTER_RNG_42 \xE0(\xA1[\xA0-\xBF]|\xA3[\x80-\x87]|\xA2[\x80-\xBF]) +LETTER_RNG_43 \xE0\xA4[\x84-\xB9] +LETTER_RNG_44 \xE0\xA4\xBD +LETTER_RNG_45 \xE0\xA5\x90 +LETTER_RNG_46 \xE0\xA5[\x98-\xA1] +LETTER_RNG_47 \xE0(\xA5[\xB1-\xBF]|\xA6\x80) +LETTER_RNG_48 \xE0\xA6[\x85-\xB9] +LETTER_RNG_49 \xE0\xA6\xBD +LETTER_RNG_50 \xE0\xA7\x8E +LETTER_RNG_51 \xE0\xA7[\x9C-\xA1] +LETTER_RNG_52 \xE0\xA7[\xB0-\xB1] +LETTER_RNG_53 \xE0\xA7\xBC +LETTER_RNG_54 \xE0\xA8[\x85-\xB9] +LETTER_RNG_55 \xE0\xA9[\x99-\x9E] +LETTER_RNG_56 \xE0\xA9[\xB2-\xB4] +LETTER_RNG_57 \xE0\xAA[\x85-\xB9] +LETTER_RNG_58 \xE0\xAA\xBD +LETTER_RNG_59 \xE0\xAB[\x90-\xA1] +LETTER_RNG_60 \xE0\xAB\xB9 +LETTER_RNG_61 \xE0\xAC[\x85-\xB9] +LETTER_RNG_62 \xE0\xAC\xBD +LETTER_RNG_63 \xE0\xAD[\x9C-\xA1] +LETTER_RNG_64 \xE0\xAD\xB1 +LETTER_RNG_65 \xE0\xAE[\x83-\xB9] +LETTER_RNG_66 \xE0\xAF\x90 +LETTER_RNG_67 \xE0\xB0[\x85-\xBD] +LETTER_RNG_68 \xE0\xB1[\x98-\xA1] +LETTER_RNG_69 \xE0\xB2\x80 +LETTER_RNG_70 \xE0\xB2[\x85-\xB9] +LETTER_RNG_71 \xE0\xB2\xBD +LETTER_RNG_72 \xE0\xB3[\x9E-\xA1] +LETTER_RNG_73 \xE0\xB3[\xB1-\xB2] +LETTER_RNG_74 \xE0\xB4[\x84-\xBA] +LETTER_RNG_75 \xE0\xB4\xBD +LETTER_RNG_76 \xE0\xB5\x8E +LETTER_RNG_77 \xE0\xB5[\x94-\x96] +LETTER_RNG_78 \xE0\xB5[\x9F-\xA1] +LETTER_RNG_79 \xE0\xB5[\xBA-\xBF] +LETTER_RNG_80 \xE0(\xB6[\x85-\xBF]|\xB7[\x80-\x86]) +LETTER_RNG_81 \xE0\xB8[\x81-\xB0] +LETTER_RNG_82 \xE0\xB8[\xB2-\xB3] +LETTER_RNG_83 \xE0\xB9[\x80-\x86] +LETTER_RNG_84 \xE0\xBA[\x81-\xB0] +LETTER_RNG_85 \xE0\xBA[\xB2-\xB3] +LETTER_RNG_86 \xE0(\xBA[\xBD-\xBF]|\xBB[\x80-\x86]) +LETTER_RNG_87 \xE0(\xBB[\x9C-\xBF]|\xBC\x80) +LETTER_RNG_88 \xE0\xBD[\x80-\xAC] +LETTER_RNG_89 \xE0\xBE[\x88-\x8C] +LETTER_RNG_90 \xE1\x80[\x80-\xAA] +LETTER_RNG_91 \xE1\x80\xBF +LETTER_RNG_92 \xE1\x81[\x90-\x95] +LETTER_RNG_93 \xE1\x81[\x9A-\x9D] +LETTER_RNG_94 \xE1\x81\xA1 +LETTER_RNG_95 \xE1\x81[\xA5-\xA6] +LETTER_RNG_96 \xE1\x81[\xAE-\xB0] +LETTER_RNG_97 \xE1(\x81[\xB5-\xBF]|\x82[\x80-\x81]) +LETTER_RNG_98 \xE1\x82\x8E +LETTER_RNG_99 \xE1(\x82[\xA0-\xBF]|\x83[\x80-\xBA]) +LETTER_RNG_100 \xE1(\x83[\xBC-\xBF]|\x8D[\x80-\x9A]|[\x84-\x8C][\x80-\xBF]) +LETTER_RNG_101 \xE1\x8E[\x80-\x8F] +LETTER_RNG_102 \xE1(\x8E[\xA0-\xBF]|\x8F[\x80-\xBD]) +LETTER_RNG_103 \xE1(\x90[\x81-\xBF]|\x99[\x80-\xAC]|[\x91-\x98][\x80-\xBF]) +LETTER_RNG_104 \xE1\x99[\xAF-\xBF] +LETTER_RNG_105 \xE1\x9A[\x81-\x9A] +LETTER_RNG_106 \xE1(\x9A[\xA0-\xBF]|\x9B[\x80-\xAA]) +LETTER_RNG_107 \xE1(\x9B[\xAE-\xBF]|\x9C[\x80-\x91]) +LETTER_RNG_108 \xE1\x9C[\xA0-\xB1] +LETTER_RNG_109 \xE1\x9D[\x80-\x91] +LETTER_RNG_110 \xE1\x9D[\xA0-\xB0] +LETTER_RNG_111 \xE1\x9E[\x80-\xB3] +LETTER_RNG_112 \xE1\x9F\x97 +LETTER_RNG_113 \xE1\x9F\x9C +LETTER_RNG_114 \xE1(\xA0[\xA0-\xBF]|\xA2[\x80-\x84]|\xA1[\x80-\xBF]) +LETTER_RNG_115 \xE1\xA2[\x87-\xA8] +LETTER_RNG_116 \xE1(\xA2[\xAA-\xBF]|\xA4[\x80-\x9E]|\xA3[\x80-\xBF]) +LETTER_RNG_117 \xE1(\xA5[\x90-\xBF]|\xA7[\x80-\x89]|\xA6[\x80-\xBF]) +LETTER_RNG_118 \xE1\xA8[\x80-\x96] +LETTER_RNG_119 \xE1(\xA8[\xA0-\xBF]|\xA9[\x80-\x94]) +LETTER_RNG_120 \xE1\xAA\xA7 +LETTER_RNG_121 \xE1\xAC[\x85-\xB3] +LETTER_RNG_122 \xE1\xAD[\x85-\x8B] +LETTER_RNG_123 \xE1\xAE[\x83-\xA0] +LETTER_RNG_124 \xE1\xAE[\xAE-\xAF] +LETTER_RNG_125 \xE1(\xAE[\xBA-\xBF]|\xAF[\x80-\xA5]) +LETTER_RNG_126 \xE1\xB0[\x80-\xA3] +LETTER_RNG_127 \xE1\xB1[\x8D-\x8F] +LETTER_RNG_128 \xE1\xB1[\x9A-\xBD] +LETTER_RNG_129 \xE1\xB2[\x80-\xBF] +LETTER_RNG_130 \xE1\xB3[\xA9-\xAC] +LETTER_RNG_131 \xE1\xB3[\xAE-\xB3] +LETTER_RNG_132 \xE1\xB3[\xB5-\xB6] +LETTER_RNG_133 \xE1(\xB3[\xBA-\xBF]|[\xB4-\xB6][\x80-\xBF]) +LETTER_RNG_134 \xE1(\xBE[\x80-\xBC]|[\xB8-\xBD][\x80-\xBF]) +LETTER_RNG_135 \xE1\xBE\xBE +LETTER_RNG_136 \xE1\xBF[\x82-\x8C] +LETTER_RNG_137 \xE1\xBF[\x90-\x9B] +LETTER_RNG_138 \xE1\xBF[\xA0-\xAC] +LETTER_RNG_139 \xE1\xBF[\xB2-\xBC] +LETTER_RNG_140 \xE2\x81\xB1 +LETTER_RNG_141 \xE2\x81\xBF +LETTER_RNG_142 \xE2\x82[\x90-\x9C] +LETTER_RNG_143 \xE2\x84\x82 +LETTER_RNG_144 \xE2\x84\x87 +LETTER_RNG_145 \xE2\x84[\x8A-\x93] +LETTER_RNG_146 \xE2\x84\x95 +LETTER_RNG_147 \xE2\x84[\x99-\x9D] +LETTER_RNG_148 \xE2\x84\xA4 +LETTER_RNG_149 \xE2\x84\xA6 +LETTER_RNG_150 \xE2\x84\xA8 +LETTER_RNG_151 \xE2\x84[\xAA-\xAD] +LETTER_RNG_152 \xE2\x84[\xAF-\xB9] +LETTER_RNG_153 \xE2\x84[\xBC-\xBF] +LETTER_RNG_154 \xE2\x85[\x85-\x89] +LETTER_RNG_155 \xE2\x85\x8E +LETTER_RNG_156 \xE2(\x85[\xA0-\xBF]|\x86[\x80-\x88]) +LETTER_RNG_157 \xE2(\xB3[\x80-\xA4]|[\xB0-\xB2][\x80-\xBF]) +LETTER_RNG_158 \xE2\xB3[\xAB-\xAE] +LETTER_RNG_159 \xE2\xB3[\xB2-\xB3] +LETTER_RNG_160 \xE2(\xB5[\x80-\xAF]|\xB4[\x80-\xBF]) +LETTER_RNG_161 \xE2(\xB7[\x80-\x9E]|\xB6[\x80-\xBF]) +LETTER_RNG_162 \xE2\xB8\xAF +LETTER_RNG_163 \xE3\x80[\x85-\x87] +LETTER_RNG_164 \xE3\x80[\xA1-\xA9] +LETTER_RNG_165 \xE3\x80[\xB1-\xB5] +LETTER_RNG_166 \xE3\x80[\xB8-\xBC] +LETTER_RNG_167 \xE3(\x81[\x81-\xBF]|\x82[\x80-\x96]) +LETTER_RNG_168 \xE3\x82[\x9D-\x9F] +LETTER_RNG_169 \xE3(\x82[\xA1-\xBF]|\x83[\x80-\xBA]) +LETTER_RNG_170 \xE3(\x83[\xBC-\xBF]|\x86[\x80-\x8E]|[\x84-\x85][\x80-\xBF]) +LETTER_RNG_171 \xE3\x86[\xA0-\xBF] +LETTER_RNG_172 \xE3\x87[\xB0-\xBF] +LETTER_RNG_173 (\xE3[\x90-\xBF]|\xE4[\x80-\xB6])[\x80-\xBF] +LETTER_RNG_174 \xEA\x92[\x80-\x8C]|(\xE4[\xB8-\xBF]|\xEA[\x80-\x91]|[\xE5-\xE9][\x80-\xBF])[\x80-\xBF] +LETTER_RNG_175 \xEA\x93[\x90-\xBD] +LETTER_RNG_176 \xEA(\x98[\x80-\x8C]|[\x94-\x97][\x80-\xBF]) +LETTER_RNG_177 \xEA\x98[\x90-\x9F] +LETTER_RNG_178 \xEA(\x98[\xAA-\xBF]|\x99[\x80-\xAE]) +LETTER_RNG_179 \xEA(\x99\xBF|\x9A[\x80-\x9D]) +LETTER_RNG_180 \xEA(\x9A[\xA0-\xBF]|\x9B[\x80-\xAF]) +LETTER_RNG_181 \xEA\x9C[\x97-\x9F] +LETTER_RNG_182 \xEA(\x9C[\xA2-\xBF]|\x9E[\x80-\x88]|\x9D[\x80-\xBF]) +LETTER_RNG_183 \xEA(\x9E[\x8B-\xBF]|\xA0[\x80-\x81]|\x9F[\x80-\xBF]) +LETTER_RNG_184 \xEA\xA0[\x83-\x85] +LETTER_RNG_185 \xEA\xA0[\x87-\x8A] +LETTER_RNG_186 \xEA\xA0[\x8C-\xA2] +LETTER_RNG_187 \xEA\xA1[\x80-\xB3] +LETTER_RNG_188 \xEA\xA2[\x82-\xB3] +LETTER_RNG_189 \xEA\xA3[\xB2-\xB7] +LETTER_RNG_190 \xEA\xA3\xBB +LETTER_RNG_191 \xEA\xA3[\xBD-\xBE] +LETTER_RNG_192 \xEA\xA4[\x8A-\xA5] +LETTER_RNG_193 \xEA(\xA4[\xB0-\xBF]|\xA5[\x80-\x86]) +LETTER_RNG_194 \xEA\xA5[\xA0-\xBC] +LETTER_RNG_195 \xEA\xA6[\x84-\xB2] +LETTER_RNG_196 \xEA\xA7\x8F +LETTER_RNG_197 \xEA\xA7[\xA0-\xA4] +LETTER_RNG_198 \xEA\xA7[\xA6-\xAF] +LETTER_RNG_199 \xEA(\xA7[\xBA-\xBF]|\xA8[\x80-\xA8]) +LETTER_RNG_200 \xEA\xA9[\x80-\x82] +LETTER_RNG_201 \xEA\xA9[\x84-\x8B] +LETTER_RNG_202 \xEA\xA9[\xA0-\xB6] +LETTER_RNG_203 \xEA\xA9\xBA +LETTER_RNG_204 \xEA(\xA9[\xBE-\xBF]|\xAA[\x80-\xAF]) +LETTER_RNG_205 \xEA\xAA\xB1 +LETTER_RNG_206 \xEA\xAA[\xB5-\xB6] +LETTER_RNG_207 \xEA\xAA[\xB9-\xBD] +LETTER_RNG_208 \xEA\xAB\x80 +LETTER_RNG_209 \xEA\xAB[\x82-\x9D] +LETTER_RNG_210 \xEA\xAB[\xA0-\xAA] +LETTER_RNG_211 \xEA\xAB[\xB2-\xB4] +LETTER_RNG_212 \xEA(\xAC[\x81-\xBF]|\xAD[\x80-\x9A]) +LETTER_RNG_213 \xEA\xAD[\x9C-\xA9] +LETTER_RNG_214 \xEA(\xAD[\xB0-\xBF]|\xAF[\x80-\xA2]|\xAE[\x80-\xBF]) +LETTER_RNG_215 \xED\x9F[\x80-\xBB]|(\xEA[\xB0-\xBF]|\xED[\x80-\x9E]|[\xEB-\xEC][\x80-\xBF])[\x80-\xBF] +LETTER_RNG_216 \xEF(\xAC[\x80-\x9D]|[\xA4-\xAB][\x80-\xBF]) +LETTER_RNG_217 \xEF\xAC[\x9F-\xA8] +LETTER_RNG_218 \xEF(\xAC[\xAA-\xBF]|\xAE[\x80-\xB1]|\xAD[\x80-\xBF]) +LETTER_RNG_219 \xEF(\xAF[\x93-\xBF]|\xB4[\x80-\xBD]|[\xB0-\xB3][\x80-\xBF]) +LETTER_RNG_220 \xEF(\xB5[\x90-\xBF]|\xB7[\x80-\xBB]|\xB6[\x80-\xBF]) +LETTER_RNG_221 \xEF(\xB9[\xB0-\xBF]|\xBB[\x80-\xBC]|\xBA[\x80-\xBF]) +LETTER_RNG_222 \xEF\xBC[\xA1-\xBA] +LETTER_RNG_223 \xEF\xBD[\x81-\x9A] +LETTER_RNG_224 \xEF(\xBD[\xA6-\xBF]|\xBF[\x80-\x9C]|\xBE[\x80-\xBF]) +LETTER_RNG_225 \xF0\x90(\x83[\x80-\xBA]|[\x80-\x82][\x80-\xBF]) +LETTER_RNG_226 \xF0\x90\x85[\x80-\xB4] +LETTER_RNG_227 \xF0\x90(\x8B[\x80-\x90]|\x8A[\x80-\xBF]) +LETTER_RNG_228 \xF0\x90\x8C[\x80-\x9F] +LETTER_RNG_229 \xF0\x90(\x8C[\xAD-\xBF]|\x8D[\x80-\xB5]) +LETTER_RNG_230 \xF0\x90\x8E[\x80-\x9D] +LETTER_RNG_231 \xF0\x90(\x8E[\xA0-\xBF]|\x8F[\x80-\x8F]) +LETTER_RNG_232 \xF0\x90(\x8F[\x91-\xBF]|\x92[\x80-\x9D]|[\x90-\x91][\x80-\xBF]) +LETTER_RNG_233 \xF0\x90(\x92[\xB0-\xBF]|\x95[\x80-\xA3]|[\x93-\x94][\x80-\xBF]) +LETTER_RNG_234 \xF0\x90(\xA1[\x80-\x95]|[\x98-\xA0][\x80-\xBF]) +LETTER_RNG_235 \xF0\x90\xA1[\xA0-\xB6] +LETTER_RNG_236 \xF0\x90\xA2[\x80-\x9E] +LETTER_RNG_237 \xF0\x90\xA3[\xA0-\xB5] +LETTER_RNG_238 \xF0\x90\xA4[\x80-\x95] +LETTER_RNG_239 \xF0\x90\xA4[\xA0-\xB9] +LETTER_RNG_240 \xF0\x90\xA6[\x80-\xB7] +LETTER_RNG_241 \xF0\x90\xA6[\xBE-\xBF] +LETTER_RNG_242 \xF0\x90\xA8\x80 +LETTER_RNG_243 \xF0\x90\xA8[\x90-\xB5] +LETTER_RNG_244 \xF0\x90\xA9[\xA0-\xBC] +LETTER_RNG_245 \xF0\x90\xAA[\x80-\x9C] +LETTER_RNG_246 \xF0\x90\xAB[\x80-\x87] +LETTER_RNG_247 \xF0\x90\xAB[\x89-\xA4] +LETTER_RNG_248 \xF0\x90\xAC[\x80-\xB5] +LETTER_RNG_249 \xF0\x90\xAD[\x80-\x95] +LETTER_RNG_250 \xF0\x90\xAD[\xA0-\xB2] +LETTER_RNG_251 \xF0\x90\xAE[\x80-\x91] +LETTER_RNG_252 \xF0\x90(\xB3[\x80-\xB2]|[\xB0-\xB2][\x80-\xBF]) +LETTER_RNG_253 \xF0\x90\xB4[\x80-\xA3] +LETTER_RNG_254 \xF0\x90\xBA[\x80-\xA9] +LETTER_RNG_255 \xF0\x90(\xBA[\xB0-\xBF]|\xBC[\x80-\x9C]|\xBB[\x80-\xBF]) +LETTER_RNG_256 \xF0\x90(\xBC[\xA7-\xBF]|\xBD[\x80-\x85]) +LETTER_RNG_257 \xF0\x90(\xBE[\xB0-\xBF]|\xBF[\x80-\x84]) +LETTER_RNG_258 \xF0\x90\xBF[\xA0-\xB6] +LETTER_RNG_259 \xF0\x91\x80[\x83-\xB7] +LETTER_RNG_260 \xF0\x91\x82[\x83-\xAF] +LETTER_RNG_261 \xF0\x91\x83[\x90-\xA8] +LETTER_RNG_262 \xF0\x91\x84[\x83-\xA6] +LETTER_RNG_263 \xF0\x91\x85\x84 +LETTER_RNG_264 \xF0\x91\x85[\x87-\xB2] +LETTER_RNG_265 \xF0\x91\x85\xB6 +LETTER_RNG_266 \xF0\x91\x86[\x83-\xB2] +LETTER_RNG_267 \xF0\x91\x87[\x81-\x84] +LETTER_RNG_268 \xF0\x91\x87\x9A +LETTER_RNG_269 \xF0\x91\x87\x9C +LETTER_RNG_270 \xF0\x91\x88[\x80-\xAB] +LETTER_RNG_271 \xF0\x91\x8A[\x80-\xA8] +LETTER_RNG_272 \xF0\x91(\x8A[\xB0-\xBF]|\x8B[\x80-\x9E]) +LETTER_RNG_273 \xF0\x91\x8C[\x85-\xB9] +LETTER_RNG_274 \xF0\x91\x8C\xBD +LETTER_RNG_275 \xF0\x91\x8D\x90 +LETTER_RNG_276 \xF0\x91\x8D[\x9D-\xA1] +LETTER_RNG_277 \xF0\x91\x90[\x80-\xB4] +LETTER_RNG_278 \xF0\x91\x91[\x87-\x8A] +LETTER_RNG_279 \xF0\x91(\x91[\x9F-\xBF]|\x92[\x80-\xAF]) +LETTER_RNG_280 \xF0\x91\x93[\x84-\x85] +LETTER_RNG_281 \xF0\x91\x93\x87 +LETTER_RNG_282 \xF0\x91\x96[\x80-\xAE] +LETTER_RNG_283 \xF0\x91\x97[\x98-\x9B] +LETTER_RNG_284 \xF0\x91\x98[\x80-\xAF] +LETTER_RNG_285 \xF0\x91\x99\x84 +LETTER_RNG_286 \xF0\x91\x9A[\x80-\xAA] +LETTER_RNG_287 \xF0\x91\x9A\xB8 +LETTER_RNG_288 \xF0\x91\x9C[\x80-\x9A] +LETTER_RNG_289 \xF0\x91\xA0[\x80-\xAB] +LETTER_RNG_290 \xF0\x91(\xA2[\xA0-\xBF]|\xA3[\x80-\x9F]) +LETTER_RNG_291 \xF0\x91(\xA3\xBF|\xA4[\x80-\xAF]) +LETTER_RNG_292 \xF0\x91\xA4\xBF +LETTER_RNG_293 \xF0\x91\xA5\x81 +LETTER_RNG_294 \xF0\x91(\xA6[\xA0-\xBF]|\xA7[\x80-\x90]) +LETTER_RNG_295 \xF0\x91\xA7\xA1 +LETTER_RNG_296 \xF0\x91\xA7\xA3 +LETTER_RNG_297 \xF0\x91\xA8\x80 +LETTER_RNG_298 \xF0\x91\xA8[\x8B-\xB2] +LETTER_RNG_299 \xF0\x91\xA8\xBA +LETTER_RNG_300 \xF0\x91\xA9\x90 +LETTER_RNG_301 \xF0\x91(\xA9[\x9C-\xBF]|\xAA[\x80-\x89]) +LETTER_RNG_302 \xF0\x91\xAA\x9D +LETTER_RNG_303 \xF0\x91(\xB0[\x80-\xAE]|[\xAB-\xAF][\x80-\xBF]) +LETTER_RNG_304 \xF0\x91\xB1\x80 +LETTER_RNG_305 \xF0\x91(\xB1[\xB2-\xBF]|\xB2[\x80-\x8F]) +LETTER_RNG_306 \xF0\x91\xB4[\x80-\xB0] +LETTER_RNG_307 \xF0\x91\xB5\x86 +LETTER_RNG_308 \xF0\x91(\xB5[\xA0-\xBF]|\xB6[\x80-\x89]) +LETTER_RNG_309 \xF0\x91\xB6\x98 +LETTER_RNG_310 \xF0\x91\xBB[\xA0-\xB2] +LETTER_RNG_311 \xF0\x91\xBE\xB0 +LETTER_RNG_312 \xF0\x92(\x91[\x80-\xAE]|[\x80-\x90][\x80-\xBF]) +LETTER_RNG_313 \xF0(\x93\x90[\x80-\xAE]|(\x92[\x92-\xBF]|\x93[\x80-\x8F])[\x80-\xBF]) +LETTER_RNG_314 \xF0(\x96\xA9[\x80-\x9E]|(\x94[\x90-\xBF]|\x96[\x80-\xA8]|\x95[\x80-\xBF])[\x80-\xBF]) +LETTER_RNG_315 \xF0\x96\xAB[\x90-\xAD] +LETTER_RNG_316 \xF0\x96\xAC[\x80-\xAF] +LETTER_RNG_317 \xF0\x96\xAD[\x80-\x83] +LETTER_RNG_318 \xF0\x96(\xAD[\xA3-\xBF]|[\xAE-\xB9][\x80-\xBF]) +LETTER_RNG_319 \xF0\x96(\xBD[\x80-\x8A]|\xBC[\x80-\xBF]) +LETTER_RNG_320 \xF0\x96\xBD\x90 +LETTER_RNG_321 \xF0\x96(\xBE[\x93-\xBF]|\xBF[\x80-\xA1]) +LETTER_RNG_322 \xF0\x96\xBF\xA3 +LETTER_RNG_323 \xF0(\x9B\xB2[\x80-\x99]|(\x9B[\x80-\xB1]|[\x97-\x9A][\x80-\xBF])[\x80-\xBF]) +LETTER_RNG_324 \xF0\x9D(\x9B\x80|[\x90-\x9A][\x80-\xBF]) +LETTER_RNG_325 \xF0\x9D\x9B[\x82-\x9A] +LETTER_RNG_326 \xF0\x9D\x9B[\x9C-\xBA] +LETTER_RNG_327 \xF0\x9D(\x9B[\xBC-\xBF]|\x9C[\x80-\x94]) +LETTER_RNG_328 \xF0\x9D\x9C[\x96-\xB4] +LETTER_RNG_329 \xF0\x9D(\x9C[\xB6-\xBF]|\x9D[\x80-\x8E]) +LETTER_RNG_330 \xF0\x9D\x9D[\x90-\xAE] +LETTER_RNG_331 \xF0\x9D(\x9D[\xB0-\xBF]|\x9E[\x80-\x88]) +LETTER_RNG_332 \xF0\x9D\x9E[\x8A-\xA8] +LETTER_RNG_333 \xF0\x9D(\x9E[\xAA-\xBF]|\x9F[\x80-\x82]) +LETTER_RNG_334 \xF0\x9D\x9F[\x84-\x8B] +LETTER_RNG_335 \xF0\x9E\x84[\x80-\xAC] +LETTER_RNG_336 \xF0\x9E\x84[\xB7-\xBD] +LETTER_RNG_337 \xF0\x9E\x85\x8E +LETTER_RNG_338 \xF0\x9E\x8B[\x80-\xAB] +LETTER_RNG_339 \xF0\x9E(\xA3[\x80-\x84]|[\xA0-\xA2][\x80-\xBF]) +LETTER_RNG_340 \xF0\x9E(\xA5[\x80-\x83]|\xA4[\x80-\xBF]) +LETTER_RNG_341 \xF0\x9E\xA5\x8B +LETTER_RNG_342 \xF0\x9E(\xBA[\x80-\xBB]|[\xB8-\xB9][\x80-\xBF]) +LETTER_RNG_343 \xF0(\xB1\x8D[\x80-\x8A]|(\xB1[\x80-\x8C]|[\xA0-\xB0][\x80-\xBF])[\x80-\xBF]) + +LETTER_GROUP_1 {LETTER_RNG_1}|{LETTER_RNG_2}|{LETTER_RNG_3}|{LETTER_RNG_4}|{LETTER_RNG_5}|{LETTER_RNG_6}|{LETTER_RNG_7}|{LETTER_RNG_8}|{LETTER_RNG_9}|{LETTER_RNG_10} +LETTER_GROUP_2 {LETTER_GROUP_1}|{LETTER_RNG_11}|{LETTER_RNG_12}|{LETTER_RNG_13}|{LETTER_RNG_14}|{LETTER_RNG_15}|{LETTER_RNG_16}|{LETTER_RNG_17}|{LETTER_RNG_18}|{LETTER_RNG_19} +LETTER_GROUP_3 {LETTER_GROUP_2}|{LETTER_RNG_20}|{LETTER_RNG_21}|{LETTER_RNG_22}|{LETTER_RNG_23}|{LETTER_RNG_24}|{LETTER_RNG_25}|{LETTER_RNG_26}|{LETTER_RNG_27}|{LETTER_RNG_28} +LETTER_GROUP_4 {LETTER_GROUP_3}|{LETTER_RNG_29}|{LETTER_RNG_30}|{LETTER_RNG_31}|{LETTER_RNG_32}|{LETTER_RNG_33}|{LETTER_RNG_34}|{LETTER_RNG_35}|{LETTER_RNG_36}|{LETTER_RNG_37} +LETTER_GROUP_5 {LETTER_GROUP_4}|{LETTER_RNG_38}|{LETTER_RNG_39}|{LETTER_RNG_40}|{LETTER_RNG_41}|{LETTER_RNG_42}|{LETTER_RNG_43}|{LETTER_RNG_44}|{LETTER_RNG_45}|{LETTER_RNG_46} +LETTER_GROUP_6 {LETTER_GROUP_5}|{LETTER_RNG_47}|{LETTER_RNG_48}|{LETTER_RNG_49}|{LETTER_RNG_50}|{LETTER_RNG_51}|{LETTER_RNG_52}|{LETTER_RNG_53}|{LETTER_RNG_54}|{LETTER_RNG_55} +LETTER_GROUP_7 {LETTER_GROUP_6}|{LETTER_RNG_56}|{LETTER_RNG_57}|{LETTER_RNG_58}|{LETTER_RNG_59}|{LETTER_RNG_60}|{LETTER_RNG_61}|{LETTER_RNG_62}|{LETTER_RNG_63}|{LETTER_RNG_64} +LETTER_GROUP_8 {LETTER_GROUP_7}|{LETTER_RNG_65}|{LETTER_RNG_66}|{LETTER_RNG_67}|{LETTER_RNG_68}|{LETTER_RNG_69}|{LETTER_RNG_70}|{LETTER_RNG_71}|{LETTER_RNG_72}|{LETTER_RNG_73} +LETTER_GROUP_9 {LETTER_GROUP_8}|{LETTER_RNG_74}|{LETTER_RNG_75}|{LETTER_RNG_76}|{LETTER_RNG_77}|{LETTER_RNG_78}|{LETTER_RNG_79}|{LETTER_RNG_80}|{LETTER_RNG_81}|{LETTER_RNG_82} +LETTER_GROUP_10 {LETTER_GROUP_9}|{LETTER_RNG_83}|{LETTER_RNG_84}|{LETTER_RNG_85}|{LETTER_RNG_86}|{LETTER_RNG_87}|{LETTER_RNG_88}|{LETTER_RNG_89}|{LETTER_RNG_90}|{LETTER_RNG_91} +LETTER_GROUP_11 {LETTER_GROUP_10}|{LETTER_RNG_92}|{LETTER_RNG_93}|{LETTER_RNG_94}|{LETTER_RNG_95}|{LETTER_RNG_96}|{LETTER_RNG_97}|{LETTER_RNG_98}|{LETTER_RNG_99}|{LETTER_RNG_100} +LETTER_GROUP_12 {LETTER_GROUP_11}|{LETTER_RNG_101}|{LETTER_RNG_102}|{LETTER_RNG_103}|{LETTER_RNG_104}|{LETTER_RNG_105}|{LETTER_RNG_106}|{LETTER_RNG_107}|{LETTER_RNG_108}|{LETTER_RNG_109} +LETTER_GROUP_13 {LETTER_GROUP_12}|{LETTER_RNG_110}|{LETTER_RNG_111}|{LETTER_RNG_112}|{LETTER_RNG_113}|{LETTER_RNG_114}|{LETTER_RNG_115}|{LETTER_RNG_116}|{LETTER_RNG_117}|{LETTER_RNG_118} +LETTER_GROUP_14 {LETTER_GROUP_13}|{LETTER_RNG_119}|{LETTER_RNG_120}|{LETTER_RNG_121}|{LETTER_RNG_122}|{LETTER_RNG_123}|{LETTER_RNG_124}|{LETTER_RNG_125}|{LETTER_RNG_126}|{LETTER_RNG_127} +LETTER_GROUP_15 {LETTER_GROUP_14}|{LETTER_RNG_128}|{LETTER_RNG_129}|{LETTER_RNG_130}|{LETTER_RNG_131}|{LETTER_RNG_132}|{LETTER_RNG_133}|{LETTER_RNG_134}|{LETTER_RNG_135}|{LETTER_RNG_136} +LETTER_GROUP_16 {LETTER_GROUP_15}|{LETTER_RNG_137}|{LETTER_RNG_138}|{LETTER_RNG_139}|{LETTER_RNG_140}|{LETTER_RNG_141}|{LETTER_RNG_142}|{LETTER_RNG_143}|{LETTER_RNG_144}|{LETTER_RNG_145} +LETTER_GROUP_17 {LETTER_GROUP_15}|{LETTER_RNG_146}|{LETTER_RNG_147}|{LETTER_RNG_148}|{LETTER_RNG_149}|{LETTER_RNG_150}|{LETTER_RNG_151}|{LETTER_RNG_152}|{LETTER_RNG_153}|{LETTER_RNG_154} +LETTER_GROUP_18 {LETTER_GROUP_17}|{LETTER_RNG_155}|{LETTER_RNG_156}|{LETTER_RNG_157}|{LETTER_RNG_158}|{LETTER_RNG_159}|{LETTER_RNG_160}|{LETTER_RNG_161}|{LETTER_RNG_162}|{LETTER_RNG_163} +LETTER_GROUP_19 {LETTER_GROUP_18}|{LETTER_RNG_164}|{LETTER_RNG_165}|{LETTER_RNG_166}|{LETTER_RNG_167}|{LETTER_RNG_168}|{LETTER_RNG_169}|{LETTER_RNG_170}|{LETTER_RNG_171}|{LETTER_RNG_172} +LETTER_GROUP_20 {LETTER_GROUP_19}|{LETTER_RNG_173}|{LETTER_RNG_174}|{LETTER_RNG_175}|{LETTER_RNG_176}|{LETTER_RNG_177}|{LETTER_RNG_178}|{LETTER_RNG_179}|{LETTER_RNG_180}|{LETTER_RNG_181} +LETTER_GROUP_21 {LETTER_GROUP_20}|{LETTER_RNG_182}|{LETTER_RNG_183}|{LETTER_RNG_184}|{LETTER_RNG_185}|{LETTER_RNG_186}|{LETTER_RNG_187}|{LETTER_RNG_188}|{LETTER_RNG_189}|{LETTER_RNG_190} +LETTER_GROUP_22 {LETTER_GROUP_21}|{LETTER_RNG_191}|{LETTER_RNG_192}|{LETTER_RNG_193}|{LETTER_RNG_194}|{LETTER_RNG_195}|{LETTER_RNG_196}|{LETTER_RNG_197}|{LETTER_RNG_198}|{LETTER_RNG_199} +LETTER_GROUP_23 {LETTER_GROUP_22}|{LETTER_RNG_200}|{LETTER_RNG_201}|{LETTER_RNG_202}|{LETTER_RNG_203}|{LETTER_RNG_204}|{LETTER_RNG_205}|{LETTER_RNG_206}|{LETTER_RNG_207}|{LETTER_RNG_208} +LETTER_GROUP_24 {LETTER_GROUP_23}|{LETTER_RNG_209}|{LETTER_RNG_210}|{LETTER_RNG_211}|{LETTER_RNG_212}|{LETTER_RNG_213}|{LETTER_RNG_214}|{LETTER_RNG_215}|{LETTER_RNG_216}|{LETTER_RNG_217} +LETTER_GROUP_25 {LETTER_GROUP_24}|{LETTER_RNG_218}|{LETTER_RNG_219}|{LETTER_RNG_220}|{LETTER_RNG_221}|{LETTER_RNG_222}|{LETTER_RNG_223}|{LETTER_RNG_224}|{LETTER_RNG_225}|{LETTER_RNG_226} +LETTER_GROUP_26 {LETTER_GROUP_25}|{LETTER_RNG_227}|{LETTER_RNG_228}|{LETTER_RNG_229}|{LETTER_RNG_230}|{LETTER_RNG_231}|{LETTER_RNG_232}|{LETTER_RNG_233}|{LETTER_RNG_234}|{LETTER_RNG_235} +LETTER_GROUP_27 {LETTER_GROUP_26}|{LETTER_RNG_236}|{LETTER_RNG_237}|{LETTER_RNG_238}|{LETTER_RNG_239}|{LETTER_RNG_240}|{LETTER_RNG_241}|{LETTER_RNG_242}|{LETTER_RNG_243}|{LETTER_RNG_244} +LETTER_GROUP_28 {LETTER_GROUP_27}|{LETTER_RNG_245}|{LETTER_RNG_246}|{LETTER_RNG_247}|{LETTER_RNG_248}|{LETTER_RNG_249}|{LETTER_RNG_250}|{LETTER_RNG_251}|{LETTER_RNG_252}|{LETTER_RNG_253} +LETTER_GROUP_29 {LETTER_GROUP_28}|{LETTER_RNG_254}|{LETTER_RNG_255}|{LETTER_RNG_256}|{LETTER_RNG_257}|{LETTER_RNG_258}|{LETTER_RNG_259}|{LETTER_RNG_260}|{LETTER_RNG_261}|{LETTER_RNG_262} +LETTER_GROUP_30 {LETTER_GROUP_29}|{LETTER_RNG_263}|{LETTER_RNG_264}|{LETTER_RNG_265}|{LETTER_RNG_266}|{LETTER_RNG_267}|{LETTER_RNG_268}|{LETTER_RNG_269}|{LETTER_RNG_270}|{LETTER_RNG_271} +LETTER_GROUP_31 {LETTER_GROUP_30}|{LETTER_RNG_272}|{LETTER_RNG_273}|{LETTER_RNG_274}|{LETTER_RNG_275}|{LETTER_RNG_276}|{LETTER_RNG_277}|{LETTER_RNG_278}|{LETTER_RNG_279}|{LETTER_RNG_280} +LETTER_GROUP_32 {LETTER_GROUP_31}|{LETTER_RNG_281}|{LETTER_RNG_282}|{LETTER_RNG_283}|{LETTER_RNG_284}|{LETTER_RNG_285}|{LETTER_RNG_286}|{LETTER_RNG_287}|{LETTER_RNG_288}|{LETTER_RNG_289} +LETTER_GROUP_33 {LETTER_GROUP_32}|{LETTER_RNG_290}|{LETTER_RNG_291}|{LETTER_RNG_292}|{LETTER_RNG_293}|{LETTER_RNG_294}|{LETTER_RNG_295}|{LETTER_RNG_296}|{LETTER_RNG_297}|{LETTER_RNG_298} +LETTER_GROUP_34 {LETTER_GROUP_33}|{LETTER_RNG_299}|{LETTER_RNG_300}|{LETTER_RNG_301}|{LETTER_RNG_302}|{LETTER_RNG_303}|{LETTER_RNG_304}|{LETTER_RNG_305}|{LETTER_RNG_306}|{LETTER_RNG_307} +LETTER_GROUP_35 {LETTER_GROUP_34}|{LETTER_RNG_308}|{LETTER_RNG_309}|{LETTER_RNG_310}|{LETTER_RNG_311}|{LETTER_RNG_312}|{LETTER_RNG_313}|{LETTER_RNG_314}|{LETTER_RNG_315}|{LETTER_RNG_316} +LETTER_GROUP_36 {LETTER_GROUP_35}|{LETTER_RNG_317}|{LETTER_RNG_318}|{LETTER_RNG_319}|{LETTER_RNG_320}|{LETTER_RNG_321}|{LETTER_RNG_322}|{LETTER_RNG_323}|{LETTER_RNG_324}|{LETTER_RNG_325} +LETTER_GROUP_37 {LETTER_GROUP_36}|{LETTER_RNG_326}|{LETTER_RNG_327}|{LETTER_RNG_328}|{LETTER_RNG_329}|{LETTER_RNG_330}|{LETTER_RNG_331}|{LETTER_RNG_332}|{LETTER_RNG_333}|{LETTER_RNG_334} +LETTER_GROUP_38 {LETTER_GROUP_37}|{LETTER_RNG_335}|{LETTER_RNG_336}|{LETTER_RNG_337}|{LETTER_RNG_338}|{LETTER_RNG_339}|{LETTER_RNG_340}|{LETTER_RNG_341}|{LETTER_RNG_342}|{LETTER_RNG_343} + +LETTER_G_GROUP_1 {LETTER_GROUP_1}|{LETTER_GROUP_2}|{LETTER_GROUP_3}|{LETTER_GROUP_4}|{LETTER_GROUP_5}|{LETTER_GROUP_6}|{LETTER_GROUP_7}|{LETTER_GROUP_8}|{LETTER_GROUP_9}|{LETTER_GROUP_10} +LETTER_G_GROUP_2 {LETTER_G_GROUP_1}|{LETTER_GROUP_11}|{LETTER_GROUP_12}|{LETTER_GROUP_13}|{LETTER_GROUP_14}|{LETTER_GROUP_15}|{LETTER_GROUP_16}|{LETTER_GROUP_17}|{LETTER_GROUP_18}|{LETTER_GROUP_19} +LETTER_G_GROUP_3 {LETTER_G_GROUP_2}|{LETTER_GROUP_20}|{LETTER_GROUP_21}|{LETTER_GROUP_22}|{LETTER_GROUP_23}|{LETTER_GROUP_24}|{LETTER_GROUP_25}|{LETTER_GROUP_26}|{LETTER_GROUP_27}|{LETTER_GROUP_28} +LETTER_G_GROUP_4 {LETTER_G_GROUP_3}|{LETTER_GROUP_29}|{LETTER_GROUP_30}|{LETTER_GROUP_31}|{LETTER_GROUP_32}|{LETTER_GROUP_33}|{LETTER_GROUP_34}|{LETTER_GROUP_35}|{LETTER_GROUP_36}|{LETTER_GROUP_37} +LETTER_G_GROUP_5 {LETTER_G_GROUP_4}|{LETTER_GROUP_38} + +UNICODE_LETTER {LETTER_G_GROUP_1}|{LETTER_G_GROUP_2}|{LETTER_G_GROUP_3}|{LETTER_G_GROUP_4}|{LETTER_G_GROUP_5} + +/* Unicode digit ranges (category Nd) */ +/* generated with unicode_range_generator.l */ +/* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */ +DIGIT_RNG_1 [0-9] +DIGIT_RNG_2 \xD9[\xA0-\xA9] +DIGIT_RNG_3 \xDB[\xB0-\xB9] +DIGIT_RNG_4 \xDF[\x80-\x89] +DIGIT_RNG_5 \xE0\xA5[\xA6-\xAF] +DIGIT_RNG_6 \xE0\xA7[\xA6-\xAF] +DIGIT_RNG_7 \xE0\xA9[\xA6-\xAF] +DIGIT_RNG_8 \xE0\xAB[\xA6-\xAF] +DIGIT_RNG_9 \xE0\xAD[\xA6-\xAF] +DIGIT_RNG_10 \xE0\xAF[\xA6-\xAF] +DIGIT_RNG_11 \xE0\xB1[\xA6-\xAF] +DIGIT_RNG_12 \xE0\xB3[\xA6-\xAF] +DIGIT_RNG_13 \xE0\xB5[\xA6-\xAF] +DIGIT_RNG_14 \xE0\xB7[\xA6-\xAF] +DIGIT_RNG_15 \xE0\xB9[\x90-\x99] +DIGIT_RNG_16 \xE0\xBB[\x90-\x99] +DIGIT_RNG_17 \xE0\xBC[\xA0-\xA9] +DIGIT_RNG_18 \xE1\x81[\x80-\x89] +DIGIT_RNG_19 \xE1\x82[\x90-\x99] +DIGIT_RNG_20 \xE1\x9F[\xA0-\xA9] +DIGIT_RNG_21 \xE1\xA0[\x90-\x99] +DIGIT_RNG_22 \xE1\xA5[\x86-\x8F] +DIGIT_RNG_23 \xE1\xA7[\x90-\x99] +DIGIT_RNG_24 \xE1\xAA[\x80-\x99] +DIGIT_RNG_25 \xE1\xAD[\x90-\x99] +DIGIT_RNG_26 \xE1\xAE[\xB0-\xB9] +DIGIT_RNG_27 \xE1\xB1[\x80-\x89] +DIGIT_RNG_28 \xE1\xB1[\x90-\x99] +DIGIT_RNG_29 \xEA\x98[\xA0-\xA9] +DIGIT_RNG_30 \xEA\xA3[\x90-\x99] +DIGIT_RNG_31 \xEA\xA4[\x80-\x89] +DIGIT_RNG_32 \xEA\xA7[\x90-\x99] +DIGIT_RNG_33 \xEA\xA7[\xB0-\xB9] +DIGIT_RNG_34 \xEA\xA9[\x90-\x99] +DIGIT_RNG_35 \xEA\xAF[\xB0-\xB9] +DIGIT_RNG_36 \xEF\xBC[\x90-\x99] +DIGIT_RNG_37 \xF0\x90\x92[\xA0-\xA9] +DIGIT_RNG_38 \xF0\x90\xB4[\xB0-\xB9] +DIGIT_RNG_39 \xF0\x91\x81[\xA6-\xAF] +DIGIT_RNG_40 \xF0\x91\x83[\xB0-\xB9] +DIGIT_RNG_41 \xF0\x91\x84[\xB6-\xBF] +DIGIT_RNG_42 \xF0\x91\x87[\x90-\x99] +DIGIT_RNG_43 \xF0\x91\x8B[\xB0-\xB9] +DIGIT_RNG_44 \xF0\x91\x91[\x90-\x99] +DIGIT_RNG_45 \xF0\x91\x93[\x90-\x99] +DIGIT_RNG_46 \xF0\x91\x99[\x90-\x99] +DIGIT_RNG_47 \xF0\x91\x9B[\x80-\x89] +DIGIT_RNG_48 \xF0\x91\x9C[\xB0-\xB9] +DIGIT_RNG_49 \xF0\x91\xA3[\xA0-\xA9] +DIGIT_RNG_50 \xF0\x91\xA5[\x90-\x99] +DIGIT_RNG_51 \xF0\x91\xB1[\x90-\x99] +DIGIT_RNG_52 \xF0\x91\xB5[\x90-\x99] +DIGIT_RNG_53 \xF0\x91\xB6[\xA0-\xA9] +DIGIT_RNG_54 \xF0\x96\xA9[\xA0-\xA9] +DIGIT_RNG_55 \xF0\x96\xAD[\x90-\x99] +DIGIT_RNG_56 \xF0\x9D\x9F[\x8E-\xBF] +DIGIT_RNG_57 \xF0\x9E\x85[\x80-\x89] +DIGIT_RNG_58 \xF0\x9E\x8B[\xB0-\xB9] +DIGIT_RNG_59 \xF0\x9E\xA5[\x90-\x99] +DIGIT_RNG_60 \xF0\x9F\xAF[\xB0-\xB9] + +DIGIT_GROUP_1 {DIGIT_RNG_1}|{DIGIT_RNG_2}|{DIGIT_RNG_3}|{DIGIT_RNG_4}|{DIGIT_RNG_5}|{DIGIT_RNG_6}|{DIGIT_RNG_7}|{DIGIT_RNG_8}|{DIGIT_RNG_10} +DIGIT_GROUP_2 {DIGIT_GROUP_1}|{DIGIT_RNG_11}|{DIGIT_RNG_12}|{DIGIT_RNG_13}|{DIGIT_RNG_14}|{DIGIT_RNG_15}|{DIGIT_RNG_16}|{DIGIT_RNG_17}|{DIGIT_RNG_18} +DIGIT_GROUP_3 {DIGIT_GROUP_2}|{DIGIT_RNG_19}|{DIGIT_RNG_20}|{DIGIT_RNG_21}|{DIGIT_RNG_22}|{DIGIT_RNG_23}|{DIGIT_RNG_24}|{DIGIT_RNG_25}|{DIGIT_RNG_26} +DIGIT_GROUP_4 {DIGIT_GROUP_3}|{DIGIT_RNG_27}|{DIGIT_RNG_28}|{DIGIT_RNG_29}|{DIGIT_RNG_30}|{DIGIT_RNG_31}|{DIGIT_RNG_32}|{DIGIT_RNG_33}|{DIGIT_RNG_34} +DIGIT_GROUP_5 {DIGIT_GROUP_4}|{DIGIT_RNG_35}|{DIGIT_RNG_36}|{DIGIT_RNG_37}|{DIGIT_RNG_38}|{DIGIT_RNG_39}|{DIGIT_RNG_40}|{DIGIT_RNG_41}|{DIGIT_RNG_42} +DIGIT_GROUP_6 {DIGIT_GROUP_5}|{DIGIT_RNG_43}|{DIGIT_RNG_44}|{DIGIT_RNG_45}|{DIGIT_RNG_46}|{DIGIT_RNG_47}|{DIGIT_RNG_48}|{DIGIT_RNG_49}|{DIGIT_RNG_50} +DIGIT_GROUP_7 {DIGIT_GROUP_6}|{DIGIT_RNG_51}|{DIGIT_RNG_52}|{DIGIT_RNG_53}|{DIGIT_RNG_54}|{DIGIT_RNG_55}|{DIGIT_RNG_56}|{DIGIT_RNG_57}|{DIGIT_RNG_58} +DIGIT_GROUP_8 {DIGIT_GROUP_7}|{DIGIT_RNG_59}|{DIGIT_RNG_60} + +UNICODE_DIGIT {DIGIT_GROUP_1}|{DIGIT_GROUP_2}|{DIGIT_GROUP_3}|{DIGIT_GROUP_4}|{DIGIT_GROUP_5}|{DIGIT_GROUP_6}|{DIGIT_GROUP_7}|{DIGIT_GROUP_8} + +/* Unicode combining mark ranges (categories Mn and Mc) */ +/* generated with unicode_range_generator.l */ +/* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */ +COMB_MARK_RNG_1 \xCD[\x80-\xAF]|\xCC[\x80-\xBF] +COMB_MARK_RNG_2 \xD2[\x83-\x87] +COMB_MARK_RNG_3 \xD6[\x91-\xBD] +COMB_MARK_RNG_4 \xD6\xBF +COMB_MARK_RNG_5 \xD7[\x81-\x82] +COMB_MARK_RNG_6 \xD7[\x84-\x85] +COMB_MARK_RNG_7 \xD7\x87 +COMB_MARK_RNG_8 \xD8[\x90-\x9A] +COMB_MARK_RNG_9 \xD9[\x8B-\x9F] +COMB_MARK_RNG_10 \xD9\xB0 +COMB_MARK_RNG_11 \xDB[\x96-\x9C] +COMB_MARK_RNG_12 \xDB[\x9F-\xA4] +COMB_MARK_RNG_13 \xDB[\xA7-\xA8] +COMB_MARK_RNG_14 \xDB[\xAA-\xAD] +COMB_MARK_RNG_15 \xDC\x91 +COMB_MARK_RNG_16 \xDC[\xB0-\xBF]|\xDD[\x80-\x8A] +COMB_MARK_RNG_17 \xDE[\xA6-\xB0] +COMB_MARK_RNG_18 \xDF[\xAB-\xB3] +COMB_MARK_RNG_19 \xDF\xBD +COMB_MARK_RNG_20 \xE0\xA0[\x96-\x99] +COMB_MARK_RNG_21 \xE0\xA0[\x9B-\xA3] +COMB_MARK_RNG_22 \xE0\xA0[\xA5-\xA7] +COMB_MARK_RNG_23 \xE0\xA0[\xA9-\xAD] +COMB_MARK_RNG_24 \xE0\xA1[\x99-\x9B] +COMB_MARK_RNG_25 \xE0\xA3[\x93-\xA1] +COMB_MARK_RNG_26 \xE0(\xA3[\xA3-\xBF]|\xA4[\x80-\x83]) +COMB_MARK_RNG_27 \xE0\xA4[\xBA-\xBC] +COMB_MARK_RNG_28 \xE0(\xA4[\xBE-\xBF]|\xA5[\x80-\x8F]) +COMB_MARK_RNG_29 \xE0\xA5[\x91-\x97] +COMB_MARK_RNG_30 \xE0\xA5[\xA2-\xA3] +COMB_MARK_RNG_31 \xE0\xA6[\x81-\x83] +COMB_MARK_RNG_32 \xE0\xA6\xBC +COMB_MARK_RNG_33 \xE0(\xA6[\xBE-\xBF]|\xA7[\x80-\x8D]) +COMB_MARK_RNG_34 \xE0\xA7\x97 +COMB_MARK_RNG_35 \xE0\xA7[\xA2-\xA3] +COMB_MARK_RNG_36 \xE0(\xA7[\xBE-\xBF]|\xA8[\x80-\x83]) +COMB_MARK_RNG_37 \xE0(\xA8[\xBC-\xBF]|\xA9[\x80-\x91]) +COMB_MARK_RNG_38 \xE0\xA9[\xB0-\xB1] +COMB_MARK_RNG_39 \xE0\xA9\xB5 +COMB_MARK_RNG_40 \xE0\xAA[\x81-\x83] +COMB_MARK_RNG_41 \xE0\xAA\xBC +COMB_MARK_RNG_42 \xE0(\xAA[\xBE-\xBF]|\xAB[\x80-\x8D]) +COMB_MARK_RNG_43 \xE0\xAB[\xA2-\xA3] +COMB_MARK_RNG_44 \xE0(\xAB[\xBA-\xBF]|\xAC[\x80-\x83]) +COMB_MARK_RNG_45 \xE0\xAC\xBC +COMB_MARK_RNG_46 \xE0(\xAC[\xBE-\xBF]|\xAD[\x80-\x97]) +COMB_MARK_RNG_47 \xE0\xAD[\xA2-\xA3] +COMB_MARK_RNG_48 \xE0\xAE\x82 +COMB_MARK_RNG_49 \xE0(\xAE[\xBE-\xBF]|\xAF[\x80-\x8D]) +COMB_MARK_RNG_50 \xE0\xAF\x97 +COMB_MARK_RNG_51 \xE0\xB0[\x80-\x84] +COMB_MARK_RNG_52 \xE0(\xB0[\xBE-\xBF]|\xB1[\x80-\x96]) +COMB_MARK_RNG_53 \xE0\xB1[\xA2-\xA3] +COMB_MARK_RNG_54 \xE0\xB2[\x81-\x83] +COMB_MARK_RNG_55 \xE0\xB2\xBC +COMB_MARK_RNG_56 \xE0(\xB2[\xBE-\xBF]|\xB3[\x80-\x96]) +COMB_MARK_RNG_57 \xE0\xB3[\xA2-\xA3] +COMB_MARK_RNG_58 \xE0\xB4[\x80-\x83] +COMB_MARK_RNG_59 \xE0\xB4[\xBB-\xBC] +COMB_MARK_RNG_60 \xE0(\xB4[\xBE-\xBF]|\xB5[\x80-\x8D]) +COMB_MARK_RNG_61 \xE0\xB5\x97 +COMB_MARK_RNG_62 \xE0\xB5[\xA2-\xA3] +COMB_MARK_RNG_63 \xE0\xB6[\x81-\x83] +COMB_MARK_RNG_64 \xE0\xB7[\x8A-\x9F] +COMB_MARK_RNG_65 \xE0\xB7[\xB2-\xB3] +COMB_MARK_RNG_66 \xE0\xB8\xB1 +COMB_MARK_RNG_67 \xE0\xB8[\xB4-\xBA] +COMB_MARK_RNG_68 \xE0\xB9[\x87-\x8E] +COMB_MARK_RNG_69 \xE0\xBA\xB1 +COMB_MARK_RNG_70 \xE0\xBA[\xB4-\xBC] +COMB_MARK_RNG_71 \xE0\xBB[\x88-\x8D] +COMB_MARK_RNG_72 \xE0\xBC[\x98-\x99] +COMB_MARK_RNG_73 \xE0\xBC\xB5 +COMB_MARK_RNG_74 \xE0\xBC\xB7 +COMB_MARK_RNG_75 \xE0\xBC\xB9 +COMB_MARK_RNG_76 \xE0\xBC[\xBE-\xBF] +COMB_MARK_RNG_77 \xE0(\xBD[\xB1-\xBF]|\xBE[\x80-\x84]) +COMB_MARK_RNG_78 \xE0\xBE[\x86-\x87] +COMB_MARK_RNG_79 \xE0\xBE[\x8D-\xBC] +COMB_MARK_RNG_80 \xE0\xBF\x86 +COMB_MARK_RNG_81 \xE1\x80[\xAB-\xBE] +COMB_MARK_RNG_82 \xE1\x81[\x96-\x99] +COMB_MARK_RNG_83 \xE1\x81[\x9E-\xA0] +COMB_MARK_RNG_84 \xE1\x81[\xA2-\xA4] +COMB_MARK_RNG_85 \xE1\x81[\xA7-\xAD] +COMB_MARK_RNG_86 \xE1\x81[\xB1-\xB4] +COMB_MARK_RNG_87 \xE1\x82[\x82-\x8D] +COMB_MARK_RNG_88 \xE1\x82\x8F +COMB_MARK_RNG_89 \xE1\x82[\x9A-\x9D] +COMB_MARK_RNG_90 \8xE1\x8D[\x9D-\x9F] +COMB_MARK_RNG_91 \xE1\x9C[\x92-\x94] +COMB_MARK_RNG_92 \xE1\x9C[\xB2-\xB4] +COMB_MARK_RNG_93 \xE1\x9D[\x92-\x93] +COMB_MARK_RNG_94 \xE1\x9D[\xB2-\xB3] +COMB_MARK_RNG_95 \xE1(\x9E[\xB4-\xBF]|\x9F[\x80-\x93]) +COMB_MARK_RNG_96 \xE1\x9F\x9D +COMB_MARK_RNG_97 \xE1\xA0[\x8B-\x8D] +COMB_MARK_RNG_98 \xE1\xA2[\x85-\x86] +COMB_MARK_RNG_99 \xE1\xA2\xA9 +COMB_MARK_RNG_100 \xE1\xA4[\xA0-\xBB] +COMB_MARK_RNG_101 \xE1\xA8[\x97-\x9B] +COMB_MARK_RNG_102 \xE1\xA9[\x95-\xBF] +COMB_MARK_RNG_103 \xE1\xAA[\xB0-\xBD] +COMB_MARK_RNG_104 \xE1(\xAA\xBF|\xAC[\x80-\x84]|\xAB[\x80-\xBF]) +COMB_MARK_RNG_105 \xE1(\xAC[\xB4-\xBF]|\xAD[\x80-\x84]) +COMB_MARK_RNG_106 \xE1\xAD[\xAB-\xB3] +COMB_MARK_RNG_107 \xE1\xAE[\x80-\x82] +COMB_MARK_RNG_108 \xE1\xAE[\xA1-\xAD] +COMB_MARK_RNG_109 \xE1\xAF[\xA6-\xB3] +COMB_MARK_RNG_110 \xE1\xB0[\xA4-\xB7] +COMB_MARK_RNG_111 \xE1\xB3[\x90-\x92] +COMB_MARK_RNG_112 \xE1\xB3[\x94-\xA8] +COMB_MARK_RNG_113 \xE1\xB3\xAD +COMB_MARK_RNG_114 \xE1\xB3\xB4 +COMB_MARK_RNG_115 \xE1\xB3[\xB7-\xB9] +COMB_MARK_RNG_116 \xE1\xB7[\x80-\xBF] +COMB_MARK_RNG_117 \xE2\x83[\x90-\x9C] +COMB_MARK_RNG_118 \xE2\x83\xA1 +COMB_MARK_RNG_119 \xE2\x83[\xA5-\xB0] +COMB_MARK_RNG_120 \xE2\xB3[\xAF-\xB1] +COMB_MARK_RNG_121 \xE2\xB5\xBF +COMB_MARK_RNG_122 \xE2\xB7[\xA0-\xBF] +COMB_MARK_RNG_123 \xE3\x80[\xAA-\xAF] +COMB_MARK_RNG_124 \xE3\x82[\x99-\x9A] +COMB_MARK_RNG_125 \xEA\x99\xAF +COMB_MARK_RNG_126 \xEA\x99[\xB4-\xBD] +COMB_MARK_RNG_127 \xEA\x9A[\x9E-\x9F] +COMB_MARK_RNG_128 \xEA\x9B[\xB0-\xB1] +COMB_MARK_RNG_129 \xEA\xA0\x82 +COMB_MARK_RNG_130 \xEA\xA0\x86 +COMB_MARK_RNG_131 \xEA\xA0\x8B +COMB_MARK_RNG_132 \xEA\xA0[\xA3-\xA7] +COMB_MARK_RNG_133 \xEA\xA0\xAC +COMB_MARK_RNG_134 \xEA\xA2[\x80-\x81] +COMB_MARK_RNG_135 \xEA(\xA2[\xB4-\xBF]|\xA3[\x80-\x85]) +COMB_MARK_RNG_136 \xEA\xA3[\xA0-\xB1] +COMB_MARK_RNG_137 \xEA\xA3\xBF +COMB_MARK_RNG_138 \xEA\xA4[\xA6-\xAD] +COMB_MARK_RNG_139 \xEA\xA5[\x87-\x93] +COMB_MARK_RNG_140 \xEA\xA6[\x80-\x83] +COMB_MARK_RNG_141 \xEA(\xA6[\xB3-\xBF]|\xA7\x80) +COMB_MARK_RNG_142 \xEA\xA7\xA5 +COMB_MARK_RNG_143 \xEA\xA8[\xA9-\xB6] +COMB_MARK_RNG_144 \xEA\xA9\x83 +COMB_MARK_RNG_145 \xEA\xA9[\x8C-\x8D] +COMB_MARK_RNG_146 \xEA\xA9[\xBB-\xBD] +COMB_MARK_RNG_147 \xEA\xAA\xB0 +COMB_MARK_RNG_148 \xEA\xAA[\xB2-\xB4] +COMB_MARK_RNG_149 \xEA\xAA[\xB7-\xB8] +COMB_MARK_RNG_150 \xEA\xAA[\xBE-\xBF] +COMB_MARK_RNG_151 \xEA\xAB\x81 +COMB_MARK_RNG_152 \xEA\xAB[\xAB-\xAF] +COMB_MARK_RNG_153 \xEA\xAB[\xB5-\xB6] +COMB_MARK_RNG_154 \xEA\xAF[\xA3-\xAA] +COMB_MARK_RNG_155 \xEA\xAF[\xAC-\xAD] +COMB_MARK_RNG_156 \xEF\xAC\x9E +COMB_MARK_RNG_157 \xEF\xB8[\x80-\x8F] +COMB_MARK_RNG_158 \xEF\xB8[\xA0-\xAF] +COMB_MARK_RNG_159 \xF0\x90\x87\xBD +COMB_MARK_RNG_160 \xF0\x90\x8B\xA0 +COMB_MARK_RNG_161 \xF0\x90\x8D[\xB6-\xBA] +COMB_MARK_RNG_162 \xF0\x90\xA8[\x81-\x8F] +COMB_MARK_RNG_163 \xF0\x90\xA8[\xB8-\xBF] +COMB_MARK_RNG_164 \xF0\x90\xAB[\xA5-\xA6] +COMB_MARK_RNG_165 \xF0\x90\xB4[\xA4-\xA7] +COMB_MARK_RNG_166 \xF0\x90\xBA[\xAB-\xAC] +COMB_MARK_RNG_167 \xF0\x90\xBD[\x86-\x90] +COMB_MARK_RNG_168 \xF0\x91\x80[\x80-\x82] +COMB_MARK_RNG_169 \xF0\x91(\x80[\xB8-\xBF]|\x81[\x80-\x86]) +COMB_MARK_RNG_170 \xF0\x91(\x81\xBF|\x82[\x80-\x82]) +COMB_MARK_RNG_171 \xF0\x91\x82[\xB0-\xBA] +COMB_MARK_RNG_172 \xF0\x91\x84[\x80-\x82] +COMB_MARK_RNG_173 \xF0\x91\x84[\xA7-\xB4] +COMB_MARK_RNG_174 \xF0\x91\x85[\x85-\x86] +COMB_MARK_RNG_175 \xF0\x91\x85\xB3 +COMB_MARK_RNG_176 \xF0\x91\x86[\x80-\x82] +COMB_MARK_RNG_177 \xF0\x91(\x86[\xB3-\xBF]|\x87\x80) +COMB_MARK_RNG_178 \xF0\x91\x87[\x89-\x8C] +COMB_MARK_RNG_179 \xF0\x91\x87[\x8E-\x8F] +COMB_MARK_RNG_180 \xF0\x91\x88[\xAC-\xB7] +COMB_MARK_RNG_181 \xF0\x91\x88\xBE +COMB_MARK_RNG_182 \xF0\x91\x8B[\x9F-\xAA] +COMB_MARK_RNG_183 \xF0\x91\x8C[\x80-\x83] +COMB_MARK_RNG_184 \xF0\x91\x8C[\xBB-\xBC] +COMB_MARK_RNG_185 \xF0\x91(\x8C[\xBE-\xBF]|\x8D[\x80-\x8D]) +COMB_MARK_RNG_186 \xF0\x91\x8D\x97 +COMB_MARK_RNG_187 \xF0\x91\x8D[\xA2-\xB4] +COMB_MARK_RNG_188 \xF0\x91(\x90[\xB5-\xBF]|\x91[\x80-\x86]) +COMB_MARK_RNG_189 \xF0\x91\x91\x9E +COMB_MARK_RNG_190 \xF0\x91(\x92[\xB0-\xBF]|\x93[\x80-\x83]) +COMB_MARK_RNG_191 \xF0\x91(\x96[\xAF-\xBF]|\x97\x80) +COMB_MARK_RNG_192 \xF0\x91\x97[\x9C-\x9D] +COMB_MARK_RNG_193 \xF0\x91(\x98[\xB0-\xBF]|\x99\x80) +COMB_MARK_RNG_194 \xF0\x91\x9A[\xAB-\xB7] +COMB_MARK_RNG_195 \xF0\x91\x9C[\x9D-\xAB] +COMB_MARK_RNG_196 \xF0\x91\xA0[\xAC-\xBA] +COMB_MARK_RNG_197 \xF0\x91\xA4[\xB0-\xBE] +COMB_MARK_RNG_198 \xF0\x91\xA5\x80 +COMB_MARK_RNG_199 \xF0\x91\xA5[\x82-\x83] +COMB_MARK_RNG_200 \xF0\x91\xA7[\x91-\xA0] +COMB_MARK_RNG_201 \xF0\x91\xA7\xA4 +COMB_MARK_RNG_202 \xF0\x91\xA8[\x81-\x8A] +COMB_MARK_RNG_203 \xF0\x91\xA8[\xB3-\xB9] +COMB_MARK_RNG_204 \xF0\x91\xA8[\xBB-\xBE] +COMB_MARK_RNG_205 \xF0\x91\xA9\x87 +COMB_MARK_RNG_206 \xF0\x91\xA9[\x91-\x9B] +COMB_MARK_RNG_207 \xF0\x91\xAA[\x8A-\x99] +COMB_MARK_RNG_208 \xF0\x91\xB0[\xAF-\xBF] +COMB_MARK_RNG_209 \xF0\x91\xB2[\x92-\xB6] +COMB_MARK_RNG_210 \xF0\x91(\xB4[\xB1-\xBF]|\xB5[\x80-\x85]) +COMB_MARK_RNG_211 \xF0\x91\xB5\x87 +COMB_MARK_RNG_212 \xF0\x91\xB6[\x8A-\x97] +COMB_MARK_RNG_213 \xF0\x91\xBB[\xB3-\xB6] +COMB_MARK_RNG_214 \xF0\x96\xAB[\xB0-\xB4] +COMB_MARK_RNG_215 \xF0\x96\xAC[\xB0-\xB6] +COMB_MARK_RNG_216 \xF0\x96\xBD\x8F +COMB_MARK_RNG_217 \xF0\x96(\xBD[\x91-\xBF]|\xBE[\x80-\x92]) +COMB_MARK_RNG_218 \xF0\x96\xBF[\xA4-\xB1] +COMB_MARK_RNG_219 \xF0\x9B\xB2[\x9D-\x9E] +COMB_MARK_RNG_220 \xF0\x9D\x85[\xA5-\xA9] +COMB_MARK_RNG_221 \xF0\x9D\x85[\xAD-\xB2] +COMB_MARK_RNG_222 \xF0\x9D(\x85[\xBB-\xBF]|\x86[\x80-\x82]) +COMB_MARK_RNG_223 \xF0\x9D\x86[\x85-\x8B] +COMB_MARK_RNG_224 \xF0\x9D\x86[\xAA-\xAD] +COMB_MARK_RNG_225 \xF0\x9D\x89[\x82-\x84] +COMB_MARK_RNG_226 \xF0\x9D\xA8[\x80-\xB6] +COMB_MARK_RNG_227 \xF0\x9D(\xA8[\xBB-\xBF]|\xA9[\x80-\xAC]) +COMB_MARK_RNG_228 \xF0\x9D\xA9\xB5 +COMB_MARK_RNG_229 \xF0\x9D\xAA\x84 +COMB_MARK_RNG_230 \xF0(\x9D\xAA[\x9B-\xBF]|\x9E\x80[\x80-\xAA]|\x9D[\xAB-\xBF][\x80-\xBF]) +COMB_MARK_RNG_231 \xF0\x9E\x84[\xB0-\xB6] +COMB_MARK_RNG_232 \xF0\x9E\x8B[\xAC-\xAF] +COMB_MARK_RNG_233 \xF0\x9E\xA3[\x90-\x96] +COMB_MARK_RNG_234 \xF0\x9E\xA5[\x84-\x8A] +COMB_MARK_RNG_235 \xF3\xA0(\x87[\x80-\xAF]|[\x84-\x86][\x80-\xBF]) + +COMB_MARK_GROUP_1 {COMB_MARK_RNG_1}|{COMB_MARK_RNG_2}|{COMB_MARK_RNG_3}|{COMB_MARK_RNG_4}|{COMB_MARK_RNG_5}|{COMB_MARK_RNG_6}|{COMB_MARK_RNG_7}|{COMB_MARK_RNG_8}|{COMB_MARK_RNG_9}|{COMB_MARK_RNG_10} +COMB_MARK_GROUP_2 {COMB_MARK_GROUP_1}|{COMB_MARK_RNG_11}|{COMB_MARK_RNG_12}|{COMB_MARK_RNG_13}|{COMB_MARK_RNG_14}|{COMB_MARK_RNG_15}|{COMB_MARK_RNG_16}|{COMB_MARK_RNG_17}|{COMB_MARK_RNG_18}|{COMB_MARK_RNG_19} +COMB_MARK_GROUP_3 {COMB_MARK_GROUP_2}|{COMB_MARK_RNG_20}|{COMB_MARK_RNG_21}|{COMB_MARK_RNG_22}|{COMB_MARK_RNG_23}|{COMB_MARK_RNG_24}|{COMB_MARK_RNG_25}|{COMB_MARK_RNG_26}|{COMB_MARK_RNG_27}|{COMB_MARK_RNG_28} +COMB_MARK_GROUP_4 {COMB_MARK_GROUP_3}|{COMB_MARK_RNG_29}|{COMB_MARK_RNG_30}|{COMB_MARK_RNG_31}|{COMB_MARK_RNG_32}|{COMB_MARK_RNG_33}|{COMB_MARK_RNG_34}|{COMB_MARK_RNG_35}|{COMB_MARK_RNG_36}|{COMB_MARK_RNG_37} +COMB_MARK_GROUP_5 {COMB_MARK_GROUP_4}|{COMB_MARK_RNG_38}|{COMB_MARK_RNG_39}|{COMB_MARK_RNG_40}|{COMB_MARK_RNG_41}|{COMB_MARK_RNG_42}|{COMB_MARK_RNG_43}|{COMB_MARK_RNG_44}|{COMB_MARK_RNG_45}|{COMB_MARK_RNG_46} +COMB_MARK_GROUP_6 {COMB_MARK_GROUP_5}|{COMB_MARK_RNG_47}|{COMB_MARK_RNG_48}|{COMB_MARK_RNG_49}|{COMB_MARK_RNG_50}|{COMB_MARK_RNG_51}|{COMB_MARK_RNG_52}|{COMB_MARK_RNG_53}|{COMB_MARK_RNG_54}|{COMB_MARK_RNG_55} +COMB_MARK_GROUP_7 {COMB_MARK_GROUP_6}|{COMB_MARK_RNG_56}|{COMB_MARK_RNG_57}|{COMB_MARK_RNG_58}|{COMB_MARK_RNG_59}|{COMB_MARK_RNG_60}|{COMB_MARK_RNG_61}|{COMB_MARK_RNG_62}|{COMB_MARK_RNG_63}|{COMB_MARK_RNG_64} +COMB_MARK_GROUP_8 {COMB_MARK_GROUP_7}|{COMB_MARK_RNG_65}|{COMB_MARK_RNG_66}|{COMB_MARK_RNG_67}|{COMB_MARK_RNG_68}|{COMB_MARK_RNG_69}|{COMB_MARK_RNG_70}|{COMB_MARK_RNG_71}|{COMB_MARK_RNG_72}|{COMB_MARK_RNG_73} +COMB_MARK_GROUP_9 {COMB_MARK_GROUP_8}|{COMB_MARK_RNG_74}|{COMB_MARK_RNG_75}|{COMB_MARK_RNG_76}|{COMB_MARK_RNG_77}|{COMB_MARK_RNG_78}|{COMB_MARK_RNG_79}|{COMB_MARK_RNG_80}|{COMB_MARK_RNG_81}|{COMB_MARK_RNG_82} +COMB_MARK_GROUP_10 {COMB_MARK_GROUP_9}|{COMB_MARK_RNG_83}|{COMB_MARK_RNG_84}|{COMB_MARK_RNG_85}|{COMB_MARK_RNG_86}|{COMB_MARK_RNG_87}|{COMB_MARK_RNG_88}|{COMB_MARK_RNG_89}|{COMB_MARK_RNG_90}|{COMB_MARK_RNG_91} +COMB_MARK_GROUP_11 {COMB_MARK_GROUP_10}|{COMB_MARK_RNG_92}|{COMB_MARK_RNG_93}|{COMB_MARK_RNG_94}|{COMB_MARK_RNG_95}|{COMB_MARK_RNG_96}|{COMB_MARK_RNG_97}|{COMB_MARK_RNG_98}|{COMB_MARK_RNG_99}|{COMB_MARK_RNG_100} +COMB_MARK_GROUP_12 {COMB_MARK_GROUP_11}|{COMB_MARK_RNG_101}|{COMB_MARK_RNG_102}|{COMB_MARK_RNG_103}|{COMB_MARK_RNG_104}|{COMB_MARK_RNG_105}|{COMB_MARK_RNG_106}|{COMB_MARK_RNG_107}|{COMB_MARK_RNG_108}|{COMB_MARK_RNG_109} +COMB_MARK_GROUP_13 {COMB_MARK_GROUP_12}|{COMB_MARK_RNG_110}|{COMB_MARK_RNG_111}|{COMB_MARK_RNG_112}|{COMB_MARK_RNG_113}|{COMB_MARK_RNG_114}|{COMB_MARK_RNG_115}|{COMB_MARK_RNG_116}|{COMB_MARK_RNG_117}|{COMB_MARK_RNG_118} +COMB_MARK_GROUP_14 {COMB_MARK_GROUP_13}|{COMB_MARK_RNG_119}|{COMB_MARK_RNG_120}|{COMB_MARK_RNG_121}|{COMB_MARK_RNG_122}|{COMB_MARK_RNG_123}|{COMB_MARK_RNG_124}|{COMB_MARK_RNG_125}|{COMB_MARK_RNG_126}|{COMB_MARK_RNG_127} +COMB_MARK_GROUP_15 {COMB_MARK_GROUP_14}|{COMB_MARK_RNG_128}|{COMB_MARK_RNG_129}|{COMB_MARK_RNG_130}|{COMB_MARK_RNG_131}|{COMB_MARK_RNG_132}|{COMB_MARK_RNG_133}|{COMB_MARK_RNG_134}|{COMB_MARK_RNG_135}|{COMB_MARK_RNG_136} +COMB_MARK_GROUP_16 {COMB_MARK_GROUP_15}|{COMB_MARK_RNG_137}|{COMB_MARK_RNG_138}|{COMB_MARK_RNG_139}|{COMB_MARK_RNG_140}|{COMB_MARK_RNG_141}|{COMB_MARK_RNG_142}|{COMB_MARK_RNG_143}|{COMB_MARK_RNG_144}|{COMB_MARK_RNG_145} +COMB_MARK_GROUP_17 {COMB_MARK_GROUP_16}|{COMB_MARK_RNG_146}|{COMB_MARK_RNG_147}|{COMB_MARK_RNG_148}|{COMB_MARK_RNG_149}|{COMB_MARK_RNG_150}|{COMB_MARK_RNG_151}|{COMB_MARK_RNG_152}|{COMB_MARK_RNG_153}|{COMB_MARK_RNG_154} +COMB_MARK_GROUP_18 {COMB_MARK_GROUP_17}|{COMB_MARK_RNG_155}|{COMB_MARK_RNG_156}|{COMB_MARK_RNG_157}|{COMB_MARK_RNG_158}|{COMB_MARK_RNG_159}|{COMB_MARK_RNG_160}|{COMB_MARK_RNG_161}|{COMB_MARK_RNG_162}|{COMB_MARK_RNG_163} +COMB_MARK_GROUP_19 {COMB_MARK_GROUP_18}|{COMB_MARK_RNG_164}|{COMB_MARK_RNG_165}|{COMB_MARK_RNG_166}|{COMB_MARK_RNG_167}|{COMB_MARK_RNG_168}|{COMB_MARK_RNG_169}|{COMB_MARK_RNG_170}|{COMB_MARK_RNG_171}|{COMB_MARK_RNG_172} +COMB_MARK_GROUP_20 {COMB_MARK_GROUP_19}|{COMB_MARK_RNG_173}|{COMB_MARK_RNG_174}|{COMB_MARK_RNG_175}|{COMB_MARK_RNG_176}|{COMB_MARK_RNG_177}|{COMB_MARK_RNG_178}|{COMB_MARK_RNG_179}|{COMB_MARK_RNG_180}|{COMB_MARK_RNG_181} +COMB_MARK_GROUP_21 {COMB_MARK_GROUP_20}|{COMB_MARK_RNG_182}|{COMB_MARK_RNG_183}|{COMB_MARK_RNG_184}|{COMB_MARK_RNG_185}|{COMB_MARK_RNG_186}|{COMB_MARK_RNG_187}|{COMB_MARK_RNG_188}|{COMB_MARK_RNG_189}|{COMB_MARK_RNG_190} +COMB_MARK_GROUP_22 {COMB_MARK_GROUP_21}|{COMB_MARK_RNG_191}|{COMB_MARK_RNG_192}|{COMB_MARK_RNG_193}|{COMB_MARK_RNG_194}|{COMB_MARK_RNG_195}|{COMB_MARK_RNG_196}|{COMB_MARK_RNG_197}|{COMB_MARK_RNG_198}|{COMB_MARK_RNG_199} +COMB_MARK_GROUP_23 {COMB_MARK_GROUP_22}|{COMB_MARK_RNG_200}|{COMB_MARK_RNG_201}|{COMB_MARK_RNG_202}|{COMB_MARK_RNG_203}|{COMB_MARK_RNG_204}|{COMB_MARK_RNG_205}|{COMB_MARK_RNG_206}|{COMB_MARK_RNG_207}|{COMB_MARK_RNG_208} +COMB_MARK_GROUP_24 {COMB_MARK_GROUP_23}|{COMB_MARK_RNG_209}|{COMB_MARK_RNG_210}|{COMB_MARK_RNG_211}|{COMB_MARK_RNG_212}|{COMB_MARK_RNG_213}|{COMB_MARK_RNG_214}|{COMB_MARK_RNG_215}|{COMB_MARK_RNG_216}|{COMB_MARK_RNG_217} +COMB_MARK_GROUP_25 {COMB_MARK_GROUP_24}|{COMB_MARK_RNG_218}|{COMB_MARK_RNG_219}|{COMB_MARK_RNG_220}|{COMB_MARK_RNG_221}|{COMB_MARK_RNG_222}|{COMB_MARK_RNG_223}|{COMB_MARK_RNG_224}|{COMB_MARK_RNG_225}|{COMB_MARK_RNG_226} +COMB_MARK_GROUP_26 {COMB_MARK_GROUP_25}|{COMB_MARK_RNG_227}|{COMB_MARK_RNG_228}|{COMB_MARK_RNG_229}|{COMB_MARK_RNG_230}|{COMB_MARK_RNG_231}|{COMB_MARK_RNG_232}|{COMB_MARK_RNG_233}|{COMB_MARK_RNG_234}|{COMB_MARK_RNG_235} + +COMB_MARK_G_GROUP_1 {COMB_MARK_GROUP_1}|{COMB_MARK_GROUP_2}|{COMB_MARK_GROUP_3}|{COMB_MARK_GROUP_4}|{COMB_MARK_GROUP_5}|{COMB_MARK_GROUP_6}|{COMB_MARK_GROUP_7}|{COMB_MARK_GROUP_8}|{COMB_MARK_GROUP_9}|{COMB_MARK_GROUP_10} +COMB_MARK_G_GROUP_2 {COMB_MARK_G_GROUP_1}|{COMB_MARK_GROUP_11}|{COMB_MARK_GROUP_12}|{COMB_MARK_GROUP_13}|{COMB_MARK_GROUP_14}|{COMB_MARK_GROUP_15}|{COMB_MARK_GROUP_16}|{COMB_MARK_GROUP_17}|{COMB_MARK_GROUP_18}|{COMB_MARK_GROUP_19} +COMB_MARK_G_GROUP_3 {COMB_MARK_G_GROUP_2}|{COMB_MARK_GROUP_20}|{COMB_MARK_GROUP_21}|{COMB_MARK_GROUP_22}|{COMB_MARK_GROUP_23}|{COMB_MARK_GROUP_24}|{COMB_MARK_GROUP_25}|{COMB_MARK_GROUP_26} + +UNICODE_COMBINING_MARK {COMB_MARK_G_GROUP_1}|{COMB_MARK_G_GROUP_2}|{COMB_MARK_G_GROUP_3} + +/* Unicode connector punctuation ranges (category Pc) */ +/* generated with unicode_range_generator.l */ +/* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */ +CONNECTOR_PUNCT_RNG_1 _ +CONNECTOR_PUNCT_RNG_2 \xE2(\x80\xBF|\x81\x80) +CONNECTOR_PUNCT_RNG_3 \xE2\x81\x94 +CONNECTOR_PUNCT_RNG_4 \xEF\xB8[\xB3-\xB4] +CONNECTOR_PUNCT_RNG_5 \xEF\xB9[\x8D-\x8F] +CONNECTOR_PUNCT_RNG_6 \xEF\xBC\xBF + +UNICODE_CONNECTOR_PUNCTUATION {CONNECTOR_PUNCT_RNG_1}|{CONNECTOR_PUNCT_RNG_2}|{CONNECTOR_PUNCT_RNG_3}|{CONNECTOR_PUNCT_RNG_4}|{CONNECTOR_PUNCT_RNG_5}|{CONNECTOR_PUNCT_RNG_6} + +UNICODE_ZWNJ \xE2\x80\x8C +UNICODE_ZWJ \xE2\x80\x8D + +/* Unicode escape sequence */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 (escape sequence) */ +UNICODE_ESCAPE_SEQUENCE \\u[0-9a-fA-F]{4} + +/* identifiers */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6 */ +IDENTIFIER_START [_$]|({UNICODE_LETTER})|{UNICODE_ESCAPE_SEQUENCE} +IDENTIFIER_PART (({IDENTIFIER_START})|({UNICODE_COMBINING_MARK})|({UNICODE_DIGIT})|({UNICODE_CONNECTOR_PUNCTUATION})|{UNICODE_ZWNJ}|{UNICODE_ZWJ})* +IDENTIFIER ({IDENTIFIER_START}{IDENTIFIER_PART})* + +/* literals */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8 */ +LITERAL_NULL null +LITERAL_BOOLEAN true|false +LITERAL_DECIMAL [.]?[0-9]+[\.]?[0-9]*[eE]?[0-9]* +LITERAL_HEX_INTEGER 0x[0-9a-fA-F]*|0X[0-9a-fA-F]* +LITERAL_DOUBLE_STRING_BEGIN \" +LITERAL_SINGLE_STRING_BEGIN \' +LITERAL_REGULAR_EXPRESSION \/[^*\/] +/* extra literals */ +/* according to https://ecma-international.org/ecma-262/5.1/#sec-4.3 */ +LITERAL_UNDEFINED undefined +LITERAL_INFINITY Infinity|\xE2\x88\x9E +LITERAL_NAN NaN +LITERAL {LITERAL_NULL}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN} + +HTML_COMMENT_OPEN '++' and '-'<-->'--'") + { + NORMALIZE(syntax_cases_buf14, syntax_cases_expected14); + VALIDATE(syntax_cases_buf14, syntax_cases_expected14); + } + SECTION("LS and PS chars within literal") + { + NORMALIZE(syntax_cases_buf15, syntax_cases_expected15); + VALIDATE(syntax_cases_buf15, syntax_cases_expected15); + } + SECTION("explicit LF within literal") + { + NORMALIZE(syntax_cases_buf16, syntax_cases_expected16); + VALIDATE(syntax_cases_buf16, syntax_cases_expected16); + } + SECTION("explicit CR within literal") + { + NORMALIZE(syntax_cases_buf17, syntax_cases_expected17); + VALIDATE(syntax_cases_buf17, syntax_cases_expected17); + } + SECTION("escaped LF-CR sequence within literal") + { + NORMALIZE(syntax_cases_buf18, syntax_cases_expected18); + VALIDATE(syntax_cases_buf18, syntax_cases_expected18); + } + SECTION("escaped LF within regex literal") + { + NORMALIZE(syntax_cases_buf19, syntax_cases_expected19); + VALIDATE(syntax_cases_buf19, syntax_cases_expected19); + } + SECTION("escaped CR-LF within regex literal") + { + NORMALIZE(syntax_cases_buf20, syntax_cases_expected20); + VALIDATE(syntax_cases_buf20, syntax_cases_expected20); + } +} + +TEST_CASE("norm_depth is specified", "[JSNormalizer]") +{ + const char srcbuf[] = "var abc = 123;\n\r"; + const char expected[] = "var abc"; + char dstbuf[7]; + int bytes_copied; + const char* ptr = srcbuf; + int norm_depth = 7; + int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf), dstbuf, sizeof(dstbuf), &ptr, + &bytes_copied, norm_depth); + + CHECK(ret == 0); + CHECK(bytes_copied == sizeof(expected) - 1); + CHECK(!memcmp(dstbuf, expected, bytes_copied)); +} + +TEST_CASE("tag script end is specified", "[JSNormalizer]") +{ + const char srcbuf[] = + "var a = 1 ;\n" // 12 bytes + "var b = 2 ;\n" // 12 bytes --> ptr_offset = 24 + "\n" + "var c = 3 ;\n"; + const int ptr_offset = 24; + const char expected[] = "var a=1;var b=2;"; + char dstbuf[sizeof(expected)]; + int bytes_copied; + const char* ptr = srcbuf; + int norm_depth = NORM_DEPTH; + int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf), dstbuf, sizeof(dstbuf), &ptr, + &bytes_copied, norm_depth); + + CHECK(ret == 0); + CHECK(bytes_copied == sizeof(expected) - 1); + CHECK((ptr - srcbuf) == ptr_offset); + CHECK(!memcmp(dstbuf, expected, bytes_copied)); +} + +// Tests for JavaScript parsing errors and anomalies + +TEST_CASE("parsing errors", "[JSNormalizer]") +{ + SECTION("dstlen is too small") + { + const char srcbuf[] = "var abc = 123;\n\r"; + const char expected[] = "var abc"; + char dstbuf[7]; + int bytes_copied; + const char* ptr = srcbuf; + int norm_depth = NORM_DEPTH; + int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf), dstbuf, sizeof(dstbuf), &ptr, + &bytes_copied, norm_depth); + + CHECK(ret == 1); + CHECK(bytes_copied == sizeof(expected) - 1); + CHECK(!memcmp(dstbuf, expected, bytes_copied)); + } +} + diff --git a/src/utils/util_jsnorm.h b/src/utils/util_jsnorm.h index 979b40b6c..2179a0ca5 100644 --- a/src/utils/util_jsnorm.h +++ b/src/utils/util_jsnorm.h @@ -40,7 +40,7 @@ struct JSState uint16_t alerts; }; -SO_PUBLIC int JSNormalizeDecode( +int JSNormalizeDecode( const char*, uint16_t, char*, uint16_t destlen, const char**, int*, JSState*, uint8_t*); } #endif