]> git.ipfire.org Git - thirdparty/snort3.git/commitdiff
Merge pull request #2778 in SNORT/snort3 from ~OSERHIIE/snort3:javascript_normalizati...
authorMike Stepanek (mstepane) <mstepane@cisco.com>
Tue, 23 Mar 2021 13:05:23 +0000 (13:05 +0000)
committerMike Stepanek (mstepane) <mstepane@cisco.com>
Tue, 23 Mar 2021 13:05:23 +0000 (13:05 +0000)
Squashed commit of the following:

commit 5371730d74442a199d46ed862639172f18437193
Author: Oleksandr Serhiienko <oserhiie@cisco.com>
Date:   Mon Feb 1 16:01:38 2021 +0200

    http_inspect: add JavaScript whitespace normalization

        http_inspect: integrate JSNormalizer (whitespace normalizzation) keeping the old one
        http_inspect: add normalization_depth config option
        utils: add JSNormalizer
        cmake: add flex build dependency
        doc: update http_inspect feature doc

16 files changed:
cmake/include_libraries.cmake
doc/user/http_inspect.txt
src/service_inspectors/http_inspect/http_js_norm.cc
src/service_inspectors/http_inspect/http_js_norm.h
src/service_inspectors/http_inspect/http_module.cc
src/service_inspectors/http_inspect/http_module.h
src/service_inspectors/http_inspect/test/http_module_test.cc
src/service_inspectors/http_inspect/test/http_uri_norm_test.cc
src/utils/CMakeLists.txt
src/utils/js_normalizer.cc [new file with mode: 0644]
src/utils/js_normalizer.h [new file with mode: 0644]
src/utils/js_tokenizer.h [new file with mode: 0644]
src/utils/js_tokenizer.l [new file with mode: 0644]
src/utils/test/CMakeLists.txt
src/utils/test/js_normalizer_test.cc [new file with mode: 0644]
src/utils/util_jsnorm.h

index 811c95ac6709948eb22e1980e72c9d7a8df45511..82ac24529054b5345741028554322edefef88448 100644 (file)
@@ -3,6 +3,7 @@
 find_package(Threads REQUIRED)
 find_package(DAQ REQUIRED)
 find_package(DNET REQUIRED)
+find_package(FLEX REQUIRED)
 find_package(HWLOC REQUIRED)
 find_package(LuaJIT REQUIRED)
 find_package(OpenSSL REQUIRED)
index f0c70686456b756f68db22027d8d8d1c5bf3eddf..00ce605dc54231acd486563ff601f716855a1954 100755 (executable)
@@ -153,6 +153,20 @@ decodeURIComponent are normalized. The different encodings handled within
 the unescape, decodeURI, or decodeURIComponent are %XX, %uXXXX, XX and
 uXXXXi. http_inspect also replaces consecutive whitespaces with a single
 space and normalizes the plus by concatenating the strings.
+Such normalizations refer to basic JavaScript normalization.
+
+===== normalization_depth
+
+normalization_depth = N {-1 : 65535} will set a number of input JavaScript
+bytes to normalize and enable the whitespace normalizer instead of the
+basic one. Meanwhile, normalize_javascript = true must be configured as
+well. When the depth is reached, normalization will be stopped. It's
+implemented per-script. normalization_depth = -1 will configure max depth
+value. By default, the value is set to 0. Configure this option to enable
+more precise whitespace normalization of JavaScript, that removes all
+redundant whitespaces and line terminators from the JavaScript syntax point
+of view (between identifier and punctuator, between identifier and operator,
+etc.) according to ECMAScript 5.1 standard.
 
 ===== xff_headers
 
index da46624f3879de33acf44b79e9fce82d8bec853b..405b3afbee9cb8a6bb5c1aacd7e6c9d1d7b5747a 100644 (file)
 
 #include "http_js_norm.h"
 
-#include "utils/util_jsnorm.h"
+#include "utils/js_normalizer.h"
 #include "utils/safec.h"
+#include "utils/util_jsnorm.h"
 
 #include "http_enum.h"
 
 using namespace HttpEnums;
 using namespace snort;
 
-HttpJsNorm::HttpJsNorm(int max_javascript_whitespaces_, const HttpParaList::UriParam& uri_param_) :
-    max_javascript_whitespaces(max_javascript_whitespaces_), uri_param(uri_param_),
-    javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr) {}
+class JsNormBase
+{
+public:
+    virtual ~JsNormBase() = default;
+
+    virtual int normalize(const char*, uint16_t, char*, uint16_t, const char**, int*, JSState*,
+    uint8_t*) = 0;
+
+};
+
+class UtilJsNorm : public JsNormBase
+{
+public:
+    UtilJsNorm() : JsNormBase() {}
+
+protected:
+    virtual int normalize(const char* src, uint16_t srclen, char* dst, uint16_t destlen,
+        const char** ptr, int* bytes_copied, JSState* js, uint8_t* iis_unicode_map) override
+    {
+        return JSNormalizeDecode(src, srclen, dst, destlen, ptr, bytes_copied, js, iis_unicode_map);
+    }
+
+};
+
+class JsNorm : public JsNormBase
+{
+public:
+    JsNorm(int normalization_depth)
+        : JsNormBase(),
+          norm_depth(normalization_depth)
+    {}
+
+protected:
+    virtual int normalize(const char* src, uint16_t srclen, char* dst, uint16_t destlen,
+        const char** ptr, int* bytes_copied, JSState*, uint8_t*) override
+    {
+        return JSNormalizer::normalize(src, srclen, dst, destlen, ptr, bytes_copied, norm_depth);
+    }
+
+private:
+    int norm_depth;
+
+};
+
+HttpJsNorm::HttpJsNorm(int max_javascript_whitespaces_, const HttpParaList::UriParam& uri_param_,
+    int normalization_depth) :
+    normalizer(nullptr), max_javascript_whitespaces(max_javascript_whitespaces_),
+    uri_param(uri_param_), normalization_depth(normalization_depth),
+    javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr)
+{}
 
 HttpJsNorm::~HttpJsNorm()
 {
+    delete normalizer;
     delete javascript_search_mpse;
     delete htmltype_search_mpse;
 }
 
 void HttpJsNorm::configure()
 {
-    if ( javascript_search_mpse || htmltype_search_mpse )
+    if ( configure_once )
         return;
 
+    // Based on this option configuration, default or whitespace normalizer will be initialized
+    // normalization_depth = 0 means to initialize default normalizer
+    // normalization_depth != 0 means to initialize whitespace normalizer with specified depth
+    if ( normalization_depth != 0 )
+        normalizer = new JsNorm(normalization_depth);
+    else
+        normalizer = new UtilJsNorm;
+
     javascript_search_mpse = new SearchTool;
     htmltype_search_mpse = new SearchTool;
 
@@ -72,6 +129,8 @@ void HttpJsNorm::configure()
         htmltype_search_mpse->add(tmp->name, tmp->name_len, tmp->search_id);
     }
     htmltype_search_mpse->prep();
+
+    configure_once = true;
 }
 
 void HttpJsNorm::normalize(const Field& input, Field& output, HttpInfractions* infractions,
@@ -100,7 +159,7 @@ void HttpJsNorm::normalize(const Field& input, Field& output, HttpInfractions* i
             const char* js_start = ptr + mindex;
             const char* const angle_bracket =
                 (const char*)SnortStrnStr(js_start, end - js_start, ">");
-            if (angle_bracket == nullptr)
+            if (angle_bracket == nullptr || (end - angle_bracket) == 0)
                 break;
 
             bool type_js = false;
@@ -110,7 +169,7 @@ void HttpJsNorm::normalize(const Field& input, Field& output, HttpInfractions* i
                 const int script_found = htmltype_search_mpse->find(
                     js_start, (angle_bracket-js_start), search_html_found, false, &mid);
 
-                js_start = angle_bracket;
+                js_start = angle_bracket + 1;
                 if (script_found > 0)
                 {
                     switch (mid)
@@ -144,7 +203,7 @@ void HttpJsNorm::normalize(const Field& input, Field& output, HttpInfractions* i
             if (!type_js)
                 continue;
 
-            JSNormalizeDecode(js_start, (uint16_t)(end-js_start), (char*)buffer+index,
+            normalizer->normalize(js_start, (uint16_t)(end-js_start), (char*)buffer+index,
                 (uint16_t)(input.length() - index), &ptr, &bytes_copied, &js,
                 uri_param.iis_unicode ? uri_param.unicode_map : nullptr);
 
index 05da0719b08073bcfa5355ef548ec788e33951ec..2e1dbe2df61a558c080d1f312a0634868c9c9dd9 100644 (file)
 // HttpJsNorm class
 //-------------------------------------------------------------------------
 
+class JsNormBase;
+
 class HttpJsNorm
 {
 public:
-    HttpJsNorm(int max_javascript_whitespaces_, const HttpParaList::UriParam& uri_param_);
+    HttpJsNorm(int max_javascript_whitespaces_, const HttpParaList::UriParam& uri_param_,
+        int normalization_depth);
     ~HttpJsNorm();
     void normalize(const Field& input, Field& output, HttpInfractions* infractions,
         HttpEventGen* events) const;
     void configure();
 private:
+    bool configure_once = false;
+
+    JsNormBase* normalizer;
+
     enum JsSearchId { JS_JAVASCRIPT };
     enum HtmlSearchId { HTML_JS, HTML_EMA, HTML_VB };
 
@@ -49,6 +56,7 @@ private:
 
     const int max_javascript_whitespaces;
     const HttpParaList::UriParam& uri_param;
+    const int normalization_depth;
 
     snort::SearchTool* javascript_search_mpse;
     snort::SearchTool* htmltype_search_mpse;
index d60171a434013c1b9c0a2de253b3de63a57ef458..88d370c4729f602c675f05272a19607fb40811e8 100755 (executable)
@@ -89,6 +89,9 @@ const Parameter HttpModule::http_params[] =
     { "normalize_javascript", Parameter::PT_BOOL, nullptr, "false",
       "normalize JavaScript in response bodies" },
 
+    { "normalization_depth", Parameter::PT_INT, "-1:65535", "0",
+      "number of input JavaScript bytes to normalize" },
+
     { "max_javascript_whitespaces", Parameter::PT_INT, "1:65535", "200",
       "maximum consecutive whitespaces allowed within the JavaScript obfuscated data" },
 
@@ -214,6 +217,11 @@ bool HttpModule::set(const char*, Value& val, SnortConfig*)
     {
         params->js_norm_param.normalize_javascript = val.get_bool();
     }
+    else if (val.is("normalization_depth"))
+    {
+        int v = val.get_int32();
+        params->js_norm_param.normalization_depth = (v == -1) ? 65535 : v;
+    }
     else if (val.is("max_javascript_whitespaces"))
     {
         params->js_norm_param.max_javascript_whitespaces = val.get_uint16();
@@ -393,7 +401,8 @@ bool HttpModule::end(const char*, int, SnortConfig*)
     if (params->js_norm_param.normalize_javascript)
     {
         params->js_norm_param.js_norm =
-            new HttpJsNorm(params->js_norm_param.max_javascript_whitespaces, params->uri_param);
+            new HttpJsNorm(params->js_norm_param.max_javascript_whitespaces, params->uri_param,
+            params->js_norm_param.normalization_depth);
     }
 
     prepare_http_header_list(params);
index 4b0adfe7ddad5def7a38f3a93d23d17ec5fe2d34..f7d76f650ef73570b728cfdeb16d7efbf2c8b199 100755 (executable)
@@ -52,6 +52,7 @@ public:
     public:
         ~JsNormParam();
         bool normalize_javascript = false;
+        int normalization_depth = 0;
         int max_javascript_whitespaces = 200;
         class HttpJsNorm* js_norm = nullptr;
     };
index 48776f6040465977813d3ff23baabed6ac919355..ec5d19029afc6a283245b0f1eb1472fd8a855f30 100755 (executable)
@@ -64,9 +64,9 @@ int32_t substr_to_code(const uint8_t*, const int32_t, const StrCode []) { return
 long HttpTestManager::print_amount {};
 bool HttpTestManager::print_hex {};
 
-HttpJsNorm::HttpJsNorm(int, const HttpParaList::UriParam& uri_param_) :
-    max_javascript_whitespaces(0), uri_param(uri_param_), javascript_search_mpse(nullptr),
-    htmltype_search_mpse(nullptr) {}
+HttpJsNorm::HttpJsNorm(int, const HttpParaList::UriParam& uri_param_, int) :
+    normalizer(nullptr), max_javascript_whitespaces(0), uri_param(uri_param_),
+    normalization_depth(0), javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr) {}
 HttpJsNorm::~HttpJsNorm() = default;
 void HttpJsNorm::configure(){}
 
index 295f7f5dfe244b5122dbb0cd25a2c4aea7de468f..83a2041effb6c2205ae66e38a95fbf78d8537a82 100755 (executable)
@@ -53,9 +53,9 @@ LiteralSearch* LiteralSearch::instantiate(LiteralSearch::Handle*, const uint8_t*
 void show_stats(PegCount*, const PegInfo*, unsigned, const char*) { }
 void show_stats(PegCount*, const PegInfo*, const IndexVec&, const char*, FILE*) { }
 
-HttpJsNorm::HttpJsNorm(int, const HttpParaList::UriParam& uri_param_) :
-    max_javascript_whitespaces(0), uri_param(uri_param_), javascript_search_mpse(nullptr),
-    htmltype_search_mpse(nullptr) {}
+HttpJsNorm::HttpJsNorm(int, const HttpParaList::UriParam& uri_param_, int) :
+    normalizer(nullptr), max_javascript_whitespaces(0), uri_param(uri_param_),
+    normalization_depth(0), javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr) {}
 HttpJsNorm::~HttpJsNorm() = default;
 void HttpJsNorm::configure() {}
 
index a8ebc8e6fe8b61ddba936347897db8e39c7f5fce..bbaf23380ecfa1603fb436cfa1f45ca5e7f62c8b 100644 (file)
@@ -15,18 +15,28 @@ set( UTIL_INCLUDES
     util.h
     util_ber.h
     util_cstring.h
-    util_jsnorm.h
     util_unfold.h
     util_utf.h
 )
 
+set (FLEX_EXECUTABLE flex++)
+
+FLEX_TARGET ( js_tokenizer ${CMAKE_CURRENT_SOURCE_DIR}/js_tokenizer.l
+    ${CMAKE_CURRENT_BINARY_DIR}/js_tokenizer.cc
+    COMPILE_FLAGS -Ca
+)
+
 add_library ( utils OBJECT
     ${UTIL_INCLUDES}
     ${SNPRINTF_SOURCES}
+    ${FLEX_js_tokenizer_OUTPUTS}
     boyer_moore.cc
     dnet_header.h
     dyn_array.cc
     dyn_array.h
+    js_normalizer.cc
+    js_normalizer.h
+    js_tokenizer.h
     kmap.cc
     segment_mem.cc
     sflsq.cc
@@ -36,6 +46,7 @@ add_library ( utils OBJECT
     util_ber.cc
     util_cstring.cc
     util_jsnorm.cc
+    util_jsnorm.h
     util_net.cc
     util_net.h
     util_unfold.cc
diff --git a/src/utils/js_normalizer.cc b/src/utils/js_normalizer.cc
new file mode 100644 (file)
index 0000000..f5793b9
--- /dev/null
@@ -0,0 +1,42 @@
+//--------------------------------------------------------------------------
+// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation.  You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//--------------------------------------------------------------------------
+// js_normalizer.cc author Oleksandr Serhiienko <oserhiie@cisco.com>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "js_normalizer.h"
+
+#include <FlexLexer.h>
+
+#include "js_tokenizer.h"
+
+using namespace snort;
+
+int JSNormalizer::normalize(const char* srcbuf, uint16_t srclen, char* dstbuf, uint16_t dstlen,
+        const char** ptr, int* bytes_copied, int norm_depth)
+{
+    std::stringstream in, out;
+
+    in.rdbuf()->pubsetbuf(const_cast<char*>(srcbuf), (norm_depth >= srclen) ? srclen : norm_depth);
+    JSTokenizer tokenizer(in, out, dstbuf, dstlen, ptr, bytes_copied);
+
+    return tokenizer.yylex();
+}
+
diff --git a/src/utils/js_normalizer.h b/src/utils/js_normalizer.h
new file mode 100644 (file)
index 0000000..4688e98
--- /dev/null
@@ -0,0 +1,36 @@
+//--------------------------------------------------------------------------
+// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation.  You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//--------------------------------------------------------------------------
+// js_normalizer.h author Oleksandr Serhiienko <oserhiie@cisco.com>
+
+#ifndef JS_NORMALIZER_H
+#define JS_NORMALIZER_H
+
+#include "main/snort_types.h"
+
+namespace snort
+{
+class JSNormalizer
+{
+public:
+    static int normalize(const char* srcbuf, uint16_t srclen, char* dstbuf, uint16_t dstlen,
+        const char** ptr, int* bytes_copied, int norm_depth);
+};
+}
+
+#endif //JS_NORMALIZER_H
+
diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h
new file mode 100644 (file)
index 0000000..892fdc4
--- /dev/null
@@ -0,0 +1,106 @@
+//--------------------------------------------------------------------------
+// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation.  You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//--------------------------------------------------------------------------
+// js_tokenizer.h author Oleksandr Serhiienko <oserhiie@cisco.com>
+
+#ifndef JS_TOKENIZER_H
+#define JS_TOKENIZER_H
+
+#include <sstream>
+
+#include "log/messages.h"
+
+class JSTokenizer : public yyFlexLexer
+{
+private:
+    enum JSToken
+    {
+        UNDEFINED = 0,
+        IDENTIFIER,
+        KEYWORD,
+        PUNCTUATOR,
+        OPERATOR,
+        LITERAL,
+        DIRECTIVE,
+        TAG_SCRIPT_OPEN
+    };
+
+public:
+    // we need an out stream because yyFlexLexer API strongly requires that
+    JSTokenizer(std::stringstream& in, std::stringstream& out, char* dstbuf,
+        const uint16_t dstlen, const char** ptr, int* bytes_copied);
+    ~JSTokenizer() override;
+
+    // so, Flex will treat this class as yyclass
+    // must come with yyclass Flex option
+    // don't need to define this method, it'll be substituted by Flex
+    // returns 0 if OK, 1 otherwise
+    int yylex() override;
+
+protected:
+    [[noreturn]] void LexerError(const char* msg) override
+    { snort::FatalError("%s", msg); }
+
+private:
+    void init();
+
+    // scan buffers control
+    void switch_to_temporal(const std::string& data);
+    void switch_to_initial();
+
+    bool eval_identifier(const char* lexeme);
+    bool eval_string_literal(const char* match_prefix, const char quotes);
+    bool eval_regex_literal(const char* match_prefix);
+    bool eval_eof();
+    void skip_single_line_comment();
+    void skip_multi_line_comment();
+
+    bool parse_literal(const std::string& match_prefix, const char sentinel_ch,
+        std::string& result, bool is_regex = false);
+
+    // main lexeme handler
+    // all scanned tokens must pass here
+    bool eval(const JSToken tok, const char* lexeme);
+
+    bool normalize_identifier(const JSToken prev_tok, const char* lexeme);
+    bool normalize_punctuator(const JSToken prev_tok, const char* lexeme);
+    bool normalize_operator(const JSToken prev_tok, const char* lexeme);
+    bool normalize_directive(const JSToken prev_tok, const char* lexeme);
+    bool normalize_tag_script_open(const JSToken prev_tok, const char* lexeme);
+    bool normalize_undefined(const JSToken prev_tok, const char* lexeme);
+    bool normalize_lexeme(const JSToken prev_tok, const char* lexeme);
+
+    bool write_output(const std::string& str);
+
+    void update_ptr();
+
+private:
+    char* dstbuf;
+    const uint16_t dstlen;
+    const char** ptr;
+    int* bytes_copied;
+
+    struct ScanBuffers;
+    ScanBuffers* buffers = nullptr;
+    std::stringstream temporal;
+
+    JSToken prev_tok = UNDEFINED;
+
+};
+
+#endif // JS_TOKENIZER_H
+
diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l
new file mode 100644 (file)
index 0000000..f4f51fb
--- /dev/null
@@ -0,0 +1,1348 @@
+/*--------------------------------------------------------------------------
+// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation.  You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//--------------------------------------------------------------------------
+// js_tokenizer.l author Oleksandr Serhiienko <oserhiie@cisco.com>
+*/
+
+/* Define JSTokenizer as yyClass */
+%option yyclass="JSTokenizer"
+/* Disable yywrap() generation */
+%option noyywrap
+
+%{
+    #ifdef HAVE_CONFIG_H
+    #include "config.h"
+    #endif
+
+    #include "utils/js_tokenizer.h"
+%}
+
+/* The following grammar was created based on ECMAScript specification */
+/* source https://ecma-international.org/ecma-262/5.1/ */
+
+/* whitespaces */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.2 */
+TAB            \x9
+VT             \xB
+FF             \xC
+SP             \x20
+NBSP           \xA0
+BOM            \xEF\xBB\xBF
+WHITESPACES    {TAB}|{VT}|{FF}|{SP}|{NBSP}|{BOM}
+
+/* single char escape sequences */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 */
+NUL                      \x0
+BS                       \x8
+HT                       \x9
+CHAR_ESCAPE_SEQUENCES    {NUL}|{BS}|{HT}
+
+/* line terminators */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.3 */
+LF                  \xA
+CR                  \xD
+LS                  \xE2\x80\xA8
+PS                  \xE2\x80\xA9
+LINE_TERMINATORS    {LF}|{CR}|{LS}|{PS}
+
+/* comments */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.4 */
+SINGLE_LINE_COMMENT    "//"
+MULTI_LINE_COMMENT     "/\*"
+
+/* directives */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-14.1 */
+USE_STRICT_DIRECTIVE    "\"use strict\"";*|"\'use strict\'";*
+
+/* keywords */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6.1.1 */
+KEYWORD    break|case|debugger|in|import|protected|do|else|function|try|implements|static|instanceof|new|this|class|let|typeof|var|with|enum|private|catch|continue|default|extends|public|finally|for|if|super|yield|return|switch|throw|const|interface|void|while|delete|export|package
+
+/* punctuators */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.7 */
+CLOSING_BRACES             ")"|"]"
+PUNCTUATOR                 "{"|"}"|"("|"["|">="|"=="|"!="|"==="|"!=="|"."|";"|","|"<"|">"|"<="|"<<"|">>"|">>>"|"&"|"|"|"^"|"!"|"&&"|"||"|"?"|":"|"="|"+="|"-="|"*="|"%="|"<<="|">>="|">>>="|"&="|"|="|"^="|"~"
+OPERATOR                   "+"|"-"|"*"|"++"|"--"|"%"
+DIV_OPERATOR               "/"
+DIV_ASSIGNMENT_OPERATOR    "/="
+
+/* Unicode letter ranges (categories Lu, Ll, Lt, Lm, Lo and Nl) */
+/* generated with unicode_range_generator.l */
+/* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */
+/* the script above converts Unicode multi-byte ranges into UTF-8 encoding regex ranges since Flex doesn't support Unicode */
+/* for example, the Unicode range from 0x00D1 to 0x00D6 will look like this: \xC3[\x91-\x96] */
+/* just because each character in this range consists of two UTF-8 characters: \xC3 and the one of the range [\x91-\x96] */
+/* using this trick it's possible to handle unicode character ranges within the Flex regular expressions */
+/* i.e. the idea is to represent Unicode as a UTF-8 character sequence */
+LETTER_RNG_1      [A-Z]
+LETTER_RNG_2      [a-z]
+LETTER_RNG_3      \xC2\xAA
+LETTER_RNG_4      \xC2\xB5
+LETTER_RNG_5      \xC2\xBA
+LETTER_RNG_6      \xC3[\x80-\x96]
+LETTER_RNG_7      \xC3[\x98-\xB6]
+LETTER_RNG_8      \xC3[\xB8-\xBF]|\xCB[\x80-\x81]|[\xC4-\xCA][\x80-\xBF]
+LETTER_RNG_9      \xCB[\x86-\x91]
+LETTER_RNG_10     \xCB[\xA0-\xA4]
+LETTER_RNG_11     \xCB\xAC
+LETTER_RNG_12     \xCB\xAE
+LETTER_RNG_13     \xCD[\xB0-\xB4]
+LETTER_RNG_14     \xCD[\xB6-\xBD]
+LETTER_RNG_15     \xCD\xBF
+LETTER_RNG_16     \xCE\x86
+LETTER_RNG_17     \xCE[\x88-\xBF]|\xCF[\x80-\xB5]
+LETTER_RNG_18     \xCF[\xB7-\xBF]|\xD2[\x80-\x81]|[\xD0-\xD1][\x80-\xBF]
+LETTER_RNG_19     \xD2[\x8A-\xBF]|\xD5[\x80-\x99]|[\xD3-\xD4][\x80-\xBF]
+LETTER_RNG_20     \xD5[\xA0-\xBF]|\xD6[\x80-\x88]
+LETTER_RNG_21     \xD7[\x90-\xB2]
+LETTER_RNG_22     \xD8[\xA0-\xBF]|\xD9[\x80-\x8A]
+LETTER_RNG_23     \xD9[\xAE-\xAF]
+LETTER_RNG_24     \xD9[\xB1-\xBF]|\xDB[\x80-\x93]|\xDA[\x80-\xBF]
+LETTER_RNG_25     \xDB\x95
+LETTER_RNG_26     \xDB[\xA5-\xA6]
+LETTER_RNG_27     \xDB[\xAE-\xAF]
+LETTER_RNG_28     \xDB[\xBA-\xBC]
+LETTER_RNG_29     \xDB\xBF
+LETTER_RNG_30     \xDC\x90
+LETTER_RNG_31     \xDC[\x92-\xAF]
+LETTER_RNG_32     \xDD[\x8D-\xBF]|\xDE[\x80-\xA5]
+LETTER_RNG_33     \xDE\xB1
+LETTER_RNG_34     \xDF[\x8A-\xAA]
+LETTER_RNG_35     \xDF[\xB4-\xB5]
+LETTER_RNG_36     \xDF\xBA
+LETTER_RNG_37     \xE0\xA0[\x80-\x95]
+LETTER_RNG_38     \xE0\xA0\x9A
+LETTER_RNG_39     \xE0\xA0\xA4
+LETTER_RNG_40     \xE0\xA0\xA8
+LETTER_RNG_41     \xE0\xA1[\x80-\x98]
+LETTER_RNG_42     \xE0(\xA1[\xA0-\xBF]|\xA3[\x80-\x87]|\xA2[\x80-\xBF])
+LETTER_RNG_43     \xE0\xA4[\x84-\xB9]
+LETTER_RNG_44     \xE0\xA4\xBD
+LETTER_RNG_45     \xE0\xA5\x90
+LETTER_RNG_46     \xE0\xA5[\x98-\xA1]
+LETTER_RNG_47     \xE0(\xA5[\xB1-\xBF]|\xA6\x80)
+LETTER_RNG_48     \xE0\xA6[\x85-\xB9]
+LETTER_RNG_49     \xE0\xA6\xBD
+LETTER_RNG_50     \xE0\xA7\x8E
+LETTER_RNG_51     \xE0\xA7[\x9C-\xA1]
+LETTER_RNG_52     \xE0\xA7[\xB0-\xB1]
+LETTER_RNG_53     \xE0\xA7\xBC
+LETTER_RNG_54     \xE0\xA8[\x85-\xB9]
+LETTER_RNG_55     \xE0\xA9[\x99-\x9E]
+LETTER_RNG_56     \xE0\xA9[\xB2-\xB4]
+LETTER_RNG_57     \xE0\xAA[\x85-\xB9]
+LETTER_RNG_58     \xE0\xAA\xBD
+LETTER_RNG_59     \xE0\xAB[\x90-\xA1]
+LETTER_RNG_60     \xE0\xAB\xB9
+LETTER_RNG_61     \xE0\xAC[\x85-\xB9]
+LETTER_RNG_62     \xE0\xAC\xBD
+LETTER_RNG_63     \xE0\xAD[\x9C-\xA1]
+LETTER_RNG_64     \xE0\xAD\xB1
+LETTER_RNG_65     \xE0\xAE[\x83-\xB9]
+LETTER_RNG_66     \xE0\xAF\x90
+LETTER_RNG_67     \xE0\xB0[\x85-\xBD]
+LETTER_RNG_68     \xE0\xB1[\x98-\xA1]
+LETTER_RNG_69     \xE0\xB2\x80
+LETTER_RNG_70     \xE0\xB2[\x85-\xB9]
+LETTER_RNG_71     \xE0\xB2\xBD
+LETTER_RNG_72     \xE0\xB3[\x9E-\xA1]
+LETTER_RNG_73     \xE0\xB3[\xB1-\xB2]
+LETTER_RNG_74     \xE0\xB4[\x84-\xBA]
+LETTER_RNG_75     \xE0\xB4\xBD
+LETTER_RNG_76     \xE0\xB5\x8E
+LETTER_RNG_77     \xE0\xB5[\x94-\x96]
+LETTER_RNG_78     \xE0\xB5[\x9F-\xA1]
+LETTER_RNG_79     \xE0\xB5[\xBA-\xBF]
+LETTER_RNG_80     \xE0(\xB6[\x85-\xBF]|\xB7[\x80-\x86])
+LETTER_RNG_81     \xE0\xB8[\x81-\xB0]
+LETTER_RNG_82     \xE0\xB8[\xB2-\xB3]
+LETTER_RNG_83     \xE0\xB9[\x80-\x86]
+LETTER_RNG_84     \xE0\xBA[\x81-\xB0]
+LETTER_RNG_85     \xE0\xBA[\xB2-\xB3]
+LETTER_RNG_86     \xE0(\xBA[\xBD-\xBF]|\xBB[\x80-\x86])
+LETTER_RNG_87     \xE0(\xBB[\x9C-\xBF]|\xBC\x80)
+LETTER_RNG_88     \xE0\xBD[\x80-\xAC]
+LETTER_RNG_89     \xE0\xBE[\x88-\x8C]
+LETTER_RNG_90     \xE1\x80[\x80-\xAA]
+LETTER_RNG_91     \xE1\x80\xBF
+LETTER_RNG_92     \xE1\x81[\x90-\x95]
+LETTER_RNG_93     \xE1\x81[\x9A-\x9D]
+LETTER_RNG_94     \xE1\x81\xA1
+LETTER_RNG_95     \xE1\x81[\xA5-\xA6]
+LETTER_RNG_96     \xE1\x81[\xAE-\xB0]
+LETTER_RNG_97     \xE1(\x81[\xB5-\xBF]|\x82[\x80-\x81])
+LETTER_RNG_98     \xE1\x82\x8E
+LETTER_RNG_99     \xE1(\x82[\xA0-\xBF]|\x83[\x80-\xBA])
+LETTER_RNG_100    \xE1(\x83[\xBC-\xBF]|\x8D[\x80-\x9A]|[\x84-\x8C][\x80-\xBF])
+LETTER_RNG_101    \xE1\x8E[\x80-\x8F]
+LETTER_RNG_102    \xE1(\x8E[\xA0-\xBF]|\x8F[\x80-\xBD])
+LETTER_RNG_103    \xE1(\x90[\x81-\xBF]|\x99[\x80-\xAC]|[\x91-\x98][\x80-\xBF])
+LETTER_RNG_104    \xE1\x99[\xAF-\xBF]
+LETTER_RNG_105    \xE1\x9A[\x81-\x9A]
+LETTER_RNG_106    \xE1(\x9A[\xA0-\xBF]|\x9B[\x80-\xAA])
+LETTER_RNG_107    \xE1(\x9B[\xAE-\xBF]|\x9C[\x80-\x91])
+LETTER_RNG_108    \xE1\x9C[\xA0-\xB1]
+LETTER_RNG_109    \xE1\x9D[\x80-\x91]
+LETTER_RNG_110    \xE1\x9D[\xA0-\xB0]
+LETTER_RNG_111    \xE1\x9E[\x80-\xB3]
+LETTER_RNG_112    \xE1\x9F\x97
+LETTER_RNG_113    \xE1\x9F\x9C
+LETTER_RNG_114    \xE1(\xA0[\xA0-\xBF]|\xA2[\x80-\x84]|\xA1[\x80-\xBF])
+LETTER_RNG_115    \xE1\xA2[\x87-\xA8]
+LETTER_RNG_116    \xE1(\xA2[\xAA-\xBF]|\xA4[\x80-\x9E]|\xA3[\x80-\xBF])
+LETTER_RNG_117    \xE1(\xA5[\x90-\xBF]|\xA7[\x80-\x89]|\xA6[\x80-\xBF])
+LETTER_RNG_118    \xE1\xA8[\x80-\x96]
+LETTER_RNG_119    \xE1(\xA8[\xA0-\xBF]|\xA9[\x80-\x94])
+LETTER_RNG_120    \xE1\xAA\xA7
+LETTER_RNG_121    \xE1\xAC[\x85-\xB3]
+LETTER_RNG_122    \xE1\xAD[\x85-\x8B]
+LETTER_RNG_123    \xE1\xAE[\x83-\xA0]
+LETTER_RNG_124    \xE1\xAE[\xAE-\xAF]
+LETTER_RNG_125    \xE1(\xAE[\xBA-\xBF]|\xAF[\x80-\xA5])
+LETTER_RNG_126    \xE1\xB0[\x80-\xA3]
+LETTER_RNG_127    \xE1\xB1[\x8D-\x8F]
+LETTER_RNG_128    \xE1\xB1[\x9A-\xBD]
+LETTER_RNG_129    \xE1\xB2[\x80-\xBF]
+LETTER_RNG_130    \xE1\xB3[\xA9-\xAC]
+LETTER_RNG_131    \xE1\xB3[\xAE-\xB3]
+LETTER_RNG_132    \xE1\xB3[\xB5-\xB6]
+LETTER_RNG_133    \xE1(\xB3[\xBA-\xBF]|[\xB4-\xB6][\x80-\xBF])
+LETTER_RNG_134    \xE1(\xBE[\x80-\xBC]|[\xB8-\xBD][\x80-\xBF])
+LETTER_RNG_135    \xE1\xBE\xBE
+LETTER_RNG_136    \xE1\xBF[\x82-\x8C]
+LETTER_RNG_137    \xE1\xBF[\x90-\x9B]
+LETTER_RNG_138    \xE1\xBF[\xA0-\xAC]
+LETTER_RNG_139    \xE1\xBF[\xB2-\xBC]
+LETTER_RNG_140    \xE2\x81\xB1
+LETTER_RNG_141    \xE2\x81\xBF
+LETTER_RNG_142    \xE2\x82[\x90-\x9C]
+LETTER_RNG_143    \xE2\x84\x82
+LETTER_RNG_144    \xE2\x84\x87
+LETTER_RNG_145    \xE2\x84[\x8A-\x93]
+LETTER_RNG_146    \xE2\x84\x95
+LETTER_RNG_147    \xE2\x84[\x99-\x9D]
+LETTER_RNG_148    \xE2\x84\xA4
+LETTER_RNG_149    \xE2\x84\xA6
+LETTER_RNG_150    \xE2\x84\xA8
+LETTER_RNG_151    \xE2\x84[\xAA-\xAD]
+LETTER_RNG_152    \xE2\x84[\xAF-\xB9]
+LETTER_RNG_153    \xE2\x84[\xBC-\xBF]
+LETTER_RNG_154    \xE2\x85[\x85-\x89]
+LETTER_RNG_155    \xE2\x85\x8E
+LETTER_RNG_156    \xE2(\x85[\xA0-\xBF]|\x86[\x80-\x88])
+LETTER_RNG_157    \xE2(\xB3[\x80-\xA4]|[\xB0-\xB2][\x80-\xBF])
+LETTER_RNG_158    \xE2\xB3[\xAB-\xAE]
+LETTER_RNG_159    \xE2\xB3[\xB2-\xB3]
+LETTER_RNG_160    \xE2(\xB5[\x80-\xAF]|\xB4[\x80-\xBF])
+LETTER_RNG_161    \xE2(\xB7[\x80-\x9E]|\xB6[\x80-\xBF])
+LETTER_RNG_162    \xE2\xB8\xAF
+LETTER_RNG_163    \xE3\x80[\x85-\x87]
+LETTER_RNG_164    \xE3\x80[\xA1-\xA9]
+LETTER_RNG_165    \xE3\x80[\xB1-\xB5]
+LETTER_RNG_166    \xE3\x80[\xB8-\xBC]
+LETTER_RNG_167    \xE3(\x81[\x81-\xBF]|\x82[\x80-\x96])
+LETTER_RNG_168    \xE3\x82[\x9D-\x9F]
+LETTER_RNG_169    \xE3(\x82[\xA1-\xBF]|\x83[\x80-\xBA])
+LETTER_RNG_170    \xE3(\x83[\xBC-\xBF]|\x86[\x80-\x8E]|[\x84-\x85][\x80-\xBF])
+LETTER_RNG_171    \xE3\x86[\xA0-\xBF]
+LETTER_RNG_172    \xE3\x87[\xB0-\xBF]
+LETTER_RNG_173    (\xE3[\x90-\xBF]|\xE4[\x80-\xB6])[\x80-\xBF]
+LETTER_RNG_174    \xEA\x92[\x80-\x8C]|(\xE4[\xB8-\xBF]|\xEA[\x80-\x91]|[\xE5-\xE9][\x80-\xBF])[\x80-\xBF]
+LETTER_RNG_175    \xEA\x93[\x90-\xBD]
+LETTER_RNG_176    \xEA(\x98[\x80-\x8C]|[\x94-\x97][\x80-\xBF])
+LETTER_RNG_177    \xEA\x98[\x90-\x9F]
+LETTER_RNG_178    \xEA(\x98[\xAA-\xBF]|\x99[\x80-\xAE])
+LETTER_RNG_179    \xEA(\x99\xBF|\x9A[\x80-\x9D])
+LETTER_RNG_180    \xEA(\x9A[\xA0-\xBF]|\x9B[\x80-\xAF])
+LETTER_RNG_181    \xEA\x9C[\x97-\x9F]
+LETTER_RNG_182    \xEA(\x9C[\xA2-\xBF]|\x9E[\x80-\x88]|\x9D[\x80-\xBF])
+LETTER_RNG_183    \xEA(\x9E[\x8B-\xBF]|\xA0[\x80-\x81]|\x9F[\x80-\xBF])
+LETTER_RNG_184    \xEA\xA0[\x83-\x85]
+LETTER_RNG_185    \xEA\xA0[\x87-\x8A]
+LETTER_RNG_186    \xEA\xA0[\x8C-\xA2]
+LETTER_RNG_187    \xEA\xA1[\x80-\xB3]
+LETTER_RNG_188    \xEA\xA2[\x82-\xB3]
+LETTER_RNG_189    \xEA\xA3[\xB2-\xB7]
+LETTER_RNG_190    \xEA\xA3\xBB
+LETTER_RNG_191    \xEA\xA3[\xBD-\xBE]
+LETTER_RNG_192    \xEA\xA4[\x8A-\xA5]
+LETTER_RNG_193    \xEA(\xA4[\xB0-\xBF]|\xA5[\x80-\x86])
+LETTER_RNG_194    \xEA\xA5[\xA0-\xBC]
+LETTER_RNG_195    \xEA\xA6[\x84-\xB2]
+LETTER_RNG_196    \xEA\xA7\x8F
+LETTER_RNG_197    \xEA\xA7[\xA0-\xA4]
+LETTER_RNG_198    \xEA\xA7[\xA6-\xAF]
+LETTER_RNG_199    \xEA(\xA7[\xBA-\xBF]|\xA8[\x80-\xA8])
+LETTER_RNG_200    \xEA\xA9[\x80-\x82]
+LETTER_RNG_201    \xEA\xA9[\x84-\x8B]
+LETTER_RNG_202    \xEA\xA9[\xA0-\xB6]
+LETTER_RNG_203    \xEA\xA9\xBA
+LETTER_RNG_204    \xEA(\xA9[\xBE-\xBF]|\xAA[\x80-\xAF])
+LETTER_RNG_205    \xEA\xAA\xB1
+LETTER_RNG_206    \xEA\xAA[\xB5-\xB6]
+LETTER_RNG_207    \xEA\xAA[\xB9-\xBD]
+LETTER_RNG_208    \xEA\xAB\x80
+LETTER_RNG_209    \xEA\xAB[\x82-\x9D]
+LETTER_RNG_210    \xEA\xAB[\xA0-\xAA]
+LETTER_RNG_211    \xEA\xAB[\xB2-\xB4]
+LETTER_RNG_212    \xEA(\xAC[\x81-\xBF]|\xAD[\x80-\x9A])
+LETTER_RNG_213    \xEA\xAD[\x9C-\xA9]
+LETTER_RNG_214    \xEA(\xAD[\xB0-\xBF]|\xAF[\x80-\xA2]|\xAE[\x80-\xBF])
+LETTER_RNG_215    \xED\x9F[\x80-\xBB]|(\xEA[\xB0-\xBF]|\xED[\x80-\x9E]|[\xEB-\xEC][\x80-\xBF])[\x80-\xBF]
+LETTER_RNG_216    \xEF(\xAC[\x80-\x9D]|[\xA4-\xAB][\x80-\xBF])
+LETTER_RNG_217    \xEF\xAC[\x9F-\xA8]
+LETTER_RNG_218    \xEF(\xAC[\xAA-\xBF]|\xAE[\x80-\xB1]|\xAD[\x80-\xBF])
+LETTER_RNG_219    \xEF(\xAF[\x93-\xBF]|\xB4[\x80-\xBD]|[\xB0-\xB3][\x80-\xBF])
+LETTER_RNG_220    \xEF(\xB5[\x90-\xBF]|\xB7[\x80-\xBB]|\xB6[\x80-\xBF])
+LETTER_RNG_221    \xEF(\xB9[\xB0-\xBF]|\xBB[\x80-\xBC]|\xBA[\x80-\xBF])
+LETTER_RNG_222    \xEF\xBC[\xA1-\xBA]
+LETTER_RNG_223    \xEF\xBD[\x81-\x9A]
+LETTER_RNG_224    \xEF(\xBD[\xA6-\xBF]|\xBF[\x80-\x9C]|\xBE[\x80-\xBF])
+LETTER_RNG_225    \xF0\x90(\x83[\x80-\xBA]|[\x80-\x82][\x80-\xBF])
+LETTER_RNG_226    \xF0\x90\x85[\x80-\xB4]
+LETTER_RNG_227    \xF0\x90(\x8B[\x80-\x90]|\x8A[\x80-\xBF])
+LETTER_RNG_228    \xF0\x90\x8C[\x80-\x9F]
+LETTER_RNG_229    \xF0\x90(\x8C[\xAD-\xBF]|\x8D[\x80-\xB5])
+LETTER_RNG_230    \xF0\x90\x8E[\x80-\x9D]
+LETTER_RNG_231    \xF0\x90(\x8E[\xA0-\xBF]|\x8F[\x80-\x8F])
+LETTER_RNG_232    \xF0\x90(\x8F[\x91-\xBF]|\x92[\x80-\x9D]|[\x90-\x91][\x80-\xBF])
+LETTER_RNG_233    \xF0\x90(\x92[\xB0-\xBF]|\x95[\x80-\xA3]|[\x93-\x94][\x80-\xBF])
+LETTER_RNG_234    \xF0\x90(\xA1[\x80-\x95]|[\x98-\xA0][\x80-\xBF])
+LETTER_RNG_235    \xF0\x90\xA1[\xA0-\xB6]
+LETTER_RNG_236    \xF0\x90\xA2[\x80-\x9E]
+LETTER_RNG_237    \xF0\x90\xA3[\xA0-\xB5]
+LETTER_RNG_238    \xF0\x90\xA4[\x80-\x95]
+LETTER_RNG_239    \xF0\x90\xA4[\xA0-\xB9]
+LETTER_RNG_240    \xF0\x90\xA6[\x80-\xB7]
+LETTER_RNG_241    \xF0\x90\xA6[\xBE-\xBF]
+LETTER_RNG_242    \xF0\x90\xA8\x80
+LETTER_RNG_243    \xF0\x90\xA8[\x90-\xB5]
+LETTER_RNG_244    \xF0\x90\xA9[\xA0-\xBC]
+LETTER_RNG_245    \xF0\x90\xAA[\x80-\x9C]
+LETTER_RNG_246    \xF0\x90\xAB[\x80-\x87]
+LETTER_RNG_247    \xF0\x90\xAB[\x89-\xA4]
+LETTER_RNG_248    \xF0\x90\xAC[\x80-\xB5]
+LETTER_RNG_249    \xF0\x90\xAD[\x80-\x95]
+LETTER_RNG_250    \xF0\x90\xAD[\xA0-\xB2]
+LETTER_RNG_251    \xF0\x90\xAE[\x80-\x91]
+LETTER_RNG_252    \xF0\x90(\xB3[\x80-\xB2]|[\xB0-\xB2][\x80-\xBF])
+LETTER_RNG_253    \xF0\x90\xB4[\x80-\xA3]
+LETTER_RNG_254    \xF0\x90\xBA[\x80-\xA9]
+LETTER_RNG_255    \xF0\x90(\xBA[\xB0-\xBF]|\xBC[\x80-\x9C]|\xBB[\x80-\xBF])
+LETTER_RNG_256    \xF0\x90(\xBC[\xA7-\xBF]|\xBD[\x80-\x85])
+LETTER_RNG_257    \xF0\x90(\xBE[\xB0-\xBF]|\xBF[\x80-\x84])
+LETTER_RNG_258    \xF0\x90\xBF[\xA0-\xB6]
+LETTER_RNG_259    \xF0\x91\x80[\x83-\xB7]
+LETTER_RNG_260    \xF0\x91\x82[\x83-\xAF]
+LETTER_RNG_261    \xF0\x91\x83[\x90-\xA8]
+LETTER_RNG_262    \xF0\x91\x84[\x83-\xA6]
+LETTER_RNG_263    \xF0\x91\x85\x84
+LETTER_RNG_264    \xF0\x91\x85[\x87-\xB2]
+LETTER_RNG_265    \xF0\x91\x85\xB6
+LETTER_RNG_266    \xF0\x91\x86[\x83-\xB2]
+LETTER_RNG_267    \xF0\x91\x87[\x81-\x84]
+LETTER_RNG_268    \xF0\x91\x87\x9A
+LETTER_RNG_269    \xF0\x91\x87\x9C
+LETTER_RNG_270    \xF0\x91\x88[\x80-\xAB]
+LETTER_RNG_271    \xF0\x91\x8A[\x80-\xA8]
+LETTER_RNG_272    \xF0\x91(\x8A[\xB0-\xBF]|\x8B[\x80-\x9E])
+LETTER_RNG_273    \xF0\x91\x8C[\x85-\xB9]
+LETTER_RNG_274    \xF0\x91\x8C\xBD
+LETTER_RNG_275    \xF0\x91\x8D\x90
+LETTER_RNG_276    \xF0\x91\x8D[\x9D-\xA1]
+LETTER_RNG_277    \xF0\x91\x90[\x80-\xB4]
+LETTER_RNG_278    \xF0\x91\x91[\x87-\x8A]
+LETTER_RNG_279    \xF0\x91(\x91[\x9F-\xBF]|\x92[\x80-\xAF])
+LETTER_RNG_280    \xF0\x91\x93[\x84-\x85]
+LETTER_RNG_281    \xF0\x91\x93\x87
+LETTER_RNG_282    \xF0\x91\x96[\x80-\xAE]
+LETTER_RNG_283    \xF0\x91\x97[\x98-\x9B]
+LETTER_RNG_284    \xF0\x91\x98[\x80-\xAF]
+LETTER_RNG_285    \xF0\x91\x99\x84
+LETTER_RNG_286    \xF0\x91\x9A[\x80-\xAA]
+LETTER_RNG_287    \xF0\x91\x9A\xB8
+LETTER_RNG_288    \xF0\x91\x9C[\x80-\x9A]
+LETTER_RNG_289    \xF0\x91\xA0[\x80-\xAB]
+LETTER_RNG_290    \xF0\x91(\xA2[\xA0-\xBF]|\xA3[\x80-\x9F])
+LETTER_RNG_291    \xF0\x91(\xA3\xBF|\xA4[\x80-\xAF])
+LETTER_RNG_292    \xF0\x91\xA4\xBF
+LETTER_RNG_293    \xF0\x91\xA5\x81
+LETTER_RNG_294    \xF0\x91(\xA6[\xA0-\xBF]|\xA7[\x80-\x90])
+LETTER_RNG_295    \xF0\x91\xA7\xA1
+LETTER_RNG_296    \xF0\x91\xA7\xA3
+LETTER_RNG_297    \xF0\x91\xA8\x80
+LETTER_RNG_298    \xF0\x91\xA8[\x8B-\xB2]
+LETTER_RNG_299    \xF0\x91\xA8\xBA
+LETTER_RNG_300    \xF0\x91\xA9\x90
+LETTER_RNG_301    \xF0\x91(\xA9[\x9C-\xBF]|\xAA[\x80-\x89])
+LETTER_RNG_302    \xF0\x91\xAA\x9D
+LETTER_RNG_303    \xF0\x91(\xB0[\x80-\xAE]|[\xAB-\xAF][\x80-\xBF])
+LETTER_RNG_304    \xF0\x91\xB1\x80
+LETTER_RNG_305    \xF0\x91(\xB1[\xB2-\xBF]|\xB2[\x80-\x8F])
+LETTER_RNG_306    \xF0\x91\xB4[\x80-\xB0]
+LETTER_RNG_307    \xF0\x91\xB5\x86
+LETTER_RNG_308    \xF0\x91(\xB5[\xA0-\xBF]|\xB6[\x80-\x89])
+LETTER_RNG_309    \xF0\x91\xB6\x98
+LETTER_RNG_310    \xF0\x91\xBB[\xA0-\xB2]
+LETTER_RNG_311    \xF0\x91\xBE\xB0
+LETTER_RNG_312    \xF0\x92(\x91[\x80-\xAE]|[\x80-\x90][\x80-\xBF])
+LETTER_RNG_313    \xF0(\x93\x90[\x80-\xAE]|(\x92[\x92-\xBF]|\x93[\x80-\x8F])[\x80-\xBF])
+LETTER_RNG_314    \xF0(\x96\xA9[\x80-\x9E]|(\x94[\x90-\xBF]|\x96[\x80-\xA8]|\x95[\x80-\xBF])[\x80-\xBF])
+LETTER_RNG_315    \xF0\x96\xAB[\x90-\xAD]
+LETTER_RNG_316    \xF0\x96\xAC[\x80-\xAF]
+LETTER_RNG_317    \xF0\x96\xAD[\x80-\x83]
+LETTER_RNG_318    \xF0\x96(\xAD[\xA3-\xBF]|[\xAE-\xB9][\x80-\xBF])
+LETTER_RNG_319    \xF0\x96(\xBD[\x80-\x8A]|\xBC[\x80-\xBF])
+LETTER_RNG_320    \xF0\x96\xBD\x90
+LETTER_RNG_321    \xF0\x96(\xBE[\x93-\xBF]|\xBF[\x80-\xA1])
+LETTER_RNG_322    \xF0\x96\xBF\xA3
+LETTER_RNG_323    \xF0(\x9B\xB2[\x80-\x99]|(\x9B[\x80-\xB1]|[\x97-\x9A][\x80-\xBF])[\x80-\xBF])
+LETTER_RNG_324    \xF0\x9D(\x9B\x80|[\x90-\x9A][\x80-\xBF])
+LETTER_RNG_325    \xF0\x9D\x9B[\x82-\x9A]
+LETTER_RNG_326    \xF0\x9D\x9B[\x9C-\xBA]
+LETTER_RNG_327    \xF0\x9D(\x9B[\xBC-\xBF]|\x9C[\x80-\x94])
+LETTER_RNG_328    \xF0\x9D\x9C[\x96-\xB4]
+LETTER_RNG_329    \xF0\x9D(\x9C[\xB6-\xBF]|\x9D[\x80-\x8E])
+LETTER_RNG_330    \xF0\x9D\x9D[\x90-\xAE]
+LETTER_RNG_331    \xF0\x9D(\x9D[\xB0-\xBF]|\x9E[\x80-\x88])
+LETTER_RNG_332    \xF0\x9D\x9E[\x8A-\xA8]
+LETTER_RNG_333    \xF0\x9D(\x9E[\xAA-\xBF]|\x9F[\x80-\x82])
+LETTER_RNG_334    \xF0\x9D\x9F[\x84-\x8B]
+LETTER_RNG_335    \xF0\x9E\x84[\x80-\xAC]
+LETTER_RNG_336    \xF0\x9E\x84[\xB7-\xBD]
+LETTER_RNG_337    \xF0\x9E\x85\x8E
+LETTER_RNG_338    \xF0\x9E\x8B[\x80-\xAB]
+LETTER_RNG_339    \xF0\x9E(\xA3[\x80-\x84]|[\xA0-\xA2][\x80-\xBF])
+LETTER_RNG_340    \xF0\x9E(\xA5[\x80-\x83]|\xA4[\x80-\xBF])
+LETTER_RNG_341    \xF0\x9E\xA5\x8B
+LETTER_RNG_342    \xF0\x9E(\xBA[\x80-\xBB]|[\xB8-\xB9][\x80-\xBF])
+LETTER_RNG_343    \xF0(\xB1\x8D[\x80-\x8A]|(\xB1[\x80-\x8C]|[\xA0-\xB0][\x80-\xBF])[\x80-\xBF])
+
+LETTER_GROUP_1     {LETTER_RNG_1}|{LETTER_RNG_2}|{LETTER_RNG_3}|{LETTER_RNG_4}|{LETTER_RNG_5}|{LETTER_RNG_6}|{LETTER_RNG_7}|{LETTER_RNG_8}|{LETTER_RNG_9}|{LETTER_RNG_10}
+LETTER_GROUP_2     {LETTER_GROUP_1}|{LETTER_RNG_11}|{LETTER_RNG_12}|{LETTER_RNG_13}|{LETTER_RNG_14}|{LETTER_RNG_15}|{LETTER_RNG_16}|{LETTER_RNG_17}|{LETTER_RNG_18}|{LETTER_RNG_19}
+LETTER_GROUP_3     {LETTER_GROUP_2}|{LETTER_RNG_20}|{LETTER_RNG_21}|{LETTER_RNG_22}|{LETTER_RNG_23}|{LETTER_RNG_24}|{LETTER_RNG_25}|{LETTER_RNG_26}|{LETTER_RNG_27}|{LETTER_RNG_28}
+LETTER_GROUP_4     {LETTER_GROUP_3}|{LETTER_RNG_29}|{LETTER_RNG_30}|{LETTER_RNG_31}|{LETTER_RNG_32}|{LETTER_RNG_33}|{LETTER_RNG_34}|{LETTER_RNG_35}|{LETTER_RNG_36}|{LETTER_RNG_37}
+LETTER_GROUP_5     {LETTER_GROUP_4}|{LETTER_RNG_38}|{LETTER_RNG_39}|{LETTER_RNG_40}|{LETTER_RNG_41}|{LETTER_RNG_42}|{LETTER_RNG_43}|{LETTER_RNG_44}|{LETTER_RNG_45}|{LETTER_RNG_46}
+LETTER_GROUP_6     {LETTER_GROUP_5}|{LETTER_RNG_47}|{LETTER_RNG_48}|{LETTER_RNG_49}|{LETTER_RNG_50}|{LETTER_RNG_51}|{LETTER_RNG_52}|{LETTER_RNG_53}|{LETTER_RNG_54}|{LETTER_RNG_55}
+LETTER_GROUP_7     {LETTER_GROUP_6}|{LETTER_RNG_56}|{LETTER_RNG_57}|{LETTER_RNG_58}|{LETTER_RNG_59}|{LETTER_RNG_60}|{LETTER_RNG_61}|{LETTER_RNG_62}|{LETTER_RNG_63}|{LETTER_RNG_64}
+LETTER_GROUP_8     {LETTER_GROUP_7}|{LETTER_RNG_65}|{LETTER_RNG_66}|{LETTER_RNG_67}|{LETTER_RNG_68}|{LETTER_RNG_69}|{LETTER_RNG_70}|{LETTER_RNG_71}|{LETTER_RNG_72}|{LETTER_RNG_73}
+LETTER_GROUP_9     {LETTER_GROUP_8}|{LETTER_RNG_74}|{LETTER_RNG_75}|{LETTER_RNG_76}|{LETTER_RNG_77}|{LETTER_RNG_78}|{LETTER_RNG_79}|{LETTER_RNG_80}|{LETTER_RNG_81}|{LETTER_RNG_82}
+LETTER_GROUP_10    {LETTER_GROUP_9}|{LETTER_RNG_83}|{LETTER_RNG_84}|{LETTER_RNG_85}|{LETTER_RNG_86}|{LETTER_RNG_87}|{LETTER_RNG_88}|{LETTER_RNG_89}|{LETTER_RNG_90}|{LETTER_RNG_91}
+LETTER_GROUP_11    {LETTER_GROUP_10}|{LETTER_RNG_92}|{LETTER_RNG_93}|{LETTER_RNG_94}|{LETTER_RNG_95}|{LETTER_RNG_96}|{LETTER_RNG_97}|{LETTER_RNG_98}|{LETTER_RNG_99}|{LETTER_RNG_100}
+LETTER_GROUP_12    {LETTER_GROUP_11}|{LETTER_RNG_101}|{LETTER_RNG_102}|{LETTER_RNG_103}|{LETTER_RNG_104}|{LETTER_RNG_105}|{LETTER_RNG_106}|{LETTER_RNG_107}|{LETTER_RNG_108}|{LETTER_RNG_109}
+LETTER_GROUP_13    {LETTER_GROUP_12}|{LETTER_RNG_110}|{LETTER_RNG_111}|{LETTER_RNG_112}|{LETTER_RNG_113}|{LETTER_RNG_114}|{LETTER_RNG_115}|{LETTER_RNG_116}|{LETTER_RNG_117}|{LETTER_RNG_118}
+LETTER_GROUP_14    {LETTER_GROUP_13}|{LETTER_RNG_119}|{LETTER_RNG_120}|{LETTER_RNG_121}|{LETTER_RNG_122}|{LETTER_RNG_123}|{LETTER_RNG_124}|{LETTER_RNG_125}|{LETTER_RNG_126}|{LETTER_RNG_127}
+LETTER_GROUP_15    {LETTER_GROUP_14}|{LETTER_RNG_128}|{LETTER_RNG_129}|{LETTER_RNG_130}|{LETTER_RNG_131}|{LETTER_RNG_132}|{LETTER_RNG_133}|{LETTER_RNG_134}|{LETTER_RNG_135}|{LETTER_RNG_136}
+LETTER_GROUP_16    {LETTER_GROUP_15}|{LETTER_RNG_137}|{LETTER_RNG_138}|{LETTER_RNG_139}|{LETTER_RNG_140}|{LETTER_RNG_141}|{LETTER_RNG_142}|{LETTER_RNG_143}|{LETTER_RNG_144}|{LETTER_RNG_145}
+LETTER_GROUP_17    {LETTER_GROUP_15}|{LETTER_RNG_146}|{LETTER_RNG_147}|{LETTER_RNG_148}|{LETTER_RNG_149}|{LETTER_RNG_150}|{LETTER_RNG_151}|{LETTER_RNG_152}|{LETTER_RNG_153}|{LETTER_RNG_154}
+LETTER_GROUP_18    {LETTER_GROUP_17}|{LETTER_RNG_155}|{LETTER_RNG_156}|{LETTER_RNG_157}|{LETTER_RNG_158}|{LETTER_RNG_159}|{LETTER_RNG_160}|{LETTER_RNG_161}|{LETTER_RNG_162}|{LETTER_RNG_163}
+LETTER_GROUP_19    {LETTER_GROUP_18}|{LETTER_RNG_164}|{LETTER_RNG_165}|{LETTER_RNG_166}|{LETTER_RNG_167}|{LETTER_RNG_168}|{LETTER_RNG_169}|{LETTER_RNG_170}|{LETTER_RNG_171}|{LETTER_RNG_172}
+LETTER_GROUP_20    {LETTER_GROUP_19}|{LETTER_RNG_173}|{LETTER_RNG_174}|{LETTER_RNG_175}|{LETTER_RNG_176}|{LETTER_RNG_177}|{LETTER_RNG_178}|{LETTER_RNG_179}|{LETTER_RNG_180}|{LETTER_RNG_181}
+LETTER_GROUP_21    {LETTER_GROUP_20}|{LETTER_RNG_182}|{LETTER_RNG_183}|{LETTER_RNG_184}|{LETTER_RNG_185}|{LETTER_RNG_186}|{LETTER_RNG_187}|{LETTER_RNG_188}|{LETTER_RNG_189}|{LETTER_RNG_190}
+LETTER_GROUP_22    {LETTER_GROUP_21}|{LETTER_RNG_191}|{LETTER_RNG_192}|{LETTER_RNG_193}|{LETTER_RNG_194}|{LETTER_RNG_195}|{LETTER_RNG_196}|{LETTER_RNG_197}|{LETTER_RNG_198}|{LETTER_RNG_199}
+LETTER_GROUP_23    {LETTER_GROUP_22}|{LETTER_RNG_200}|{LETTER_RNG_201}|{LETTER_RNG_202}|{LETTER_RNG_203}|{LETTER_RNG_204}|{LETTER_RNG_205}|{LETTER_RNG_206}|{LETTER_RNG_207}|{LETTER_RNG_208}
+LETTER_GROUP_24    {LETTER_GROUP_23}|{LETTER_RNG_209}|{LETTER_RNG_210}|{LETTER_RNG_211}|{LETTER_RNG_212}|{LETTER_RNG_213}|{LETTER_RNG_214}|{LETTER_RNG_215}|{LETTER_RNG_216}|{LETTER_RNG_217}
+LETTER_GROUP_25    {LETTER_GROUP_24}|{LETTER_RNG_218}|{LETTER_RNG_219}|{LETTER_RNG_220}|{LETTER_RNG_221}|{LETTER_RNG_222}|{LETTER_RNG_223}|{LETTER_RNG_224}|{LETTER_RNG_225}|{LETTER_RNG_226}
+LETTER_GROUP_26    {LETTER_GROUP_25}|{LETTER_RNG_227}|{LETTER_RNG_228}|{LETTER_RNG_229}|{LETTER_RNG_230}|{LETTER_RNG_231}|{LETTER_RNG_232}|{LETTER_RNG_233}|{LETTER_RNG_234}|{LETTER_RNG_235}
+LETTER_GROUP_27    {LETTER_GROUP_26}|{LETTER_RNG_236}|{LETTER_RNG_237}|{LETTER_RNG_238}|{LETTER_RNG_239}|{LETTER_RNG_240}|{LETTER_RNG_241}|{LETTER_RNG_242}|{LETTER_RNG_243}|{LETTER_RNG_244}
+LETTER_GROUP_28    {LETTER_GROUP_27}|{LETTER_RNG_245}|{LETTER_RNG_246}|{LETTER_RNG_247}|{LETTER_RNG_248}|{LETTER_RNG_249}|{LETTER_RNG_250}|{LETTER_RNG_251}|{LETTER_RNG_252}|{LETTER_RNG_253}
+LETTER_GROUP_29    {LETTER_GROUP_28}|{LETTER_RNG_254}|{LETTER_RNG_255}|{LETTER_RNG_256}|{LETTER_RNG_257}|{LETTER_RNG_258}|{LETTER_RNG_259}|{LETTER_RNG_260}|{LETTER_RNG_261}|{LETTER_RNG_262}
+LETTER_GROUP_30    {LETTER_GROUP_29}|{LETTER_RNG_263}|{LETTER_RNG_264}|{LETTER_RNG_265}|{LETTER_RNG_266}|{LETTER_RNG_267}|{LETTER_RNG_268}|{LETTER_RNG_269}|{LETTER_RNG_270}|{LETTER_RNG_271}
+LETTER_GROUP_31    {LETTER_GROUP_30}|{LETTER_RNG_272}|{LETTER_RNG_273}|{LETTER_RNG_274}|{LETTER_RNG_275}|{LETTER_RNG_276}|{LETTER_RNG_277}|{LETTER_RNG_278}|{LETTER_RNG_279}|{LETTER_RNG_280}
+LETTER_GROUP_32    {LETTER_GROUP_31}|{LETTER_RNG_281}|{LETTER_RNG_282}|{LETTER_RNG_283}|{LETTER_RNG_284}|{LETTER_RNG_285}|{LETTER_RNG_286}|{LETTER_RNG_287}|{LETTER_RNG_288}|{LETTER_RNG_289}
+LETTER_GROUP_33    {LETTER_GROUP_32}|{LETTER_RNG_290}|{LETTER_RNG_291}|{LETTER_RNG_292}|{LETTER_RNG_293}|{LETTER_RNG_294}|{LETTER_RNG_295}|{LETTER_RNG_296}|{LETTER_RNG_297}|{LETTER_RNG_298}
+LETTER_GROUP_34    {LETTER_GROUP_33}|{LETTER_RNG_299}|{LETTER_RNG_300}|{LETTER_RNG_301}|{LETTER_RNG_302}|{LETTER_RNG_303}|{LETTER_RNG_304}|{LETTER_RNG_305}|{LETTER_RNG_306}|{LETTER_RNG_307}
+LETTER_GROUP_35    {LETTER_GROUP_34}|{LETTER_RNG_308}|{LETTER_RNG_309}|{LETTER_RNG_310}|{LETTER_RNG_311}|{LETTER_RNG_312}|{LETTER_RNG_313}|{LETTER_RNG_314}|{LETTER_RNG_315}|{LETTER_RNG_316}
+LETTER_GROUP_36    {LETTER_GROUP_35}|{LETTER_RNG_317}|{LETTER_RNG_318}|{LETTER_RNG_319}|{LETTER_RNG_320}|{LETTER_RNG_321}|{LETTER_RNG_322}|{LETTER_RNG_323}|{LETTER_RNG_324}|{LETTER_RNG_325}
+LETTER_GROUP_37    {LETTER_GROUP_36}|{LETTER_RNG_326}|{LETTER_RNG_327}|{LETTER_RNG_328}|{LETTER_RNG_329}|{LETTER_RNG_330}|{LETTER_RNG_331}|{LETTER_RNG_332}|{LETTER_RNG_333}|{LETTER_RNG_334}
+LETTER_GROUP_38    {LETTER_GROUP_37}|{LETTER_RNG_335}|{LETTER_RNG_336}|{LETTER_RNG_337}|{LETTER_RNG_338}|{LETTER_RNG_339}|{LETTER_RNG_340}|{LETTER_RNG_341}|{LETTER_RNG_342}|{LETTER_RNG_343}
+
+LETTER_G_GROUP_1    {LETTER_GROUP_1}|{LETTER_GROUP_2}|{LETTER_GROUP_3}|{LETTER_GROUP_4}|{LETTER_GROUP_5}|{LETTER_GROUP_6}|{LETTER_GROUP_7}|{LETTER_GROUP_8}|{LETTER_GROUP_9}|{LETTER_GROUP_10}
+LETTER_G_GROUP_2    {LETTER_G_GROUP_1}|{LETTER_GROUP_11}|{LETTER_GROUP_12}|{LETTER_GROUP_13}|{LETTER_GROUP_14}|{LETTER_GROUP_15}|{LETTER_GROUP_16}|{LETTER_GROUP_17}|{LETTER_GROUP_18}|{LETTER_GROUP_19}
+LETTER_G_GROUP_3    {LETTER_G_GROUP_2}|{LETTER_GROUP_20}|{LETTER_GROUP_21}|{LETTER_GROUP_22}|{LETTER_GROUP_23}|{LETTER_GROUP_24}|{LETTER_GROUP_25}|{LETTER_GROUP_26}|{LETTER_GROUP_27}|{LETTER_GROUP_28}
+LETTER_G_GROUP_4    {LETTER_G_GROUP_3}|{LETTER_GROUP_29}|{LETTER_GROUP_30}|{LETTER_GROUP_31}|{LETTER_GROUP_32}|{LETTER_GROUP_33}|{LETTER_GROUP_34}|{LETTER_GROUP_35}|{LETTER_GROUP_36}|{LETTER_GROUP_37}
+LETTER_G_GROUP_5    {LETTER_G_GROUP_4}|{LETTER_GROUP_38}
+
+UNICODE_LETTER    {LETTER_G_GROUP_1}|{LETTER_G_GROUP_2}|{LETTER_G_GROUP_3}|{LETTER_G_GROUP_4}|{LETTER_G_GROUP_5}
+
+/* Unicode digit ranges (category Nd) */
+/* generated with unicode_range_generator.l */
+/* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */
+DIGIT_RNG_1     [0-9]
+DIGIT_RNG_2     \xD9[\xA0-\xA9]
+DIGIT_RNG_3     \xDB[\xB0-\xB9]
+DIGIT_RNG_4     \xDF[\x80-\x89]
+DIGIT_RNG_5     \xE0\xA5[\xA6-\xAF]
+DIGIT_RNG_6     \xE0\xA7[\xA6-\xAF]
+DIGIT_RNG_7     \xE0\xA9[\xA6-\xAF]
+DIGIT_RNG_8     \xE0\xAB[\xA6-\xAF]
+DIGIT_RNG_9     \xE0\xAD[\xA6-\xAF]
+DIGIT_RNG_10    \xE0\xAF[\xA6-\xAF]
+DIGIT_RNG_11    \xE0\xB1[\xA6-\xAF]
+DIGIT_RNG_12    \xE0\xB3[\xA6-\xAF]
+DIGIT_RNG_13    \xE0\xB5[\xA6-\xAF]
+DIGIT_RNG_14    \xE0\xB7[\xA6-\xAF]
+DIGIT_RNG_15    \xE0\xB9[\x90-\x99]
+DIGIT_RNG_16    \xE0\xBB[\x90-\x99]
+DIGIT_RNG_17    \xE0\xBC[\xA0-\xA9]
+DIGIT_RNG_18    \xE1\x81[\x80-\x89]
+DIGIT_RNG_19    \xE1\x82[\x90-\x99]
+DIGIT_RNG_20    \xE1\x9F[\xA0-\xA9]
+DIGIT_RNG_21    \xE1\xA0[\x90-\x99]
+DIGIT_RNG_22    \xE1\xA5[\x86-\x8F]
+DIGIT_RNG_23    \xE1\xA7[\x90-\x99]
+DIGIT_RNG_24    \xE1\xAA[\x80-\x99]
+DIGIT_RNG_25    \xE1\xAD[\x90-\x99]
+DIGIT_RNG_26    \xE1\xAE[\xB0-\xB9]
+DIGIT_RNG_27    \xE1\xB1[\x80-\x89]
+DIGIT_RNG_28    \xE1\xB1[\x90-\x99]
+DIGIT_RNG_29    \xEA\x98[\xA0-\xA9]
+DIGIT_RNG_30    \xEA\xA3[\x90-\x99]
+DIGIT_RNG_31    \xEA\xA4[\x80-\x89]
+DIGIT_RNG_32    \xEA\xA7[\x90-\x99]
+DIGIT_RNG_33    \xEA\xA7[\xB0-\xB9]
+DIGIT_RNG_34    \xEA\xA9[\x90-\x99]
+DIGIT_RNG_35    \xEA\xAF[\xB0-\xB9]
+DIGIT_RNG_36    \xEF\xBC[\x90-\x99]
+DIGIT_RNG_37    \xF0\x90\x92[\xA0-\xA9]
+DIGIT_RNG_38    \xF0\x90\xB4[\xB0-\xB9]
+DIGIT_RNG_39    \xF0\x91\x81[\xA6-\xAF]
+DIGIT_RNG_40    \xF0\x91\x83[\xB0-\xB9]
+DIGIT_RNG_41    \xF0\x91\x84[\xB6-\xBF]
+DIGIT_RNG_42    \xF0\x91\x87[\x90-\x99]
+DIGIT_RNG_43    \xF0\x91\x8B[\xB0-\xB9]
+DIGIT_RNG_44    \xF0\x91\x91[\x90-\x99]
+DIGIT_RNG_45    \xF0\x91\x93[\x90-\x99]
+DIGIT_RNG_46    \xF0\x91\x99[\x90-\x99]
+DIGIT_RNG_47    \xF0\x91\x9B[\x80-\x89]
+DIGIT_RNG_48    \xF0\x91\x9C[\xB0-\xB9]
+DIGIT_RNG_49    \xF0\x91\xA3[\xA0-\xA9]
+DIGIT_RNG_50    \xF0\x91\xA5[\x90-\x99]
+DIGIT_RNG_51    \xF0\x91\xB1[\x90-\x99]
+DIGIT_RNG_52    \xF0\x91\xB5[\x90-\x99]
+DIGIT_RNG_53    \xF0\x91\xB6[\xA0-\xA9]
+DIGIT_RNG_54    \xF0\x96\xA9[\xA0-\xA9]
+DIGIT_RNG_55    \xF0\x96\xAD[\x90-\x99]
+DIGIT_RNG_56    \xF0\x9D\x9F[\x8E-\xBF]
+DIGIT_RNG_57    \xF0\x9E\x85[\x80-\x89]
+DIGIT_RNG_58    \xF0\x9E\x8B[\xB0-\xB9]
+DIGIT_RNG_59    \xF0\x9E\xA5[\x90-\x99]
+DIGIT_RNG_60    \xF0\x9F\xAF[\xB0-\xB9]
+
+DIGIT_GROUP_1    {DIGIT_RNG_1}|{DIGIT_RNG_2}|{DIGIT_RNG_3}|{DIGIT_RNG_4}|{DIGIT_RNG_5}|{DIGIT_RNG_6}|{DIGIT_RNG_7}|{DIGIT_RNG_8}|{DIGIT_RNG_10}
+DIGIT_GROUP_2    {DIGIT_GROUP_1}|{DIGIT_RNG_11}|{DIGIT_RNG_12}|{DIGIT_RNG_13}|{DIGIT_RNG_14}|{DIGIT_RNG_15}|{DIGIT_RNG_16}|{DIGIT_RNG_17}|{DIGIT_RNG_18}
+DIGIT_GROUP_3    {DIGIT_GROUP_2}|{DIGIT_RNG_19}|{DIGIT_RNG_20}|{DIGIT_RNG_21}|{DIGIT_RNG_22}|{DIGIT_RNG_23}|{DIGIT_RNG_24}|{DIGIT_RNG_25}|{DIGIT_RNG_26}
+DIGIT_GROUP_4    {DIGIT_GROUP_3}|{DIGIT_RNG_27}|{DIGIT_RNG_28}|{DIGIT_RNG_29}|{DIGIT_RNG_30}|{DIGIT_RNG_31}|{DIGIT_RNG_32}|{DIGIT_RNG_33}|{DIGIT_RNG_34}
+DIGIT_GROUP_5    {DIGIT_GROUP_4}|{DIGIT_RNG_35}|{DIGIT_RNG_36}|{DIGIT_RNG_37}|{DIGIT_RNG_38}|{DIGIT_RNG_39}|{DIGIT_RNG_40}|{DIGIT_RNG_41}|{DIGIT_RNG_42}
+DIGIT_GROUP_6    {DIGIT_GROUP_5}|{DIGIT_RNG_43}|{DIGIT_RNG_44}|{DIGIT_RNG_45}|{DIGIT_RNG_46}|{DIGIT_RNG_47}|{DIGIT_RNG_48}|{DIGIT_RNG_49}|{DIGIT_RNG_50}
+DIGIT_GROUP_7    {DIGIT_GROUP_6}|{DIGIT_RNG_51}|{DIGIT_RNG_52}|{DIGIT_RNG_53}|{DIGIT_RNG_54}|{DIGIT_RNG_55}|{DIGIT_RNG_56}|{DIGIT_RNG_57}|{DIGIT_RNG_58}
+DIGIT_GROUP_8    {DIGIT_GROUP_7}|{DIGIT_RNG_59}|{DIGIT_RNG_60}
+
+UNICODE_DIGIT    {DIGIT_GROUP_1}|{DIGIT_GROUP_2}|{DIGIT_GROUP_3}|{DIGIT_GROUP_4}|{DIGIT_GROUP_5}|{DIGIT_GROUP_6}|{DIGIT_GROUP_7}|{DIGIT_GROUP_8}
+
+/* Unicode combining mark ranges (categories Mn and Mc) */
+/* generated with unicode_range_generator.l */
+/* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */
+COMB_MARK_RNG_1      \xCD[\x80-\xAF]|\xCC[\x80-\xBF]
+COMB_MARK_RNG_2      \xD2[\x83-\x87]
+COMB_MARK_RNG_3      \xD6[\x91-\xBD]
+COMB_MARK_RNG_4      \xD6\xBF
+COMB_MARK_RNG_5      \xD7[\x81-\x82]
+COMB_MARK_RNG_6      \xD7[\x84-\x85]
+COMB_MARK_RNG_7      \xD7\x87
+COMB_MARK_RNG_8      \xD8[\x90-\x9A]
+COMB_MARK_RNG_9      \xD9[\x8B-\x9F]
+COMB_MARK_RNG_10     \xD9\xB0
+COMB_MARK_RNG_11     \xDB[\x96-\x9C]
+COMB_MARK_RNG_12     \xDB[\x9F-\xA4]
+COMB_MARK_RNG_13     \xDB[\xA7-\xA8]
+COMB_MARK_RNG_14     \xDB[\xAA-\xAD]
+COMB_MARK_RNG_15     \xDC\x91
+COMB_MARK_RNG_16     \xDC[\xB0-\xBF]|\xDD[\x80-\x8A]
+COMB_MARK_RNG_17     \xDE[\xA6-\xB0]
+COMB_MARK_RNG_18     \xDF[\xAB-\xB3]
+COMB_MARK_RNG_19     \xDF\xBD
+COMB_MARK_RNG_20     \xE0\xA0[\x96-\x99]
+COMB_MARK_RNG_21     \xE0\xA0[\x9B-\xA3]
+COMB_MARK_RNG_22     \xE0\xA0[\xA5-\xA7]
+COMB_MARK_RNG_23     \xE0\xA0[\xA9-\xAD]
+COMB_MARK_RNG_24     \xE0\xA1[\x99-\x9B]
+COMB_MARK_RNG_25     \xE0\xA3[\x93-\xA1]
+COMB_MARK_RNG_26     \xE0(\xA3[\xA3-\xBF]|\xA4[\x80-\x83])
+COMB_MARK_RNG_27     \xE0\xA4[\xBA-\xBC]
+COMB_MARK_RNG_28     \xE0(\xA4[\xBE-\xBF]|\xA5[\x80-\x8F])
+COMB_MARK_RNG_29     \xE0\xA5[\x91-\x97]
+COMB_MARK_RNG_30     \xE0\xA5[\xA2-\xA3]
+COMB_MARK_RNG_31     \xE0\xA6[\x81-\x83]
+COMB_MARK_RNG_32     \xE0\xA6\xBC
+COMB_MARK_RNG_33     \xE0(\xA6[\xBE-\xBF]|\xA7[\x80-\x8D])
+COMB_MARK_RNG_34     \xE0\xA7\x97
+COMB_MARK_RNG_35     \xE0\xA7[\xA2-\xA3]
+COMB_MARK_RNG_36     \xE0(\xA7[\xBE-\xBF]|\xA8[\x80-\x83])
+COMB_MARK_RNG_37     \xE0(\xA8[\xBC-\xBF]|\xA9[\x80-\x91])
+COMB_MARK_RNG_38     \xE0\xA9[\xB0-\xB1]
+COMB_MARK_RNG_39     \xE0\xA9\xB5
+COMB_MARK_RNG_40     \xE0\xAA[\x81-\x83]
+COMB_MARK_RNG_41     \xE0\xAA\xBC
+COMB_MARK_RNG_42     \xE0(\xAA[\xBE-\xBF]|\xAB[\x80-\x8D])
+COMB_MARK_RNG_43     \xE0\xAB[\xA2-\xA3]
+COMB_MARK_RNG_44     \xE0(\xAB[\xBA-\xBF]|\xAC[\x80-\x83])
+COMB_MARK_RNG_45     \xE0\xAC\xBC
+COMB_MARK_RNG_46     \xE0(\xAC[\xBE-\xBF]|\xAD[\x80-\x97])
+COMB_MARK_RNG_47     \xE0\xAD[\xA2-\xA3]
+COMB_MARK_RNG_48     \xE0\xAE\x82
+COMB_MARK_RNG_49     \xE0(\xAE[\xBE-\xBF]|\xAF[\x80-\x8D])
+COMB_MARK_RNG_50     \xE0\xAF\x97
+COMB_MARK_RNG_51     \xE0\xB0[\x80-\x84]
+COMB_MARK_RNG_52     \xE0(\xB0[\xBE-\xBF]|\xB1[\x80-\x96])
+COMB_MARK_RNG_53     \xE0\xB1[\xA2-\xA3]
+COMB_MARK_RNG_54     \xE0\xB2[\x81-\x83]
+COMB_MARK_RNG_55     \xE0\xB2\xBC
+COMB_MARK_RNG_56     \xE0(\xB2[\xBE-\xBF]|\xB3[\x80-\x96])
+COMB_MARK_RNG_57     \xE0\xB3[\xA2-\xA3]
+COMB_MARK_RNG_58     \xE0\xB4[\x80-\x83]
+COMB_MARK_RNG_59     \xE0\xB4[\xBB-\xBC]
+COMB_MARK_RNG_60     \xE0(\xB4[\xBE-\xBF]|\xB5[\x80-\x8D])
+COMB_MARK_RNG_61     \xE0\xB5\x97
+COMB_MARK_RNG_62     \xE0\xB5[\xA2-\xA3]
+COMB_MARK_RNG_63     \xE0\xB6[\x81-\x83]
+COMB_MARK_RNG_64     \xE0\xB7[\x8A-\x9F]
+COMB_MARK_RNG_65     \xE0\xB7[\xB2-\xB3]
+COMB_MARK_RNG_66     \xE0\xB8\xB1
+COMB_MARK_RNG_67     \xE0\xB8[\xB4-\xBA]
+COMB_MARK_RNG_68     \xE0\xB9[\x87-\x8E]
+COMB_MARK_RNG_69     \xE0\xBA\xB1
+COMB_MARK_RNG_70     \xE0\xBA[\xB4-\xBC]
+COMB_MARK_RNG_71     \xE0\xBB[\x88-\x8D]
+COMB_MARK_RNG_72     \xE0\xBC[\x98-\x99]
+COMB_MARK_RNG_73     \xE0\xBC\xB5
+COMB_MARK_RNG_74     \xE0\xBC\xB7
+COMB_MARK_RNG_75     \xE0\xBC\xB9
+COMB_MARK_RNG_76     \xE0\xBC[\xBE-\xBF]
+COMB_MARK_RNG_77     \xE0(\xBD[\xB1-\xBF]|\xBE[\x80-\x84])
+COMB_MARK_RNG_78     \xE0\xBE[\x86-\x87]
+COMB_MARK_RNG_79     \xE0\xBE[\x8D-\xBC]
+COMB_MARK_RNG_80     \xE0\xBF\x86
+COMB_MARK_RNG_81     \xE1\x80[\xAB-\xBE]
+COMB_MARK_RNG_82     \xE1\x81[\x96-\x99]
+COMB_MARK_RNG_83     \xE1\x81[\x9E-\xA0]
+COMB_MARK_RNG_84     \xE1\x81[\xA2-\xA4]
+COMB_MARK_RNG_85     \xE1\x81[\xA7-\xAD]
+COMB_MARK_RNG_86     \xE1\x81[\xB1-\xB4]
+COMB_MARK_RNG_87     \xE1\x82[\x82-\x8D]
+COMB_MARK_RNG_88     \xE1\x82\x8F
+COMB_MARK_RNG_89     \xE1\x82[\x9A-\x9D]
+COMB_MARK_RNG_90     \8xE1\x8D[\x9D-\x9F]
+COMB_MARK_RNG_91     \xE1\x9C[\x92-\x94]
+COMB_MARK_RNG_92     \xE1\x9C[\xB2-\xB4]
+COMB_MARK_RNG_93     \xE1\x9D[\x92-\x93]
+COMB_MARK_RNG_94     \xE1\x9D[\xB2-\xB3]
+COMB_MARK_RNG_95     \xE1(\x9E[\xB4-\xBF]|\x9F[\x80-\x93])
+COMB_MARK_RNG_96     \xE1\x9F\x9D
+COMB_MARK_RNG_97     \xE1\xA0[\x8B-\x8D]
+COMB_MARK_RNG_98     \xE1\xA2[\x85-\x86]
+COMB_MARK_RNG_99     \xE1\xA2\xA9
+COMB_MARK_RNG_100    \xE1\xA4[\xA0-\xBB]
+COMB_MARK_RNG_101    \xE1\xA8[\x97-\x9B]
+COMB_MARK_RNG_102    \xE1\xA9[\x95-\xBF]
+COMB_MARK_RNG_103    \xE1\xAA[\xB0-\xBD]
+COMB_MARK_RNG_104    \xE1(\xAA\xBF|\xAC[\x80-\x84]|\xAB[\x80-\xBF])
+COMB_MARK_RNG_105    \xE1(\xAC[\xB4-\xBF]|\xAD[\x80-\x84])
+COMB_MARK_RNG_106    \xE1\xAD[\xAB-\xB3]
+COMB_MARK_RNG_107    \xE1\xAE[\x80-\x82]
+COMB_MARK_RNG_108    \xE1\xAE[\xA1-\xAD]
+COMB_MARK_RNG_109    \xE1\xAF[\xA6-\xB3]
+COMB_MARK_RNG_110    \xE1\xB0[\xA4-\xB7]
+COMB_MARK_RNG_111    \xE1\xB3[\x90-\x92]
+COMB_MARK_RNG_112    \xE1\xB3[\x94-\xA8]
+COMB_MARK_RNG_113    \xE1\xB3\xAD
+COMB_MARK_RNG_114    \xE1\xB3\xB4
+COMB_MARK_RNG_115    \xE1\xB3[\xB7-\xB9]
+COMB_MARK_RNG_116    \xE1\xB7[\x80-\xBF]
+COMB_MARK_RNG_117    \xE2\x83[\x90-\x9C]
+COMB_MARK_RNG_118    \xE2\x83\xA1
+COMB_MARK_RNG_119    \xE2\x83[\xA5-\xB0]
+COMB_MARK_RNG_120    \xE2\xB3[\xAF-\xB1]
+COMB_MARK_RNG_121    \xE2\xB5\xBF
+COMB_MARK_RNG_122    \xE2\xB7[\xA0-\xBF]
+COMB_MARK_RNG_123    \xE3\x80[\xAA-\xAF]
+COMB_MARK_RNG_124    \xE3\x82[\x99-\x9A]
+COMB_MARK_RNG_125    \xEA\x99\xAF
+COMB_MARK_RNG_126    \xEA\x99[\xB4-\xBD]
+COMB_MARK_RNG_127    \xEA\x9A[\x9E-\x9F]
+COMB_MARK_RNG_128    \xEA\x9B[\xB0-\xB1]
+COMB_MARK_RNG_129    \xEA\xA0\x82
+COMB_MARK_RNG_130    \xEA\xA0\x86
+COMB_MARK_RNG_131    \xEA\xA0\x8B
+COMB_MARK_RNG_132    \xEA\xA0[\xA3-\xA7]
+COMB_MARK_RNG_133    \xEA\xA0\xAC
+COMB_MARK_RNG_134    \xEA\xA2[\x80-\x81]
+COMB_MARK_RNG_135    \xEA(\xA2[\xB4-\xBF]|\xA3[\x80-\x85])
+COMB_MARK_RNG_136    \xEA\xA3[\xA0-\xB1]
+COMB_MARK_RNG_137    \xEA\xA3\xBF
+COMB_MARK_RNG_138    \xEA\xA4[\xA6-\xAD]
+COMB_MARK_RNG_139    \xEA\xA5[\x87-\x93]
+COMB_MARK_RNG_140    \xEA\xA6[\x80-\x83]
+COMB_MARK_RNG_141    \xEA(\xA6[\xB3-\xBF]|\xA7\x80)
+COMB_MARK_RNG_142    \xEA\xA7\xA5
+COMB_MARK_RNG_143    \xEA\xA8[\xA9-\xB6]
+COMB_MARK_RNG_144    \xEA\xA9\x83
+COMB_MARK_RNG_145    \xEA\xA9[\x8C-\x8D]
+COMB_MARK_RNG_146    \xEA\xA9[\xBB-\xBD]
+COMB_MARK_RNG_147    \xEA\xAA\xB0
+COMB_MARK_RNG_148    \xEA\xAA[\xB2-\xB4]
+COMB_MARK_RNG_149    \xEA\xAA[\xB7-\xB8]
+COMB_MARK_RNG_150    \xEA\xAA[\xBE-\xBF]
+COMB_MARK_RNG_151    \xEA\xAB\x81
+COMB_MARK_RNG_152    \xEA\xAB[\xAB-\xAF]
+COMB_MARK_RNG_153    \xEA\xAB[\xB5-\xB6]
+COMB_MARK_RNG_154    \xEA\xAF[\xA3-\xAA]
+COMB_MARK_RNG_155    \xEA\xAF[\xAC-\xAD]
+COMB_MARK_RNG_156    \xEF\xAC\x9E
+COMB_MARK_RNG_157    \xEF\xB8[\x80-\x8F]
+COMB_MARK_RNG_158    \xEF\xB8[\xA0-\xAF]
+COMB_MARK_RNG_159    \xF0\x90\x87\xBD
+COMB_MARK_RNG_160    \xF0\x90\x8B\xA0
+COMB_MARK_RNG_161    \xF0\x90\x8D[\xB6-\xBA]
+COMB_MARK_RNG_162    \xF0\x90\xA8[\x81-\x8F]
+COMB_MARK_RNG_163    \xF0\x90\xA8[\xB8-\xBF]
+COMB_MARK_RNG_164    \xF0\x90\xAB[\xA5-\xA6]
+COMB_MARK_RNG_165    \xF0\x90\xB4[\xA4-\xA7]
+COMB_MARK_RNG_166    \xF0\x90\xBA[\xAB-\xAC]
+COMB_MARK_RNG_167    \xF0\x90\xBD[\x86-\x90]
+COMB_MARK_RNG_168    \xF0\x91\x80[\x80-\x82]
+COMB_MARK_RNG_169    \xF0\x91(\x80[\xB8-\xBF]|\x81[\x80-\x86])
+COMB_MARK_RNG_170    \xF0\x91(\x81\xBF|\x82[\x80-\x82])
+COMB_MARK_RNG_171    \xF0\x91\x82[\xB0-\xBA]
+COMB_MARK_RNG_172    \xF0\x91\x84[\x80-\x82]
+COMB_MARK_RNG_173    \xF0\x91\x84[\xA7-\xB4]
+COMB_MARK_RNG_174    \xF0\x91\x85[\x85-\x86]
+COMB_MARK_RNG_175    \xF0\x91\x85\xB3
+COMB_MARK_RNG_176    \xF0\x91\x86[\x80-\x82]
+COMB_MARK_RNG_177    \xF0\x91(\x86[\xB3-\xBF]|\x87\x80)
+COMB_MARK_RNG_178    \xF0\x91\x87[\x89-\x8C]
+COMB_MARK_RNG_179    \xF0\x91\x87[\x8E-\x8F]
+COMB_MARK_RNG_180    \xF0\x91\x88[\xAC-\xB7]
+COMB_MARK_RNG_181    \xF0\x91\x88\xBE
+COMB_MARK_RNG_182    \xF0\x91\x8B[\x9F-\xAA]
+COMB_MARK_RNG_183    \xF0\x91\x8C[\x80-\x83]
+COMB_MARK_RNG_184    \xF0\x91\x8C[\xBB-\xBC]
+COMB_MARK_RNG_185    \xF0\x91(\x8C[\xBE-\xBF]|\x8D[\x80-\x8D])
+COMB_MARK_RNG_186    \xF0\x91\x8D\x97
+COMB_MARK_RNG_187    \xF0\x91\x8D[\xA2-\xB4]
+COMB_MARK_RNG_188    \xF0\x91(\x90[\xB5-\xBF]|\x91[\x80-\x86])
+COMB_MARK_RNG_189    \xF0\x91\x91\x9E
+COMB_MARK_RNG_190    \xF0\x91(\x92[\xB0-\xBF]|\x93[\x80-\x83])
+COMB_MARK_RNG_191    \xF0\x91(\x96[\xAF-\xBF]|\x97\x80)
+COMB_MARK_RNG_192    \xF0\x91\x97[\x9C-\x9D]
+COMB_MARK_RNG_193    \xF0\x91(\x98[\xB0-\xBF]|\x99\x80)
+COMB_MARK_RNG_194    \xF0\x91\x9A[\xAB-\xB7]
+COMB_MARK_RNG_195    \xF0\x91\x9C[\x9D-\xAB]
+COMB_MARK_RNG_196    \xF0\x91\xA0[\xAC-\xBA]
+COMB_MARK_RNG_197    \xF0\x91\xA4[\xB0-\xBE]
+COMB_MARK_RNG_198    \xF0\x91\xA5\x80
+COMB_MARK_RNG_199    \xF0\x91\xA5[\x82-\x83]
+COMB_MARK_RNG_200    \xF0\x91\xA7[\x91-\xA0]
+COMB_MARK_RNG_201    \xF0\x91\xA7\xA4
+COMB_MARK_RNG_202    \xF0\x91\xA8[\x81-\x8A]
+COMB_MARK_RNG_203    \xF0\x91\xA8[\xB3-\xB9]
+COMB_MARK_RNG_204    \xF0\x91\xA8[\xBB-\xBE]
+COMB_MARK_RNG_205    \xF0\x91\xA9\x87
+COMB_MARK_RNG_206    \xF0\x91\xA9[\x91-\x9B]
+COMB_MARK_RNG_207    \xF0\x91\xAA[\x8A-\x99]
+COMB_MARK_RNG_208    \xF0\x91\xB0[\xAF-\xBF]
+COMB_MARK_RNG_209    \xF0\x91\xB2[\x92-\xB6]
+COMB_MARK_RNG_210    \xF0\x91(\xB4[\xB1-\xBF]|\xB5[\x80-\x85])
+COMB_MARK_RNG_211    \xF0\x91\xB5\x87
+COMB_MARK_RNG_212    \xF0\x91\xB6[\x8A-\x97]
+COMB_MARK_RNG_213    \xF0\x91\xBB[\xB3-\xB6]
+COMB_MARK_RNG_214    \xF0\x96\xAB[\xB0-\xB4]
+COMB_MARK_RNG_215    \xF0\x96\xAC[\xB0-\xB6]
+COMB_MARK_RNG_216    \xF0\x96\xBD\x8F
+COMB_MARK_RNG_217    \xF0\x96(\xBD[\x91-\xBF]|\xBE[\x80-\x92])
+COMB_MARK_RNG_218    \xF0\x96\xBF[\xA4-\xB1]
+COMB_MARK_RNG_219    \xF0\x9B\xB2[\x9D-\x9E]
+COMB_MARK_RNG_220    \xF0\x9D\x85[\xA5-\xA9]
+COMB_MARK_RNG_221    \xF0\x9D\x85[\xAD-\xB2]
+COMB_MARK_RNG_222    \xF0\x9D(\x85[\xBB-\xBF]|\x86[\x80-\x82])
+COMB_MARK_RNG_223    \xF0\x9D\x86[\x85-\x8B]
+COMB_MARK_RNG_224    \xF0\x9D\x86[\xAA-\xAD]
+COMB_MARK_RNG_225    \xF0\x9D\x89[\x82-\x84]
+COMB_MARK_RNG_226    \xF0\x9D\xA8[\x80-\xB6]
+COMB_MARK_RNG_227    \xF0\x9D(\xA8[\xBB-\xBF]|\xA9[\x80-\xAC])
+COMB_MARK_RNG_228    \xF0\x9D\xA9\xB5
+COMB_MARK_RNG_229    \xF0\x9D\xAA\x84
+COMB_MARK_RNG_230    \xF0(\x9D\xAA[\x9B-\xBF]|\x9E\x80[\x80-\xAA]|\x9D[\xAB-\xBF][\x80-\xBF])
+COMB_MARK_RNG_231    \xF0\x9E\x84[\xB0-\xB6]
+COMB_MARK_RNG_232    \xF0\x9E\x8B[\xAC-\xAF]
+COMB_MARK_RNG_233    \xF0\x9E\xA3[\x90-\x96]
+COMB_MARK_RNG_234    \xF0\x9E\xA5[\x84-\x8A]
+COMB_MARK_RNG_235    \xF3\xA0(\x87[\x80-\xAF]|[\x84-\x86][\x80-\xBF])
+
+COMB_MARK_GROUP_1     {COMB_MARK_RNG_1}|{COMB_MARK_RNG_2}|{COMB_MARK_RNG_3}|{COMB_MARK_RNG_4}|{COMB_MARK_RNG_5}|{COMB_MARK_RNG_6}|{COMB_MARK_RNG_7}|{COMB_MARK_RNG_8}|{COMB_MARK_RNG_9}|{COMB_MARK_RNG_10}
+COMB_MARK_GROUP_2     {COMB_MARK_GROUP_1}|{COMB_MARK_RNG_11}|{COMB_MARK_RNG_12}|{COMB_MARK_RNG_13}|{COMB_MARK_RNG_14}|{COMB_MARK_RNG_15}|{COMB_MARK_RNG_16}|{COMB_MARK_RNG_17}|{COMB_MARK_RNG_18}|{COMB_MARK_RNG_19}
+COMB_MARK_GROUP_3     {COMB_MARK_GROUP_2}|{COMB_MARK_RNG_20}|{COMB_MARK_RNG_21}|{COMB_MARK_RNG_22}|{COMB_MARK_RNG_23}|{COMB_MARK_RNG_24}|{COMB_MARK_RNG_25}|{COMB_MARK_RNG_26}|{COMB_MARK_RNG_27}|{COMB_MARK_RNG_28}
+COMB_MARK_GROUP_4     {COMB_MARK_GROUP_3}|{COMB_MARK_RNG_29}|{COMB_MARK_RNG_30}|{COMB_MARK_RNG_31}|{COMB_MARK_RNG_32}|{COMB_MARK_RNG_33}|{COMB_MARK_RNG_34}|{COMB_MARK_RNG_35}|{COMB_MARK_RNG_36}|{COMB_MARK_RNG_37}
+COMB_MARK_GROUP_5     {COMB_MARK_GROUP_4}|{COMB_MARK_RNG_38}|{COMB_MARK_RNG_39}|{COMB_MARK_RNG_40}|{COMB_MARK_RNG_41}|{COMB_MARK_RNG_42}|{COMB_MARK_RNG_43}|{COMB_MARK_RNG_44}|{COMB_MARK_RNG_45}|{COMB_MARK_RNG_46}
+COMB_MARK_GROUP_6     {COMB_MARK_GROUP_5}|{COMB_MARK_RNG_47}|{COMB_MARK_RNG_48}|{COMB_MARK_RNG_49}|{COMB_MARK_RNG_50}|{COMB_MARK_RNG_51}|{COMB_MARK_RNG_52}|{COMB_MARK_RNG_53}|{COMB_MARK_RNG_54}|{COMB_MARK_RNG_55}
+COMB_MARK_GROUP_7     {COMB_MARK_GROUP_6}|{COMB_MARK_RNG_56}|{COMB_MARK_RNG_57}|{COMB_MARK_RNG_58}|{COMB_MARK_RNG_59}|{COMB_MARK_RNG_60}|{COMB_MARK_RNG_61}|{COMB_MARK_RNG_62}|{COMB_MARK_RNG_63}|{COMB_MARK_RNG_64}
+COMB_MARK_GROUP_8     {COMB_MARK_GROUP_7}|{COMB_MARK_RNG_65}|{COMB_MARK_RNG_66}|{COMB_MARK_RNG_67}|{COMB_MARK_RNG_68}|{COMB_MARK_RNG_69}|{COMB_MARK_RNG_70}|{COMB_MARK_RNG_71}|{COMB_MARK_RNG_72}|{COMB_MARK_RNG_73}
+COMB_MARK_GROUP_9     {COMB_MARK_GROUP_8}|{COMB_MARK_RNG_74}|{COMB_MARK_RNG_75}|{COMB_MARK_RNG_76}|{COMB_MARK_RNG_77}|{COMB_MARK_RNG_78}|{COMB_MARK_RNG_79}|{COMB_MARK_RNG_80}|{COMB_MARK_RNG_81}|{COMB_MARK_RNG_82}
+COMB_MARK_GROUP_10    {COMB_MARK_GROUP_9}|{COMB_MARK_RNG_83}|{COMB_MARK_RNG_84}|{COMB_MARK_RNG_85}|{COMB_MARK_RNG_86}|{COMB_MARK_RNG_87}|{COMB_MARK_RNG_88}|{COMB_MARK_RNG_89}|{COMB_MARK_RNG_90}|{COMB_MARK_RNG_91}
+COMB_MARK_GROUP_11    {COMB_MARK_GROUP_10}|{COMB_MARK_RNG_92}|{COMB_MARK_RNG_93}|{COMB_MARK_RNG_94}|{COMB_MARK_RNG_95}|{COMB_MARK_RNG_96}|{COMB_MARK_RNG_97}|{COMB_MARK_RNG_98}|{COMB_MARK_RNG_99}|{COMB_MARK_RNG_100}
+COMB_MARK_GROUP_12    {COMB_MARK_GROUP_11}|{COMB_MARK_RNG_101}|{COMB_MARK_RNG_102}|{COMB_MARK_RNG_103}|{COMB_MARK_RNG_104}|{COMB_MARK_RNG_105}|{COMB_MARK_RNG_106}|{COMB_MARK_RNG_107}|{COMB_MARK_RNG_108}|{COMB_MARK_RNG_109}
+COMB_MARK_GROUP_13    {COMB_MARK_GROUP_12}|{COMB_MARK_RNG_110}|{COMB_MARK_RNG_111}|{COMB_MARK_RNG_112}|{COMB_MARK_RNG_113}|{COMB_MARK_RNG_114}|{COMB_MARK_RNG_115}|{COMB_MARK_RNG_116}|{COMB_MARK_RNG_117}|{COMB_MARK_RNG_118}
+COMB_MARK_GROUP_14    {COMB_MARK_GROUP_13}|{COMB_MARK_RNG_119}|{COMB_MARK_RNG_120}|{COMB_MARK_RNG_121}|{COMB_MARK_RNG_122}|{COMB_MARK_RNG_123}|{COMB_MARK_RNG_124}|{COMB_MARK_RNG_125}|{COMB_MARK_RNG_126}|{COMB_MARK_RNG_127}
+COMB_MARK_GROUP_15    {COMB_MARK_GROUP_14}|{COMB_MARK_RNG_128}|{COMB_MARK_RNG_129}|{COMB_MARK_RNG_130}|{COMB_MARK_RNG_131}|{COMB_MARK_RNG_132}|{COMB_MARK_RNG_133}|{COMB_MARK_RNG_134}|{COMB_MARK_RNG_135}|{COMB_MARK_RNG_136}
+COMB_MARK_GROUP_16    {COMB_MARK_GROUP_15}|{COMB_MARK_RNG_137}|{COMB_MARK_RNG_138}|{COMB_MARK_RNG_139}|{COMB_MARK_RNG_140}|{COMB_MARK_RNG_141}|{COMB_MARK_RNG_142}|{COMB_MARK_RNG_143}|{COMB_MARK_RNG_144}|{COMB_MARK_RNG_145}
+COMB_MARK_GROUP_17    {COMB_MARK_GROUP_16}|{COMB_MARK_RNG_146}|{COMB_MARK_RNG_147}|{COMB_MARK_RNG_148}|{COMB_MARK_RNG_149}|{COMB_MARK_RNG_150}|{COMB_MARK_RNG_151}|{COMB_MARK_RNG_152}|{COMB_MARK_RNG_153}|{COMB_MARK_RNG_154}
+COMB_MARK_GROUP_18    {COMB_MARK_GROUP_17}|{COMB_MARK_RNG_155}|{COMB_MARK_RNG_156}|{COMB_MARK_RNG_157}|{COMB_MARK_RNG_158}|{COMB_MARK_RNG_159}|{COMB_MARK_RNG_160}|{COMB_MARK_RNG_161}|{COMB_MARK_RNG_162}|{COMB_MARK_RNG_163}
+COMB_MARK_GROUP_19    {COMB_MARK_GROUP_18}|{COMB_MARK_RNG_164}|{COMB_MARK_RNG_165}|{COMB_MARK_RNG_166}|{COMB_MARK_RNG_167}|{COMB_MARK_RNG_168}|{COMB_MARK_RNG_169}|{COMB_MARK_RNG_170}|{COMB_MARK_RNG_171}|{COMB_MARK_RNG_172}
+COMB_MARK_GROUP_20    {COMB_MARK_GROUP_19}|{COMB_MARK_RNG_173}|{COMB_MARK_RNG_174}|{COMB_MARK_RNG_175}|{COMB_MARK_RNG_176}|{COMB_MARK_RNG_177}|{COMB_MARK_RNG_178}|{COMB_MARK_RNG_179}|{COMB_MARK_RNG_180}|{COMB_MARK_RNG_181}
+COMB_MARK_GROUP_21    {COMB_MARK_GROUP_20}|{COMB_MARK_RNG_182}|{COMB_MARK_RNG_183}|{COMB_MARK_RNG_184}|{COMB_MARK_RNG_185}|{COMB_MARK_RNG_186}|{COMB_MARK_RNG_187}|{COMB_MARK_RNG_188}|{COMB_MARK_RNG_189}|{COMB_MARK_RNG_190}
+COMB_MARK_GROUP_22    {COMB_MARK_GROUP_21}|{COMB_MARK_RNG_191}|{COMB_MARK_RNG_192}|{COMB_MARK_RNG_193}|{COMB_MARK_RNG_194}|{COMB_MARK_RNG_195}|{COMB_MARK_RNG_196}|{COMB_MARK_RNG_197}|{COMB_MARK_RNG_198}|{COMB_MARK_RNG_199}
+COMB_MARK_GROUP_23    {COMB_MARK_GROUP_22}|{COMB_MARK_RNG_200}|{COMB_MARK_RNG_201}|{COMB_MARK_RNG_202}|{COMB_MARK_RNG_203}|{COMB_MARK_RNG_204}|{COMB_MARK_RNG_205}|{COMB_MARK_RNG_206}|{COMB_MARK_RNG_207}|{COMB_MARK_RNG_208}
+COMB_MARK_GROUP_24    {COMB_MARK_GROUP_23}|{COMB_MARK_RNG_209}|{COMB_MARK_RNG_210}|{COMB_MARK_RNG_211}|{COMB_MARK_RNG_212}|{COMB_MARK_RNG_213}|{COMB_MARK_RNG_214}|{COMB_MARK_RNG_215}|{COMB_MARK_RNG_216}|{COMB_MARK_RNG_217}
+COMB_MARK_GROUP_25    {COMB_MARK_GROUP_24}|{COMB_MARK_RNG_218}|{COMB_MARK_RNG_219}|{COMB_MARK_RNG_220}|{COMB_MARK_RNG_221}|{COMB_MARK_RNG_222}|{COMB_MARK_RNG_223}|{COMB_MARK_RNG_224}|{COMB_MARK_RNG_225}|{COMB_MARK_RNG_226}
+COMB_MARK_GROUP_26    {COMB_MARK_GROUP_25}|{COMB_MARK_RNG_227}|{COMB_MARK_RNG_228}|{COMB_MARK_RNG_229}|{COMB_MARK_RNG_230}|{COMB_MARK_RNG_231}|{COMB_MARK_RNG_232}|{COMB_MARK_RNG_233}|{COMB_MARK_RNG_234}|{COMB_MARK_RNG_235}
+
+COMB_MARK_G_GROUP_1    {COMB_MARK_GROUP_1}|{COMB_MARK_GROUP_2}|{COMB_MARK_GROUP_3}|{COMB_MARK_GROUP_4}|{COMB_MARK_GROUP_5}|{COMB_MARK_GROUP_6}|{COMB_MARK_GROUP_7}|{COMB_MARK_GROUP_8}|{COMB_MARK_GROUP_9}|{COMB_MARK_GROUP_10}
+COMB_MARK_G_GROUP_2    {COMB_MARK_G_GROUP_1}|{COMB_MARK_GROUP_11}|{COMB_MARK_GROUP_12}|{COMB_MARK_GROUP_13}|{COMB_MARK_GROUP_14}|{COMB_MARK_GROUP_15}|{COMB_MARK_GROUP_16}|{COMB_MARK_GROUP_17}|{COMB_MARK_GROUP_18}|{COMB_MARK_GROUP_19}
+COMB_MARK_G_GROUP_3    {COMB_MARK_G_GROUP_2}|{COMB_MARK_GROUP_20}|{COMB_MARK_GROUP_21}|{COMB_MARK_GROUP_22}|{COMB_MARK_GROUP_23}|{COMB_MARK_GROUP_24}|{COMB_MARK_GROUP_25}|{COMB_MARK_GROUP_26}
+
+UNICODE_COMBINING_MARK    {COMB_MARK_G_GROUP_1}|{COMB_MARK_G_GROUP_2}|{COMB_MARK_G_GROUP_3}
+
+/* Unicode connector punctuation ranges (category Pc) */
+/* generated with unicode_range_generator.l */
+/* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */
+CONNECTOR_PUNCT_RNG_1    _
+CONNECTOR_PUNCT_RNG_2    \xE2(\x80\xBF|\x81\x80)
+CONNECTOR_PUNCT_RNG_3    \xE2\x81\x94
+CONNECTOR_PUNCT_RNG_4    \xEF\xB8[\xB3-\xB4]
+CONNECTOR_PUNCT_RNG_5    \xEF\xB9[\x8D-\x8F]
+CONNECTOR_PUNCT_RNG_6    \xEF\xBC\xBF
+
+UNICODE_CONNECTOR_PUNCTUATION    {CONNECTOR_PUNCT_RNG_1}|{CONNECTOR_PUNCT_RNG_2}|{CONNECTOR_PUNCT_RNG_3}|{CONNECTOR_PUNCT_RNG_4}|{CONNECTOR_PUNCT_RNG_5}|{CONNECTOR_PUNCT_RNG_6}
+
+UNICODE_ZWNJ    \xE2\x80\x8C
+UNICODE_ZWJ     \xE2\x80\x8D
+
+/* Unicode escape sequence */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8.4 (escape sequence) */
+UNICODE_ESCAPE_SEQUENCE    \\u[0-9a-fA-F]{4}
+
+/* identifiers */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6 */
+IDENTIFIER_START    [_$]|({UNICODE_LETTER})|{UNICODE_ESCAPE_SEQUENCE}
+IDENTIFIER_PART     (({IDENTIFIER_START})|({UNICODE_COMBINING_MARK})|({UNICODE_DIGIT})|({UNICODE_CONNECTOR_PUNCTUATION})|{UNICODE_ZWNJ}|{UNICODE_ZWJ})*
+IDENTIFIER          ({IDENTIFIER_START}{IDENTIFIER_PART})*
+
+/* literals */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-7.8 */
+LITERAL_NULL                  null
+LITERAL_BOOLEAN               true|false
+LITERAL_DECIMAL               [.]?[0-9]+[\.]?[0-9]*[eE]?[0-9]*
+LITERAL_HEX_INTEGER           0x[0-9a-fA-F]*|0X[0-9a-fA-F]*
+LITERAL_DOUBLE_STRING_BEGIN   \"
+LITERAL_SINGLE_STRING_BEGIN   \'
+LITERAL_REGULAR_EXPRESSION    \/[^*\/]
+/* extra literals */
+/* according to https://ecma-international.org/ecma-262/5.1/#sec-4.3 */
+LITERAL_UNDEFINED             undefined
+LITERAL_INFINITY              Infinity|\xE2\x88\x9E
+LITERAL_NAN                   NaN
+LITERAL                       {LITERAL_NULL}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN}
+
+HTML_COMMENT_OPEN    <!--
+TAG_SCRIPT_OPEN      (?i:<script)
+TAG_SCRIPT_CLOSE     (?i:<\/script>)
+
+/* from 0x000 to 0x10FFFD to match undefined tokens */
+/* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */
+ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x80-\xBF])[\x80-\xBF]|\xF4\x8F\xBF[\x80-\xBD]|(\xF4\x8F[\x80-\xBE]|(\xF0[\x90-\xBF]|\xF4[\x80-\x8E]|[\xF1-\xF3][\x80-\xBF])[\x80-\xBF])[\x80-\xBF]
+
+/* match regex literal only if the previous token was of type PUNCTUATOR_3 or KEYWORD */
+/* this resolves an ambiguity with a division operator: var x = 2/2/1; */
+%x regex
+
+/* do not match division operators as punctuators if the previous token was of type PUNCTUATOR */
+/* this resolves an ambiguity with regular expression in some cases such as (/=abc=/g) */
+%x div_op
+
+%%
+<*>{WHITESPACES}                                        { /* skip */ }
+<*>{CHAR_ESCAPE_SEQUENCES}                              { /* skip */ }
+<*>{LINE_TERMINATORS}                                   { BEGIN(regex); }
+<*>{TAG_SCRIPT_OPEN}                                    { if ( !eval(TAG_SCRIPT_OPEN, YYText()) ) { update_ptr(); return 1; } }
+<*>{TAG_SCRIPT_CLOSE}                                   { update_ptr(); *ptr -= YYLeng(); return 0; }
+<*>{HTML_COMMENT_OPEN}                                  { skip_single_line_comment(); }
+<*>{SINGLE_LINE_COMMENT}                                { skip_single_line_comment(); }
+<*>{MULTI_LINE_COMMENT}                                 { skip_multi_line_comment(); }
+<*>{USE_STRICT_DIRECTIVE}                               { if ( !eval(DIRECTIVE, YYText()) ) { update_ptr(); return 1; } }
+<*>{KEYWORD}                                            { if ( !eval(KEYWORD, YYText()) ) { update_ptr(); return 1; } BEGIN(regex); }
+<*>{CLOSING_BRACES}                                     { if ( !eval(PUNCTUATOR, YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
+<div_op>{DIV_OPERATOR}|{DIV_ASSIGNMENT_OPERATOR}        { if ( !eval(PUNCTUATOR, YYText()) ) { update_ptr(); return 1; } }
+<*>{PUNCTUATOR}                                         { if ( !eval(PUNCTUATOR, YYText()) ) { update_ptr(); return 1; } BEGIN(regex); }
+<*>{OPERATOR}                                           { if ( !eval(OPERATOR, YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
+<*>{LITERAL}                                            { if ( !eval(LITERAL, YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
+<*>{LITERAL_DOUBLE_STRING_BEGIN}                        { if ( !eval_string_literal(YYText(), '"') ) { update_ptr(); return 1; } BEGIN(div_op); }
+<*>{LITERAL_SINGLE_STRING_BEGIN}                        { if ( !eval_string_literal(YYText(), '\'') ) { update_ptr(); return 1; } BEGIN(div_op); }
+<regex>{LITERAL_REGULAR_EXPRESSION}                     { if ( !eval_regex_literal(YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
+<*>{IDENTIFIER}                                         { if ( !eval_identifier(YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
+<*>.|{ALL_UNICODE}                                      { if ( !eval(UNDEFINED, YYText()) ) { update_ptr(); return 1; } }
+<<EOF>>                                                 { if ( eval_eof() ) { update_ptr(); return 0; } }
+%%
+
+#include <cassert>
+
+// static helper functions
+
+static std::string unicode_to_utf8(const unsigned int code)
+{
+    std::string res;
+
+    if ( code <= 0x7f )
+        res += (char)code;
+    else if ( code <= 0x7ff )
+    {
+        res += ( 0xc0 | (code >> 6) );
+        res += ( 0x80 | (code & 0x3f) );
+    }
+    else if ( code <= 0xffff )
+    {
+        res += ( 0xe0 | (code >> 12) );
+        res += ( 0x80 | ((code >> 6) & 0x3f) );
+        res += ( 0x80 | (code & 0x3f) );
+    }
+
+    return res;
+}
+
+static std::string unescape_unicode(const char* lexeme)
+{
+    assert(lexeme);
+
+    std::string lex = lexeme;
+    std::string res;
+
+    bool is_unescape = false;
+    bool is_unicode = false;
+    short digits_left = 4;
+    std::string unicode_str;
+
+    for ( const auto& ch : lex )
+    {
+        if ( ch == '\\' )
+        {
+            is_unescape = true;
+            continue;
+        }
+
+        if ( is_unescape )
+        {
+            if ( ch == 'u' )
+            {
+                is_unicode = true;
+                continue;
+            }
+            is_unescape = false;
+        }
+
+        if ( is_unicode )
+        {
+            unicode_str += ch;
+            if ( !(--digits_left) )
+            {
+                const unsigned int unicode = std::stoi(unicode_str, nullptr, 16);
+                res += unicode_to_utf8(unicode);
+
+                unicode_str = "";
+                digits_left = 4;
+                is_unicode = false;
+            }
+            continue;
+        }
+
+        res += ch;
+    }
+
+    return res;
+}
+
+// JSTokenizer members
+
+struct JSTokenizer::ScanBuffers
+{
+    YY_BUFFER_STATE initial = nullptr;
+    YY_BUFFER_STATE temporal = nullptr;
+};
+
+JSTokenizer::JSTokenizer(std::stringstream& in, std::stringstream& out, char* dstbuf,
+    uint16_t dstlen, const char** ptr, int* bytes_copied)
+    : yyFlexLexer(in, out),
+      dstbuf(dstbuf),
+      dstlen(dstlen),
+      ptr(ptr),
+      bytes_copied(bytes_copied)
+{
+    assert(bytes_copied);
+    init();
+}
+
+JSTokenizer::~JSTokenizer()
+{ delete buffers; }
+
+void JSTokenizer::init()
+{
+    buffers = new ScanBuffers;
+    *bytes_copied = 0;
+
+    // since regular expression may occur at the beginning of the input
+    BEGIN(regex);
+}
+
+void JSTokenizer::switch_to_temporal(const std::string& data)
+{
+    temporal.str(data);
+    buffers->initial = YY_CURRENT_BUFFER;
+    buffers->temporal = yy_create_buffer(temporal, data.size());
+    yy_switch_to_buffer(buffers->temporal);
+}
+
+void JSTokenizer::switch_to_initial()
+{
+    yy_delete_buffer(buffers->temporal);
+    yy_switch_to_buffer(buffers->initial);
+    buffers->temporal = nullptr;
+}
+
+bool JSTokenizer::eval_identifier(const char* lexeme)
+{
+    // If an identifier has escaped Unicode, unescape and match again
+    // in a temporal scan buffer
+    if ( strstr(lexeme, "\\u") )
+    {
+        const std::string unescaped_lex = unescape_unicode(lexeme);
+        switch_to_temporal(unescaped_lex);
+        return true;
+    }
+
+    return eval(IDENTIFIER, lexeme);
+}
+
+bool JSTokenizer::eval_string_literal(const char* match_prefix, const char quotes)
+{
+    std::string s;
+    bool is_ok = parse_literal(match_prefix, quotes, s);
+
+    return eval(is_ok ? LITERAL : UNDEFINED, s.c_str());
+}
+
+bool JSTokenizer::eval_regex_literal(const char* match_prefix)
+{
+    static const std::string regex_flags = "gimsuy";
+
+    std::string s;
+    bool is_ok = parse_literal(match_prefix, '/', s, true);
+
+    // append regex flags
+    char c;
+    while ( (c = yyinput()) != 0 )
+    {
+        if ( regex_flags.find(c) != std::string::npos )
+            s += c;
+        else
+        {
+            unput(c);
+            break;
+        }
+    }
+
+    return eval(is_ok ? LITERAL : UNDEFINED, s.c_str());
+}
+
+// A return value of this method uses to terminate the scanner
+// true - terminate, false - continue scanning
+// Use this method only in <<EOF>> handler
+// The return value should be used to make a decision about yyterminate() call
+bool JSTokenizer::eval_eof()
+{
+    // If the temporal scan buffer reaches EOF, cleanup and
+    // continue with the initial one
+    if ( buffers->temporal )
+    {
+        switch_to_initial();
+        return false;
+    }
+
+    // Normal termination
+    return true;
+}
+
+void JSTokenizer::skip_single_line_comment()
+{
+    char c;
+
+    while ( (c = yyinput()) != 0 )
+    {
+        if ( c == '\n' )
+            break;
+    }
+}
+
+void JSTokenizer::skip_multi_line_comment()
+{
+    char c;
+
+    while ( (c = yyinput()) != 0 )
+    {
+        if ( c == '*' )
+        {
+            if ( (c = yyinput()) == '/' )
+                break;
+            else
+                unput(c);
+        }
+    }
+}
+
+// Unicode line terminators
+#define LS "\u2028"
+#define PS "\u2029"
+
+// This method delineates and validates literals from the input stream such as:
+//   1. double quotes string literal
+//   2. single quotes string literal
+//   3. regex literal
+// Call this method when lexer meets those literals
+// match_prefix is a lexeme part already matched by the lexer (with sentinel char)
+bool JSTokenizer::parse_literal(const std::string& match_prefix, const char sentinel_ch,
+    std::string& result, bool is_regex)
+{
+    bool is_ok = true;
+    char c;
+    short n = 0;
+
+    for ( auto it = match_prefix.crbegin(); it != match_prefix.crend(); ++it )
+        unput(*it);
+
+    result += yyinput();
+    while ( (c = yyinput()) != 0 )
+    {
+        result += c;
+
+        if ( c == sentinel_ch and !( n % 2 ) )
+            break;
+        else if ( c == '\\' )
+        {
+            ++n;
+            continue;
+        }
+        else if ( c == '\r' )
+        {
+            if ( is_regex )
+            {
+                is_ok = false;
+                result = result.substr(0, result.size() - n);
+            }
+            else if ( n == 0 )
+                is_ok = false;
+            else if ( ( (c = yyinput()) != 0 ) and c == '\n' )
+            {
+                result = result.substr(0, result.size() - 2);
+                continue;
+            }
+            else
+            {
+                is_ok = false;
+                unput(c);
+            }
+
+            break;
+        }
+        else if ( c == '\n' )
+        {
+            if ( is_regex )
+            {
+                is_ok = false;
+                result = result.substr(0, result.size() - n);
+            }
+            else if ( n == 0 )
+                is_ok = false;
+            else
+            {
+                result = result.substr(0, result.size() - 2);
+                continue;
+            }
+
+            break;
+        }
+
+        n = 0;
+    }
+
+    if ( !is_ok )
+    {
+        result.back() = sentinel_ch;
+        return is_ok;
+    }
+
+    if ( result.find(LS) != std::string::npos or result.find(PS) != std::string::npos )
+        is_ok = false;
+
+    return is_ok;
+}
+
+bool JSTokenizer::eval(const JSToken tok, const char* lexeme)
+{
+    bool ret = false;
+
+    switch( tok )
+    {
+    case IDENTIFIER:
+        ret = normalize_identifier(prev_tok, lexeme);
+    break;
+
+    case KEYWORD:
+        ret = normalize_lexeme(prev_tok, lexeme);
+    break;
+
+    case PUNCTUATOR:
+        ret = normalize_punctuator(prev_tok, lexeme);
+    break;
+
+    case OPERATOR:
+        ret = normalize_operator(prev_tok, lexeme);
+    break;
+
+    case LITERAL:
+        ret = normalize_lexeme(prev_tok, lexeme);
+    break;
+
+    case DIRECTIVE:
+        ret = normalize_directive(prev_tok, lexeme);
+    break;
+
+    case TAG_SCRIPT_OPEN:
+        ret = normalize_tag_script_open(prev_tok, lexeme);
+    break;
+
+    case UNDEFINED:
+        ret = normalize_undefined(prev_tok, lexeme);
+    break;
+    }
+
+    prev_tok = tok;
+
+    // set a default pattern match start condition
+    if ( yy_start != INITIAL )
+        BEGIN(INITIAL);
+
+    return ret;
+}
+
+bool JSTokenizer::normalize_identifier(const JSToken prev_tok, const char* lexeme)
+{
+    return normalize_lexeme(prev_tok, lexeme);
+}
+
+bool JSTokenizer::normalize_punctuator(const JSToken, const char* lexeme)
+{
+    return write_output(lexeme);
+}
+
+bool JSTokenizer::normalize_operator(const JSToken prev_tok, const char* lexeme)
+{
+    switch( prev_tok )
+    {
+    case IDENTIFIER:
+    case KEYWORD:
+    case PUNCTUATOR:
+    case LITERAL:
+    case DIRECTIVE:
+    case TAG_SCRIPT_OPEN:
+    case UNDEFINED:
+        return write_output(lexeme);
+    break;
+
+    case OPERATOR:
+        return write_output(" " + std::string(lexeme));
+    break;
+    }
+
+    return false;
+}
+
+bool JSTokenizer::normalize_directive(const JSToken prev_tok, const char* lexeme)
+{
+    std::string str = lexeme;
+
+    if ( str.rfind(";") == std::string::npos )
+        str += ";";
+
+    return normalize_lexeme(prev_tok, str.c_str());
+}
+
+bool JSTokenizer::normalize_tag_script_open(const JSToken, const char* lexeme)
+{
+    // FIXIT-L add builtin alert here
+    return write_output(lexeme);
+}
+
+bool JSTokenizer::normalize_undefined(const JSToken, const char* lexeme)
+{ return write_output(lexeme); }
+
+bool JSTokenizer::normalize_lexeme(const JSToken prev_tok, const char* lexeme)
+{
+    switch( prev_tok )
+    {
+    case PUNCTUATOR:
+    case OPERATOR:
+    case DIRECTIVE:
+    case UNDEFINED:
+        return write_output(lexeme);
+    break;
+
+    case IDENTIFIER:
+    case KEYWORD:
+    case LITERAL:
+    case TAG_SCRIPT_OPEN:
+        return write_output(" " + std::string(lexeme));
+    break;
+    }
+
+    return false;
+}
+
+bool JSTokenizer::write_output(const std::string& str)
+{
+    size_t len = str.size();
+    int new_size = *bytes_copied + len;
+
+    if ( new_size >= 0 and new_size <= dstlen )
+        memcpy((char*) dstbuf, (const char*)str.c_str(), len);
+    else
+        return false;
+
+    dstbuf += len;
+    *bytes_copied = new_size;
+    return true;
+}
+
+void JSTokenizer::update_ptr()
+{ *ptr += yyin.tellg(); }
+
index 9a9eb166e6010e493cfdc2abd6128a5a66f300e2..ca5bf363794d0f9aa4ade4696d685c101edb08e4 100644 (file)
@@ -5,3 +5,14 @@ add_cpputest( boyer_moore_test
 
 add_cpputest( memcap_allocator_test )
 
+FLEX_TARGET ( js_tokenizer ${CMAKE_CURRENT_SOURCE_DIR}/../js_tokenizer.l
+    ${CMAKE_CURRENT_BINARY_DIR}/../js_tokenizer.cc
+    COMPILE_FLAGS -Ca
+)
+
+add_catch_test( js_normalizer_test
+    SOURCES
+        ${FLEX_js_tokenizer_OUTPUTS}
+        ../js_normalizer.cc
+)
+
diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc
new file mode 100644 (file)
index 0000000..117660f
--- /dev/null
@@ -0,0 +1,882 @@
+//--------------------------------------------------------------------------
+// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved.
+//
+// This program is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License Version 2 as published
+// by the Free Software Foundation.  You may not use, modify or distribute
+// this program under any other version of the GNU General Public License.
+//
+// This program is distributed in the hope that it will be useful, but
+// WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+// General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this program; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+//--------------------------------------------------------------------------
+// js_normalizer_test.cc author Oleksandr Serhiienko <oserhiie@cisco.com>
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "catch/catch.hpp"
+
+#include <cstring>
+
+#include "utils/js_normalizer.h"
+
+namespace snort
+{
+// Mock for JSTokenizer
+[[noreturn]] void FatalError(const char*, ...)
+{ exit(EXIT_FAILURE); }
+}
+
+using namespace snort;
+
+#define NORM_DEPTH 65535
+
+#define NORMALIZE(srcbuf, expected)                                        \
+    char dstbuf[sizeof(expected)];                                         \
+    int bytes_copied;                                                      \
+    const char* ptr = srcbuf;                                              \
+    int norm_depth = NORM_DEPTH;                                           \
+    int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf),              \
+        dstbuf, sizeof(dstbuf), &ptr, &bytes_copied, norm_depth);
+
+#define VALIDATE(srcbuf, expected)                    \
+    CHECK(ret == 0);                                  \
+    CHECK((ptr - srcbuf) == sizeof(srcbuf));          \
+    CHECK(bytes_copied == sizeof(expected) - 1);      \
+    CHECK(!memcmp(dstbuf, expected, bytes_copied));
+
+// ClamAV test cases
+static const char clamav_buf0[] =
+    "function foo(a, b) {\n"
+    "var x = 1.9e2*2*a/ 4.;\n"
+    "var y = 'test\\'tst';//var\n"
+    "x=b[5],/* multiline\nvar z=6;\nsome*some/other**/"
+    "z=x/y;/* multiline oneline */var t=z/a;\n"
+    "z=[test,testi];"
+    "document.writeln('something\\n');}";
+
+static const char clamav_expected0[] =
+    "function foo(a,b){var x=1.9e2*2*a/4.;var y='test\\'tst';x=b[5],z=x/y;var t=z/a;"
+    "z=[test,testi];document.writeln('something\\n');}";
+
+static const char clamav_buf1[] =
+    "function () { var id\\u1234tx;}";
+
+static const char clamav_expected1[] =
+    "function(){var id\u1234tx;}";
+
+static const char clamav_buf2[] =
+    "function () { var tst=\"a\"+'bc'+     'd'; }";
+
+static const char clamav_expected2[] =
+    "function(){var tst=\"a\"+'bc'+'d';}";
+
+static const char clamav_buf3[] =
+    "dF('bmfsu%2639%2638x11u%2638%263%3A%264C1');";
+
+static const char clamav_expected3[] =
+    "dF('bmfsu%2639%2638x11u%2638%263%3A%264C1');";
+
+#define B64 "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"
+
+static char clamav_buf4[] =
+    "qbphzrag.jevgr(harfpncr('%3P%73%63%72%69%70%74%20%6P%61%6R%67%75%61%67%65%3Q%22%6N%61%76%61"
+        "%73%63%72%69%70%74%22%3R%66%75%6R%63%74%69%6S%6R%20%64%46%28%73%29%7O%76%61%72%20%73%31"
+        "%3Q%75%6R%65%73%63%61%70%65%28%73%2R%73%75%62%73%74%72%28%30%2P%73%2R%6P%65%6R%67%74%68"
+        "%2Q%31%29%29%3O%20%76%61%72%20%74%3Q%27%27%3O%66%6S%72%28%69%3Q%30%3O%69%3P%73%31%2R%6P"
+        "%65%6R%67%74%68%3O%69%2O%2O%29%74%2O%3Q%53%74%72%69%6R%67%2R%66%72%6S%6Q%43%68%61%72%43"
+        "%6S%64%65%28%73%31%2R%63%68%61%72%43%6S%64%65%41%74%28%69%29%2Q%73%2R%73%75%62%73%74%72"
+        "%28%73%2R%6P%65%6R%67%74%68%2Q%31%2P%31%29%29%3O%64%6S%63%75%6Q%65%6R%74%2R%77%72%69%74"
+        "%65%28%75%6R%65%73%63%61%70%65%28%74%29%29%3O%7Q%3P%2S%73%63%72%69%70%74%3R'));"
+        "riny(qS('tV%285%3O%285%3Nsdwjl%28585%3N7%28586Q%28585%3N7%3P%7P55l%28585%3N7%3P%28585%3N7"
+        "%28586R%28585%3N8T5%285%3N%285%3P%286R3'));";
+
+static char clamav_expected4[] =
+    "qbphzrag.jevgr(harfpncr('%3P%73%63%72%69%70%74%20%6P%61%6R%67%75%61%67%65%3Q%22%6N%61%76%61"
+        "%73%63%72%69%70%74%22%3R%66%75%6R%63%74%69%6S%6R%20%64%46%28%73%29%7O%76%61%72%20%73%31"
+        "%3Q%75%6R%65%73%63%61%70%65%28%73%2R%73%75%62%73%74%72%28%30%2P%73%2R%6P%65%6R%67%74%68"
+        "%2Q%31%29%29%3O%20%76%61%72%20%74%3Q%27%27%3O%66%6S%72%28%69%3Q%30%3O%69%3P%73%31%2R%6P"
+        "%65%6R%67%74%68%3O%69%2O%2O%29%74%2O%3Q%53%74%72%69%6R%67%2R%66%72%6S%6Q%43%68%61%72%43"
+        "%6S%64%65%28%73%31%2R%63%68%61%72%43%6S%64%65%41%74%28%69%29%2Q%73%2R%73%75%62%73%74%72"
+        "%28%73%2R%6P%65%6R%67%74%68%2Q%31%2P%31%29%29%3O%64%6S%63%75%6Q%65%6R%74%2R%77%72%69%74"
+        "%65%28%75%6R%65%73%63%61%70%65%28%74%29%29%3O%7Q%3P%2S%73%63%72%69%70%74%3R'));"
+        "riny(qS('tV%285%3O%285%3Nsdwjl%28585%3N7%28586Q%28585%3N7%3P%7P55l%28585%3N7%3P%28585%3N7"
+        "%28586R%28585%3N8T5%285%3N%285%3P%286R3'));";
+
+static char clamav_buf5[] =
+    "shapgvba (c,n,p,x,r,e){}('0(\\'1\\');',2,2,'nyreg|j00g'.fcyvg('|'),0,{});";
+
+static const char clamav_expected5[] =
+    "shapgvba(c,n,p,x,r,e){}('0(\\'1\\');',2,2,'nyreg|j00g'.fcyvg('|'),0,{});";
+
+static const char clamav_buf6[] =
+    "function $(p,a,c,k,e,d){} something(); $('0(\\'1\\');',2,2,'alert|w00t'.split('|'),0,{});";
+
+static const char clamav_expected6[] =
+    "function $(p,a,c,k,e,d){}something();$('0(\\'1\\');',2,2,'alert|w00t'.split('|'),0,{});";
+
+static const char clamav_buf7[] =
+    "var z=\"tst" B64 "tst\";";
+
+static const char clamav_expected7[] =
+    "var z=\"tst" B64 "tst\";";
+
+static const char clamav_buf8[] =
+    "var z=\'tst" B64 "tst\';";
+
+static const char clamav_expected8[] =
+    "var z=\'tst" B64 "tst\';";
+
+static char clamav_buf9[] =
+    "riny(harfpncr('%61%6p%65%72%74%28%27%74%65%73%74%27%29%3o'));";
+
+static const char clamav_expected9[] =
+    "riny(harfpncr('%61%6p%65%72%74%28%27%74%65%73%74%27%29%3o'));";
+
+static const char clamav_buf10[] =
+    "function $ $() dF(x); function (p,a,c,k,e,r){function $(){}";
+
+static const char clamav_expected10[] =
+    "function $ $()dF(x);function(p,a,c,k,e,r){function $(){}";
+
+static const char clamav_buf11[] =
+    "var x=123456789 ;";
+
+static const char clamav_expected11[] =
+    "var x=123456789;";
+
+static const char clamav_buf12[] =
+    "var x='test\\u0000test';";
+
+static const char clamav_expected12[] =
+    "var x='test\\u0000test';";
+
+static const char clamav_buf13[] =
+    "var x\\s12345";
+
+static const char clamav_expected13[] =
+    "var x\\s12345";
+
+static const char clamav_buf14[] =
+    "document.write(unescape('test%20test";
+
+static const char clamav_expected14[] =
+    "document.write(unescape('test%20test";
+
+TEST_CASE("clamav tests", "[JSNormalizer]")
+{
+    SECTION("test_case_0")
+    {
+        NORMALIZE(clamav_buf0, clamav_expected0);
+        VALIDATE(clamav_buf0, clamav_expected0);
+    }
+    SECTION("test_case_1")
+    {
+        NORMALIZE(clamav_buf1, clamav_expected1);
+        VALIDATE(clamav_buf1, clamav_expected1);
+    }
+    SECTION("test_case_2")
+    {
+        NORMALIZE(clamav_buf2, clamav_expected2);
+        VALIDATE(clamav_buf2, clamav_expected2);
+    }
+    SECTION("test_case_3")
+    {
+        NORMALIZE(clamav_buf3, clamav_expected3);
+        VALIDATE(clamav_buf3, clamav_expected3);
+    }
+    SECTION("test_case_4")
+    {
+        NORMALIZE(clamav_buf4, clamav_expected4);
+        VALIDATE(clamav_buf4, clamav_expected4);
+    }
+    SECTION("test_case_5")
+    {
+        NORMALIZE(clamav_buf5, clamav_expected5);
+        VALIDATE(clamav_buf5, clamav_expected5);
+    }
+    SECTION("test_case_6")
+    {
+        NORMALIZE(clamav_buf6, clamav_expected6);
+        VALIDATE(clamav_buf6, clamav_expected6);
+    }
+    SECTION("test_case_7")
+    {
+        NORMALIZE(clamav_buf7, clamav_expected7);
+        VALIDATE(clamav_buf7, clamav_expected7);
+    }
+    SECTION("test_case_8")
+    {
+        NORMALIZE(clamav_buf8, clamav_expected8);
+        VALIDATE(clamav_buf8, clamav_expected8);
+    }
+    SECTION("test_case_9")
+    {
+        NORMALIZE(clamav_buf9, clamav_expected9);
+        VALIDATE(clamav_buf9, clamav_expected9);
+    }
+    SECTION("test_case_10")
+    {
+        NORMALIZE(clamav_buf10, clamav_expected10);
+        VALIDATE(clamav_buf10, clamav_expected10);
+    }
+    SECTION("test_case_11")
+    {
+        NORMALIZE(clamav_buf11, clamav_expected11);
+        VALIDATE(clamav_buf11, clamav_expected11);
+    }
+    SECTION("test_case_12")
+    {
+        NORMALIZE(clamav_buf12, clamav_expected12);
+        VALIDATE(clamav_buf12, clamav_expected12);
+    }
+    SECTION("test_case_13")
+    {
+        NORMALIZE(clamav_buf13, clamav_expected13);
+        VALIDATE(clamav_buf13, clamav_expected13);
+    }
+    SECTION("test_case_14")
+    {
+        NORMALIZE(clamav_buf14, clamav_expected14);
+        VALIDATE(clamav_buf14, clamav_expected14);
+    }
+}
+
+// Test cases for all match patterns
+static const char all_patterns_buf0[] =
+    "var  \x9\xB\xC\x20\xA0\x8\xA\xD\xEF\xBB\xBF\xE2\x80\xA8\xE2\x80\xA9\n"
+    "  \n\t\r\v  a; \0";
+
+static const char all_patterns_expected0[] =
+    "var a;";
+
+static const char all_patterns_buf1[] =
+    "<!-- var html_comment = 'comment' ;\n"
+    "var a = 1;// first var\nvar b = 2;  /* second var\nvar foo = 'bar'\n*/"
+    "\nvar c = 3; // third var";
+
+static const char all_patterns_expected1[] =
+    "var a=1;var b=2;var c=3;";
+
+static const char all_patterns_buf2[] =
+    "{ a } ( a ) [ a ] a >= b a == b a != b a === b a !== b a /= b . ; , "
+    "a < b a > b a <= b a + b- c a * b a % b a ++; --b a << 2 a >> 3 a >>> 4 a & b a | b "
+    "a ^ b ! a a && b a || b ?: a = 2 a += 2 a -= 2 a *= 2 a %= 2 a <<= b a >>= b a >>>= b "
+    "a &= b a|= b a ^= b a/b ~ a";
+
+static const char all_patterns_expected2[] =
+    "{a}(a)[a]a>=b a==b a!=b a===b a!==b a/=b.;,a<b a>b a<=b a+b-c a*b "
+    "a%b a++;--b a<<2 a>>3 a>>>4 a&b a|b a^b!a a&&b a||b?:a=2 a+=2 a-=2 a*=2 a%=2 a<<=b "
+    "a>>=b a>>>=b a&=b a|=b a^=b a/b~a";
+
+static const char all_patterns_buf3[] =
+    "break case debugger in import protected do else function try "
+    "implements static instanceof new this class let typeof var with enum private catch "
+    "continue default extends public finally for if super yield return switch throw const "
+    "interface void while delete export package";
+
+static const char all_patterns_expected3[] =
+    "break case debugger in import protected do else function try "
+    "implements static instanceof new this class let typeof var with enum private catch "
+    "continue default extends public finally for if super yield return switch throw const "
+    "interface void while delete export package";
+
+static const char all_patterns_buf4[] =
+    "/regex/g undefined null true false 2 23 2.3 2.23 .2 .02 4. +2 -2 "
+    "+3.3 -3.3 +23 -32 2.3E45 3.E34 -2.3E45 -3.E34 +2.3E45 +3.E34 0x1234 0XFFFF Infinity "
+    "\xE2\x88\x9E NaN \"\" \"double string\" \"d\" '' 'single string' 's' x=/regex/gs "
+    "x=2/2/1";
+
+static const char all_patterns_expected4[] =
+    "/regex/g undefined null true false 2 23 2.3 2.23 .2 .02 4.+2-2"
+    "+3.3-3.3+23-32 2.3E45 3.E34-2.3E45-3.E34+2.3E45+3.E34 0x1234 0XFFFF Infinity "
+    "\xE2\x88\x9E NaN \"\" \"double string\" \"d\" '' 'single string' 's' x=/regex/gs "
+    "x=2/2/1";
+
+static const char all_patterns_buf5[] =
+    "$2abc _2abc abc $__$ 肖晗 XÆA12 \\u0041abc \\u00FBdef \\u1234ghi ab\xE2\x80\xA8ww "
+    "ab\xE2\x80\xA9ww ab\xEF\xBB\xBFww ab∞ww 2abc";
+
+static const char all_patterns_expected5[] =
+    "$2abc _2abc abc $__$ 肖晗 XÆA12 \u0041abc \u00FBdef \u1234ghi ab ww "
+    "ab ww ab ww ab ∞ ww 2 abc";
+
+static const char all_patterns_buf6[] =
+    "var a = 1;\n"
+    "<script>\n"
+    "<script var>\n"
+    "var b = 2 ;\n";
+
+static const char all_patterns_expected6[] =
+    "var a=1;<script><script var>var b=2;";
+
+TEST_CASE("all patterns", "[JSNormalizer]")
+{
+    SECTION("whitespaces and special characters")
+    {
+        NORMALIZE(all_patterns_buf0, all_patterns_expected0);
+        VALIDATE(all_patterns_buf0, all_patterns_expected0);
+    }
+    SECTION("comments")
+    {
+        NORMALIZE(all_patterns_buf1, all_patterns_expected1);
+        VALIDATE(all_patterns_buf1, all_patterns_expected1);
+    }
+    SECTION("directives")
+    {
+        const char srcbuf0[] = "'use strict'\nvar a = 1;";
+        const char srcbuf1[] = "\"use strict\"\nvar a = 1;";
+        const char srcbuf2[] = "'use strict';var a = 1;";
+        const char srcbuf3[] = "\"use strict\";var a = 1;";
+        const char srcbuf4[] = "var a = 1 'use strict';";
+        const char expected0[] = "'use strict';var a=1;";
+        const char expected1[] = "\"use strict\";var a=1;";
+        const char expected2[] = "var a=1 'use strict';";
+        char dstbuf0[sizeof(expected0)];
+        char dstbuf1[sizeof(expected1)];
+        char dstbuf2[sizeof(expected0)];
+        char dstbuf3[sizeof(expected1)];
+        char dstbuf4[sizeof(expected2)];
+        int bytes_copied0, bytes_copied1, bytes_copied2, bytes_copied3, bytes_copied4;
+        const char* ptr0 = srcbuf0;
+        const char* ptr1 = srcbuf1;
+        const char* ptr2 = srcbuf2;
+        const char* ptr3 = srcbuf3;
+        const char* ptr4 = srcbuf4;
+        int norm_depth = NORM_DEPTH;
+
+        int ret0 = JSNormalizer::normalize(srcbuf0, sizeof(srcbuf0), dstbuf0, sizeof(dstbuf0),
+            &ptr0, &bytes_copied0, norm_depth);
+        int ret1 = JSNormalizer::normalize(srcbuf1, sizeof(srcbuf1), dstbuf1, sizeof(dstbuf1),
+            &ptr1, &bytes_copied1, norm_depth);
+        int ret2 = JSNormalizer::normalize(srcbuf2, sizeof(srcbuf2), dstbuf2, sizeof(dstbuf2),
+            &ptr2, &bytes_copied2, norm_depth);
+        int ret3 = JSNormalizer::normalize(srcbuf3, sizeof(srcbuf3), dstbuf3, sizeof(dstbuf3),
+            &ptr3, &bytes_copied3, norm_depth);
+        int ret4 = JSNormalizer::normalize(srcbuf4, sizeof(srcbuf4), dstbuf4, sizeof(dstbuf4),
+            &ptr4, &bytes_copied4, norm_depth);
+
+        CHECK(ret0 == 0);
+        CHECK((ptr0 - srcbuf0) == sizeof(srcbuf0));
+        CHECK(bytes_copied0 == sizeof(expected0) - 1);
+        CHECK(!memcmp(dstbuf0, expected0, bytes_copied0));
+
+        CHECK(ret1 == 0);
+        CHECK((ptr1 - srcbuf1) == sizeof(srcbuf1));
+        CHECK(bytes_copied1 == sizeof(expected1) - 1);
+        CHECK(!memcmp(dstbuf1, expected1, bytes_copied1));
+
+        CHECK(ret2 == 0);
+        CHECK((ptr2 - srcbuf2) == sizeof(srcbuf2));
+        CHECK(bytes_copied2 == sizeof(expected0) - 1);
+        CHECK(!memcmp(dstbuf2, expected0, bytes_copied2));
+
+        CHECK(ret3 == 0);
+        CHECK((ptr3 - srcbuf3) == sizeof(srcbuf3));
+        CHECK(bytes_copied3 == sizeof(expected1) - 1);
+        CHECK(!memcmp(dstbuf3, expected1, bytes_copied3));
+
+        CHECK(ret4 == 0);
+        CHECK((ptr4 - srcbuf4) == sizeof(srcbuf4));
+        CHECK(bytes_copied4 == sizeof(expected2) - 1);
+        CHECK(!memcmp(dstbuf4, expected2, bytes_copied4));
+    }
+    SECTION("punctuators")
+    {
+        NORMALIZE(all_patterns_buf2, all_patterns_expected2);
+        VALIDATE(all_patterns_buf2, all_patterns_expected2);
+    }
+    SECTION("keywords")
+    {
+        NORMALIZE(all_patterns_buf3, all_patterns_expected3);
+        VALIDATE(all_patterns_buf3, all_patterns_expected3);
+    }
+    SECTION("literals")
+    {
+        NORMALIZE(all_patterns_buf4, all_patterns_expected4);
+        VALIDATE(all_patterns_buf4, all_patterns_expected4);
+    }
+    SECTION("identifiers")
+    {
+        NORMALIZE(all_patterns_buf5, all_patterns_expected5);
+        VALIDATE(all_patterns_buf5, all_patterns_expected5);
+    }
+    SECTION("tag script open")
+    {
+        NORMALIZE(all_patterns_buf6, all_patterns_expected6);
+        VALIDATE(all_patterns_buf6, all_patterns_expected6);
+    }
+}
+
+// Tests for different syntax cases
+static const char syntax_cases_buf0[] =
+    "var a;\n"
+    "var b = \"init this    stuff\";\n"
+    "var c = \"Hi\" + \" \" + \"Joe\";\n"
+    "var d = 1 + 2 + \"3\";\n"
+    "var e = [ 2, 3, 5, 8 ];\n"
+    "var f = false;\n"
+    "var g = /( i'm   a  .* regex )/;\n"
+    "var h = function(){};\n"
+    "const PI = 3.14;\n"
+    "var a = 1, b = 2, c = a + b;\n"
+    "let z = 'zzz zz';\n"
+    "var g = null;\n"
+    "var name = { first: \"Jane\", last: \"Doe\" };\n"
+    "var esc = 'I don\\'t \\n know';\n";
+
+static const char syntax_cases_expected0[] =
+    "var a;var b=\"init this    stuff\";var c=\"Hi\"+\" \"+\"Joe\";"
+    "var d=1+2+\"3\";var e=[2,3,5,8];var f=false;var g=/( i'm   a  .* regex )/;"
+    "var h=function(){};const PI=3.14;var a=1,b=2,c=a+b;let z='zzz zz';var g=null;"
+    "var name={first:\"Jane\",last:\"Doe\"};var esc='I don\\'t \\n know';";
+
+static const char syntax_cases_buf1[] =
+    "a = b + c - d;\n"
+    "a = b * (c / d);\n"
+    "x = 100 % 48;\n"
+    "a ++; b -- ; -- a; ++    b;\n";
+
+static const char syntax_cases_expected1[] =
+    "a=b+c-d;a=b*(c/d);x=100%48;a++;b--;--a;++b;";
+
+static const char syntax_cases_buf2[] =
+    "!(a == b);\n"
+    "a != b;\n"
+    "typeof a;\n"
+    "x << 2; x >> 3;\n"
+    "a = b;\n"
+    "a == b;\n"
+    "a != b;\n"
+    "a === b;\n"
+    "a !== b;\n"
+    "a < b; a > b;\n"
+    "a <= b;  a >= b;\n"
+    "a += b;\n"
+    "a && b;\n"
+    "a || b;\n";
+
+static const char syntax_cases_expected2[] =
+    "!(a==b);a!=b;typeof a;x<<2;x>>3;a=b;a==b;a!=b;a===b;a!==b;a<b;a>b;"
+    "a<=b;a>=b;a+=b;a&&b;a||b;";
+
+static const char syntax_cases_buf3[] =
+    "var foo = {\n"
+        "firstFoo: \"FooFirst\",\n"
+        "secondFoo: \"FooSecond\",\n"
+        "thirdFoo: 10,\n"
+        "fourthFoo: 120,\n"
+        "methodFoo : function () {\n"
+            "\treturn this.firstFoo + \" \" + this.secondFoo;\n"
+        "}\n"
+    "};\n";
+
+static const char syntax_cases_expected3[] =
+    "var foo={firstFoo:\"FooFirst\",secondFoo:\"FooSecond\","
+    "thirdFoo:10,fourthFoo:120,methodFoo:function(){return this.firstFoo+\" \"+"
+    "this.secondFoo;}};";
+
+static const char syntax_cases_buf4[] =
+    "var dogs = [\"Bulldog\", \"Beagle\", \"Labrador\"];\n"
+    "var dogs = new Array(\"Bulldog\", \"Beagle\", \"Labrador\");\n"
+    "\t\t\t\n"
+    "alert( dogs[ 1 ] );\n"
+    "dogs[0] = \"Bull Terrier\";\n"
+    "\n"
+    "for (var i = 0; i < dogs.length; i++) {\n"
+        "console.log(dogs[i]);\n"
+    "}\n\r";
+
+static const char syntax_cases_expected4[] =
+    "var dogs=[\"Bulldog\",\"Beagle\",\"Labrador\"];"
+    "var dogs=new Array(\"Bulldog\",\"Beagle\",\"Labrador\");alert(dogs[1]);"
+    "dogs[0]=\"Bull Terrier\";for(var i=0;i<dogs.length;i++){console.log(dogs[i]);}";
+
+static const char syntax_cases_buf5[] =
+    "var i = 1;\n"
+    "while (i < 100) {\n"
+        "i *= 2;\n"
+        "document.write(i + \", \");\n"
+    "}\n"
+    "\n"
+    "i = 1;\n"
+    "do {\n"
+        "i *= 2;\n"
+        "document.write(i + \", \");\n"
+    "} while (i < 100)\n"
+    "\n"
+    "for (var i = 0; i < 10; i++) {\n"
+        "if (i == 5) { break; }\n"
+        "document.write(i + \", \");\n"
+    "}\n"
+    "\n"
+    "for (var i = 0; i < 10; i++) {\n"
+        "if (i == 5) { continue; }\n"
+        "document.write(i + \", \");\n"
+    "}\n\r";
+
+static const char syntax_cases_expected5[] =
+    "var i=1;while(i<100){i*=2;document.write(i+\", \");}i=1;do{i*=2;"
+    "document.write(i+\", \");}while(i<100)for(var i=0;i<10;i++){if(i==5){break;}"
+    "document.write(i+\", \");}for(var i=0;i<10;i++){if(i==5){continue;}"
+    "document.write(i+\", \");}";
+
+static const char syntax_cases_buf6[] =
+    "var n = 1800;\n"
+    "var res;\n"
+    "if ( (n >= 1400) && (n < 1900) ) {\n"
+        "res = \"In range.\";\n"
+    "} else {\n"
+        "res = \"Not in range.\";\n"
+    "}\n"
+    "\n"
+    "var text;\n"
+    "switch ( new Date().getDay() ) {\n"
+        "case 6:\n"
+            "text = \"Saturday\";\n"
+            "break;\n"
+        "case 0:\n"
+            "text = \"Sunday\";\n"
+            "break;\n"
+        "default:\n"
+            "text = \"Whatever\";\n"
+    "}\n\r";
+
+static const char syntax_cases_expected6[] =
+    "var n=1800;var res;if((n>=1400)&&(n<1900)){res=\"In range.\";}"
+    "else{res=\"Not in range.\";}var text;switch(new Date().getDay()){case 6:"
+    "text=\"Saturday\";break;case 0:text=\"Sunday\";break;default:text=\"Whatever\";}";
+
+static const char syntax_cases_buf7[] =
+    "var x = document.getElementById(\"mynum\").value;\n"
+    "try { \n"
+        "if(x == \"\")  throw \"empty\";\n"
+        "if(isNaN(x)) throw \"not a number\";\n"
+        "x = Number(x);\n"
+        "if(x > 10)   throw \"too high\";\n"
+    "}\n"
+    "catch(err) {\n"
+        "document.write(\"Input is \" + err);\n"
+        "console.error(err);\n"
+    "}\n"
+    "finally {\n"
+        "document.write(\"</br />Done\");\n"
+    "}\n\r";
+
+static const char syntax_cases_expected7[] =
+    "var x=document.getElementById(\"mynum\").value;try{if(x==\"\")"
+    "throw \"empty\";if(isNaN(x))throw \"not a number\";x=Number(x);if(x>10)"
+    "throw \"too high\";}catch(err){document.write(\"Input is \"+err);console.error(err);}"
+    "finally{document.write(\"</br />Done\");}";
+
+static const char syntax_cases_buf8[] =
+    "function sum (a, b) {\n"
+    "return new Promise(function (resolve, reject) {\n"
+        "setTimeout(function () {\n"
+        "if (typeof a !== \"number\" || typeof b !== \"number\") {\n"
+            "return reject(new TypeError(\"Inputs must be numbers\"));\n"
+        "}\n"
+        "resolve(a + b);\n"
+        "}, 1000);\n"
+    "});\n"
+    "}\n"
+    "\n"
+    "var myPromise = sum(10, 5);\n"
+    "myPromise.then(function (result) {\n"
+        "document.write(\" 10 + 5: \", result);\n"
+        "return sum(null, \"foo\");\n"
+        "}).then(function () {\n"
+        "}).catch(function (err) {\n"
+        "console.error(err);\n"
+    "});\n\r";
+
+static const char syntax_cases_expected8[] =
+    "function sum(a,b){return new Promise(function(resolve,reject)"
+    "{setTimeout(function(){if(typeof a!==\"number\"||typeof b!==\"number\"){return "
+    "reject(new TypeError(\"Inputs must be numbers\"));}resolve(a+b);},1000);});}"
+    "var myPromise=sum(10,5);myPromise.then(function(result){"
+    "document.write(\" 10 + 5: \",result);return sum(null,\"foo\");}).then(function(){})"
+    ".catch(function(err){console.error(err);});";
+
+static const char syntax_cases_buf9[] =
+    "var a = Math.round( (new Date).getTime()/1E3 );\n"
+    "var b = a.match( /^[0-9a-z-_.]{10,1200}$/i );\n"
+    "var c = a.match( /=\\s*{((.|\\s)*?)};/g ) ;\n\r";
+
+static const char syntax_cases_expected9[] =
+    "var a=Math.round((new Date).getTime()/1E3);"
+    "var b=a.match(/^[0-9a-z-_.]{10,1200}$/i);"
+    "var c=a.match(/=\\s*{((.|\\s)*?)};/g);";
+
+static const char syntax_cases_buf10[] =
+    "var a = 2\n/ab -cd/";
+
+static const char syntax_cases_expected10[] =
+    "var a=2 /ab -cd/";
+
+static const char syntax_cases_buf11[] =
+    "var d_str1 = \"\\\\ \" ; var d_str2 = \"abc\\\"def\" ;"
+    "var d_str3 = \"\\\"abc \" ;var s_str1 = '\\\\ ' ; var s_str2 = 'abc\\\'def' ; "
+    "var s_str3 = '\\\'abc ' ;var re_1 = /\\\\ / ; var re_2 = /abc\\/def/ ; "
+    "var re_3 = /\\/abc / ;";
+
+static const char syntax_cases_expected11[] =
+    "var d_str1=\"\\\\ \";var d_str2=\"abc\\\"def\";"
+    "var d_str3=\"\\\"abc \";var s_str1='\\\\ ';var s_str2='abc\\\'def';"
+    "var s_str3='\\\'abc ';var re_1=/\\\\ /;var re_2=/abc\\/def/;var re_3=/\\/abc /;";
+
+static const char syntax_cases_buf12[] =
+    "var str1 = \"abc\\\n def\" ;"
+    "var str2 = \"abc\\\r\n def\" ;"
+    "var str3 = 'abc\\\n def' ;"
+    "var str4 = 'abc\\\r\n def' ;";
+
+static const char syntax_cases_expected12[] =
+    "var str1=\"abc def\";"
+    "var str2=\"abc def\";"
+    "var str3='abc def';"
+    "var str4='abc def';";
+
+static const char syntax_cases_buf13[] =
+    "return /regex/i.test( str ) ;";
+
+static const char syntax_cases_expected13[] =
+    "return /regex/i.test(str);";
+
+static const char syntax_cases_buf14[] =
+    "var a = b+ ++c ;\n"
+    "var a = b++ +c ;\n"
+    "var a = b++ + ++c ;\n"
+    "var a = b- --c ;\n"
+    "var a = b-- -c ;\n"
+    "var a = b-- - --c ;\n"
+    "var a = b++ - ++c ;\n"
+    "var a = b * -c ;\n"
+    "var a = b % -c ;\n"
+    "var a = b + -c ;";
+
+static const char syntax_cases_expected14[] =
+    "var a=b+ ++c;"
+    "var a=b++ +c;"
+    "var a=b++ + ++c;"
+    "var a=b- --c;"
+    "var a=b-- -c;"
+    "var a=b-- - --c;"
+    "var a=b++ - ++c;"
+    "var a=b* -c;"
+    "var a=b% -c;"
+    "var a=b+ -c;";
+
+static const char syntax_cases_buf15[] =
+    "var str1 = 'abc\u2028 def' ;\n"
+    "var str2 = 'abc\u2029 def' ;\n\r";
+
+static const char syntax_cases_expected15[] =
+    "var str1='abc\u2028 def';"
+    "var str2='abc\u2029 def';";
+
+static const char syntax_cases_buf16[] =
+    "var invalid_str = \"abc\n def\"";
+
+static const char syntax_cases_expected16[] =
+    "var invalid_str=\"abc\"def \"";
+
+static const char syntax_cases_buf17[] =
+    "var invalid_str = 'abc\r def'";
+
+static const char syntax_cases_expected17[] =
+    "var invalid_str='abc'def '";
+
+static const char syntax_cases_buf18[] =
+    "var invalid_str = 'abc\\\n\r def'";
+
+static const char syntax_cases_expected18[] =
+    "var invalid_str='abc'def '";
+
+static const char syntax_cases_buf19[] =
+    "var invalid_re = /abc\\\n def/";
+
+static const char syntax_cases_expected19[] =
+    "var invalid_re=/abc/def/";
+
+static const char syntax_cases_buf20[] =
+    "var invalid_re = /abc\\\r\n def/";
+
+static const char syntax_cases_expected20[] =
+    "var invalid_re=/abc/def/";
+
+TEST_CASE("syntax cases", "[JSNormalizer]")
+{
+    SECTION("variables")
+    {
+        NORMALIZE(syntax_cases_buf0, syntax_cases_expected0);
+        VALIDATE(syntax_cases_buf0, syntax_cases_expected0);
+    }
+    SECTION("operators")
+    {
+        NORMALIZE(syntax_cases_buf1, syntax_cases_expected1);
+        VALIDATE(syntax_cases_buf1, syntax_cases_expected1);
+    }
+    SECTION("arithmetic and logical operators")
+    {
+        NORMALIZE(syntax_cases_buf2, syntax_cases_expected2);
+        VALIDATE(syntax_cases_buf2, syntax_cases_expected2);
+    }
+    SECTION("complex object")
+    {
+        NORMALIZE(syntax_cases_buf3, syntax_cases_expected3);
+        VALIDATE(syntax_cases_buf3, syntax_cases_expected3);
+    }
+    SECTION("arrays")
+    {
+        NORMALIZE(syntax_cases_buf4, syntax_cases_expected4);
+        VALIDATE(syntax_cases_buf4, syntax_cases_expected4);
+    }
+    SECTION("loops")
+    {
+        NORMALIZE(syntax_cases_buf5, syntax_cases_expected5);
+        VALIDATE(syntax_cases_buf5, syntax_cases_expected5);
+    }
+    SECTION("if-else and switch statements")
+    {
+        NORMALIZE(syntax_cases_buf6, syntax_cases_expected6);
+        VALIDATE(syntax_cases_buf6, syntax_cases_expected6);
+    }
+    SECTION("try-catch statements")
+    {
+        NORMALIZE(syntax_cases_buf7, syntax_cases_expected7);
+        VALIDATE(syntax_cases_buf7, syntax_cases_expected7);
+    }
+    SECTION("functions and promises")
+    {
+        NORMALIZE(syntax_cases_buf8, syntax_cases_expected8);
+        VALIDATE(syntax_cases_buf8, syntax_cases_expected8);
+    }
+    SECTION("regex-division ambiguity")
+    {
+        NORMALIZE(syntax_cases_buf9, syntax_cases_expected9);
+        VALIDATE(syntax_cases_buf9, syntax_cases_expected9);
+    }
+    SECTION("regex on a new line")
+    {
+        NORMALIZE(syntax_cases_buf10, syntax_cases_expected10);
+        VALIDATE(syntax_cases_buf10, syntax_cases_expected10);
+    }
+    SECTION("string and regex literals ambiguity with escaped sentinel chars")
+    {
+        NORMALIZE(syntax_cases_buf11, syntax_cases_expected11);
+        VALIDATE(syntax_cases_buf11, syntax_cases_expected11);
+    }
+    SECTION("escaped LF and CR chars in literals")
+    {
+        NORMALIZE(syntax_cases_buf12, syntax_cases_expected12);
+        VALIDATE(syntax_cases_buf12, syntax_cases_expected12);
+    }
+    SECTION("regex after keyword")
+    {
+        NORMALIZE(syntax_cases_buf13, syntax_cases_expected13);
+        VALIDATE(syntax_cases_buf13, syntax_cases_expected13);
+    }
+    SECTION("white space between '+'<-->'++' and '-'<-->'--'")
+    {
+        NORMALIZE(syntax_cases_buf14, syntax_cases_expected14);
+        VALIDATE(syntax_cases_buf14, syntax_cases_expected14);
+    }
+    SECTION("LS and PS chars within literal")
+    {
+        NORMALIZE(syntax_cases_buf15, syntax_cases_expected15);
+        VALIDATE(syntax_cases_buf15, syntax_cases_expected15);
+    }
+    SECTION("explicit LF within literal")
+    {
+        NORMALIZE(syntax_cases_buf16, syntax_cases_expected16);
+        VALIDATE(syntax_cases_buf16, syntax_cases_expected16);
+    }
+    SECTION("explicit CR within literal")
+    {
+        NORMALIZE(syntax_cases_buf17, syntax_cases_expected17);
+        VALIDATE(syntax_cases_buf17, syntax_cases_expected17);
+    }
+    SECTION("escaped LF-CR sequence within literal")
+    {
+        NORMALIZE(syntax_cases_buf18, syntax_cases_expected18);
+        VALIDATE(syntax_cases_buf18, syntax_cases_expected18);
+    }
+    SECTION("escaped LF within regex literal")
+    {
+        NORMALIZE(syntax_cases_buf19, syntax_cases_expected19);
+        VALIDATE(syntax_cases_buf19, syntax_cases_expected19);
+    }
+    SECTION("escaped CR-LF within regex literal")
+    {
+        NORMALIZE(syntax_cases_buf20, syntax_cases_expected20);
+        VALIDATE(syntax_cases_buf20, syntax_cases_expected20);
+    }
+}
+
+TEST_CASE("norm_depth is specified", "[JSNormalizer]")
+{
+    const char srcbuf[] = "var abc = 123;\n\r";
+    const char expected[] = "var abc";
+    char dstbuf[7];
+    int bytes_copied;
+    const char* ptr = srcbuf;
+    int norm_depth = 7;
+    int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf), dstbuf, sizeof(dstbuf), &ptr,
+        &bytes_copied, norm_depth);
+
+    CHECK(ret == 0);
+    CHECK(bytes_copied == sizeof(expected) - 1);
+    CHECK(!memcmp(dstbuf, expected, bytes_copied));
+}
+
+TEST_CASE("tag script end is specified", "[JSNormalizer]")
+{
+    const char srcbuf[] =
+        "var a = 1 ;\n" // 12 bytes
+        "var b = 2 ;\n" // 12 bytes --> ptr_offset = 24
+        "</script>\n"
+        "var c = 3 ;\n";
+    const int ptr_offset = 24;
+    const char expected[] = "var a=1;var b=2;";
+    char dstbuf[sizeof(expected)];
+    int bytes_copied;
+    const char* ptr = srcbuf;
+    int norm_depth = NORM_DEPTH;
+    int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf), dstbuf, sizeof(dstbuf), &ptr,
+        &bytes_copied, norm_depth);
+
+    CHECK(ret == 0);
+    CHECK(bytes_copied == sizeof(expected) - 1);
+    CHECK((ptr - srcbuf) == ptr_offset);
+    CHECK(!memcmp(dstbuf, expected, bytes_copied));
+}
+
+// Tests for JavaScript parsing errors and anomalies
+
+TEST_CASE("parsing errors", "[JSNormalizer]")
+{
+    SECTION("dstlen is too small")
+    {
+        const char srcbuf[] = "var abc = 123;\n\r";
+        const char expected[] = "var abc";
+        char dstbuf[7];
+        int bytes_copied;
+        const char* ptr = srcbuf;
+        int norm_depth = NORM_DEPTH;
+        int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf), dstbuf, sizeof(dstbuf), &ptr,
+            &bytes_copied, norm_depth);
+
+        CHECK(ret == 1);
+        CHECK(bytes_copied == sizeof(expected) - 1);
+        CHECK(!memcmp(dstbuf, expected, bytes_copied));
+    }
+}
+
index 979b40b6c931c9c4cbfc93c05841db31ed46731a..2179a0ca59afe707064bc55543c616c2c532ef94 100644 (file)
@@ -40,7 +40,7 @@ struct JSState
     uint16_t alerts;
 };
 
-SO_PUBLIC int JSNormalizeDecode(
+int JSNormalizeDecode(
     const char*, uint16_t, char*, uint16_t destlen, const char**, int*, JSState*, uint8_t*);
 }
 #endif