]> git.ipfire.org Git - thirdparty/snort3.git/commitdiff
Merge pull request #2904 in SNORT/snort3 from ~OSHUMEIK/snort3:js_mpdu to master
authorMike Stepanek (mstepane) <mstepane@cisco.com>
Fri, 28 May 2021 15:25:37 +0000 (15:25 +0000)
committerMike Stepanek (mstepane) <mstepane@cisco.com>
Fri, 28 May 2021 15:25:37 +0000 (15:25 +0000)
Squashed commit of the following:

commit 33f6bc94d027eb9db5680b3bb3eeba0a2944f8c7
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Mon May 24 11:22:15 2021 +0300

    http_inspect: support partial detect for Javascripts

    Normalizer context is allocated and freed once per PDU inspection.

    As a partial flush happens at the closing script tag,
    it enables context reusage in the following normalizations.
    Chunked data is supported as well (by resetting the context).

commit 89043ad68d74323cfb2d4a64a6558929dae9b534
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Tue May 18 11:15:31 2021 +0300

    utils: refactor JSTokenizer

    Parsing is done mostly by the lexer's rules.
    Temporary buffer (for unicode) reworked.

commit f0952f4f9565e2f61c0bbcd76bf06474147ab90c
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Thu May 13 16:02:47 2021 +0300

    http_inspect: extend built-in alerts for Javascript processing

    Alerts follow:
     - nested opening tag
     - closing tag seen in an unexpected place
     - bad token happened

commit e6d50626331e7bc8d30a07905ef8c8341bc0d1c7
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Thu Apr 29 14:38:38 2021 +0300

    utils: rework JSNormalizer class

    JSNormalizer can be instantiated meaningfully.
    It returns the state via the return codes.

    JSNormalizer context is placed on the flow (if needed).
    Normalization depth is the property of the context.
    Flow memory usage is updated.

    UNIT_TEST_BUILD macro added.

    The inline script count is increased upon the opening tag seeing.

commit 6a8cad0fc881f94318b4679396e1364cb82ba012
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Wed Apr 21 14:11:24 2021 +0300

    http_inspect: improve MPSE in HttpJsNorm (script start conditions)

    Naming and code style.
    Rework match callback functions.
    Combine some patterns in the opening tag into a single search.

    Allocate the output buffer only if it is needed.

19 files changed:
cmake/macros.cmake
src/service_inspectors/http_inspect/http_enum.h
src/service_inspectors/http_inspect/http_flow_data.cc
src/service_inspectors/http_inspect/http_flow_data.h
src/service_inspectors/http_inspect/http_js_norm.cc
src/service_inspectors/http_inspect/http_js_norm.h
src/service_inspectors/http_inspect/http_module.cc
src/service_inspectors/http_inspect/http_msg_body.cc
src/service_inspectors/http_inspect/http_msg_body.h
src/service_inspectors/http_inspect/http_tables.cc
src/service_inspectors/http_inspect/test/http_module_test.cc
src/service_inspectors/http_inspect/test/http_uri_norm_test.cc
src/utils/CMakeLists.txt
src/utils/js_norm_state.h [deleted file]
src/utils/js_normalizer.cc
src/utils/js_normalizer.h
src/utils/js_tokenizer.h
src/utils/js_tokenizer.l
src/utils/test/js_normalizer_test.cc

index b21a07907a9121d0387f635e3cc6bdcf353158f6..df6030e710b2b1e788dbfaedff6b4483109eb153 100644 (file)
@@ -34,6 +34,7 @@ function (add_cpputest testname)
         set(multiValueArgs SOURCES LIBS)
         cmake_parse_arguments(CppUTest "" "" "${multiValueArgs}" ${ARGN})
         add_executable(${testname} EXCLUDE_FROM_ALL ${testname}.cc ${CppUTest_SOURCES})
+        target_compile_options(${testname} PRIVATE "-DUNIT_TEST_BUILD")
         target_include_directories(${testname} PRIVATE ${CPPUTEST_INCLUDE_DIR})
         target_link_libraries(${testname} ${CPPUTEST_LIBRARIES} ${CppUTest_LIBS})
         add_test(${testname} ${testname})
index d3cf7817b52d37c1ba86fbf2a1e6177c3f06f4dd..e3af9347ea3cde4406448f5ea55312f47de37826 100755 (executable)
@@ -264,7 +264,9 @@ enum Infraction
     INF_MULTIPLE_HOST_HDRS,
     INF_HTTP2_SETTINGS,
     INF_UPGRADE_HEADER_HTTP2,
-    INF_JS_UNEXPECTED_TAG,
+    INF_JS_BAD_TOKEN,
+    INF_JS_OPENING_TAG,
+    INF_JS_CLOSING_TAG,
     INF__MAX_VALUE
 };
 
@@ -323,7 +325,9 @@ enum EventSid
     EVENT_PDF_UNSUP_COMP_TYPE = 115,
     EVENT_PDF_CASC_COMP = 116,
     EVENT_PDF_PARSE_FAILURE = 117,
-    EVENT_JS_UNEXPECTED_TAG = 118,
+    EVENT_JS_BAD_TOKEN = 118,
+    EVENT_JS_OPENING_TAG = 119,
+    EVENT_JS_CLOSING_TAG = 120,
 
     EVENT_LOSS_OF_SYNC = 201,
     EVENT_CHUNK_ZEROS = 202,
index 61eb833502fb0fef65b991018503a25b79c8b5b8..a986dac88c8cfbf528bf827b3feaebc109668e1c 100644 (file)
@@ -24,6 +24,7 @@
 #include "http_flow_data.h"
 
 #include "decompress/file_decomp.h"
+#include "utils/js_normalizer.h"
 
 #include "http_cutter.h"
 #include "http_common.h"
@@ -79,6 +80,14 @@ HttpFlowData::~HttpFlowData()
     if (HttpModule::get_peg_counts(PEG_CONCURRENT_SESSIONS) > 0)
         HttpModule::decrement_peg_counts(PEG_CONCURRENT_SESSIONS);
 
+#ifndef UNIT_TEST_BUILD
+    if (js_normalizer)
+    {
+        update_deallocations(JSNormalizer::size());
+        delete js_normalizer;
+    }
+#endif
+
     for (int k=0; k <= 1; k++)
     {
         delete infractions[k];
@@ -88,6 +97,8 @@ HttpFlowData::~HttpFlowData()
         update_deallocations(partial_buffer_length[k]);
         delete[] partial_detect_buffer[k];
         update_deallocations(partial_detect_length[k]);
+        delete[] js_detect_buffer[k];
+        update_deallocations(js_detect_length[k]);
         HttpTransaction::delete_transaction(transaction[k], nullptr);
         delete cutter[k];
         if (compress_stream[k] != nullptr)
@@ -204,6 +215,32 @@ void HttpFlowData::garbage_collect()
     }
 }
 
+#ifndef UNIT_TEST_BUILD
+snort::JSNormalizer& HttpFlowData::acquire_js_ctx()
+{
+    if (js_normalizer)
+        return *js_normalizer;
+
+    js_normalizer = new JSNormalizer();
+    update_allocations(JSNormalizer::size());
+
+    return *js_normalizer;
+}
+
+void HttpFlowData::release_js_ctx()
+{
+    if (!js_normalizer)
+        return;
+
+    update_deallocations(JSNormalizer::size());
+    delete js_normalizer;
+    js_normalizer = nullptr;
+}
+#else
+snort::JSNormalizer& HttpFlowData::acquire_js_ctx() { return *js_normalizer; }
+void HttpFlowData::release_js_ctx() {}
+#endif
+
 bool HttpFlowData::add_to_pipeline(HttpTransaction* latest)
 {
     if (pipeline == nullptr)
index 617775c8208ce62fdca216f6d8ca03326eec894f..ffcb7aece8e1b2a9befd52ae4923e80083d6c0db 100644 (file)
@@ -39,6 +39,11 @@ class HttpMsgSection;
 class HttpCutter;
 class HttpQueryParser;
 
+namespace snort
+{
+class JSNormalizer;
+}
+
 class HttpFlowData : public snort::FlowData
 {
 public:
@@ -49,6 +54,7 @@ public:
     size_t size_of() override;
 
     friend class HttpInspect;
+    friend class HttpJsNorm;
     friend class HttpMsgSection;
     friend class HttpMsgStart;
     friend class HttpMsgRequest;
@@ -169,6 +175,8 @@ private:
     uint8_t* partial_detect_buffer[2] = { nullptr, nullptr };
     uint32_t partial_detect_length[2] = { 0, 0 };
     uint32_t partial_js_detect_length[2] = { 0, 0 };
+    uint8_t* js_detect_buffer[2] = { nullptr, nullptr };
+    uint32_t js_detect_length[2] = { 0, 0 };
     int32_t status_code_num = HttpCommon::STAT_NOT_PRESENT;
     HttpEnums::VersionId version_id[2] = { HttpEnums::VERS__NOT_PRESENT,
                                             HttpEnums::VERS__NOT_PRESENT };
@@ -177,6 +185,12 @@ private:
     bool cutover_on_clear = false;
     bool ssl_search_abandoned = false;
 
+    // *** HttpJsNorm
+    snort::JSNormalizer* js_normalizer = nullptr;
+
+    snort::JSNormalizer& acquire_js_ctx();
+    void release_js_ctx();
+
     // *** Transaction management including pipelining
     static const int MAX_PIPELINE = 100;  // requests seen - responses seen <= MAX_PIPELINE
     HttpTransaction* transaction[2] = { nullptr, nullptr };
index 8aad96222337c28cdbeaf48dc3a1b1abd0455b64..90e7666f6c6d08cc0b13eb9f853e23d0e8f7c170 100644 (file)
 
 #include "http_js_norm.h"
 
-#include "utils/js_norm_state.h"
 #include "utils/js_normalizer.h"
 #include "utils/safec.h"
 #include "utils/util_jsnorm.h"
 
+#include "http_common.h"
 #include "http_enum.h"
 
 using namespace HttpEnums;
 using namespace snort;
 
-HttpJsNorm::HttpJsNorm(const HttpParaList::UriParam& uri_param_) :
-    uri_param(uri_param_), javascript_search_mpse(nullptr),
-    htmltype_search_mpse(nullptr)
+HttpJsNorm::HttpJsNorm(const HttpParaList::UriParam& uri_param_, int64_t normalization_depth_) :
+    uri_param(uri_param_),
+    normalization_depth(normalization_depth_),
+    mpse_otag(nullptr),
+    mpse_attr(nullptr),
+    mpse_type(nullptr)
 {}
 
 HttpJsNorm::~HttpJsNorm()
 {
-    delete javascript_search_mpse;
-    delete js_src_attr_search_mpse;
-    delete htmltype_search_mpse;
+    delete mpse_otag;
+    delete mpse_attr;
+    delete mpse_type;
 }
 
 void HttpJsNorm::configure()
 {
-    if ( configure_once )
+    if (configure_once)
         return;
 
-    javascript_search_mpse = new SearchTool;
-    js_src_attr_search_mpse = new SearchTool;
-    htmltype_search_mpse = new SearchTool;
-
-    javascript_search_mpse->add(script_start, script_start_length, JS_JAVASCRIPT);
-    javascript_search_mpse->prep();
-
-    js_src_attr_search_mpse->add(script_src_attr, script_src_attr_length, JS_ATTR_SRC);
-    js_src_attr_search_mpse->prep();
-
-    struct HiSearchToken
-    {
-        const char* name;
-        int name_len;
-        int search_id;
-    };
-
-    const HiSearchToken html_patterns[] =
-    {
-        { "JAVASCRIPT",      10, HTML_JS },
-        { "ECMASCRIPT",      10, HTML_EMA },
-        { "VBSCRIPT",         8, HTML_VB },
-        { nullptr,            0, 0 }
-    };
-
-    for (const HiSearchToken* tmp = &html_patterns[0]; tmp->name != nullptr; tmp++)
-    {
-        htmltype_search_mpse->add(tmp->name, tmp->name_len, tmp->search_id);
-    }
-    htmltype_search_mpse->prep();
+    mpse_otag = new SearchTool;
+    mpse_attr = new SearchTool;
+    mpse_type = new SearchTool;
+
+    static constexpr const char* otag_start = "<SCRIPT";
+    static constexpr const char* attr_gt = ">";
+    static constexpr const char* attr_src = "SRC";
+    static constexpr const char* attr_js1 = "JAVASCRIPT";
+    static constexpr const char* attr_js2 = "ECMASCRIPT";
+    static constexpr const char* attr_vb = "VBSCRIPT";
+
+    mpse_otag->add(otag_start, strlen(otag_start), 0);
+    mpse_attr->add(attr_gt, strlen(attr_gt), AID_GT);
+    mpse_attr->add(attr_src, strlen(attr_src), AID_SRC);
+    mpse_attr->add(attr_js1, strlen(attr_js1), AID_JS);
+    mpse_attr->add(attr_js2, strlen(attr_js2), AID_ECMA);
+    mpse_attr->add(attr_vb, strlen(attr_vb), AID_VB);
+    mpse_type->add(attr_js1, strlen(attr_js1), AID_JS);
+    mpse_type->add(attr_js2, strlen(attr_js2), AID_ECMA);
+    mpse_type->add(attr_vb, strlen(attr_vb), AID_VB);
+
+    mpse_otag->prep();
+    mpse_attr->prep();
+    mpse_type->prep();
 
     configure_once = true;
 }
 
-void HttpJsNorm::enhanced_normalize(const Field& input, Field& output, HttpInfractions* infractions,
-    HttpEventGen* events, int64_t js_normalization_depth) const
+void HttpJsNorm::enhanced_normalize(const Field& input, Field& output,
+    HttpInfractions* infractions, HttpFlowData* ssn) const
 {
-    bool js_present = false;
-    int index = 0;
     const char* ptr = (const char*)input.start();
     const char* const end = ptr + input.length();
 
-    uint8_t* buffer = new uint8_t[input.length()];
+    HttpEventGen* events = ssn->events[HttpCommon::SRC_SERVER];
 
-    JSNormState state;
-    state.norm_depth = js_normalization_depth;
-    state.alerts = 0;
+    char* buffer = nullptr;
+    char* dst = nullptr;
+    const char* dst_end = nullptr;
+
+    bool script_continue = alive_ctx(ssn);
 
     while (ptr < end)
     {
-        int bytes_copied = 0;
-        int mindex;
-
-        // Search for beginning of a javascript
-        if (javascript_search_mpse->find(ptr, end-ptr, search_js_found, false, &mindex) > 0)
+        if (!script_continue)
         {
-            const char* js_start = ptr + mindex;
-            const char* const angle_bracket =
-                (const char*)SnortStrnStr(js_start, end - js_start, ">");
-            if (angle_bracket == nullptr || (end - angle_bracket) == 0)
+            if (!mpse_otag->find(ptr, end - ptr, match_otag, false, &ptr))
+                break;
+            if (ptr >= end)
                 break;
 
-            bool type_js = false;
-            bool external_js = false;
-            if (angle_bracket > js_start)
-            {
-                int mid;
-                const int script_found = htmltype_search_mpse->find(
-                    js_start, (angle_bracket-js_start), search_html_found, false, &mid);
-
-                external_js = is_external_script(js_start, angle_bracket);
+            MatchContext sctx = {ptr, true, false};
 
-                js_start = angle_bracket + 1;
-                if (script_found > 0)
-                {
-                    switch (mid)
-                    {
-                    case HTML_JS:
-                        js_present = true;
-                        type_js = true;
-                        break;
-                    default:
-                        type_js = false;
-                        break;
-                    }
-                }
-                else
-                {
-                    // if no type or language is found we assume it is a javascript
-                    js_present = true;
-                    type_js = true;
-                }
-            }
-            // Save before the <script> begins
-            if (js_start > ptr)
+            if (ptr[0] == '>')
+                ptr++;
+            else
             {
-                if ((js_start - ptr) > (input.length() - index))
-                    break;
+                if (!mpse_attr->find(ptr, end - ptr, match_attr, false, &sctx))
+                    break; // the opening tag never ends
+                ptr = sctx.next;
             }
 
-            ptr = js_start;
-            if (!type_js or external_js)
+            if (!sctx.is_javascript || sctx.is_external)
                 continue;
 
-            JSNormalizer::normalize(js_start, (uint16_t)(end-js_start), (char*)buffer+index,
-                (uint16_t)(input.length() - index), &ptr, &bytes_copied, state);
-
+            // script found
             HttpModule::increment_peg_counts(PEG_JS_INLINE);
+        }
 
-            index += bytes_copied;
+        if (!buffer)
+        {
+            uint8_t* nbuf = ssn->js_detect_buffer[HttpCommon::SRC_SERVER];
+            uint32_t nlen = ssn->js_detect_length[HttpCommon::SRC_SERVER];
+
+            auto len = nlen + (end - ptr); // not more then the remaining raw data
+            buffer = new char[len];
+            if (nbuf)
+                memcpy(buffer, nbuf, nlen);
+            dst = buffer + nlen;
+            dst_end = buffer + len;
         }
-        else
-            break;
-    }
 
-    if (js_present)
-    {
-        if (state.alerts & ALERT_UNEXPECTED_TAG)
+        auto& ctx = ssn->acquire_js_ctx();
+        ctx.set_depth(normalization_depth);
+
+        auto ret = ctx.normalize(ptr, end - ptr, dst, dst_end - dst);
+        ptr = ctx.get_src_next();
+        dst = ctx.get_dst_next();
+
+        switch (ret)
         {
-            *infractions += INF_JS_UNEXPECTED_TAG;
-            events->create_event(EVENT_JS_UNEXPECTED_TAG);
+        case JSTokenizer::EOS:
+            ctx.reset_depth();
+            script_continue = false;
+            break;
+        case JSTokenizer::SCRIPT_ENDED:
+            script_continue = false;
+            break;
+        case JSTokenizer::SCRIPT_CONTINUE:
+            script_continue = true;
+            break;
+        case JSTokenizer::OPENING_TAG:
+            *infractions += INF_JS_OPENING_TAG;
+            events->create_event(EVENT_JS_OPENING_TAG);
+            script_continue = false;
+            break;
+        case JSTokenizer::CLOSING_TAG:
+            *infractions += INF_JS_CLOSING_TAG;
+            events->create_event(EVENT_JS_CLOSING_TAG);
+            script_continue = false;
+            break;
+        case JSTokenizer::BAD_TOKEN:
+            *infractions += INF_JS_BAD_TOKEN;
+            events->create_event(EVENT_JS_BAD_TOKEN);
+            script_continue = false;
+            break;
+        default:
+            assert(false);
+            script_continue = false;
+            break;
         }
-        output.set(index, buffer, true);
     }
-    else
-        delete[] buffer;
+
+    if (!script_continue)
+        ssn->release_js_ctx();
+
+    if (buffer)
+        output.set(dst - buffer, (const uint8_t*)buffer, true);
 }
 
 void HttpJsNorm::legacy_normalize(const Field& input, Field& output, HttpInfractions* infractions,
@@ -199,7 +204,7 @@ void HttpJsNorm::legacy_normalize(const Field& input, Field& output, HttpInfract
         int mindex;
 
         // Search for beginning of a javascript
-        if (javascript_search_mpse->find(ptr, end-ptr, search_js_found, false, &mindex) > 0)
+        if (mpse_otag->find(ptr, end-ptr, search_js_found, false, &mindex) > 0)
         {
             const char* js_start = ptr + mindex;
             const char* const angle_bracket =
@@ -211,7 +216,7 @@ void HttpJsNorm::legacy_normalize(const Field& input, Field& output, HttpInfract
             if (angle_bracket > js_start)
             {
                 int mid;
-                const int script_found = htmltype_search_mpse->find(
+                const int script_found = mpse_type->find(
                     js_start, (angle_bracket-js_start), search_html_found, false, &mid);
 
                 js_start = angle_bracket + 1;
@@ -219,7 +224,7 @@ void HttpJsNorm::legacy_normalize(const Field& input, Field& output, HttpInfract
                 {
                     switch (mid)
                     {
-                    case HTML_JS:
+                    case AID_JS:
                         js_present = true;
                         type_js = true;
                         break;
@@ -292,42 +297,59 @@ void HttpJsNorm::legacy_normalize(const Field& input, Field& output, HttpInfract
     }
 }
 
-/* Returning non-zero stops search, which is okay since we only look for one at a time */
 int HttpJsNorm::search_js_found(void*, void*, int index, void* index_ptr, void*)
 {
+    static constexpr int script_start_length = sizeof("<SCRIPT") - 1;
     *((int*) index_ptr) = index - script_start_length;
     return 1;
 }
-int HttpJsNorm::search_js_src_attr_found(void*, void*, int index, void* index_ptr, void*)
+
+int HttpJsNorm::search_html_found(void* id, void*, int, void* id_ptr, void*)
 {
-    *((int*) index_ptr) = index - script_src_attr_length;
+    *((int*) id_ptr)  = (int)(uintptr_t)id;
     return 1;
 }
-int HttpJsNorm::search_html_found(void* id, void*, int, void* id_ptr, void*)
+
+int HttpJsNorm::match_otag(void*, void*, int index, void* ptr, void*)
 {
-    *((int*) id_ptr)  = (int)(uintptr_t)id;
+    *(char**)ptr += index;
     return 1;
 }
 
-bool HttpJsNorm::is_external_script(const char* it, const char* script_tag_end) const
+int HttpJsNorm::match_attr(void* pid, void*, int index, void* sctx, void*)
 {
-    int src_pos;
+    MatchContext* ctx = (MatchContext*)sctx;
+    AttrId id = (AttrId)(uintptr_t)pid;
+    const char* c;
 
-    while (js_src_attr_search_mpse->find(it, (script_tag_end - it),
-        search_js_src_attr_found, false, &src_pos))
+    switch (id)
     {
-        it += (src_pos + script_src_attr_length - 1);
-        while (++it < script_tag_end)
-        {
-            if (*it == ' ')
-                continue;
-            else if (*it == '=')
-                return true;
-            else
-                break;
-        }
+    case AID_GT:
+        ctx->next += index;
+        return 1;
+
+    case AID_SRC:
+        c = ctx->next + index;
+        while (*c == ' ') c++;
+        ctx->is_external = ctx->is_external || *c == '=';
+        return 0;
+
+    case AID_JS:
+        ctx->is_javascript = true;
+        return 0;
+
+    case AID_ECMA:
+        ctx->is_javascript = true;
+        return 0;
+
+    case AID_VB:
+        ctx->is_javascript = false;
+        return 0;
+
+    default:
+        ctx->next += index;
+        ctx->is_external = false;
+        ctx->is_javascript = false;
+        return 1;
     }
-
-    return false;
 }
-
index f48ec40d4678a97a6dd20a728fa7f48f13dd71ab..385754e169fdd5844853c93d1b4611a0db524567 100644 (file)
@@ -25,6 +25,7 @@
 #include "search_engines/search_tool.h"
 
 #include "http_field.h"
+#include "http_flow_data.h"
 #include "http_event.h"
 #include "http_module.h"
 
 class HttpJsNorm
 {
 public:
-    HttpJsNorm(const HttpParaList::UriParam& uri_param_);
+    HttpJsNorm(const HttpParaList::UriParam&, int64_t normalization_depth);
     ~HttpJsNorm();
-    void legacy_normalize(const Field& input, Field& output, HttpInfractions* infractions,
-        HttpEventGen* events, int max_javascript_whitespaces) const;
-    void enhanced_normalize(const Field& input, Field& output, HttpInfractions* infractions,
-        HttpEventGen* events, int64_t js_normalization_depth) const;
+
+    void legacy_normalize(const Field& input, Field& output, HttpInfractions*, HttpEventGen*,
+        int max_javascript_whitespaces) const;
+    void enhanced_normalize(const Field& input, Field& output, HttpInfractions*, HttpFlowData*) const;
 
     void configure();
-private:
-    bool configure_once = false;
 
-    enum JsSearchId { JS_JAVASCRIPT };
-    enum JsSrcAttrSearchId { JS_ATTR_SRC };
-    enum HtmlSearchId { HTML_JS, HTML_EMA, HTML_VB };
+private:
+    enum AttrId { AID_GT, AID_SRC, AID_JS, AID_ECMA, AID_VB };
 
-    static constexpr const char* script_start = "<SCRIPT";
-    static constexpr int script_start_length = sizeof("<SCRIPT") - 1;
-    static constexpr const char* script_src_attr = "SRC";
-    static constexpr int script_src_attr_length = sizeof("SRC") - 1;
+    struct MatchContext
+    {
+        const char* next;
+        bool is_javascript;
+        bool is_external;
+    };
 
     const HttpParaList::UriParam& uri_param;
+    int64_t normalization_depth;
+    bool configure_once = false;
 
-    snort::SearchTool* javascript_search_mpse;
-    snort::SearchTool* js_src_attr_search_mpse;
-    snort::SearchTool* htmltype_search_mpse;
+    snort::SearchTool* mpse_otag;
+    snort::SearchTool* mpse_attr;
+    snort::SearchTool* mpse_type; // legacy only
 
-    static int search_js_found(void*, void*, int index, void*, void*);
-    static int search_js_src_attr_found(void*, void*, int index, void*, void*);
-    static int search_html_found(void* id, void*, int, void*, void*);
+    static int search_js_found(void*, void*, int index, void*, void*);  // legacy only
+    static int search_html_found(void* id, void*, int, void*, void*); // legacy only
+    static int match_otag(void*, void*, int, void*, void*);
+    static int match_attr(void*, void*, int, void*, void*);
 
-    bool is_external_script(const char* it, const char* script_tag_end) const;
+    bool alive_ctx(const HttpFlowData* ssn) const
+    { return ssn->js_normalizer; }
 };
 
 #endif
index d72377cd30289bbea59e88deda20e23e3e5ebf57..cefc1c920668e1ffa65658ce78f78281568329ea 100755 (executable)
@@ -197,20 +197,16 @@ bool HttpModule::set(const char*, Value& val, SnortConfig*)
     else if (val.is("normalize_javascript"))
     {
         params->js_norm_param.normalize_javascript = val.get_bool();
-
-        if ( !params->js_norm_param.is_javascript_normalization )
-            params->js_norm_param.is_javascript_normalization =
-                params->js_norm_param.normalize_javascript;
+        params->js_norm_param.is_javascript_normalization =
+            params->js_norm_param.is_javascript_normalization
+            or params->js_norm_param.normalize_javascript;
     }
     else if (val.is("js_normalization_depth"))
     {
         int64_t v = val.get_int64();
-        params->js_norm_param.js_normalization_depth = (v == -1) ?
-          Parameter::get_int("max53") : v;
-
-        if ( !params->js_norm_param.is_javascript_normalization )
-            params->js_norm_param.is_javascript_normalization =
-                (params->js_norm_param.js_normalization_depth > 0);
+        params->js_norm_param.js_normalization_depth = v;
+        params->js_norm_param.is_javascript_normalization =
+            params->js_norm_param.is_javascript_normalization or (v != 0);
     }
     else if (val.is("max_javascript_whitespaces"))
     {
@@ -394,7 +390,7 @@ bool HttpModule::end(const char*, int, SnortConfig*)
         ParseError("Cannot use normalize_javascript and js_normalization_depth together.");
 
     if ( params->js_norm_param.is_javascript_normalization )
-        params->js_norm_param.js_norm = new HttpJsNorm(params->uri_param);
+        params->js_norm_param.js_norm = new HttpJsNorm(params->uri_param, params->js_norm_param.js_normalization_depth);
 
     params->script_detection_handle = script_detection_handle;
 
index 3b3a4000f893cc8d681b5077c736b47d409dcf2f..26a18d48dacf56bbd4864cfbe5aaa334c93c1648 100644 (file)
@@ -119,7 +119,9 @@ void HttpMsgBody::analyze()
             memcpy(cumulative_buffer + partial_detect_length, decompressed_file_body.start(),
                 decompressed_file_body.length());
             cumulative_data.set(total_length, cumulative_buffer, true);
-            do_js_normalization(cumulative_data, js_norm_body);
+
+            do_js_normalization(cumulative_data, js_norm_body, true);
+
             if ((int32_t)partial_js_detect_length == js_norm_body.length())
             {
                 clean_partial(partial_inspected_octets, partial_detect_length,
@@ -128,7 +130,7 @@ void HttpMsgBody::analyze()
             }
         }
         else
-            do_js_normalization(decompressed_file_body, js_norm_body);
+            do_js_normalization(decompressed_file_body, js_norm_body, false);
 
         const int32_t detect_length =
             (js_norm_body.length() <= session_data->detect_depth_remaining[source_id]) ?
@@ -277,28 +279,57 @@ void HttpMsgBody::fd_event_callback(void* context, int event)
     }
 }
 
-void HttpMsgBody::do_js_normalization(const Field& input, Field& output)
+void HttpMsgBody::do_js_normalization(const Field& input, Field& output, bool partial_detect)
 {
-    if ( !params->js_norm_param.is_javascript_normalization or source_id == SRC_CLIENT )
+    if (!params->js_norm_param.is_javascript_normalization or source_id == SRC_CLIENT)
         output.set(input);
-    else if ( params->js_norm_param.normalize_javascript )
+    else if (params->js_norm_param.normalize_javascript)
         params->js_norm_param.js_norm->legacy_normalize(input, output,
             transaction->get_infractions(source_id), session_data->events[source_id],
             params->js_norm_param.max_javascript_whitespaces);
-    else if ( params->js_norm_param.js_normalization_depth )
+    else if (params->js_norm_param.js_normalization_depth)
     {
         output.set(input);
 
+        bool js_continuation = session_data->js_normalizer;
+        uint8_t*& buf = session_data->js_detect_buffer[source_id];
+        uint32_t& len = session_data->js_detect_length[source_id];
+
+        if (partial_detect)
+            session_data->release_js_ctx();
+        else
+        {
+            session_data->update_deallocations(len);
+            delete[] buf;
+            buf = nullptr;
+            len = 0;
+        }
+
         params->js_norm_param.js_norm->enhanced_normalize(input, enhanced_js_norm_body,
-            transaction->get_infractions(source_id), session_data->events[source_id],
-            params->js_norm_param.js_normalization_depth);
+            transaction->get_infractions(source_id), session_data);
 
         const int32_t norm_length =
             (enhanced_js_norm_body.length() <= session_data->detect_depth_remaining[source_id]) ?
             enhanced_js_norm_body.length() : session_data->detect_depth_remaining[source_id];
 
         if ( norm_length > 0 )
+        {
             set_script_data(enhanced_js_norm_body.start(), (unsigned int)norm_length);
+
+            if (partial_detect)
+                return;
+
+            if (js_continuation)
+            {
+                auto nscript_len = enhanced_js_norm_body.length();
+                uint8_t* nscript = new uint8_t[nscript_len];
+
+                memcpy(nscript, enhanced_js_norm_body.start(), nscript_len);
+                buf = nscript;
+                len = nscript_len;
+                session_data->update_allocations(len);
+            }
+        }
     }
 }
 
index d4e3f671bdb2941f4a1fdc68881f76b8146a5394..689a9381db33c6cfac5b2155b373e0fd22c8fb62 100644 (file)
@@ -58,7 +58,7 @@ private:
     void do_file_processing(const Field& file_data);
     void do_utf_decoding(const Field& input, Field& output);
     void do_file_decompression(const Field& input, Field& output);
-    void do_js_normalization(const Field& input, Field& output);
+    void do_js_normalization(const Field& input, Field& output, bool partial_detect);
     void clean_partial(uint32_t& partial_inspected_octets, uint32_t& partial_detect_length,
         uint8_t*& partial_detect_buffer,  uint32_t& partial_js_detect_length,
         int32_t detect_length);
index ad7f3c8690b1457f4aea392e59e9545b8fec6f5a..26909e2fa3864a3a4bb6112f3cfcea3fae270745 100755 (executable)
@@ -357,7 +357,9 @@ const RuleMap HttpModule::http_events[] =
     { EVENT_PDF_UNSUP_COMP_TYPE,        "PDF file unsupported compression type" },
     { EVENT_PDF_CASC_COMP,              "PDF file cascaded compression" },
     { EVENT_PDF_PARSE_FAILURE,          "PDF file parse failure" },
-    { EVENT_JS_UNEXPECTED_TAG,          "unexpected script tag within inline javascript" },
+    { EVENT_JS_BAD_TOKEN,               "bad token in JavaScript" },
+    { EVENT_JS_OPENING_TAG,             "unexpected script opening tag in JavaScript" },
+    { EVENT_JS_CLOSING_TAG,             "unexpected script closing tag in JavaScript" },
     { EVENT_LOSS_OF_SYNC,               "not HTTP traffic" },
     { EVENT_CHUNK_ZEROS,                "chunk length has excessive leading zeros" },
     { EVENT_WS_BETWEEN_MSGS,            "white space before or between messages" },
index 709710239f9a2b1092d763c80d174dbd0ae96910..3aab7541ed57549eeac76e3763ae403d9df6e6a1 100755 (executable)
@@ -64,8 +64,9 @@ int32_t substr_to_code(const uint8_t*, const int32_t, const StrCode []) { return
 long HttpTestManager::print_amount {};
 bool HttpTestManager::print_hex {};
 
-HttpJsNorm::HttpJsNorm(const HttpParaList::UriParam& uri_param_) :
-    uri_param(uri_param_), javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr) {}
+HttpJsNorm::HttpJsNorm(const HttpParaList::UriParam& uri_param_, int64_t normalization_depth_) :
+    uri_param(uri_param_), normalization_depth(normalization_depth_),
+    mpse_otag(nullptr), mpse_attr(nullptr), mpse_type(nullptr) {}
 HttpJsNorm::~HttpJsNorm() = default;
 void HttpJsNorm::configure(){}
 int64_t Parameter::get_int(char const*) { return 0; }
index fb33bd3e19c09b98f73eff03df0a6ae52a14f706..f98e616a83665ae603b63b3df8af950ba1bed17c 100755 (executable)
@@ -53,8 +53,9 @@ LiteralSearch* LiteralSearch::instantiate(LiteralSearch::Handle*, const uint8_t*
 void show_stats(PegCount*, const PegInfo*, unsigned, const char*) { }
 void show_stats(PegCount*, const PegInfo*, const IndexVec&, const char*, FILE*) { }
 
-HttpJsNorm::HttpJsNorm(const HttpParaList::UriParam& uri_param_) :
-    uri_param(uri_param_), javascript_search_mpse(nullptr), htmltype_search_mpse(nullptr) {}
+HttpJsNorm::HttpJsNorm(const HttpParaList::UriParam& uri_param_, int64_t normalization_depth_) :
+    uri_param(uri_param_), normalization_depth(normalization_depth_),
+    mpse_otag(nullptr), mpse_attr(nullptr), mpse_type(nullptr) {}
 HttpJsNorm::~HttpJsNorm() = default;
 void HttpJsNorm::configure() {}
 int64_t Parameter::get_int(char const*) { return 0; }
index d42b1893615341cfff278ce030343286d68af43a..38fc2ddce3816fbacce50b0d89927ad24fe4cd64 100644 (file)
@@ -32,7 +32,6 @@ add_library ( utils OBJECT
     dnet_header.h
     dyn_array.cc
     dyn_array.h
-    js_norm_state.h
     js_normalizer.cc
     js_normalizer.h
     js_tokenizer.h
diff --git a/src/utils/js_norm_state.h b/src/utils/js_norm_state.h
deleted file mode 100644 (file)
index 764edb3..0000000
+++ /dev/null
@@ -1,37 +0,0 @@
-//--------------------------------------------------------------------------
-// Copyright (C) 2021-2021 Cisco and/or its affiliates. All rights reserved.
-//
-// This program is free software; you can redistribute it and/or modify it
-// under the terms of the GNU General Public License Version 2 as published
-// by the Free Software Foundation.  You may not use, modify or distribute
-// this program under any other version of the GNU General Public License.
-//
-// This program is distributed in the hope that it will be useful, but
-// WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-// General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License along
-// with this program; if not, write to the Free Software Foundation, Inc.,
-// 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
-//--------------------------------------------------------------------------
-// js_norm_state.h author Oleksandr Serhiienko <oserhiie@cisco.com>
-
-#ifndef JS_NORM_STATE_H
-#define JS_NORM_STATE_H
-
-#include "main/snort_types.h"
-
-namespace snort
-{
-#define ALERT_UNEXPECTED_TAG 0x1
-
-struct JSNormState
-{
-    int64_t norm_depth;
-    uint16_t alerts;
-};
-}
-
-#endif // JS_NORM_STATE_H
-
index a5868fe05e95b6a27ace43a7dec684d657b29a06..7e4b1d9a24e13e25175f4b5b616e99a64b2b7df6 100644 (file)
 
 #include "js_normalizer.h"
 
-#include <FlexLexer.h>
+using namespace snort;
+
+JSNormalizer::JSNormalizer()
+    : depth(-1),
+      rem_bytes(-1),
+      unlim(true),
+      src_next(nullptr),
+      dst_next(nullptr),
+      tokenizer(in, out)
+{
+}
 
-#include "js_tokenizer.h"
+void JSNormalizer::set_depth(size_t new_depth)
+{
+    if (depth == new_depth)
+        return;
 
-using namespace snort;
+    depth = new_depth;
+    rem_bytes = depth;
+    unlim = depth == (size_t)-1;
+}
 
-int JSNormalizer::normalize(const char* srcbuf, uint16_t srclen, char* dstbuf, uint16_t dstlen,
-        const char** ptr, int* bytes_copied, JSNormState& state)
+JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len, char* dst, size_t dst_len)
 {
-    std::stringstream in, out;
-    in.rdbuf()->pubsetbuf(const_cast<char*>(srcbuf),
-        (state.norm_depth >= srclen) ? srclen : state.norm_depth);
+    if (rem_bytes == 0 && !unlim)
+    {
+        src_next = src + src_len;
+        dst_next = dst;
+        return JSTokenizer::EOS;
+    }
+
+    size_t len = unlim ? src_len :
+        src_len < rem_bytes ? src_len : rem_bytes;
+    in.rdbuf()->pubsetbuf(const_cast<char*>(src), len);
+    out.rdbuf()->pubsetbuf(dst, dst_len);
+
+    JSTokenizer::JSRet ret = (JSTokenizer::JSRet)tokenizer.yylex();
+    in.clear();
+    out.clear();
+    size_t r_bytes = in.tellg();
+    size_t w_bytes = out.tellp();
 
-    JSTokenizer tokenizer(in, out, dstbuf, dstlen, ptr, bytes_copied, state);
-    return tokenizer.yylex();
+    if (!unlim)
+        rem_bytes -= r_bytes;
+    src_next = src + r_bytes;
+    dst_next = dst + w_bytes;
+
+    return rem_bytes ? ret : JSTokenizer::EOS;
 }
 
+size_t JSNormalizer::size()
+{
+    return sizeof(JSNormalizer) + 16834; // the default YY_BUF_SIZE
+}
index 2e562bb1b0438a705840000a6b088e3d204636f1..75bd407685396e21b31aa9e45ec2d04868297759 100644 (file)
 
 #include "main/snort_types.h"
 
-#include "js_norm_state.h"
+#include <FlexLexer.h>
+
+#include "js_tokenizer.h"
 
 namespace snort
 {
+
 class JSNormalizer
 {
 public:
-    static int normalize(const char* srcbuf, uint16_t srclen, char* dstbuf, uint16_t dstlen,
-        const char** ptr, int* bytes_copied, JSNormState& state);
+    JSNormalizer();
+
+    const char* get_src_next() const
+    { return src_next; }
+
+    char* get_dst_next() const // this can go beyond dst length, but no writing happens outside of dst
+    { return dst_next; }
+
+    void reset_depth()
+    { rem_bytes = depth; }
+
+    void set_depth(size_t depth);
+
+    JSTokenizer::JSRet normalize(const char* src, size_t src_len, char* dst, size_t dst_len);
+
+    static size_t size();
+
+private:
+    size_t depth;
+    size_t rem_bytes;
+    bool unlim;
+    const char* src_next;
+    char* dst_next;
+
+    std::stringstream in;
+    std::stringstream out;
+    JSTokenizer tokenizer;
 };
+
 }
 
 #endif //JS_NORMALIZER_H
index 2e284ef44eab437ee851f6f1331d2c19ef0633d4..0e0fd2a27f8edabd0b2246b717f29f57c76ded1f 100644 (file)
@@ -24,8 +24,6 @@
 
 #include "log/messages.h"
 
-#include "js_norm_state.h"
-
 class JSTokenizer : public yyFlexLexer
 {
 private:
@@ -41,15 +39,20 @@ private:
     };
 
 public:
-    // we need an out stream because yyFlexLexer API strongly requires that
-    JSTokenizer(std::stringstream& in, std::stringstream& out, char* dstbuf,
-        const uint16_t dstlen, const char** ptr, int* bytes_copied, snort::JSNormState& state);
+    enum JSRet
+    {
+        EOS = 0,
+        SCRIPT_ENDED,
+        SCRIPT_CONTINUE,
+        OPENING_TAG,
+        CLOSING_TAG,
+        BAD_TOKEN
+    };
+
+    JSTokenizer(std::istream& in, std::ostream& out);
     ~JSTokenizer() override;
 
-    // so, Flex will treat this class as yyclass
-    // must come with yyclass Flex option
-    // don't need to define this method, it'll be substituted by Flex
-    // returns 0 if OK, 1 otherwise
+    // returns JSRet
     int yylex() override;
 
 protected:
@@ -57,51 +60,19 @@ protected:
     { snort::FatalError("%s", msg); }
 
 private:
-    void init();
-
-    // scan buffers control
-    void switch_to_temporal(const std::string& data);
     void switch_to_initial();
-
-    bool eval_identifier(const char* lexeme);
-    bool eval_string_literal(const char* match_prefix, const char quotes);
-    bool eval_regex_literal(const char* match_prefix);
-    bool eval_eof();
-    bool eval_single_line_comment();
-    bool eval_multi_line_comment();
-
-    bool parse_literal(const std::string& match_prefix, const char sentinel_ch,
-        std::string& result, bool& is_alert, bool is_regex = false);
-
-    // main lexeme handler
-    // all scanned tokens must pass here
-    bool eval(const JSToken tok, const char* lexeme);
-
-    bool normalize_identifier(const JSToken prev_tok, const char* lexeme);
-    bool normalize_punctuator(const JSToken prev_tok, const char* lexeme);
-    bool normalize_operator(const JSToken prev_tok, const char* lexeme);
-    bool normalize_directive(const JSToken prev_tok, const char* lexeme);
-    bool normalize_undefined(const JSToken prev_tok, const char* lexeme);
-    bool normalize_lexeme(const JSToken prev_tok, const char* lexeme);
-
-    bool write_output(const std::string& str);
-
-    void update_ptr();
+    void switch_to_temporal(const std::string& data);
+    JSRet eval_eof();
+    JSRet do_spacing(JSToken cur_token);
+    JSRet do_operator_spacing(JSToken cur_token);
+    bool unescape(const char* lexeme);
 
 private:
-    char* dstbuf;
-    const uint16_t dstlen;
-    const char** ptr;
-    int* bytes_copied;
-
-    struct ScanBuffers;
-    ScanBuffers* buffers = nullptr;
-    std::stringstream temporal;
-
-    JSToken prev_tok = UNDEFINED;
-
-    snort::JSNormState& state;
+    void* cur_buffer;
+    void* tmp_buffer = nullptr;
+    std::stringstream tmp;
 
+    JSToken token = UNDEFINED;
 };
 
 #endif // JS_TOKENIZER_H
index 84e5ef6ea3c895d0af23fbd157566eced8cf9681..3f9a0c748552ff4b6023bf1abf0d4e013cdea2b7 100644 (file)
@@ -35,6 +35,8 @@
     #include <cassert>
 
     #include "utils/util_cstring.h"
+
+    #define EXEC(f) { auto r = (f); if (r) { BEGIN(regst); return r; } }
 %}
 
 /* The following grammar was created based on ECMAScript specification */
@@ -67,12 +69,22 @@ LINE_TERMINATORS    {LF}|{CR}|{LS}|{PS}
 
 /* comments */
 /* according to https://ecma-international.org/ecma-262/5.1/#sec-7.4 */
-SINGLE_LINE_COMMENT    "//"
-MULTI_LINE_COMMENT     "/\*"
+LINE_COMMENT_START   "//"
+LINE_COMMENT_END1    [^<\xA\xD]*\xA
+LINE_COMMENT_END2    [^<\xA\xD]*\xD
+LINE_COMMENT_END3    [^<\xA\xD]*"<"+(?i:script)
+LINE_COMMENT_END4    [^<\xA\xD]*"<"+(?i:\/script>)
+LINE_COMMENT_SKIP    [^<\xA\xD]*"<"?
+BLOCK_COMMENT_START  "/*"
+BLOCK_COMMENT_END1   [^<*]*"*"+"/"
+BLOCK_COMMENT_END2   [^<*]*"<"+(?i:script)
+BLOCK_COMMENT_END3   [^<*]*"<"+(?i:\/script>)
+BLOCK_COMMENT_SKIP   [^<*]*[<*]?
 
 /* directives */
 /* according to https://ecma-international.org/ecma-262/5.1/#sec-14.1 */
-USE_STRICT_DIRECTIVE    "\"use strict\"";*|"\'use strict\'";*
+USE_STRICT_DIRECTIVE    "\"use strict\""|"\'use strict\'"
+USE_STRICT_DIRECTIVE_SC "\"use strict\"";*|"\'use strict\'";*
 
 /* keywords */
 /* according to https://ecma-international.org/ecma-262/5.1/#sec-7.6.1.1 */
@@ -863,9 +875,15 @@ LITERAL_NULL                  null
 LITERAL_BOOLEAN               true|false
 LITERAL_DECIMAL               [.]?[0-9]+[\.]?[0-9]*[eE]?[0-9]*
 LITERAL_HEX_INTEGER           0x[0-9a-fA-F]*|0X[0-9a-fA-F]*
-LITERAL_DOUBLE_STRING_BEGIN   \"
-LITERAL_SINGLE_STRING_BEGIN   \'
-LITERAL_REGULAR_EXPRESSION    \/[^*\/]
+LITERAL_DQ_STRING_START       \"
+LITERAL_DQ_STRING_END         \"
+LITERAL_DQ_STRING_SKIP        \\\"
+LITERAL_SQ_STRING_START       \'
+LITERAL_SQ_STRING_END         \'
+LITERAL_SQ_STRING_SKIP        \\\'
+LITERAL_REGEX_START           \/[^*\/]
+LITERAL_REGEX_END             \/[gimsuy]*
+LITERAL_REGEX_SKIP            \\\/
 /* extra literals */
 /* according to https://ecma-international.org/ecma-262/5.1/#sec-4.3 */
 LITERAL_UNDEFINED             undefined
@@ -873,9 +891,9 @@ LITERAL_INFINITY              Infinity|\xE2\x88\x9E
 LITERAL_NAN                   NaN
 LITERAL                       {LITERAL_NULL}|{LITERAL_BOOLEAN}|{LITERAL_DECIMAL}|{LITERAL_HEX_INTEGER}|{LITERAL_UNDEFINED}|{LITERAL_INFINITY}|{LITERAL_NAN}
 
-HTML_COMMENT_OPEN         <!--
-HTML_TAG_SCRIPT_OPEN      (?i:<script)
-HTML_TAG_SCRIPT_CLOSE     (?i:<\/script>)
+HTML_COMMENT_OPEN         "<"+"!--"
+HTML_TAG_SCRIPT_OPEN      "<"+(?i:script)
+HTML_TAG_SCRIPT_CLOSE     "<"+(?i:\/script>)
 
 /* from 0x000 to 0x10FFFD to match undefined tokens */
 /* UTF-8 ranges generated with https://lists.gnu.org/archive/html/help-flex/2005-01/msg00043.html */
@@ -883,34 +901,99 @@ ALL_UNICODE    [\0-\x7F]|[\xC2-\xDF][\x80-\xBF]|(\xE0[\xA0-\xBF]|[\xE1-\xEF][\x8
 
 /* match regex literal only if the previous token was of type PUNCTUATOR_3 or KEYWORD */
 /* this resolves an ambiguity with a division operator: var x = 2/2/1; */
-%x regex
+%s regst
 
 /* do not match division operators as punctuators if the previous token was of type PUNCTUATOR */
 /* this resolves an ambiguity with regular expression in some cases such as (/=abc=/g) */
-%x div_op
+%s divop
+
+/* in a single line comment */
+%x lcomm
+
+/* in a multi line comment */
+%x bcomm
+
+/* in a single-quoted string */
+%x sqstr
+
+/* in a double-quoted string */
+%x dqstr
+
+/* in a regular expression */
+%x regex
 
 %%
-<*>{WHITESPACES}                                        { /* skip */ }
-<*>{CHAR_ESCAPE_SEQUENCES}                              { /* skip */ }
-<*>{LINE_TERMINATORS}                                   { BEGIN(regex); }
-<*>{HTML_TAG_SCRIPT_OPEN}                               { state.alerts |= ALERT_UNEXPECTED_TAG; update_ptr(); return 1; }
-<*>{HTML_TAG_SCRIPT_CLOSE}                              { update_ptr(); *ptr -= YYLeng(); return 0; }
-<*>{HTML_COMMENT_OPEN}                                  { if ( !eval_single_line_comment() ) { update_ptr(); return 1; } }
-<*>{SINGLE_LINE_COMMENT}                                { if ( !eval_single_line_comment() ) { update_ptr(); return 1; } }
-<*>{MULTI_LINE_COMMENT}                                 { if ( !eval_multi_line_comment() ) { update_ptr(); return 1; } }
-<*>{USE_STRICT_DIRECTIVE}                               { if ( !eval(DIRECTIVE, YYText()) ) { update_ptr(); return 1; } }
-<*>{KEYWORD}                                            { if ( !eval(KEYWORD, YYText()) ) { update_ptr(); return 1; } BEGIN(regex); }
-<*>{CLOSING_BRACES}                                     { if ( !eval(PUNCTUATOR, YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
-<div_op>{DIV_OPERATOR}|{DIV_ASSIGNMENT_OPERATOR}        { if ( !eval(PUNCTUATOR, YYText()) ) { update_ptr(); return 1; } }
-<*>{PUNCTUATOR}                                         { if ( !eval(PUNCTUATOR, YYText()) ) { update_ptr(); return 1; } BEGIN(regex); }
-<*>{OPERATOR}                                           { if ( !eval(OPERATOR, YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
-<*>{LITERAL}                                            { if ( !eval(LITERAL, YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
-<*>{LITERAL_DOUBLE_STRING_BEGIN}                        { if ( !eval_string_literal(YYText(), '"') ) { update_ptr(); return 1; } BEGIN(div_op); }
-<*>{LITERAL_SINGLE_STRING_BEGIN}                        { if ( !eval_string_literal(YYText(), '\'') ) { update_ptr(); return 1; } BEGIN(div_op); }
-<regex>{LITERAL_REGULAR_EXPRESSION}                     { if ( !eval_regex_literal(YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
-<*>{IDENTIFIER}                                         { if ( !eval_identifier(YYText()) ) { update_ptr(); return 1; } BEGIN(div_op); }
-<*>.|{ALL_UNICODE}                                      { if ( !eval(UNDEFINED, YYText()) ) { update_ptr(); return 1; } }
-<<EOF>>                                                 { if ( eval_eof() ) { update_ptr(); return 0; } }
+{WHITESPACES}                       { }
+{CHAR_ESCAPE_SEQUENCES}             { }
+{LINE_TERMINATORS}                  { BEGIN(regst); }
+
+<INITIAL,regex,dqstr,regst,sqstr,divop>{HTML_TAG_SCRIPT_OPEN} { BEGIN(regst); return OPENING_TAG; }
+{HTML_TAG_SCRIPT_CLOSE}             { BEGIN(regst); return SCRIPT_ENDED; }
+
+       {HTML_COMMENT_OPEN}          { BEGIN(lcomm); }
+       {LINE_COMMENT_START}         { BEGIN(lcomm); }
+<lcomm>{LINE_COMMENT_END1}          { BEGIN(regst); }
+<lcomm>{LINE_COMMENT_END2}          { BEGIN(regst); }
+<lcomm>{LINE_COMMENT_END3}          { BEGIN(regst); return OPENING_TAG; }
+<lcomm>{LINE_COMMENT_END4}          { BEGIN(regst); return CLOSING_TAG; }
+<lcomm>{LINE_COMMENT_SKIP}          { }
+<lcomm><<EOF>>                      { return SCRIPT_CONTINUE; }
+
+       {BLOCK_COMMENT_START}        { BEGIN(bcomm); }
+<bcomm>{BLOCK_COMMENT_END1}         { BEGIN(regst); }
+<bcomm>{BLOCK_COMMENT_END2}         { BEGIN(regst); return OPENING_TAG; }
+<bcomm>{BLOCK_COMMENT_END3}         { BEGIN(regst); return CLOSING_TAG; }
+<bcomm>{BLOCK_COMMENT_SKIP}         { }
+<bcomm><<EOF>>                      { return SCRIPT_CONTINUE; }
+
+       {LITERAL_DQ_STRING_START}    { EXEC(do_spacing(LITERAL)); ECHO; BEGIN(dqstr); }
+<dqstr>{LITERAL_DQ_STRING_END}      { ECHO; BEGIN(divop); }
+<dqstr>{HTML_TAG_SCRIPT_CLOSE}      { BEGIN(regst); return CLOSING_TAG; }
+<dqstr>\\{CR}{LF}                   { }
+<dqstr>\\{LF}                       { }
+<dqstr>\\{CR}                       { }
+<dqstr>{LINE_TERMINATORS}           { BEGIN(regst); return BAD_TOKEN; }
+<dqstr>{LITERAL_DQ_STRING_SKIP}     { ECHO; }
+<dqstr>.                            { ECHO; }
+<dqstr><<EOF>>                      { return SCRIPT_CONTINUE; }
+
+       {LITERAL_SQ_STRING_START}    { EXEC(do_spacing(LITERAL)); ECHO; BEGIN(sqstr); }
+<sqstr>{LITERAL_SQ_STRING_END}      { ECHO; BEGIN(divop); }
+<sqstr>{HTML_TAG_SCRIPT_CLOSE}      { BEGIN(regst); return CLOSING_TAG; }
+<sqstr>\\{CR}{LF}                   { }
+<sqstr>\\{LF}                       { }
+<sqstr>\\{CR}                       { }
+<sqstr>{LINE_TERMINATORS}           { BEGIN(regst); return BAD_TOKEN; }
+<sqstr>{LITERAL_SQ_STRING_SKIP}     { ECHO; }
+<sqstr>.                            { ECHO; }
+<sqstr><<EOF>>                      { return SCRIPT_CONTINUE; }
+
+<regst>{LITERAL_REGEX_START}        { EXEC(do_spacing(LITERAL)); yyout << '/'; yyless(1); BEGIN(regex); }
+<regex>{LITERAL_REGEX_END}          { ECHO; BEGIN(divop); }
+<regex>{HTML_TAG_SCRIPT_CLOSE}      { BEGIN(regst); return CLOSING_TAG; }
+<regex>{LITERAL_REGEX_SKIP}         { ECHO; }
+<regex>\\{LF}                       |
+<regex>\\{CR}                       |
+<regex>{LINE_TERMINATORS}           { BEGIN(regst); return BAD_TOKEN; }
+<regex>[^<{LF}{CR}{LS}{PS}\\\/]+    { ECHO; }
+<regex><<EOF>>                      { return SCRIPT_CONTINUE; }
+
+<divop>{DIV_OPERATOR}               |
+<divop>{DIV_ASSIGNMENT_OPERATOR}    { ECHO; token = PUNCTUATOR; BEGIN(INITIAL); }
+
+{CLOSING_BRACES}                    { ECHO; token = PUNCTUATOR; BEGIN(divop); }
+{PUNCTUATOR}                        { ECHO; token = PUNCTUATOR; BEGIN(regst); }
+
+{USE_STRICT_DIRECTIVE}              { EXEC(do_spacing(DIRECTIVE)); ECHO; BEGIN(INITIAL); yyout << ';'; }
+{USE_STRICT_DIRECTIVE_SC}           { EXEC(do_spacing(DIRECTIVE)); ECHO; BEGIN(INITIAL); }
+{KEYWORD}                           { EXEC(do_spacing(KEYWORD)); ECHO; BEGIN(regst); }
+{OPERATOR}                          { EXEC(do_operator_spacing(OPERATOR)); ECHO; BEGIN(divop); }
+{LITERAL}                           { EXEC(do_spacing(LITERAL)); ECHO; BEGIN(divop); }
+{IDENTIFIER}                        { if (unescape(YYText())) { EXEC(do_spacing(IDENTIFIER)); ECHO; }; BEGIN(divop); }
+
+.|{ALL_UNICODE}                     { ECHO; token = UNDEFINED; BEGIN(INITIAL); }
+<<EOF>>                             { EXEC(eval_eof()); }
+
 %%
 
 // static helper functions
@@ -987,356 +1070,79 @@ static std::string unescape_unicode(const char* lexeme)
     return res;
 }
 
-static bool contains_script_tags(const std::string& str)
-{
-    static constexpr const char* script = "SCRIPT";
-    static constexpr const int script_len = sizeof("SCRIPT") - 1;
-
-    const char* start = str.c_str();
-    const char* end = start + str.size();
-    const char* it = start;
-
-    while ( it )
-    {
-        it = snort::SnortStrcasestr(it, (end - it), script);
-        if ( it )
-        {
-            int d = it - start;
-            if ( d == 1 )
-            {
-                if ( *(it - 1) == '<' )
-                    return true;
-            }
-            else if ( d >= 2 )
-            {
-                if ( (*(it - 1) == '/' and *(it - 2) == '<') or
-                    (*(it - 1) == '<' and *(it - 2) != '\\') )
-                {
-                    return true;
-                }
-            }
-            it += script_len;
-        }
-    }
-    return false;
-}
-
 // JSTokenizer members
 
-struct JSTokenizer::ScanBuffers
-{
-    YY_BUFFER_STATE initial = nullptr;
-    YY_BUFFER_STATE temporal = nullptr;
-};
-
-JSTokenizer::JSTokenizer(std::stringstream& in, std::stringstream& out, char* dstbuf,
-    uint16_t dstlen, const char** ptr, int* bytes_copied, snort::JSNormState& state)
-    : yyFlexLexer(in, out),
-      dstbuf(dstbuf),
-      dstlen(dstlen),
-      ptr(ptr),
-      bytes_copied(bytes_copied),
-      state(state)
+JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out)
+    : yyFlexLexer(in, out)
 {
-    assert(bytes_copied);
-    init();
+    BEGIN(regst);
 }
 
 JSTokenizer::~JSTokenizer()
-{ delete buffers; }
-
-void JSTokenizer::init()
 {
-    buffers = new ScanBuffers;
-    *bytes_copied = 0;
-
-    // since regular expression may occur at the beginning of the input
-    BEGIN(regex);
+    yy_delete_buffer((YY_BUFFER_STATE)tmp_buffer);
 }
 
 void JSTokenizer::switch_to_temporal(const std::string& data)
 {
-    temporal.str(data);
-    buffers->initial = YY_CURRENT_BUFFER;
-    buffers->temporal = yy_create_buffer(temporal, data.size());
-    yy_switch_to_buffer(buffers->temporal);
+    tmp.str(data);
+    cur_buffer = YY_CURRENT_BUFFER;
+    tmp_buffer = yy_create_buffer(tmp, data.size());
+    yy_switch_to_buffer((YY_BUFFER_STATE)tmp_buffer);
 }
 
 void JSTokenizer::switch_to_initial()
 {
-    yy_delete_buffer(buffers->temporal);
-    yy_switch_to_buffer(buffers->initial);
-    buffers->temporal = nullptr;
-}
-
-bool JSTokenizer::eval_identifier(const char* lexeme)
-{
-    // If an identifier has escaped Unicode, unescape and match again
-    // in a temporal scan buffer
-    if ( strstr(lexeme, "\\u") )
-    {
-        const std::string unescaped_lex = unescape_unicode(lexeme);
-        switch_to_temporal(unescaped_lex);
-        return true;
-    }
-
-    return eval(IDENTIFIER, lexeme);
-}
-
-bool JSTokenizer::eval_string_literal(const char* match_prefix, const char quotes)
-{
-    std::string s;
-    bool is_alert = false;
-    bool is_ok = parse_literal(match_prefix, quotes, s, is_alert);
-
-    if ( is_alert )
-        return false;
-
-    return eval(is_ok ? LITERAL : UNDEFINED, s.c_str());
-}
-
-bool JSTokenizer::eval_regex_literal(const char* match_prefix)
-{
-    static const std::string regex_flags = "gimsuy";
-
-    std::string s;
-    bool is_alert = false;
-    bool is_ok = parse_literal(match_prefix, '/', s, is_alert, true);
-
-    if ( is_alert )
-        return false;
-
-    // append regex flags
-    char c;
-    while ( (c = yyinput()) != 0 )
-    {
-        if ( regex_flags.find(c) != std::string::npos )
-            s += c;
-        else
-        {
-            unput(c);
-            break;
-        }
-    }
-
-    return eval(is_ok ? LITERAL : UNDEFINED, s.c_str());
+    yy_switch_to_buffer((YY_BUFFER_STATE)cur_buffer);
+    yy_delete_buffer((YY_BUFFER_STATE)tmp_buffer);
+    tmp_buffer = nullptr;
 }
 
 // A return value of this method uses to terminate the scanner
 // true - terminate, false - continue scanning
 // Use this method only in <<EOF>> handler
 // The return value should be used to make a decision about yyterminate() call
-bool JSTokenizer::eval_eof()
+JSTokenizer::JSRet JSTokenizer::eval_eof()
 {
     // If the temporal scan buffer reaches EOF, cleanup and
     // continue with the initial one
-    if ( buffers->temporal )
+    if ( tmp_buffer )
     {
         switch_to_initial();
-        return false;
+        return EOS;
     }
 
     // Normal termination
-    return true;
-}
-
-bool JSTokenizer::eval_single_line_comment()
-{
-    char c;
-    std::string result;
-
-    while ( (c = yyinput()) != 0 )
-    {
-        result += c;
-        if ( c == '\n' )
-            break;
-    }
-
-    if ( contains_script_tags(result) )
-    {
-        state.alerts |= ALERT_UNEXPECTED_TAG;
-        return false;
-    }
-    else
-        return true;
-}
-
-bool JSTokenizer::eval_multi_line_comment()
-{
-    char c;
-    std::string result;
-
-    while ( (c = yyinput()) != 0 )
-    {
-        result += c;
-        if ( c == '*' )
-        {
-            if ( (c = yyinput()) == '/' )
-                break;
-            else
-                unput(c);
-        }
-    }
-
-    if ( contains_script_tags(result) )
-    {
-        state.alerts |= ALERT_UNEXPECTED_TAG;
-        return false;
-    }
-    else
-        return true;
-}
-
-// Unicode line terminators
-#define LS "\u2028"
-#define PS "\u2029"
-
-// This method delineates and validates literals from the input stream such as:
-//   1. double quotes string literal
-//   2. single quotes string literal
-//   3. regex literal
-// Call this method when lexer meets those literals
-// match_prefix is a lexeme part already matched by the lexer (with sentinel char)
-bool JSTokenizer::parse_literal(const std::string& match_prefix, const char sentinel_ch,
-    std::string& result, bool& is_alert, bool is_regex)
-{
-    bool is_ok = true;
-    char c;
-    short n = 0;
-
-    for ( auto it = match_prefix.crbegin(); it != match_prefix.crend(); ++it )
-        unput(*it);
-
-    result += yyinput();
-    while ( (c = yyinput()) != 0 )
-    {
-        result += c;
-
-        if ( c == sentinel_ch and !( n % 2 ) )
-            break;
-        else if ( c == '\\' )
-        {
-            ++n;
-            continue;
-        }
-        else if ( c == '\r' )
-        {
-            if ( is_regex )
-            {
-                is_ok = false;
-                result = result.substr(0, result.size() - n);
-            }
-            else if ( n == 0 )
-                is_ok = false;
-            else if ( ( (c = yyinput()) != 0 ) and c == '\n' )
-            {
-                result = result.substr(0, result.size() - 2);
-                continue;
-            }
-            else
-            {
-                is_ok = false;
-                unput(c);
-            }
-
-            break;
-        }
-        else if ( c == '\n' )
-        {
-            if ( is_regex )
-            {
-                is_ok = false;
-                result = result.substr(0, result.size() - n);
-            }
-            else if ( n == 0 )
-                is_ok = false;
-            else
-            {
-                result = result.substr(0, result.size() - 2);
-                continue;
-            }
-
-            break;
-        }
-
-        n = 0;
-    }
-
-    if ( !is_ok )
-    {
-        result.back() = sentinel_ch;
-        return is_ok;
-    }
-
-    if ( result.find(LS) != std::string::npos or result.find(PS) != std::string::npos )
-        is_ok = false;
-
-    if ( contains_script_tags(result) )
-    {
-        is_alert = true;
-        state.alerts |= ALERT_UNEXPECTED_TAG;
-    }
-
-    return is_ok;
+    return SCRIPT_CONTINUE;
 }
 
-bool JSTokenizer::eval(const JSToken tok, const char* lexeme)
+JSTokenizer::JSRet JSTokenizer::do_spacing(JSToken cur_token)
 {
-    bool ret = false;
-
-    switch( tok )
+    switch (token)
     {
-    case IDENTIFIER:
-        ret = normalize_identifier(prev_tok, lexeme);
-    break;
-
-    case KEYWORD:
-        ret = normalize_lexeme(prev_tok, lexeme);
-    break;
-
     case PUNCTUATOR:
-        ret = normalize_punctuator(prev_tok, lexeme);
-    break;
-
     case OPERATOR:
-        ret = normalize_operator(prev_tok, lexeme);
-    break;
-
-    case LITERAL:
-        ret = normalize_lexeme(prev_tok, lexeme);
-    break;
-
     case DIRECTIVE:
-        ret = normalize_directive(prev_tok, lexeme);
-    break;
-
     case UNDEFINED:
-        ret = normalize_undefined(prev_tok, lexeme);
-    break;
-    }
+        token = cur_token;
+        return EOS;
 
-    prev_tok = tok;
-
-    // set a default pattern match start condition
-    if ( yy_start != INITIAL )
-        BEGIN(INITIAL);
-
-    return ret;
-}
+    case IDENTIFIER:
+    case KEYWORD:
+    case LITERAL:
+        yyout << ' ';
+        token = cur_token;
+        return EOS;
+    }
 
-bool JSTokenizer::normalize_identifier(const JSToken prev_tok, const char* lexeme)
-{
-    return normalize_lexeme(prev_tok, lexeme);
-}
+    assert(false);
 
-bool JSTokenizer::normalize_punctuator(const JSToken, const char* lexeme)
-{
-    return write_output(lexeme);
+    return BAD_TOKEN;
 }
 
-bool JSTokenizer::normalize_operator(const JSToken prev_tok, const char* lexeme)
+JSTokenizer::JSRet JSTokenizer::do_operator_spacing(JSToken cur_token)
 {
-    switch( prev_tok )
+    switch (token)
     {
     case IDENTIFIER:
     case KEYWORD:
@@ -1344,66 +1150,28 @@ bool JSTokenizer::normalize_operator(const JSToken prev_tok, const char* lexeme)
     case LITERAL:
     case DIRECTIVE:
     case UNDEFINED:
-        return write_output(lexeme);
-    break;
+        token = cur_token;
+        return EOS;
 
     case OPERATOR:
-        return write_output(" " + std::string(lexeme));
-    break;
+        yyout << ' ';
+        token = cur_token;
+        return EOS;
     }
 
-    return false;
-}
-
-bool JSTokenizer::normalize_directive(const JSToken prev_tok, const char* lexeme)
-{
-    std::string str = lexeme;
+    assert(false);
 
-    if ( str.rfind(";") == std::string::npos )
-        str += ";";
-
-    return normalize_lexeme(prev_tok, str.c_str());
+    return BAD_TOKEN;
 }
 
-bool JSTokenizer::normalize_undefined(const JSToken, const char* lexeme)
-{ return write_output(lexeme); }
-
-bool JSTokenizer::normalize_lexeme(const JSToken prev_tok, const char* lexeme)
+bool JSTokenizer::unescape(const char* lexeme)
 {
-    switch( prev_tok )
+    if ( strstr(lexeme, "\\u") )
     {
-    case PUNCTUATOR:
-    case OPERATOR:
-    case DIRECTIVE:
-    case UNDEFINED:
-        return write_output(lexeme);
-    break;
-
-    case IDENTIFIER:
-    case KEYWORD:
-    case LITERAL:
-        return write_output(" " + std::string(lexeme));
-    break;
-    }
-
-    return false;
-}
-
-bool JSTokenizer::write_output(const std::string& str)
-{
-    size_t len = str.size();
-    int new_size = *bytes_copied + len;
-
-    if ( new_size >= 0 and new_size <= dstlen )
-        memcpy((char*) dstbuf, (const char*)str.c_str(), len);
-    else
+        const std::string unescaped_lex = unescape_unicode(lexeme);
+        switch_to_temporal(unescaped_lex);
         return false;
+    }
 
-    dstbuf += len;
-    *bytes_copied = new_size;
     return true;
 }
-
-void JSTokenizer::update_ptr()
-{ *ptr += yyin.tellg(); }
-
index 1100bbf48931d5548dd921c6241d93e544e6b1d0..b66d77766be7d7963950c9780245f8a5a6887739 100644 (file)
@@ -36,32 +36,36 @@ namespace snort
 
 using namespace snort;
 
-#define NORM_DEPTH 65535
-
-#define NORMALIZE(srcbuf, expected)                                        \
-    char dstbuf[sizeof(expected)];                                         \
-    int bytes_copied;                                                      \
-    const char* ptr = srcbuf;                                              \
-    JSNormState state;                                                     \
-    state.norm_depth = NORM_DEPTH;                                         \
-    state.alerts = 0;                                                      \
-    int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf),              \
-        dstbuf, sizeof(dstbuf), &ptr, &bytes_copied, state);
-
-#define VALIDATE(srcbuf, expected)                    \
-    CHECK(ret == 0);                                  \
-    CHECK((ptr - srcbuf) == sizeof(srcbuf));          \
-    CHECK(bytes_copied == sizeof(expected) - 1);      \
-    CHECK(!memcmp(dstbuf, expected, bytes_copied));
-
-#define VALIDATE_FAIL(srcbuf, expected, ret_code, ptr_offset)      \
-    CHECK(ret == ret_code);                                        \
-    CHECK((ptr - srcbuf) == ptr_offset);                           \
-    CHECK(bytes_copied == sizeof(expected) - 1);                   \
-    CHECK(!memcmp(dstbuf, expected, bytes_copied));
-
-#define VALIDATE_ALERT(alert)       \
-    CHECK(state.alerts & alert);
+#define DEPTH 65535
+
+#define NORMALIZE(src, expected)                                    \
+    char dst[sizeof(expected)];                                     \
+    JSNormalizer norm;                                              \
+    norm.set_depth(DEPTH);                                          \
+    auto ret = norm.normalize(src, sizeof(src), dst, sizeof(dst));  \
+    const char* ptr = norm.get_src_next();                          \
+    int act_len = norm.get_dst_next() - dst;                        \
+
+#define VALIDATE(src, expected)                 \
+    CHECK(ret == JSTokenizer::SCRIPT_CONTINUE); \
+    CHECK((ptr - src) == sizeof(src));          \
+    CHECK(act_len == sizeof(expected) - 1);     \
+    CHECK(!memcmp(dst, expected, act_len));
+
+#define VALIDATE_FAIL(src, expected, ret_code, ptr_offset)  \
+    CHECK(ret == ret_code);                                 \
+    CHECK((ptr - src) == ptr_offset);                       \
+    CHECK(act_len == sizeof(expected) - 1);                 \
+    CHECK(!memcmp(dst, expected, act_len));
+
+#define NORMALIZE_L(src, src_len, dst, dst_len, depth, ret, ptr, len)   \
+    {                                                                   \
+        JSNormalizer norm;                                              \
+        norm.set_depth(depth);                                          \
+        ret = norm.normalize(src, src_len, dst, dst_len);               \
+        ptr = norm.get_src_next();                                      \
+        len = norm.get_dst_next() - dst;                                \
+    }                                                                   \
 
 // ClamAV test cases
 static const char clamav_buf0[] =
@@ -256,6 +260,9 @@ TEST_CASE("clamav tests", "[JSNormalizer]")
     SECTION("test_case_14")
     {
         NORMALIZE(clamav_buf14, clamav_expected14);
+        // trailing \0 is included as a part of the string
+        // to utilize available macros we alter the read length
+        act_len -= 1;
         VALIDATE(clamav_buf14, clamav_expected14);
     }
 }
@@ -333,64 +340,56 @@ TEST_CASE("all patterns", "[JSNormalizer]")
     }
     SECTION("directives")
     {
-        const char srcbuf0[] = "'use strict'\nvar a = 1;";
-        const char srcbuf1[] = "\"use strict\"\nvar a = 1;";
-        const char srcbuf2[] = "'use strict';var a = 1;";
-        const char srcbuf3[] = "\"use strict\";var a = 1;";
-        const char srcbuf4[] = "var a = 1 'use strict';";
+        const char src0[] = "'use strict'\nvar a = 1;";
+        const char src1[] = "\"use strict\"\nvar a = 1;";
+        const char src2[] = "'use strict';var a = 1;";
+        const char src3[] = "\"use strict\";var a = 1;";
+        const char src4[] = "var a = 1 'use strict';";
+
         const char expected0[] = "'use strict';var a=1;";
         const char expected1[] = "\"use strict\";var a=1;";
         const char expected2[] = "var a=1 'use strict';";
-        char dstbuf0[sizeof(expected0)];
-        char dstbuf1[sizeof(expected1)];
-        char dstbuf2[sizeof(expected0)];
-        char dstbuf3[sizeof(expected1)];
-        char dstbuf4[sizeof(expected2)];
-        int bytes_copied0, bytes_copied1, bytes_copied2, bytes_copied3, bytes_copied4;
-        const char* ptr0 = srcbuf0;
-        const char* ptr1 = srcbuf1;
-        const char* ptr2 = srcbuf2;
-        const char* ptr3 = srcbuf3;
-        const char* ptr4 = srcbuf4;
-        JSNormState state;
-        state.norm_depth = NORM_DEPTH;
-        state.alerts = 0;
-
-        int ret0 = JSNormalizer::normalize(srcbuf0, sizeof(srcbuf0), dstbuf0, sizeof(dstbuf0),
-            &ptr0, &bytes_copied0, state);
-        int ret1 = JSNormalizer::normalize(srcbuf1, sizeof(srcbuf1), dstbuf1, sizeof(dstbuf1),
-            &ptr1, &bytes_copied1, state);
-        int ret2 = JSNormalizer::normalize(srcbuf2, sizeof(srcbuf2), dstbuf2, sizeof(dstbuf2),
-            &ptr2, &bytes_copied2, state);
-        int ret3 = JSNormalizer::normalize(srcbuf3, sizeof(srcbuf3), dstbuf3, sizeof(dstbuf3),
-            &ptr3, &bytes_copied3, state);
-        int ret4 = JSNormalizer::normalize(srcbuf4, sizeof(srcbuf4), dstbuf4, sizeof(dstbuf4),
-            &ptr4, &bytes_copied4, state);
-
-        CHECK(ret0 == 0);
-        CHECK((ptr0 - srcbuf0) == sizeof(srcbuf0));
-        CHECK(bytes_copied0 == sizeof(expected0) - 1);
-        CHECK(!memcmp(dstbuf0, expected0, bytes_copied0));
-
-        CHECK(ret1 == 0);
-        CHECK((ptr1 - srcbuf1) == sizeof(srcbuf1));
-        CHECK(bytes_copied1 == sizeof(expected1) - 1);
-        CHECK(!memcmp(dstbuf1, expected1, bytes_copied1));
-
-        CHECK(ret2 == 0);
-        CHECK((ptr2 - srcbuf2) == sizeof(srcbuf2));
-        CHECK(bytes_copied2 == sizeof(expected0) - 1);
-        CHECK(!memcmp(dstbuf2, expected0, bytes_copied2));
-
-        CHECK(ret3 == 0);
-        CHECK((ptr3 - srcbuf3) == sizeof(srcbuf3));
-        CHECK(bytes_copied3 == sizeof(expected1) - 1);
-        CHECK(!memcmp(dstbuf3, expected1, bytes_copied3));
-
-        CHECK(ret4 == 0);
-        CHECK((ptr4 - srcbuf4) == sizeof(srcbuf4));
-        CHECK(bytes_copied4 == sizeof(expected2) - 1);
-        CHECK(!memcmp(dstbuf4, expected2, bytes_copied4));
+
+        char dst0[sizeof(expected0)];
+        char dst1[sizeof(expected1)];
+        char dst2[sizeof(expected0)];
+        char dst3[sizeof(expected1)];
+        char dst4[sizeof(expected2)];
+
+        int ret0, ret1, ret2, ret3, ret4;
+        const char *ptr0, *ptr1, *ptr2, *ptr3, *ptr4;
+        int act_len0, act_len1, act_len2, act_len3, act_len4;
+
+        NORMALIZE_L(src0, sizeof(src0), dst0, sizeof(dst0), DEPTH, ret0, ptr0, act_len0);
+        NORMALIZE_L(src1, sizeof(src1), dst1, sizeof(dst1), DEPTH, ret1, ptr1, act_len1);
+        NORMALIZE_L(src2, sizeof(src2), dst2, sizeof(dst2), DEPTH, ret2, ptr2, act_len2);
+        NORMALIZE_L(src3, sizeof(src3), dst3, sizeof(dst3), DEPTH, ret3, ptr3, act_len3);
+        NORMALIZE_L(src4, sizeof(src4), dst4, sizeof(dst4), DEPTH, ret4, ptr4, act_len4);
+
+        CHECK(ret0 == JSTokenizer::SCRIPT_CONTINUE);
+        CHECK((ptr0 - src0) == sizeof(src0));
+        CHECK(act_len0 == sizeof(expected0) - 1);
+        CHECK(!memcmp(dst0, expected0, act_len0));
+
+        CHECK(ret1 == JSTokenizer::SCRIPT_CONTINUE);
+        CHECK((ptr1 - src1) == sizeof(src1));
+        CHECK(act_len1 == sizeof(expected1) - 1);
+        CHECK(!memcmp(dst1, expected1, act_len1));
+
+        CHECK(ret2 == JSTokenizer::SCRIPT_CONTINUE);
+        CHECK((ptr2 - src2) == sizeof(src2));
+        CHECK(act_len2 == sizeof(expected0) - 1);
+        CHECK(!memcmp(dst2, expected0, act_len2));
+
+        CHECK(ret3 == JSTokenizer::SCRIPT_CONTINUE);
+        CHECK((ptr3 - src3) == sizeof(src3));
+        CHECK(act_len3 == sizeof(expected1) - 1);
+        CHECK(!memcmp(dst3, expected1, act_len3));
+
+        CHECK(ret4 == JSTokenizer::SCRIPT_CONTINUE);
+        CHECK((ptr4 - src4) == sizeof(src4));
+        CHECK(act_len4 == sizeof(expected2) - 1);
+        CHECK(!memcmp(dst4, expected2, act_len4));
     }
     SECTION("punctuators")
     {
@@ -673,43 +672,51 @@ static const char syntax_cases_expected14[] =
     "var a=b% -c;"
     "var a=b+ -c;";
 
+// In the following cases:
+//   a reading cursor will be after the literal
+//   a malformed literal is not present in the output
+
 static const char syntax_cases_buf15[] =
-    "var str1 = 'abc\u2028 def' ;\n"
-    "var str2 = 'abc\u2029 def' ;\n\r";
+    "var invalid_str = 'abc\u2028 def' ;\n";
 
 static const char syntax_cases_expected15[] =
-    "var str1='abc\u2028 def';"
-    "var str2='abc\u2029 def';";
+    "var invalid_str='abc";
 
 static const char syntax_cases_buf16[] =
     "var invalid_str = \"abc\n def\"";
 
 static const char syntax_cases_expected16[] =
-    "var invalid_str=\"abc\"def \"";
+    "var invalid_str=\"abc";
 
 static const char syntax_cases_buf17[] =
     "var invalid_str = 'abc\r def'";
 
 static const char syntax_cases_expected17[] =
-    "var invalid_str='abc'def '";
+    "var invalid_str='abc";
 
 static const char syntax_cases_buf18[] =
     "var invalid_str = 'abc\\\n\r def'";
 
 static const char syntax_cases_expected18[] =
-    "var invalid_str='abc'def '";
+    "var invalid_str='abc";
 
 static const char syntax_cases_buf19[] =
     "var invalid_re = /abc\\\n def/";
 
 static const char syntax_cases_expected19[] =
-    "var invalid_re=/abc/def/";
+    "var invalid_re=/abc";
 
 static const char syntax_cases_buf20[] =
     "var invalid_re = /abc\\\r\n def/";
 
 static const char syntax_cases_expected20[] =
-    "var invalid_re=/abc/def/";
+    "var invalid_re=/abc";
+
+static const char syntax_cases_buf21[] =
+    "var invalid_str = 'abc\u2029 def' ;\n\r";
+
+static const char syntax_cases_expected21[] =
+    "var invalid_str='abc";
 
 TEST_CASE("syntax cases", "[JSNormalizer]")
 {
@@ -788,100 +795,115 @@ TEST_CASE("syntax cases", "[JSNormalizer]")
         NORMALIZE(syntax_cases_buf14, syntax_cases_expected14);
         VALIDATE(syntax_cases_buf14, syntax_cases_expected14);
     }
-    SECTION("LS and PS chars within literal")
+}
+
+TEST_CASE("bad tokens", "[JSNormalizer]")
+{
+    SECTION("LS chars within literal")
     {
         NORMALIZE(syntax_cases_buf15, syntax_cases_expected15);
-        VALIDATE(syntax_cases_buf15, syntax_cases_expected15);
+        VALIDATE_FAIL(syntax_cases_buf15, syntax_cases_expected15, JSTokenizer::BAD_TOKEN, 25);
+    }
+    SECTION("PS chars within literal")
+    {
+        NORMALIZE(syntax_cases_buf21, syntax_cases_expected21);
+        VALIDATE_FAIL(syntax_cases_buf21, syntax_cases_expected21, JSTokenizer::BAD_TOKEN, 25);
     }
     SECTION("explicit LF within literal")
     {
         NORMALIZE(syntax_cases_buf16, syntax_cases_expected16);
-        VALIDATE(syntax_cases_buf16, syntax_cases_expected16);
+        VALIDATE_FAIL(syntax_cases_buf16, syntax_cases_expected16, JSTokenizer::BAD_TOKEN, 23);
     }
     SECTION("explicit CR within literal")
     {
         NORMALIZE(syntax_cases_buf17, syntax_cases_expected17);
-        VALIDATE(syntax_cases_buf17, syntax_cases_expected17);
+        VALIDATE_FAIL(syntax_cases_buf17, syntax_cases_expected17, JSTokenizer::BAD_TOKEN, 23);
     }
     SECTION("escaped LF-CR sequence within literal")
     {
         NORMALIZE(syntax_cases_buf18, syntax_cases_expected18);
-        VALIDATE(syntax_cases_buf18, syntax_cases_expected18);
+        VALIDATE_FAIL(syntax_cases_buf18, syntax_cases_expected18, JSTokenizer::BAD_TOKEN, 25);
     }
     SECTION("escaped LF within regex literal")
     {
         NORMALIZE(syntax_cases_buf19, syntax_cases_expected19);
-        VALIDATE(syntax_cases_buf19, syntax_cases_expected19);
+        VALIDATE_FAIL(syntax_cases_buf19, syntax_cases_expected19, JSTokenizer::BAD_TOKEN, 23);
     }
     SECTION("escaped CR-LF within regex literal")
     {
         NORMALIZE(syntax_cases_buf20, syntax_cases_expected20);
-        VALIDATE(syntax_cases_buf20, syntax_cases_expected20);
+        VALIDATE_FAIL(syntax_cases_buf20, syntax_cases_expected20, JSTokenizer::BAD_TOKEN, 23);
     }
 }
 
-TEST_CASE("norm_depth is specified", "[JSNormalizer]")
+TEST_CASE("endings", "[JSNormalizer]")
 {
-    const char srcbuf[] = "var abc = 123;\n\r";
-    const char expected[] = "var abc";
-    char dstbuf[7];
-    int bytes_copied;
-    const char* ptr = srcbuf;
-    JSNormState state;
-    state.norm_depth = 7;
-    state.alerts = 0;
-    int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf), dstbuf, sizeof(dstbuf), &ptr,
-        &bytes_copied, state);
-
-    CHECK(ret == 0);
-    CHECK(bytes_copied == sizeof(expected) - 1);
-    CHECK(!memcmp(dstbuf, expected, bytes_copied));
-}
+    SECTION("script closing tag is present", "[JSNormalizer]")
+    {
+        const char src[] =
+            "var a = 1 ;\n" // 12 bytes
+            "var b = 2 ;\n" // 12 bytes
+            "</script>\n"   // ptr_offset is here = 33
+            "var c = 3 ;\n";
+        const int ptr_offset = 33;
+        const char expected[] = "var a=1;var b=2;";
+        char dst[sizeof(expected)];
+        int act_len;
+        const char* ptr;
+        int ret;
+
+        NORMALIZE_L(src, sizeof(src), dst, sizeof(dst), DEPTH, ret, ptr, act_len);
+
+        CHECK(ret == JSTokenizer::SCRIPT_ENDED);
+        CHECK(act_len == sizeof(expected) - 1);
+        CHECK((ptr - src) == ptr_offset);
+        CHECK(!memcmp(dst, expected, act_len));
+    }
+    SECTION("depth reached", "[JSNormalizer]")
+    {
+        const char src[] = "var abc = 123;\n\r";
+        const char src2[] = "var foo = 321;\n\r";
+        const char expected[] = "var abc";
+        char dst[sizeof(src)];
+        int act_len;
+        const char* ptr;
+        int ret;
 
-TEST_CASE("tag script end is specified", "[JSNormalizer]")
-{
-    const char srcbuf[] =
-        "var a = 1 ;\n" // 12 bytes
-        "var b = 2 ;\n" // 12 bytes --> ptr_offset = 24
-        "</script>\n"
-        "var c = 3 ;\n";
-    const int ptr_offset = 24;
-    const char expected[] = "var a=1;var b=2;";
-    char dstbuf[sizeof(expected)];
-    int bytes_copied;
-    const char* ptr = srcbuf;
-    JSNormState state;
-    state.norm_depth = NORM_DEPTH;
-    state.alerts = 0;
-    int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf), dstbuf, sizeof(dstbuf), &ptr,
-        &bytes_copied, state);
-
-    CHECK(ret == 0);
-    CHECK(bytes_copied == sizeof(expected) - 1);
-    CHECK((ptr - srcbuf) == ptr_offset);
-    CHECK(!memcmp(dstbuf, expected, bytes_copied));
-}
+        JSNormalizer norm;
 
-// Tests for JavaScript parsing errors and anomalies
+        norm.set_depth(7);
+        ret = norm.normalize(src, sizeof(src), dst, sizeof(dst));
+        ptr = norm.get_src_next();
+        act_len = norm.get_dst_next() - dst;
 
-TEST_CASE("parsing errors", "[JSNormalizer]")
-{
-    SECTION("dstlen is too small")
+        CHECK(ret == JSTokenizer::EOS);
+        CHECK(ptr == src + 7);
+        CHECK(act_len == sizeof(expected) - 1);
+        CHECK(!memcmp(dst, expected, act_len));
+
+        ret = norm.normalize(src2, sizeof(src2), dst, sizeof(dst));
+        ptr = norm.get_src_next();
+        act_len = norm.get_dst_next() - dst;
+
+        CHECK(ret == JSTokenizer::EOS);
+        CHECK(ptr == src2 + sizeof(src2));
+        CHECK(act_len == 0);
+    }
+    SECTION("dst size is less then src size")
     {
-        const char srcbuf[] = "var abc = 123;\n\r";
-        const char expected[] = "var abc";
-        char dstbuf[7];
-        int bytes_copied;
-        const char* ptr = srcbuf;
-        JSNormState state;
-        state.norm_depth = NORM_DEPTH;
-        state.alerts = 0;
-        int ret = JSNormalizer::normalize(srcbuf, sizeof(srcbuf), dstbuf, sizeof(dstbuf), &ptr,
-            &bytes_copied, state);
-
-        CHECK(ret == 1);
-        CHECK(bytes_copied == sizeof(expected) - 1);
-        CHECK(!memcmp(dstbuf, expected, bytes_copied));
+        const char src[] = "var abc = 123;\n\r";
+        const char expected[sizeof(src)] = "var abc";
+        char dst[7];
+        int act_len;
+        const char* ptr;
+        int ret;
+
+        NORMALIZE_L(src, sizeof(src), dst, sizeof(dst), DEPTH, ret, ptr, act_len);
+
+        CHECK(ret == JSTokenizer::SCRIPT_CONTINUE);
+        CHECK(ptr == src + sizeof(src));
+        CHECK(act_len == 12); // size of normalized src
+        CHECK(!memcmp(dst, expected, sizeof(dst)));
     }
 }
 
@@ -896,7 +918,7 @@ static const char unexpected_tag_expected0[] =
 static const char unexpected_tag_buf1[] =
     "var a = 1;\n"
     "<script type=application/javascript>\n"
-    "var b = 2;\r\n";;
+    "var b = 2;\r\n";
 
 static const char unexpected_tag_expected1[] =
     "var a=1;";
@@ -907,7 +929,7 @@ static const char unexpected_tag_buf2[] =
     "var b = 2;\r\n";
 
 static const char unexpected_tag_expected2[] =
-    "var a=1;var str=";
+    "var a=1;var str='";
 
 static const char unexpected_tag_buf3[] =
     "var a = 1;\n"
@@ -915,7 +937,7 @@ static const char unexpected_tag_buf3[] =
     "var b = 2;\r\n";
 
 static const char unexpected_tag_expected3[] =
-    "var a=1;var str=";
+    "var a=1;var str='something ";
 
 static const char unexpected_tag_buf4[] =
     "var a = 1;\n"
@@ -923,7 +945,7 @@ static const char unexpected_tag_buf4[] =
     "var b = 2;\r\n";
 
 static const char unexpected_tag_expected4[] =
-    "var a=1;var str=";
+    "var a=1;var str='something ";
 
 static const char unexpected_tag_buf5[] =
     "var a = 1;\n"
@@ -931,7 +953,7 @@ static const char unexpected_tag_buf5[] =
     "var b = 2;\r\n";
 
 static const char unexpected_tag_expected5[] =
-    "var a=1;var str=";
+    "var a=1;var str='";
 
 static const char unexpected_tag_buf6[] =
     "var a = 1;\n"
@@ -939,7 +961,7 @@ static const char unexpected_tag_buf6[] =
     "var b = 2;\r\n";
 
 static const char unexpected_tag_expected6[] =
-    "var a=1;var str=";
+    "var a=1;var str='something ";
 
 static const char unexpected_tag_buf7[] =
     "var a = 1;\n"
@@ -947,7 +969,7 @@ static const char unexpected_tag_buf7[] =
     "var b = 2;\r\n";
 
 static const char unexpected_tag_expected7[] =
-    "var a=1;var str=";
+    "var a=1;var str='something ";
 
 static const char unexpected_tag_buf8[] =
     "var a = 1;\n"
@@ -955,7 +977,7 @@ static const char unexpected_tag_buf8[] =
     "var b = 2;\r\n";
 
 static const char unexpected_tag_expected8[] =
-    "var a=1;var str='something \\<script\\> something';var b=2;";
+    "var a=1;var str='something \\";
 
 static const char unexpected_tag_buf9[] =
     "var a = 1;\n"
@@ -1079,7 +1101,7 @@ static const char unexpected_tag_buf23[] =
     "var b = 2;\r\n";
 
 static const char unexpected_tag_expected23[] =
-    "var a=1;var str=";
+    "var a=1;var str='script somescript /script something ";
 
 static const char unexpected_tag_buf24[] =
     "var a = 1;\n"
@@ -1087,63 +1109,54 @@ static const char unexpected_tag_buf24[] =
     "var b = 2;\r\n";
 
 static const char unexpected_tag_expected24[] =
-    "var a=1;var str=";
+    "var a=1;var str='something ";
 
-TEST_CASE("unexpected script tag alert", "[JSNormalizer]")
+TEST_CASE("nested script tags", "[JSNormalizer]")
 {
-    const int ret_code = 1;
     SECTION("explicit open tag - simple")
     {
         NORMALIZE(unexpected_tag_buf0, unexpected_tag_expected0);
-        VALIDATE_FAIL(unexpected_tag_buf0, unexpected_tag_expected0, ret_code, 18);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf0, unexpected_tag_expected0, JSTokenizer::OPENING_TAG, 18);
     }
     SECTION("explicit open tag - complex")
     {
         NORMALIZE(unexpected_tag_buf1, unexpected_tag_expected1);
-        VALIDATE_FAIL(unexpected_tag_buf1, unexpected_tag_expected1, ret_code, 18);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf1, unexpected_tag_expected1, JSTokenizer::OPENING_TAG, 18);
     }
     SECTION("open tag within literal - start")
     {
         NORMALIZE(unexpected_tag_buf2, unexpected_tag_expected2);
-        VALIDATE_FAIL(unexpected_tag_buf2, unexpected_tag_expected2, ret_code, 41);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf2, unexpected_tag_expected2, JSTokenizer::OPENING_TAG, 29);
     }
     SECTION("open tag within literal - mid")
     {
         NORMALIZE(unexpected_tag_buf3, unexpected_tag_expected3);
-        VALIDATE_FAIL(unexpected_tag_buf3, unexpected_tag_expected3, ret_code, 51);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf3, unexpected_tag_expected3, JSTokenizer::OPENING_TAG, 39);
     }
     SECTION("open tag within literal - end")
     {
         NORMALIZE(unexpected_tag_buf4, unexpected_tag_expected4);
-        VALIDATE_FAIL(unexpected_tag_buf4, unexpected_tag_expected4, ret_code, 41);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf4, unexpected_tag_expected4, JSTokenizer::OPENING_TAG, 39);
     }
     SECTION("close tag within literal - start")
     {
         NORMALIZE(unexpected_tag_buf5, unexpected_tag_expected5);
-        VALIDATE_FAIL(unexpected_tag_buf5, unexpected_tag_expected5, ret_code, 42);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf5, unexpected_tag_expected5, JSTokenizer::CLOSING_TAG, 31);
     }
     SECTION("close tag within literal - mid")
     {
         NORMALIZE(unexpected_tag_buf6, unexpected_tag_expected6);
-        VALIDATE_FAIL(unexpected_tag_buf6, unexpected_tag_expected6, ret_code, 52);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf6, unexpected_tag_expected6, JSTokenizer::CLOSING_TAG, 41);
     }
     SECTION("close tag within literal - end")
     {
         NORMALIZE(unexpected_tag_buf7, unexpected_tag_expected7);
-        VALIDATE_FAIL(unexpected_tag_buf7, unexpected_tag_expected7, ret_code, 42);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf7, unexpected_tag_expected7, JSTokenizer::CLOSING_TAG, 41);
     }
     SECTION("open tag within literal - escaped")
     {
         NORMALIZE(unexpected_tag_buf8, unexpected_tag_expected8);
-        VALIDATE(unexpected_tag_buf8, unexpected_tag_expected8);
+        VALIDATE_FAIL(unexpected_tag_buf8, unexpected_tag_expected8, JSTokenizer::OPENING_TAG, 40);
     }
     SECTION("close tag within literal - escaped")
     {
@@ -1153,74 +1166,62 @@ TEST_CASE("unexpected script tag alert", "[JSNormalizer]")
     SECTION("open tag within single-line comment - start")
     {
         NORMALIZE(unexpected_tag_buf10, unexpected_tag_expected10);
-        VALIDATE_FAIL(unexpected_tag_buf10, unexpected_tag_expected10, ret_code, 32);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf10, unexpected_tag_expected10, JSTokenizer::OPENING_TAG, 20);
     }
     SECTION("open tag within single-line comment - mid")
     {
         NORMALIZE(unexpected_tag_buf11, unexpected_tag_expected11);
-        VALIDATE_FAIL(unexpected_tag_buf11, unexpected_tag_expected11, ret_code, 42);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf11, unexpected_tag_expected11, JSTokenizer::OPENING_TAG, 30);
     }
     SECTION("open tag within single-line comment - end")
     {
         NORMALIZE(unexpected_tag_buf12, unexpected_tag_expected12);
-        VALIDATE_FAIL(unexpected_tag_buf12, unexpected_tag_expected12, ret_code, 32);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf12, unexpected_tag_expected12, JSTokenizer::OPENING_TAG, 30);
     }
     SECTION("open tag within multi-line comment - start")
     {
         NORMALIZE(unexpected_tag_buf13, unexpected_tag_expected13);
-        VALIDATE_FAIL(unexpected_tag_buf13, unexpected_tag_expected13, ret_code, 33);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf13, unexpected_tag_expected13, JSTokenizer::OPENING_TAG, 20);
     }
     SECTION("open tag within multi-line comment - mid")
     {
         NORMALIZE(unexpected_tag_buf14, unexpected_tag_expected14);
-        VALIDATE_FAIL(unexpected_tag_buf14, unexpected_tag_expected14, ret_code, 43);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf14, unexpected_tag_expected14, JSTokenizer::OPENING_TAG, 30);
     }
     SECTION("open tag within multi-line comment - end")
     {
         NORMALIZE(unexpected_tag_buf15, unexpected_tag_expected15);
-        VALIDATE_FAIL(unexpected_tag_buf15, unexpected_tag_expected15, ret_code, 33);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf15, unexpected_tag_expected15, JSTokenizer::OPENING_TAG, 30);
     }
     SECTION("close tag within single-line comment - start")
     {
         NORMALIZE(unexpected_tag_buf16, unexpected_tag_expected16);
-        VALIDATE_FAIL(unexpected_tag_buf16, unexpected_tag_expected16, ret_code, 33);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf16, unexpected_tag_expected16, JSTokenizer::CLOSING_TAG, 22);
     }
     SECTION("close tag within single-line comment - mid")
     {
         NORMALIZE(unexpected_tag_buf17, unexpected_tag_expected17);
-        VALIDATE_FAIL(unexpected_tag_buf17, unexpected_tag_expected17, ret_code, 50);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf17, unexpected_tag_expected17, JSTokenizer::CLOSING_TAG, 34);
     }
     SECTION("close tag within single-line comment - end")
     {
         NORMALIZE(unexpected_tag_buf18, unexpected_tag_expected18);
-        VALIDATE_FAIL(unexpected_tag_buf18, unexpected_tag_expected18, ret_code, 33);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf18, unexpected_tag_expected18, JSTokenizer::CLOSING_TAG, 32);
     }
     SECTION("close tag within multi-line comment - start")
     {
         NORMALIZE(unexpected_tag_buf19, unexpected_tag_expected19);
-        VALIDATE_FAIL(unexpected_tag_buf19, unexpected_tag_expected19, ret_code, 34);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf19, unexpected_tag_expected19, JSTokenizer::CLOSING_TAG, 22);
     }
     SECTION("close tag within multi-line comment - mid")
     {
         NORMALIZE(unexpected_tag_buf20, unexpected_tag_expected20);
-        VALIDATE_FAIL(unexpected_tag_buf20, unexpected_tag_expected20, ret_code, 44);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf20, unexpected_tag_expected20, JSTokenizer::CLOSING_TAG, 32);
     }
     SECTION("close tag within multi-line comment - end")
     {
         NORMALIZE(unexpected_tag_buf21, unexpected_tag_expected21);
-        VALIDATE_FAIL(unexpected_tag_buf21, unexpected_tag_expected21, ret_code, 34);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf21, unexpected_tag_expected21, JSTokenizer::CLOSING_TAG, 32);
     }
     SECTION("multiple patterns - not matched")
     {
@@ -1230,14 +1231,11 @@ TEST_CASE("unexpected script tag alert", "[JSNormalizer]")
     SECTION("multiple patterns - matched")
     {
         NORMALIZE(unexpected_tag_buf23, unexpected_tag_expected23);
-        VALIDATE_FAIL(unexpected_tag_buf23, unexpected_tag_expected23, ret_code, 67);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf23, unexpected_tag_expected23, JSTokenizer::OPENING_TAG, 65);
     }
     SECTION("mixed lower and upper case")
     {
         NORMALIZE(unexpected_tag_buf24, unexpected_tag_expected24);
-        VALIDATE_FAIL(unexpected_tag_buf24, unexpected_tag_expected24, ret_code, 41);
-        VALIDATE_ALERT(ALERT_UNEXPECTED_TAG);
+        VALIDATE_FAIL(unexpected_tag_buf24, unexpected_tag_expected24, JSTokenizer::OPENING_TAG, 39);
     }
 }
-