]> git.ipfire.org Git - thirdparty/snort3.git/commitdiff
Pull request #3128: JS Normalization: single pass processing
authorMike Stepanek (mstepane) <mstepane@cisco.com>
Tue, 2 Nov 2021 14:12:46 +0000 (14:12 +0000)
committerMike Stepanek (mstepane) <mstepane@cisco.com>
Tue, 2 Nov 2021 14:12:46 +0000 (14:12 +0000)
Merge in SNORT/snort3 from ~OSHUMEIK/snort3:spp to master

Squashed commit of the following:

commit f09974f5dca6d48223f441e61ccd1b7676fd64e2
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Fri Oct 22 15:55:56 2021 +0300

    utils: correct Normalizer's output upon the next scan

    The output stream buffer was updated with a special-case code to speed up
    getting the output size.

commit 0f66f7491fcd07c44934a4a473d26354dd39a859
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Mon Oct 18 16:23:35 2021 +0300

    http_inspect: eliminate cumulative js data processing

    Input data is fed by portions (script_detection, chunked HTTP) to JSNormalizer.
    Output data is accumulated in output stream buffer, which resides in
    JSNormalizer context. Accumulated output data is deleted at the end of PDU.

commit 7fe0cc81badb99a2a732c74cddc1aa042e40cbd2
Author: Oleksii Shumeiko <oshumeik@cisco.com>
Date:   Fri Oct 15 16:50:09 2021 +0300

    utils: add get methods to peek in internal buffer

19 files changed:
src/pub_sub/test/pub_sub_http_request_body_event_test.cc
src/service_inspectors/http_inspect/dev_notes.txt
src/service_inspectors/http_inspect/http_flow_data.cc
src/service_inspectors/http_inspect/http_flow_data.h
src/service_inspectors/http_inspect/http_inspect.cc
src/service_inspectors/http_inspect/http_js_norm.cc
src/service_inspectors/http_inspect/http_js_norm.h
src/service_inspectors/http_inspect/http_msg_body.cc
src/service_inspectors/http_inspect/http_msg_body.h
src/service_inspectors/http_inspect/http_msg_header.cc
src/service_inspectors/http_inspect/http_stream_splitter_scan.cc
src/utils/js_normalizer.cc
src/utils/js_normalizer.h
src/utils/js_tokenizer.h
src/utils/js_tokenizer.l
src/utils/streambuf.cc
src/utils/streambuf.h
src/utils/test/js_normalizer_test.cc
src/utils/test/streambuf_test.cc

index 2776a13dbb168d4117e31c4cf1d0fe2f2514133e..80177c0e09d860f86e2110e4cb8ad512f6fceaa5 100644 (file)
@@ -55,7 +55,7 @@ void HttpMsgBody::publish() {}
 void HttpMsgBody::do_file_processing(const Field&) {}
 void HttpMsgBody::do_utf_decoding(const Field&, Field&) {}
 void HttpMsgBody::do_file_decompression(const Field&, Field&) {}
-void HttpMsgBody::do_enhanced_js_normalization(char*&, size_t&) {}
+void HttpMsgBody::do_enhanced_js_normalization(const Field&, Field&) {}
 void HttpMsgBody::clean_partial(uint32_t&, uint32_t&, uint8_t*&, uint32_t&, int32_t) {}
 void HttpMsgBody::bookkeeping_regular_flush(uint32_t&, uint8_t*&, uint32_t&, int32_t) {}
 #ifdef REG_TEST
index 2b95547a810b915da828cfbc0459cbb245c26e33..89cebf294bbc09429cf3a66722b89aa21e850815 100755 (executable)
@@ -273,10 +273,6 @@ The script continuation will be processed with the saved context.
 In order to support Script Detection feature for inline scripts, Normalizer ensures
 that after reaching the script end (legitimate closing tag or bad token),
 it falls back to an initial state, so that the next script can be processed by the same context.
-If PDU starts with a script continuation, it is not possible to restore
-Normalizer to the right state later (because context on the flow is not in an initial state).
-A buffer dedicated to handle this scenario. It contains a normalized data
-from the script continuation, so later it can be prepended to subsequent normalizations.
 
 Algorithm for reassembling chunked message bodies:
 
index 6dbd3fd55f0edcd96603ce0156f1975aedcaaa6e..84d8680210f36f80563b4cadbd4fdcf9a1852fea 100644 (file)
@@ -120,8 +120,6 @@ HttpFlowData::~HttpFlowData()
         update_deallocations(partial_buffer_length[k]);
         delete[] partial_detect_buffer[k];
         update_deallocations(partial_detect_length[k]);
-        delete[] js_detect_buffer[k];
-        update_deallocations(js_detect_length[k]);
         HttpTransaction::delete_transaction(transaction[k], nullptr);
         delete cutter[k];
         if (compress_stream[k] != nullptr)
@@ -274,10 +272,6 @@ snort::JSNormalizer& HttpFlowData::acquire_js_ctx(int32_t ident_depth, size_t no
         max_template_nesting, max_scope_depth);
     update_allocations(JSNormalizer::size());
 
-    auto ptr = js_detect_buffer[HttpCommon::SRC_SERVER];
-    auto len = js_detect_length[HttpCommon::SRC_SERVER];
-    js_normalizer->prepend_script(ptr, len);
-
     debug_logf(4, http_trace, TRACE_JS_PROC, nullptr,
         "js_normalizer created (norm_depth %zd, max_template_nesting %d)\n",
         norm_depth, max_template_nesting);
@@ -287,6 +281,8 @@ snort::JSNormalizer& HttpFlowData::acquire_js_ctx(int32_t ident_depth, size_t no
 
 void HttpFlowData::release_js_ctx()
 {
+    js_continue = false;
+
     if (!js_normalizer)
         return;
 
index 084e615004086e8c73cdf35a3f01771b674a7b78..0f22644396dfa880f3f86eec7d4c07fef80aaf8c 100644 (file)
@@ -85,9 +85,6 @@ public:
     void set_h2_body_state(HttpCommon::SourceId source_id, HttpEnums::H2BodyState state)
     { h2_body_state[source_id] = state; }
 
-    void reset_partial_flush(HttpCommon::SourceId source_id)
-    { partial_flush[source_id] = false; }
-
     uint32_t get_h2_stream_id() const;
 
 private:
@@ -170,8 +167,6 @@ private:
         HttpCommon::STAT_NOT_PRESENT };
     int64_t detect_depth_remaining[2] = { HttpCommon::STAT_NOT_PRESENT,
         HttpCommon::STAT_NOT_PRESENT };
-    int64_t js_norm_depth_remaining[2] = { HttpCommon::STAT_NOT_PRESENT,
-        HttpCommon::STAT_NOT_PRESENT };
     int32_t publish_depth_remaining[2] = { HttpCommon::STAT_NOT_PRESENT,
         HttpCommon::STAT_NOT_PRESENT };
     int32_t file_decomp_buffer_size_remaining[2] = { HttpCommon::STAT_NOT_PRESENT,
@@ -187,8 +182,6 @@ private:
     uint8_t* partial_detect_buffer[2] = { nullptr, nullptr };
     uint32_t partial_detect_length[2] = { 0, 0 };
     uint32_t partial_js_detect_length[2] = { 0, 0 };
-    uint8_t* js_detect_buffer[2] = { nullptr, nullptr };
-    uint32_t js_detect_length[2] = { 0, 0 };
     int32_t status_code_num = HttpCommon::STAT_NOT_PRESENT;
     HttpEnums::VersionId version_id[2] = { HttpEnums::VERS__NOT_PRESENT,
                                             HttpEnums::VERS__NOT_PRESENT };
@@ -218,6 +211,7 @@ private:
     // *** HttpJsNorm
     JSIdentifierCtxBase* js_ident_ctx = nullptr;
     snort::JSNormalizer* js_normalizer = nullptr;
+    bool js_continue = false;
     bool js_built_in_event = false;
 
     void reset_js_ident_ctx();
index 16234710883df42b8f0402f5b20a0678ffab1e64..c5218aee5b2ea14c559a791470e3f859a3939049 100755 (executable)
@@ -590,7 +590,6 @@ bool HttpInspect::process(const uint8_t* data, const uint16_t dsize, Flow* const
     current_section->gen_events();
     if (!session_data->partial_flush[source_id])
         current_section->update_flow();
-    session_data->partial_flush[source_id] = false;
     session_data->section_type[source_id] = SEC__NOT_COMPUTE;
 
 #ifdef REG_TEST
index fbdaeb644145f5d46c1867323271b223666c5cb6..a8c0b1801af412539b0141045e19ac0aa7472fef 100644 (file)
@@ -83,6 +83,7 @@ HttpJsNorm::HttpJsNorm(const HttpParaList::UriParam& uri_param_, int64_t normali
     int32_t identifier_depth_, uint8_t max_template_nesting_, uint32_t max_scope_depth_,
     const std::unordered_set<std::string>& built_in_ident_) :
     uri_param(uri_param_),
+    detection_depth(UINT64_MAX),
     normalization_depth(normalization_depth_),
     identifier_depth(identifier_depth_),
     max_template_nesting(max_template_nesting_),
@@ -135,8 +136,8 @@ void HttpJsNorm::configure()
     configure_once = true;
 }
 
-void HttpJsNorm::enhanced_external_normalize(const Field& input,
-    HttpInfractions* infractions, HttpFlowData* ssn, char*& out_buf, size_t& out_len) const
+void HttpJsNorm::do_external(const Field& input, Field& output,
+    HttpInfractions* infractions, HttpFlowData* ssn, bool final_portion) const
 {
     if (ssn->js_built_in_event)
         return;
@@ -156,7 +157,6 @@ void HttpJsNorm::enhanced_external_normalize(const Field& input,
         trace_logf(2, http_trace, TRACE_JS_PROC, nullptr,
             "script continues\n");
 
-
     auto& js_ctx = ssn->acquire_js_ctx(identifier_depth, normalization_depth, max_template_nesting,
         max_scope_depth, built_in_ident);
 
@@ -211,27 +211,34 @@ void HttpJsNorm::enhanced_external_normalize(const Field& input,
             break;
     }
 
-    auto result = js_ctx.get_script();
-    out_buf = result.first;
+    debug_logf(4, http_trace, TRACE_JS_PROC, nullptr,
+        "input data was %s\n", final_portion ? "last one in PDU" : "a part of PDU");
+
+    uint32_t data_len = std::min(detection_depth, js_ctx.script_size());
 
-    if (out_buf)
+    if (data_len)
     {
-        out_len = result.second;
+        const char* data = final_portion ? js_ctx.take_script() : js_ctx.get_script();
 
-        trace_logf(1, http_trace, TRACE_JS_DUMP, nullptr,
-            "js_data[%zu]: %.*s\n", out_len, static_cast<int>(out_len), out_buf);
+        if (data)
+        {
+            trace_logf(1, http_trace, TRACE_JS_DUMP, nullptr,
+                       "js_data[%u]: %.*s\n", data_len, data_len, data);
+
+            output.set(data_len, (const uint8_t*)data, final_portion);
+        }
     }
 }
 
-void HttpJsNorm::enhanced_inline_normalize(const Field& input,
-    HttpInfractions* infractions, HttpFlowData* ssn, char*& out_buf, size_t& out_len) const
+void HttpJsNorm::do_inline(const Field& input, Field& output,
+    HttpInfractions* infractions, HttpFlowData* ssn, bool final_portion) const
 {
     const char* ptr = (const char*)input.start();
     const char* const end = ptr + input.length();
 
     HttpEventGen* events = ssn->events[HttpCommon::SRC_SERVER];
 
-    bool script_continue = alive_ctx(ssn);
+    bool script_continue = ssn->js_continue;
     bool script_external = false;
 
     while (ptr < end)
@@ -282,7 +289,7 @@ void HttpJsNorm::enhanced_inline_normalize(const Field& input,
 
         auto& js_ctx = ssn->acquire_js_ctx(identifier_depth, normalization_depth,
             max_template_nesting, max_scope_depth, built_in_ident);
-        auto output_size_before = js_ctx.peek_script_size();
+        auto output_size_before = js_ctx.script_size();
 
         auto ret = js_normalize(js_ctx, end, ptr);
 
@@ -324,7 +331,7 @@ void HttpJsNorm::enhanced_inline_normalize(const Field& input,
             break;
         }
 
-        if (script_external && output_size_before != js_ctx.peek_script_size())
+        if (script_external && output_size_before != js_ctx.script_size())
         {
             *infractions += INF_JS_CODE_IN_EXTERNAL;
             events->create_event(EVENT_JS_CODE_IN_EXTERNAL);
@@ -333,26 +340,35 @@ void HttpJsNorm::enhanced_inline_normalize(const Field& input,
         script_continue = ret == JSTokenizer::SCRIPT_CONTINUE;
     }
 
+    ssn->js_continue = script_continue;
+
     if (!alive_ctx(ssn))
         return;
 
+    debug_logf(4, http_trace, TRACE_JS_PROC, nullptr,
+        "input data was %s\n", final_portion ? "last one in PDU" : "a part of PDU");
+
     auto js_ctx = ssn->js_normalizer;
-    auto result = js_ctx->get_script();
-    out_buf = result.first;
+    uint32_t data_len = std::min(detection_depth, js_ctx->script_size());
 
-    if (out_buf)
+    if (data_len)
     {
-        out_len = result.second;
+        const char* data = final_portion ? js_ctx->take_script() : js_ctx->get_script();
 
-        trace_logf(1, http_trace, TRACE_JS_DUMP, nullptr,
-            "js_data[%zu]: %.*s\n", out_len, static_cast<int>(out_len), out_buf);
+        if (data)
+        {
+            trace_logf(1, http_trace, TRACE_JS_DUMP, nullptr,
+                       "js_data[%u]: %.*s\n", data_len, data_len, data);
+
+            output.set(data_len, (const uint8_t*)data, final_portion);
+        }
     }
 
-    if (!script_continue)
+    if (!script_continue && final_portion)
         ssn->release_js_ctx();
 }
 
-void HttpJsNorm::legacy_normalize(const Field& input, Field& output, HttpInfractions* infractions,
+void HttpJsNorm::do_legacy(const Field& input, Field& output, HttpInfractions* infractions,
     HttpEventGen* events, int max_javascript_whitespaces) const
 {
     bool js_present = false;
index ad62257103c8339c1a4d4d31b66ee10f55518bf6..076f1689edf45abb05bd24af05748c24d761ff70 100644 (file)
@@ -41,12 +41,13 @@ public:
         const std::unordered_set<std::string>& built_in_ident);
     ~HttpJsNorm();
 
-    void legacy_normalize(const Field& input, Field& output, HttpInfractions*, HttpEventGen*,
+    void set_detection_depth(size_t depth)
+    { detection_depth = depth; }
+
+    void do_legacy(const Field& input, Field& output, HttpInfractions*, HttpEventGen*,
         int max_javascript_whitespaces) const;
-    void enhanced_inline_normalize(const Field& input, HttpInfractions*, HttpFlowData*,
-        char*& out_buf, size_t& out_len) const;
-    void enhanced_external_normalize(const Field& input, HttpInfractions*, HttpFlowData*,
-        char*& out_buf, size_t& out_len) const;
+    void do_inline(const Field& input, Field& output, HttpInfractions*, HttpFlowData*, bool) const;
+    void do_external(const Field& input, Field& output, HttpInfractions*, HttpFlowData*, bool) const;
 
     void configure();
 
@@ -62,6 +63,7 @@ private:
     };
 
     const HttpParaList::UriParam& uri_param;
+    size_t detection_depth;
     int64_t normalization_depth;
     int32_t identifier_depth;
     uint8_t max_template_nesting;
index 722ec11590f4ac314be2481c0862308f51699522..d342026957078c18d267b9dc27c40d65d96ed2a5 100644 (file)
@@ -80,7 +80,8 @@ void HttpMsgBody::publish()
 void HttpMsgBody::bookkeeping_regular_flush(uint32_t& partial_detect_length,
     uint8_t*& partial_detect_buffer, uint32_t& partial_js_detect_length, int32_t detect_length)
 {
-    session_data->js_norm_depth_remaining[source_id] = session_data->detect_depth_remaining[source_id];
+    params->js_norm_param.js_norm->set_detection_depth(session_data->detect_depth_remaining[source_id]);
+
     session_data->detect_depth_remaining[source_id] -= detect_length;
     partial_detect_buffer = nullptr;
     partial_detect_length = 0;
@@ -332,58 +333,17 @@ void HttpMsgBody::fd_event_callback(void* context, int event)
     }
 }
 
-void HttpMsgBody::do_enhanced_js_normalization(char*& out_buf, size_t& out_buf_len)
+void HttpMsgBody::do_enhanced_js_normalization(const Field& input, Field& output)
 {
-    const bool has_cumulative_data = (cumulative_data.length() > 0);
-    Field& input = has_cumulative_data ? cumulative_data : decompressed_file_body;
-
-    bool js_continuation = session_data->js_normalizer;
-    uint8_t*& buf = session_data->js_detect_buffer[source_id];
-    uint32_t& len = session_data->js_detect_length[source_id];
-
-    if (has_cumulative_data)
-        session_data->release_js_ctx();
-    else
-    {
-        session_data->update_deallocations(len);
-        delete[] buf;
-        buf = nullptr;
-        len = 0;
-    }
-
+    auto back = !session_data->partial_flush[source_id];
     auto http_header = get_header(source_id);
+    auto normalizer = params->js_norm_param.js_norm;
+    auto infractions = transaction->get_infractions(source_id);
 
     if (http_header and http_header->is_external_js())
-        params->js_norm_param.js_norm->enhanced_external_normalize(input,
-            transaction->get_infractions(source_id), session_data, out_buf, out_buf_len);
-    else
-        params->js_norm_param.js_norm->enhanced_inline_normalize(input,
-            transaction->get_infractions(source_id), session_data, out_buf, out_buf_len);
-
-    out_buf_len = static_cast<int64_t>(out_buf_len) <= session_data->js_norm_depth_remaining[source_id] ?
-        out_buf_len : session_data->js_norm_depth_remaining[source_id];
-
-    if (out_buf_len > 0)
-    {
-        if (has_cumulative_data)
-            return;
-
-        if (js_continuation)
-        {
-            uint8_t* nscript = new uint8_t[out_buf_len];
-
-            memcpy(nscript, out_buf, out_buf_len);
-            buf = nscript;
-            len = out_buf_len;
-            session_data->update_allocations(len);
-        }
-    }
+        normalizer->do_external(input, output, infractions, session_data, back);
     else
-    {
-        delete[] out_buf;
-        out_buf = nullptr;
-        out_buf_len = 0;
-    }
+        normalizer->do_inline(input, output, infractions, session_data, back);
 }
 
 void HttpMsgBody::do_legacy_js_normalization(const Field& input, Field& output)
@@ -394,7 +354,7 @@ void HttpMsgBody::do_legacy_js_normalization(const Field& input, Field& output)
         return;
     }
 
-    params->js_norm_param.js_norm->legacy_normalize(input, output,
+    params->js_norm_param.js_norm->do_legacy(input, output,
         transaction->get_infractions(source_id), session_data->events[source_id],
         params->js_norm_param.max_javascript_whitespaces);
 }
@@ -566,20 +526,15 @@ const Field& HttpMsgBody::get_decomp_vba_data()
 
 const Field& HttpMsgBody::get_norm_js_data()
 {
-    if (enhanced_js_norm_body.length() != STAT_NOT_COMPUTE)
-        return enhanced_js_norm_body;
-
-    char* buf = nullptr;
-    size_t buf_len = 0;
+    if (norm_js_data.length() != STAT_NOT_COMPUTE)
+        return norm_js_data;
 
-    do_enhanced_js_normalization(buf, buf_len);
+    do_enhanced_js_normalization(decompressed_file_body, norm_js_data);
 
-    if (buf && buf_len)
-        enhanced_js_norm_body.set(buf_len, reinterpret_cast<const uint8_t*>(buf), true);
-    else
-        enhanced_js_norm_body.set(STAT_NOT_PRESENT);
+    if (norm_js_data.length() == STAT_NOT_COMPUTE)
+        norm_js_data.set(STAT_NOT_PRESENT);
 
-    return enhanced_js_norm_body;
+    return norm_js_data;
 }
 
 int32_t HttpMsgBody::get_publish_length() const
index 2f1a9c3883ee0df55a506cce9fb1980d7f4410d1..e3a0461fb7bc3ff7381c4e112e01d7b7737edfcc 100644 (file)
@@ -63,7 +63,7 @@ private:
     void do_file_processing(const Field& file_data);
     void do_utf_decoding(const Field& input, Field& output);
     void do_file_decompression(const Field& input, Field& output);
-    void do_enhanced_js_normalization(char*& out_buf, size_t& out_len);
+    void do_enhanced_js_normalization(const Field& input, Field& output);
     void do_legacy_js_normalization(const Field& input, Field& output);
     void clean_partial(uint32_t& partial_inspected_octets, uint32_t& partial_detect_length,
         uint8_t*& partial_detect_buffer,  uint32_t& partial_js_detect_length,
@@ -81,7 +81,7 @@ private:
     Field cumulative_data;
     Field js_norm_body;
     Field detect_data;
-    Field enhanced_js_norm_body;
+    Field norm_js_data;
     Field classic_client_body;   // URI normalization applied
     Field decompressed_vba_data;
 
index 5f75894a45cffeaf760577d2ff80aa62c1a73430..e896de26b9d52d8957f2959a9af4d971d3f7d6d0 100755 (executable)
@@ -38,6 +38,7 @@
 #include "http_common.h"
 #include "http_enum.h"
 #include "http_inspect.h"
+#include "http_js_norm.h"
 #include "http_msg_request.h"
 #include "http_msg_body.h"
 #include "http_normalizers.h"
@@ -449,7 +450,8 @@ void HttpMsgHeader::prepare_body()
     const int64_t& depth = (source_id == SRC_CLIENT) ? params->request_depth :
         params->response_depth;
     session_data->detect_depth_remaining[source_id] = (depth != -1) ? depth : INT64_MAX;
-    session_data->js_norm_depth_remaining[source_id] = session_data->detect_depth_remaining[source_id];
+    params->js_norm_param.js_norm->set_detection_depth(session_data->detect_depth_remaining[source_id]);
+
     if ((source_id == SRC_CLIENT) and params->publish_request_body and session_data->for_http2)
     {
         session_data->publish_octets[source_id] = 0;
index 21f6b233aff5879eeae0d769110bcd6f50ae2a84..b5623e6bc8431b1695333822d64740b9d467d4e7 100644 (file)
@@ -157,6 +157,7 @@ StreamSplitter::Status HttpStreamSplitter::scan(Packet* pkt, const uint8_t* data
 #endif
 
     SectionType& type = session_data->type_expected[source_id];
+    session_data->partial_flush[source_id] = false;
 
     if (type == SEC_ABORT)
         return status_value(StreamSplitter::ABORT);
index 4e040b76bc313d0dac3c8a2bc4b626f03cd8fd12..781092fcfcd60594c98f01fb865ee505aa074553 100644 (file)
@@ -71,6 +71,7 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len)
         ->pubsetbuf(const_cast<char*>(src), len);
     out_buf.reserve(src_len * BUFF_EXP_FACTOR);
 
+    tokenizer.pre_yylex();
     JSTokenizer::JSRet ret = static_cast<JSTokenizer::JSRet>(tokenizer.yylex());
     in.clear();
     out.clear();
@@ -82,33 +83,3 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len)
 
     return rem_bytes ? ret : JSTokenizer::EOS;
 }
-
-std::pair<char*,size_t> JSNormalizer::get_script()
-{
-    streamsize len = 0;
-    char* dst = out_buf.release_data(len);
-    return {dst, len};
-}
-
-size_t JSNormalizer::peek_script_size()
-{
-    return out.tellp();
-}
-
-void JSNormalizer::prepend_script(const void* p , size_t n)
-{
-    if (p)
-        out_buf.sputn(reinterpret_cast<const char*>(p), n);
-}
-
-size_t JSNormalizer::size()
-{
-    return sizeof(JSNormalizer) + 16834; // the default YY_BUF_SIZE
-}
-
-#ifdef BENCHMARK_TEST
-void JSNormalizer::rewind_output()
-{
-    out_buf.pubseekoff(0, ios_base::beg, ios_base::out);
-}
-#endif
index f2866d11acc717f07ee5f82d19b17c010a4a5d0c..508380d9dee0ec3a2fee4d3d21bb35806c1fb412 100644 (file)
@@ -38,21 +38,29 @@ public:
         int tmp_cap_size = JSTOKENIZER_BUF_MAX_SIZE);
     ~JSNormalizer();
 
+    JSTokenizer::JSRet normalize(const char* src, size_t src_len);
+
     const char* get_src_next() const
     { return src_next; }
 
     void reset_depth()
     { rem_bytes = depth; }
 
-    JSTokenizer::JSRet normalize(const char* src, size_t src_len);
-    std::pair<char*,size_t> get_script();
-    size_t peek_script_size();
-    void prepend_script(const void*, size_t);
+    const char* take_script()
+    { return out_buf.take_data(); }
+
+    const char* get_script() const
+    { return out_buf.data(); }
+
+    size_t script_size()
+    { return out.tellp(); }
 
-    static size_t size();
+    static size_t size()
+    { return sizeof(JSNormalizer) + 16834; /* YY_BUF_SIZE */ }
 
 #ifdef BENCHMARK_TEST
-    void rewind_output();
+    void rewind_output()
+    { out_buf.pubseekoff(0, std::ios_base::beg, std::ios_base::out); }
 #endif
 
 private:
index 47239648e0a2d206185490faa7bc7741b51b0bd5..5079e23d56528cb8f294d5c2c11b32a4d4321dcc 100644 (file)
@@ -116,6 +116,9 @@ public:
         int cap_size = JSTOKENIZER_BUF_MAX_SIZE);
     ~JSTokenizer() override;
 
+    // internal actions before calling main loop
+    void pre_yylex();
+
     // returns JSRet
     int yylex() override;
 
@@ -164,7 +167,8 @@ private:
     struct
     {
         JSToken token = UNDEFINED;          // the token before
-        int length = 0;                     // current token length
+        int orig_len = 0;                   // current token original length
+        int norm_len = 0;                   // normalized length of previous tokens
         int sc = 0;                         // current Starting Condition
     } states[JSTOKENIZER_MAX_STATES];
     int sp = 0;                             // points to the top of states
@@ -172,6 +176,7 @@ private:
     char*& tmp_buf;
     size_t& tmp_buf_size;
     const int tmp_cap_size;
+    int output_steps_back;
 
     bool newline_found = false;
     constexpr static bool insert_semicolon[ASI_GROUP_MAX][ASI_GROUP_MAX]
index f399dc1bcac0909560cb793d79dd5f7773580247..f2239747b4dbe6705fc85ee718122f3dc19604f3 100644 (file)
@@ -1146,6 +1146,7 @@ JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out,
       tmp_buf(buf),
       tmp_buf_size(buf_size),
       tmp_cap_size(cap_size),
+      output_steps_back(0),
       max_scope_depth(max_scope_depth)
 {
     scope_push(GLOBAL);
@@ -1160,6 +1161,12 @@ JSTokenizer::~JSTokenizer()
     tmp_buf_size = 0;
 }
 
+void JSTokenizer::pre_yylex()
+{
+    assert(output_steps_back >= 0);
+    yyout.seekp(-output_steps_back, std::ios_base::cur);
+}
+
 void JSTokenizer::switch_to_temporal(const std::string& data)
 {
     tmp.str(data);
@@ -1351,19 +1358,22 @@ void JSTokenizer::states_push()
     auto& state = states[sp];
 
     state.token = token;
-    state.length = yyleng;
+    state.orig_len = yyleng;
+    state.norm_len = yyout.rdbuf()->pubseekoff(0, std::ios_base::cur, std::ios_base::out);
     state.sc = yy_start;
 }
 
 void JSTokenizer::states_correct(int take_off)
 {
     auto& state = states[sp];
-    state.length -= yyleng - take_off;
+    state.orig_len -= yyleng - take_off;
 }
 
 void JSTokenizer::states_apply()
 {
     int tail_size = 0;
+    int outbuf_pos = yyout.tellp();
+    int outbuf_back = outbuf_pos;
 
     for (int i = JSTOKENIZER_MAX_STATES; i > 0 && tail_size < tmp_cap_size; --i)
     {
@@ -1371,17 +1381,21 @@ void JSTokenizer::states_apply()
         idx %= JSTOKENIZER_MAX_STATES;
         auto& state = states[idx];
 
-        if (state.length == 0)
+        outbuf_back = state.norm_len;
+
+        if (state.orig_len == 0)
             continue;
 
         token = state.token;
         yy_start = state.sc;
-        tail_size += state.length;
+        tail_size += state.orig_len;
         tail_size = tail_size < tmp_cap_size ? tail_size : tmp_cap_size;
     }
 
+    output_steps_back = outbuf_pos - outbuf_back;
+
     for (int i = 0; i < JSTOKENIZER_MAX_STATES; ++i)
-        states[i].length = 0;
+        states[i].orig_len = 0;
 
     char* buf = new char[tail_size];
 
index 1b5e1d1b01a452993f62033fc86925cee810c97d..ff46f9653871d2278e4ae8297b577e589b36031b 100644 (file)
@@ -251,7 +251,19 @@ void ostreambuf_infl::reserve(streamsize n)
         enlarge(n - size);
 }
 
-char* ostreambuf_infl::release_data(streamsize& n)
+const char* ostreambuf_infl::take_data()
+{
+    auto data = pbase();
+
+    setp(nullptr, nullptr);
+
+    gen.s = states[0].s;
+    gen.n = states[0].n;
+
+    return data;
+}
+
+const char* ostreambuf_infl::take_data(streamsize& n)
 {
     auto data = pbase();
 
@@ -278,6 +290,9 @@ streampos ostreambuf_infl::seekoff(streamoff off, ios_base::seekdir way, ios_bas
     if (!(which & ios_base::out))
         return -1;
 
+    if (off == 0 && way == ios_base::cur)
+        return pptr() - pbase();
+
     auto base = pbase();
     auto ptr = pptr();
     auto eptr = epptr();
index 674a5b4987d9529729ad205ec58466baad29f918..b008db2da5145b867e0d127988bf5e515bfd45e0 100644 (file)
@@ -78,7 +78,14 @@ public:
 
     // releases the current buffer,
     // the caller takes ownership over the buffer
-    char* release_data(std::streamsize& n);
+    const char* take_data();
+    const char* take_data(std::streamsize& n);
+
+    const char* data() const
+    { return pbase(); }
+
+    std::streamsize data_len() const
+    { return pptr() - pbase(); }
 
 protected:
     virtual std::streambuf* setbuf(char* s, std::streamsize n) override;
index 9ac56bf7bc4fb3d7e56ee908032cfd1aba272ed9..99d16f19be0992e46ca679d91b043429e05e13b7 100644 (file)
@@ -75,9 +75,8 @@ static const std::unordered_set<std::string> s_ident_built_in { "console", "eval
     JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH);     \
     auto ret = norm.normalize(src, sizeof(src));                   \
     const char* ptr = norm.get_src_next();                         \
-    auto result = norm.get_script();                               \
-    char* dst = result.first;                                      \
-    int act_len = result.second;                                   \
+    int act_len = norm.script_size();                              \
+    const char* dst = norm.take_script();
 
 #define VALIDATE(src, expected)                 \
     CHECK(ret == JSTokenizer::SCRIPT_CONTINUE); \
@@ -100,12 +99,10 @@ static const std::unordered_set<std::string> s_ident_built_in { "console", "eval
         JSNormalizer norm(ident_ctx, depth, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH);    \
         ret = norm.normalize(src, src_len);                           \
         ptr = norm.get_src_next();                                    \
-        auto result = norm.get_script();                              \
-        char* dptr = result.first;                                    \
-        len = result.second;                                          \
+        len = norm.script_size();                                     \
+        const char* dptr = norm.get_script();                         \
         REQUIRE(len == dst_len);                                      \
         memcpy(dst, dptr, dst_len);                                   \
-        delete[] dptr;                                                \
     }
 
 #define DO(src, slen, dst, dlen)                            \
@@ -113,9 +110,8 @@ static const std::unordered_set<std::string> s_ident_built_in { "console", "eval
         auto ret = norm.normalize(src, slen);               \
         CHECK(ret == JSTokenizer::SCRIPT_CONTINUE);         \
         auto nsrc = norm.get_src_next();                    \
-        auto result = norm.get_script();                    \
-        char* ptr = result.first;                           \
-        int act_len = result.second;                        \
+        int act_len = norm.script_size();                   \
+        const char* ptr = norm.take_script();               \
         REQUIRE(nsrc - src == slen);                        \
         REQUIRE(act_len == dlen);                           \
         memcpy(dst, ptr, dlen);                             \
@@ -126,12 +122,10 @@ static const std::unordered_set<std::string> s_ident_built_in { "console", "eval
     {                                                       \
         auto ret = norm.normalize(src, slen);               \
         CHECK(ret == rexp);                                 \
-        auto result = norm.get_script();                    \
-        char* ptr = result.first;                           \
-        int act_len = result.second;                        \
+        int act_len = norm.script_size();                   \
+        const char* ptr = norm.get_script();                \
         REQUIRE(act_len == dlen);                           \
         memcpy(dst, ptr, dlen);                             \
-        delete[] ptr;                                       \
     }
 
 #define CLOSE()                                                         \
@@ -1454,9 +1448,8 @@ TEST_CASE("endings", "[JSNormalizer]")
         JSNormalizer norm(ident_ctx, 7, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH);
         ret = norm.normalize(src, sizeof(src));
         ptr = norm.get_src_next();
-        auto res1 = norm.get_script();
-        char* dst1 = res1.first;
-        int act_len1 = res1.second;
+        int act_len1 = norm.script_size();
+        const char* dst1 = norm.take_script();
 
         CHECK(ret == JSTokenizer::EOS);
         CHECK(ptr == src + 7);
@@ -1466,9 +1459,8 @@ TEST_CASE("endings", "[JSNormalizer]")
 
         ret = norm.normalize(src2, sizeof(src2));
         ptr = norm.get_src_next();
-        auto res2 = norm.get_script();
-        char* dst2 = res2.first;
-        int act_len2 = res2.second;
+        int act_len2 = norm.script_size();
+        const char* dst2 = norm.take_script();
 
         CHECK(ret == JSTokenizer::EOS);
         CHECK(ptr == src2 + sizeof(src2));
index f3246b4fce89990cb41a5ec4145361000dbf2c73..d9285b83ad77d1fa4c2c51437a035334e1b6387e 100644 (file)
@@ -69,8 +69,17 @@ using namespace std;
 
 #define EXP_RES(b, exp, exp_len, exp_mem_size)                          \
     {                                                                   \
+        auto d1_len = (b).data_len();                                   \
+        auto d1 = (b).data();                                           \
         streamsize act_len;                                             \
-        char* act = (b).release_data(act_len);                          \
+        const char* act = (b).take_data(act_len);                       \
+        auto d2_len = (b).data_len();                                   \
+        auto d2 = (b).data();                                           \
+                                                                        \
+        CHECK(d1 == act);                                               \
+        CHECK(d1_len == act_len);                                       \
+        CHECK(d2 == nullptr);                                           \
+        CHECK(d2_len == 0);                                             \
                                                                         \
         CHECK((exp_mem_size) == act_len);                               \
         REQUIRE((exp_len) <= act_len);                                  \
@@ -101,12 +110,11 @@ using namespace std;
         CHECK((exp_len) == (s).tellp());                                \
                                                                         \
         ostreambuf_infl* b = reinterpret_cast<ostreambuf_infl*>((s).rdbuf()); \
-        streamsize act_len;                                             \
-        char* act = b->release_data(act_len);                           \
+        auto act = b->data();                                           \
+        auto act_len = b->data_len();                                   \
                                                                         \
         REQUIRE((exp_len) == act_len);                                  \
         CHECK(!memcmp((exp), act, (exp_len)));                          \
-        delete[] act;                                                   \
     }
 
 #define EOF_OUT(s, exp, exp_len)                                        \
@@ -117,12 +125,11 @@ using namespace std;
         CHECK((exp_len) == (s).tellp());                                \
                                                                         \
         ostreambuf_infl* b = reinterpret_cast<ostreambuf_infl*>((s).rdbuf()); \
-        streamsize act_len;                                             \
-        char* act = b->release_data(act_len);                           \
+        auto act = b->data();                                           \
+        auto act_len = b->data_len();                                   \
                                                                         \
         REQUIRE((exp_len) == act_len);                                  \
         CHECK(!memcmp((exp), act, (exp_len)));                          \
-        delete[] act;                                                   \
     }
 
 TEST_CASE("input buffer - basic one source", "[Stream buffers]")