From: Mike Stepanek (mstepane) Date: Tue, 2 Nov 2021 14:12:46 +0000 (+0000) Subject: Pull request #3128: JS Normalization: single pass processing X-Git-Tag: 3.1.16.0~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=6c900f293ee47f2fd1eadfeccaab79d6e1c28dc5;p=thirdparty%2Fsnort3.git Pull request #3128: JS Normalization: single pass processing Merge in SNORT/snort3 from ~OSHUMEIK/snort3:spp to master Squashed commit of the following: commit f09974f5dca6d48223f441e61ccd1b7676fd64e2 Author: Oleksii Shumeiko Date: Fri Oct 22 15:55:56 2021 +0300 utils: correct Normalizer's output upon the next scan The output stream buffer was updated with a special-case code to speed up getting the output size. commit 0f66f7491fcd07c44934a4a473d26354dd39a859 Author: Oleksii Shumeiko Date: Mon Oct 18 16:23:35 2021 +0300 http_inspect: eliminate cumulative js data processing Input data is fed by portions (script_detection, chunked HTTP) to JSNormalizer. Output data is accumulated in output stream buffer, which resides in JSNormalizer context. Accumulated output data is deleted at the end of PDU. commit 7fe0cc81badb99a2a732c74cddc1aa042e40cbd2 Author: Oleksii Shumeiko Date: Fri Oct 15 16:50:09 2021 +0300 utils: add get methods to peek in internal buffer --- diff --git a/src/pub_sub/test/pub_sub_http_request_body_event_test.cc b/src/pub_sub/test/pub_sub_http_request_body_event_test.cc index 2776a13db..80177c0e0 100644 --- a/src/pub_sub/test/pub_sub_http_request_body_event_test.cc +++ b/src/pub_sub/test/pub_sub_http_request_body_event_test.cc @@ -55,7 +55,7 @@ void HttpMsgBody::publish() {} void HttpMsgBody::do_file_processing(const Field&) {} void HttpMsgBody::do_utf_decoding(const Field&, Field&) {} void HttpMsgBody::do_file_decompression(const Field&, Field&) {} -void HttpMsgBody::do_enhanced_js_normalization(char*&, size_t&) {} +void HttpMsgBody::do_enhanced_js_normalization(const Field&, Field&) {} void HttpMsgBody::clean_partial(uint32_t&, uint32_t&, uint8_t*&, uint32_t&, int32_t) {} void HttpMsgBody::bookkeeping_regular_flush(uint32_t&, uint8_t*&, uint32_t&, int32_t) {} #ifdef REG_TEST diff --git a/src/service_inspectors/http_inspect/dev_notes.txt b/src/service_inspectors/http_inspect/dev_notes.txt index 2b95547a8..89cebf294 100755 --- a/src/service_inspectors/http_inspect/dev_notes.txt +++ b/src/service_inspectors/http_inspect/dev_notes.txt @@ -273,10 +273,6 @@ The script continuation will be processed with the saved context. In order to support Script Detection feature for inline scripts, Normalizer ensures that after reaching the script end (legitimate closing tag or bad token), it falls back to an initial state, so that the next script can be processed by the same context. -If PDU starts with a script continuation, it is not possible to restore -Normalizer to the right state later (because context on the flow is not in an initial state). -A buffer dedicated to handle this scenario. It contains a normalized data -from the script continuation, so later it can be prepended to subsequent normalizations. Algorithm for reassembling chunked message bodies: diff --git a/src/service_inspectors/http_inspect/http_flow_data.cc b/src/service_inspectors/http_inspect/http_flow_data.cc index 6dbd3fd55..84d868021 100644 --- a/src/service_inspectors/http_inspect/http_flow_data.cc +++ b/src/service_inspectors/http_inspect/http_flow_data.cc @@ -120,8 +120,6 @@ HttpFlowData::~HttpFlowData() update_deallocations(partial_buffer_length[k]); delete[] partial_detect_buffer[k]; update_deallocations(partial_detect_length[k]); - delete[] js_detect_buffer[k]; - update_deallocations(js_detect_length[k]); HttpTransaction::delete_transaction(transaction[k], nullptr); delete cutter[k]; if (compress_stream[k] != nullptr) @@ -274,10 +272,6 @@ snort::JSNormalizer& HttpFlowData::acquire_js_ctx(int32_t ident_depth, size_t no max_template_nesting, max_scope_depth); update_allocations(JSNormalizer::size()); - auto ptr = js_detect_buffer[HttpCommon::SRC_SERVER]; - auto len = js_detect_length[HttpCommon::SRC_SERVER]; - js_normalizer->prepend_script(ptr, len); - debug_logf(4, http_trace, TRACE_JS_PROC, nullptr, "js_normalizer created (norm_depth %zd, max_template_nesting %d)\n", norm_depth, max_template_nesting); @@ -287,6 +281,8 @@ snort::JSNormalizer& HttpFlowData::acquire_js_ctx(int32_t ident_depth, size_t no void HttpFlowData::release_js_ctx() { + js_continue = false; + if (!js_normalizer) return; diff --git a/src/service_inspectors/http_inspect/http_flow_data.h b/src/service_inspectors/http_inspect/http_flow_data.h index 084e61500..0f2264439 100644 --- a/src/service_inspectors/http_inspect/http_flow_data.h +++ b/src/service_inspectors/http_inspect/http_flow_data.h @@ -85,9 +85,6 @@ public: void set_h2_body_state(HttpCommon::SourceId source_id, HttpEnums::H2BodyState state) { h2_body_state[source_id] = state; } - void reset_partial_flush(HttpCommon::SourceId source_id) - { partial_flush[source_id] = false; } - uint32_t get_h2_stream_id() const; private: @@ -170,8 +167,6 @@ private: HttpCommon::STAT_NOT_PRESENT }; int64_t detect_depth_remaining[2] = { HttpCommon::STAT_NOT_PRESENT, HttpCommon::STAT_NOT_PRESENT }; - int64_t js_norm_depth_remaining[2] = { HttpCommon::STAT_NOT_PRESENT, - HttpCommon::STAT_NOT_PRESENT }; int32_t publish_depth_remaining[2] = { HttpCommon::STAT_NOT_PRESENT, HttpCommon::STAT_NOT_PRESENT }; int32_t file_decomp_buffer_size_remaining[2] = { HttpCommon::STAT_NOT_PRESENT, @@ -187,8 +182,6 @@ private: uint8_t* partial_detect_buffer[2] = { nullptr, nullptr }; uint32_t partial_detect_length[2] = { 0, 0 }; uint32_t partial_js_detect_length[2] = { 0, 0 }; - uint8_t* js_detect_buffer[2] = { nullptr, nullptr }; - uint32_t js_detect_length[2] = { 0, 0 }; int32_t status_code_num = HttpCommon::STAT_NOT_PRESENT; HttpEnums::VersionId version_id[2] = { HttpEnums::VERS__NOT_PRESENT, HttpEnums::VERS__NOT_PRESENT }; @@ -218,6 +211,7 @@ private: // *** HttpJsNorm JSIdentifierCtxBase* js_ident_ctx = nullptr; snort::JSNormalizer* js_normalizer = nullptr; + bool js_continue = false; bool js_built_in_event = false; void reset_js_ident_ctx(); diff --git a/src/service_inspectors/http_inspect/http_inspect.cc b/src/service_inspectors/http_inspect/http_inspect.cc index 162347108..c5218aee5 100755 --- a/src/service_inspectors/http_inspect/http_inspect.cc +++ b/src/service_inspectors/http_inspect/http_inspect.cc @@ -590,7 +590,6 @@ bool HttpInspect::process(const uint8_t* data, const uint16_t dsize, Flow* const current_section->gen_events(); if (!session_data->partial_flush[source_id]) current_section->update_flow(); - session_data->partial_flush[source_id] = false; session_data->section_type[source_id] = SEC__NOT_COMPUTE; #ifdef REG_TEST diff --git a/src/service_inspectors/http_inspect/http_js_norm.cc b/src/service_inspectors/http_inspect/http_js_norm.cc index fbdaeb644..a8c0b1801 100644 --- a/src/service_inspectors/http_inspect/http_js_norm.cc +++ b/src/service_inspectors/http_inspect/http_js_norm.cc @@ -83,6 +83,7 @@ HttpJsNorm::HttpJsNorm(const HttpParaList::UriParam& uri_param_, int64_t normali int32_t identifier_depth_, uint8_t max_template_nesting_, uint32_t max_scope_depth_, const std::unordered_set& built_in_ident_) : uri_param(uri_param_), + detection_depth(UINT64_MAX), normalization_depth(normalization_depth_), identifier_depth(identifier_depth_), max_template_nesting(max_template_nesting_), @@ -135,8 +136,8 @@ void HttpJsNorm::configure() configure_once = true; } -void HttpJsNorm::enhanced_external_normalize(const Field& input, - HttpInfractions* infractions, HttpFlowData* ssn, char*& out_buf, size_t& out_len) const +void HttpJsNorm::do_external(const Field& input, Field& output, + HttpInfractions* infractions, HttpFlowData* ssn, bool final_portion) const { if (ssn->js_built_in_event) return; @@ -156,7 +157,6 @@ void HttpJsNorm::enhanced_external_normalize(const Field& input, trace_logf(2, http_trace, TRACE_JS_PROC, nullptr, "script continues\n"); - auto& js_ctx = ssn->acquire_js_ctx(identifier_depth, normalization_depth, max_template_nesting, max_scope_depth, built_in_ident); @@ -211,27 +211,34 @@ void HttpJsNorm::enhanced_external_normalize(const Field& input, break; } - auto result = js_ctx.get_script(); - out_buf = result.first; + debug_logf(4, http_trace, TRACE_JS_PROC, nullptr, + "input data was %s\n", final_portion ? "last one in PDU" : "a part of PDU"); + + uint32_t data_len = std::min(detection_depth, js_ctx.script_size()); - if (out_buf) + if (data_len) { - out_len = result.second; + const char* data = final_portion ? js_ctx.take_script() : js_ctx.get_script(); - trace_logf(1, http_trace, TRACE_JS_DUMP, nullptr, - "js_data[%zu]: %.*s\n", out_len, static_cast(out_len), out_buf); + if (data) + { + trace_logf(1, http_trace, TRACE_JS_DUMP, nullptr, + "js_data[%u]: %.*s\n", data_len, data_len, data); + + output.set(data_len, (const uint8_t*)data, final_portion); + } } } -void HttpJsNorm::enhanced_inline_normalize(const Field& input, - HttpInfractions* infractions, HttpFlowData* ssn, char*& out_buf, size_t& out_len) const +void HttpJsNorm::do_inline(const Field& input, Field& output, + HttpInfractions* infractions, HttpFlowData* ssn, bool final_portion) const { const char* ptr = (const char*)input.start(); const char* const end = ptr + input.length(); HttpEventGen* events = ssn->events[HttpCommon::SRC_SERVER]; - bool script_continue = alive_ctx(ssn); + bool script_continue = ssn->js_continue; bool script_external = false; while (ptr < end) @@ -282,7 +289,7 @@ void HttpJsNorm::enhanced_inline_normalize(const Field& input, auto& js_ctx = ssn->acquire_js_ctx(identifier_depth, normalization_depth, max_template_nesting, max_scope_depth, built_in_ident); - auto output_size_before = js_ctx.peek_script_size(); + auto output_size_before = js_ctx.script_size(); auto ret = js_normalize(js_ctx, end, ptr); @@ -324,7 +331,7 @@ void HttpJsNorm::enhanced_inline_normalize(const Field& input, break; } - if (script_external && output_size_before != js_ctx.peek_script_size()) + if (script_external && output_size_before != js_ctx.script_size()) { *infractions += INF_JS_CODE_IN_EXTERNAL; events->create_event(EVENT_JS_CODE_IN_EXTERNAL); @@ -333,26 +340,35 @@ void HttpJsNorm::enhanced_inline_normalize(const Field& input, script_continue = ret == JSTokenizer::SCRIPT_CONTINUE; } + ssn->js_continue = script_continue; + if (!alive_ctx(ssn)) return; + debug_logf(4, http_trace, TRACE_JS_PROC, nullptr, + "input data was %s\n", final_portion ? "last one in PDU" : "a part of PDU"); + auto js_ctx = ssn->js_normalizer; - auto result = js_ctx->get_script(); - out_buf = result.first; + uint32_t data_len = std::min(detection_depth, js_ctx->script_size()); - if (out_buf) + if (data_len) { - out_len = result.second; + const char* data = final_portion ? js_ctx->take_script() : js_ctx->get_script(); - trace_logf(1, http_trace, TRACE_JS_DUMP, nullptr, - "js_data[%zu]: %.*s\n", out_len, static_cast(out_len), out_buf); + if (data) + { + trace_logf(1, http_trace, TRACE_JS_DUMP, nullptr, + "js_data[%u]: %.*s\n", data_len, data_len, data); + + output.set(data_len, (const uint8_t*)data, final_portion); + } } - if (!script_continue) + if (!script_continue && final_portion) ssn->release_js_ctx(); } -void HttpJsNorm::legacy_normalize(const Field& input, Field& output, HttpInfractions* infractions, +void HttpJsNorm::do_legacy(const Field& input, Field& output, HttpInfractions* infractions, HttpEventGen* events, int max_javascript_whitespaces) const { bool js_present = false; diff --git a/src/service_inspectors/http_inspect/http_js_norm.h b/src/service_inspectors/http_inspect/http_js_norm.h index ad6225710..076f1689e 100644 --- a/src/service_inspectors/http_inspect/http_js_norm.h +++ b/src/service_inspectors/http_inspect/http_js_norm.h @@ -41,12 +41,13 @@ public: const std::unordered_set& built_in_ident); ~HttpJsNorm(); - void legacy_normalize(const Field& input, Field& output, HttpInfractions*, HttpEventGen*, + void set_detection_depth(size_t depth) + { detection_depth = depth; } + + void do_legacy(const Field& input, Field& output, HttpInfractions*, HttpEventGen*, int max_javascript_whitespaces) const; - void enhanced_inline_normalize(const Field& input, HttpInfractions*, HttpFlowData*, - char*& out_buf, size_t& out_len) const; - void enhanced_external_normalize(const Field& input, HttpInfractions*, HttpFlowData*, - char*& out_buf, size_t& out_len) const; + void do_inline(const Field& input, Field& output, HttpInfractions*, HttpFlowData*, bool) const; + void do_external(const Field& input, Field& output, HttpInfractions*, HttpFlowData*, bool) const; void configure(); @@ -62,6 +63,7 @@ private: }; const HttpParaList::UriParam& uri_param; + size_t detection_depth; int64_t normalization_depth; int32_t identifier_depth; uint8_t max_template_nesting; diff --git a/src/service_inspectors/http_inspect/http_msg_body.cc b/src/service_inspectors/http_inspect/http_msg_body.cc index 722ec1159..d34202695 100644 --- a/src/service_inspectors/http_inspect/http_msg_body.cc +++ b/src/service_inspectors/http_inspect/http_msg_body.cc @@ -80,7 +80,8 @@ void HttpMsgBody::publish() void HttpMsgBody::bookkeeping_regular_flush(uint32_t& partial_detect_length, uint8_t*& partial_detect_buffer, uint32_t& partial_js_detect_length, int32_t detect_length) { - session_data->js_norm_depth_remaining[source_id] = session_data->detect_depth_remaining[source_id]; + params->js_norm_param.js_norm->set_detection_depth(session_data->detect_depth_remaining[source_id]); + session_data->detect_depth_remaining[source_id] -= detect_length; partial_detect_buffer = nullptr; partial_detect_length = 0; @@ -332,58 +333,17 @@ void HttpMsgBody::fd_event_callback(void* context, int event) } } -void HttpMsgBody::do_enhanced_js_normalization(char*& out_buf, size_t& out_buf_len) +void HttpMsgBody::do_enhanced_js_normalization(const Field& input, Field& output) { - const bool has_cumulative_data = (cumulative_data.length() > 0); - Field& input = has_cumulative_data ? cumulative_data : decompressed_file_body; - - bool js_continuation = session_data->js_normalizer; - uint8_t*& buf = session_data->js_detect_buffer[source_id]; - uint32_t& len = session_data->js_detect_length[source_id]; - - if (has_cumulative_data) - session_data->release_js_ctx(); - else - { - session_data->update_deallocations(len); - delete[] buf; - buf = nullptr; - len = 0; - } - + auto back = !session_data->partial_flush[source_id]; auto http_header = get_header(source_id); + auto normalizer = params->js_norm_param.js_norm; + auto infractions = transaction->get_infractions(source_id); if (http_header and http_header->is_external_js()) - params->js_norm_param.js_norm->enhanced_external_normalize(input, - transaction->get_infractions(source_id), session_data, out_buf, out_buf_len); - else - params->js_norm_param.js_norm->enhanced_inline_normalize(input, - transaction->get_infractions(source_id), session_data, out_buf, out_buf_len); - - out_buf_len = static_cast(out_buf_len) <= session_data->js_norm_depth_remaining[source_id] ? - out_buf_len : session_data->js_norm_depth_remaining[source_id]; - - if (out_buf_len > 0) - { - if (has_cumulative_data) - return; - - if (js_continuation) - { - uint8_t* nscript = new uint8_t[out_buf_len]; - - memcpy(nscript, out_buf, out_buf_len); - buf = nscript; - len = out_buf_len; - session_data->update_allocations(len); - } - } + normalizer->do_external(input, output, infractions, session_data, back); else - { - delete[] out_buf; - out_buf = nullptr; - out_buf_len = 0; - } + normalizer->do_inline(input, output, infractions, session_data, back); } void HttpMsgBody::do_legacy_js_normalization(const Field& input, Field& output) @@ -394,7 +354,7 @@ void HttpMsgBody::do_legacy_js_normalization(const Field& input, Field& output) return; } - params->js_norm_param.js_norm->legacy_normalize(input, output, + params->js_norm_param.js_norm->do_legacy(input, output, transaction->get_infractions(source_id), session_data->events[source_id], params->js_norm_param.max_javascript_whitespaces); } @@ -566,20 +526,15 @@ const Field& HttpMsgBody::get_decomp_vba_data() const Field& HttpMsgBody::get_norm_js_data() { - if (enhanced_js_norm_body.length() != STAT_NOT_COMPUTE) - return enhanced_js_norm_body; - - char* buf = nullptr; - size_t buf_len = 0; + if (norm_js_data.length() != STAT_NOT_COMPUTE) + return norm_js_data; - do_enhanced_js_normalization(buf, buf_len); + do_enhanced_js_normalization(decompressed_file_body, norm_js_data); - if (buf && buf_len) - enhanced_js_norm_body.set(buf_len, reinterpret_cast(buf), true); - else - enhanced_js_norm_body.set(STAT_NOT_PRESENT); + if (norm_js_data.length() == STAT_NOT_COMPUTE) + norm_js_data.set(STAT_NOT_PRESENT); - return enhanced_js_norm_body; + return norm_js_data; } int32_t HttpMsgBody::get_publish_length() const diff --git a/src/service_inspectors/http_inspect/http_msg_body.h b/src/service_inspectors/http_inspect/http_msg_body.h index 2f1a9c388..e3a0461fb 100644 --- a/src/service_inspectors/http_inspect/http_msg_body.h +++ b/src/service_inspectors/http_inspect/http_msg_body.h @@ -63,7 +63,7 @@ private: void do_file_processing(const Field& file_data); void do_utf_decoding(const Field& input, Field& output); void do_file_decompression(const Field& input, Field& output); - void do_enhanced_js_normalization(char*& out_buf, size_t& out_len); + void do_enhanced_js_normalization(const Field& input, Field& output); void do_legacy_js_normalization(const Field& input, Field& output); void clean_partial(uint32_t& partial_inspected_octets, uint32_t& partial_detect_length, uint8_t*& partial_detect_buffer, uint32_t& partial_js_detect_length, @@ -81,7 +81,7 @@ private: Field cumulative_data; Field js_norm_body; Field detect_data; - Field enhanced_js_norm_body; + Field norm_js_data; Field classic_client_body; // URI normalization applied Field decompressed_vba_data; diff --git a/src/service_inspectors/http_inspect/http_msg_header.cc b/src/service_inspectors/http_inspect/http_msg_header.cc index 5f75894a4..e896de26b 100755 --- a/src/service_inspectors/http_inspect/http_msg_header.cc +++ b/src/service_inspectors/http_inspect/http_msg_header.cc @@ -38,6 +38,7 @@ #include "http_common.h" #include "http_enum.h" #include "http_inspect.h" +#include "http_js_norm.h" #include "http_msg_request.h" #include "http_msg_body.h" #include "http_normalizers.h" @@ -449,7 +450,8 @@ void HttpMsgHeader::prepare_body() const int64_t& depth = (source_id == SRC_CLIENT) ? params->request_depth : params->response_depth; session_data->detect_depth_remaining[source_id] = (depth != -1) ? depth : INT64_MAX; - session_data->js_norm_depth_remaining[source_id] = session_data->detect_depth_remaining[source_id]; + params->js_norm_param.js_norm->set_detection_depth(session_data->detect_depth_remaining[source_id]); + if ((source_id == SRC_CLIENT) and params->publish_request_body and session_data->for_http2) { session_data->publish_octets[source_id] = 0; diff --git a/src/service_inspectors/http_inspect/http_stream_splitter_scan.cc b/src/service_inspectors/http_inspect/http_stream_splitter_scan.cc index 21f6b233a..b5623e6bc 100644 --- a/src/service_inspectors/http_inspect/http_stream_splitter_scan.cc +++ b/src/service_inspectors/http_inspect/http_stream_splitter_scan.cc @@ -157,6 +157,7 @@ StreamSplitter::Status HttpStreamSplitter::scan(Packet* pkt, const uint8_t* data #endif SectionType& type = session_data->type_expected[source_id]; + session_data->partial_flush[source_id] = false; if (type == SEC_ABORT) return status_value(StreamSplitter::ABORT); diff --git a/src/utils/js_normalizer.cc b/src/utils/js_normalizer.cc index 4e040b76b..781092fcf 100644 --- a/src/utils/js_normalizer.cc +++ b/src/utils/js_normalizer.cc @@ -71,6 +71,7 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len) ->pubsetbuf(const_cast(src), len); out_buf.reserve(src_len * BUFF_EXP_FACTOR); + tokenizer.pre_yylex(); JSTokenizer::JSRet ret = static_cast(tokenizer.yylex()); in.clear(); out.clear(); @@ -82,33 +83,3 @@ JSTokenizer::JSRet JSNormalizer::normalize(const char* src, size_t src_len) return rem_bytes ? ret : JSTokenizer::EOS; } - -std::pair JSNormalizer::get_script() -{ - streamsize len = 0; - char* dst = out_buf.release_data(len); - return {dst, len}; -} - -size_t JSNormalizer::peek_script_size() -{ - return out.tellp(); -} - -void JSNormalizer::prepend_script(const void* p , size_t n) -{ - if (p) - out_buf.sputn(reinterpret_cast(p), n); -} - -size_t JSNormalizer::size() -{ - return sizeof(JSNormalizer) + 16834; // the default YY_BUF_SIZE -} - -#ifdef BENCHMARK_TEST -void JSNormalizer::rewind_output() -{ - out_buf.pubseekoff(0, ios_base::beg, ios_base::out); -} -#endif diff --git a/src/utils/js_normalizer.h b/src/utils/js_normalizer.h index f2866d11a..508380d9d 100644 --- a/src/utils/js_normalizer.h +++ b/src/utils/js_normalizer.h @@ -38,21 +38,29 @@ public: int tmp_cap_size = JSTOKENIZER_BUF_MAX_SIZE); ~JSNormalizer(); + JSTokenizer::JSRet normalize(const char* src, size_t src_len); + const char* get_src_next() const { return src_next; } void reset_depth() { rem_bytes = depth; } - JSTokenizer::JSRet normalize(const char* src, size_t src_len); - std::pair get_script(); - size_t peek_script_size(); - void prepend_script(const void*, size_t); + const char* take_script() + { return out_buf.take_data(); } + + const char* get_script() const + { return out_buf.data(); } + + size_t script_size() + { return out.tellp(); } - static size_t size(); + static size_t size() + { return sizeof(JSNormalizer) + 16834; /* YY_BUF_SIZE */ } #ifdef BENCHMARK_TEST - void rewind_output(); + void rewind_output() + { out_buf.pubseekoff(0, std::ios_base::beg, std::ios_base::out); } #endif private: diff --git a/src/utils/js_tokenizer.h b/src/utils/js_tokenizer.h index 47239648e..5079e23d5 100644 --- a/src/utils/js_tokenizer.h +++ b/src/utils/js_tokenizer.h @@ -116,6 +116,9 @@ public: int cap_size = JSTOKENIZER_BUF_MAX_SIZE); ~JSTokenizer() override; + // internal actions before calling main loop + void pre_yylex(); + // returns JSRet int yylex() override; @@ -164,7 +167,8 @@ private: struct { JSToken token = UNDEFINED; // the token before - int length = 0; // current token length + int orig_len = 0; // current token original length + int norm_len = 0; // normalized length of previous tokens int sc = 0; // current Starting Condition } states[JSTOKENIZER_MAX_STATES]; int sp = 0; // points to the top of states @@ -172,6 +176,7 @@ private: char*& tmp_buf; size_t& tmp_buf_size; const int tmp_cap_size; + int output_steps_back; bool newline_found = false; constexpr static bool insert_semicolon[ASI_GROUP_MAX][ASI_GROUP_MAX] diff --git a/src/utils/js_tokenizer.l b/src/utils/js_tokenizer.l index f399dc1bc..f2239747b 100644 --- a/src/utils/js_tokenizer.l +++ b/src/utils/js_tokenizer.l @@ -1146,6 +1146,7 @@ JSTokenizer::JSTokenizer(std::istream& in, std::ostream& out, tmp_buf(buf), tmp_buf_size(buf_size), tmp_cap_size(cap_size), + output_steps_back(0), max_scope_depth(max_scope_depth) { scope_push(GLOBAL); @@ -1160,6 +1161,12 @@ JSTokenizer::~JSTokenizer() tmp_buf_size = 0; } +void JSTokenizer::pre_yylex() +{ + assert(output_steps_back >= 0); + yyout.seekp(-output_steps_back, std::ios_base::cur); +} + void JSTokenizer::switch_to_temporal(const std::string& data) { tmp.str(data); @@ -1351,19 +1358,22 @@ void JSTokenizer::states_push() auto& state = states[sp]; state.token = token; - state.length = yyleng; + state.orig_len = yyleng; + state.norm_len = yyout.rdbuf()->pubseekoff(0, std::ios_base::cur, std::ios_base::out); state.sc = yy_start; } void JSTokenizer::states_correct(int take_off) { auto& state = states[sp]; - state.length -= yyleng - take_off; + state.orig_len -= yyleng - take_off; } void JSTokenizer::states_apply() { int tail_size = 0; + int outbuf_pos = yyout.tellp(); + int outbuf_back = outbuf_pos; for (int i = JSTOKENIZER_MAX_STATES; i > 0 && tail_size < tmp_cap_size; --i) { @@ -1371,17 +1381,21 @@ void JSTokenizer::states_apply() idx %= JSTOKENIZER_MAX_STATES; auto& state = states[idx]; - if (state.length == 0) + outbuf_back = state.norm_len; + + if (state.orig_len == 0) continue; token = state.token; yy_start = state.sc; - tail_size += state.length; + tail_size += state.orig_len; tail_size = tail_size < tmp_cap_size ? tail_size : tmp_cap_size; } + output_steps_back = outbuf_pos - outbuf_back; + for (int i = 0; i < JSTOKENIZER_MAX_STATES; ++i) - states[i].length = 0; + states[i].orig_len = 0; char* buf = new char[tail_size]; diff --git a/src/utils/streambuf.cc b/src/utils/streambuf.cc index 1b5e1d1b0..ff46f9653 100644 --- a/src/utils/streambuf.cc +++ b/src/utils/streambuf.cc @@ -251,7 +251,19 @@ void ostreambuf_infl::reserve(streamsize n) enlarge(n - size); } -char* ostreambuf_infl::release_data(streamsize& n) +const char* ostreambuf_infl::take_data() +{ + auto data = pbase(); + + setp(nullptr, nullptr); + + gen.s = states[0].s; + gen.n = states[0].n; + + return data; +} + +const char* ostreambuf_infl::take_data(streamsize& n) { auto data = pbase(); @@ -278,6 +290,9 @@ streampos ostreambuf_infl::seekoff(streamoff off, ios_base::seekdir way, ios_bas if (!(which & ios_base::out)) return -1; + if (off == 0 && way == ios_base::cur) + return pptr() - pbase(); + auto base = pbase(); auto ptr = pptr(); auto eptr = epptr(); diff --git a/src/utils/streambuf.h b/src/utils/streambuf.h index 674a5b498..b008db2da 100644 --- a/src/utils/streambuf.h +++ b/src/utils/streambuf.h @@ -78,7 +78,14 @@ public: // releases the current buffer, // the caller takes ownership over the buffer - char* release_data(std::streamsize& n); + const char* take_data(); + const char* take_data(std::streamsize& n); + + const char* data() const + { return pbase(); } + + std::streamsize data_len() const + { return pptr() - pbase(); } protected: virtual std::streambuf* setbuf(char* s, std::streamsize n) override; diff --git a/src/utils/test/js_normalizer_test.cc b/src/utils/test/js_normalizer_test.cc index 9ac56bf7b..99d16f19b 100644 --- a/src/utils/test/js_normalizer_test.cc +++ b/src/utils/test/js_normalizer_test.cc @@ -75,9 +75,8 @@ static const std::unordered_set s_ident_built_in { "console", "eval JSNormalizer norm(ident_ctx, DEPTH, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH); \ auto ret = norm.normalize(src, sizeof(src)); \ const char* ptr = norm.get_src_next(); \ - auto result = norm.get_script(); \ - char* dst = result.first; \ - int act_len = result.second; \ + int act_len = norm.script_size(); \ + const char* dst = norm.take_script(); #define VALIDATE(src, expected) \ CHECK(ret == JSTokenizer::SCRIPT_CONTINUE); \ @@ -100,12 +99,10 @@ static const std::unordered_set s_ident_built_in { "console", "eval JSNormalizer norm(ident_ctx, depth, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH); \ ret = norm.normalize(src, src_len); \ ptr = norm.get_src_next(); \ - auto result = norm.get_script(); \ - char* dptr = result.first; \ - len = result.second; \ + len = norm.script_size(); \ + const char* dptr = norm.get_script(); \ REQUIRE(len == dst_len); \ memcpy(dst, dptr, dst_len); \ - delete[] dptr; \ } #define DO(src, slen, dst, dlen) \ @@ -113,9 +110,8 @@ static const std::unordered_set s_ident_built_in { "console", "eval auto ret = norm.normalize(src, slen); \ CHECK(ret == JSTokenizer::SCRIPT_CONTINUE); \ auto nsrc = norm.get_src_next(); \ - auto result = norm.get_script(); \ - char* ptr = result.first; \ - int act_len = result.second; \ + int act_len = norm.script_size(); \ + const char* ptr = norm.take_script(); \ REQUIRE(nsrc - src == slen); \ REQUIRE(act_len == dlen); \ memcpy(dst, ptr, dlen); \ @@ -126,12 +122,10 @@ static const std::unordered_set s_ident_built_in { "console", "eval { \ auto ret = norm.normalize(src, slen); \ CHECK(ret == rexp); \ - auto result = norm.get_script(); \ - char* ptr = result.first; \ - int act_len = result.second; \ + int act_len = norm.script_size(); \ + const char* ptr = norm.get_script(); \ REQUIRE(act_len == dlen); \ memcpy(dst, ptr, dlen); \ - delete[] ptr; \ } #define CLOSE() \ @@ -1454,9 +1448,8 @@ TEST_CASE("endings", "[JSNormalizer]") JSNormalizer norm(ident_ctx, 7, MAX_TEMPLATE_NESTING, MAX_SCOPE_DEPTH); ret = norm.normalize(src, sizeof(src)); ptr = norm.get_src_next(); - auto res1 = norm.get_script(); - char* dst1 = res1.first; - int act_len1 = res1.second; + int act_len1 = norm.script_size(); + const char* dst1 = norm.take_script(); CHECK(ret == JSTokenizer::EOS); CHECK(ptr == src + 7); @@ -1466,9 +1459,8 @@ TEST_CASE("endings", "[JSNormalizer]") ret = norm.normalize(src2, sizeof(src2)); ptr = norm.get_src_next(); - auto res2 = norm.get_script(); - char* dst2 = res2.first; - int act_len2 = res2.second; + int act_len2 = norm.script_size(); + const char* dst2 = norm.take_script(); CHECK(ret == JSTokenizer::EOS); CHECK(ptr == src2 + sizeof(src2)); diff --git a/src/utils/test/streambuf_test.cc b/src/utils/test/streambuf_test.cc index f3246b4fc..d9285b83a 100644 --- a/src/utils/test/streambuf_test.cc +++ b/src/utils/test/streambuf_test.cc @@ -69,8 +69,17 @@ using namespace std; #define EXP_RES(b, exp, exp_len, exp_mem_size) \ { \ + auto d1_len = (b).data_len(); \ + auto d1 = (b).data(); \ streamsize act_len; \ - char* act = (b).release_data(act_len); \ + const char* act = (b).take_data(act_len); \ + auto d2_len = (b).data_len(); \ + auto d2 = (b).data(); \ + \ + CHECK(d1 == act); \ + CHECK(d1_len == act_len); \ + CHECK(d2 == nullptr); \ + CHECK(d2_len == 0); \ \ CHECK((exp_mem_size) == act_len); \ REQUIRE((exp_len) <= act_len); \ @@ -101,12 +110,11 @@ using namespace std; CHECK((exp_len) == (s).tellp()); \ \ ostreambuf_infl* b = reinterpret_cast((s).rdbuf()); \ - streamsize act_len; \ - char* act = b->release_data(act_len); \ + auto act = b->data(); \ + auto act_len = b->data_len(); \ \ REQUIRE((exp_len) == act_len); \ CHECK(!memcmp((exp), act, (exp_len))); \ - delete[] act; \ } #define EOF_OUT(s, exp, exp_len) \ @@ -117,12 +125,11 @@ using namespace std; CHECK((exp_len) == (s).tellp()); \ \ ostreambuf_infl* b = reinterpret_cast((s).rdbuf()); \ - streamsize act_len; \ - char* act = b->release_data(act_len); \ + auto act = b->data(); \ + auto act_len = b->data_len(); \ \ REQUIRE((exp_len) == act_len); \ CHECK(!memcmp((exp), act, (exp_len))); \ - delete[] act; \ } TEST_CASE("input buffer - basic one source", "[Stream buffers]")