From: Adrian Mamolea (admamole) Date: Fri, 6 Oct 2023 15:34:41 +0000 (+0000) Subject: Pull request #4039: http_inspect: run detection on failed utf decoding X-Git-Tag: 3.1.72.0~4 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f45faeac0b2be9ba3b00b4e12ce8bf347479e258;p=thirdparty%2Fsnort3.git Pull request #4039: http_inspect: run detection on failed utf decoding Merge in SNORT/snort3 from ~ADMAMOLE/snort3:utf to master Squashed commit of the following: commit fe4c6b14626890ba2fa116faa4c9b632532e0cf9 Author: Adrian Mamolea Date: Fri Sep 29 17:11:09 2023 -0400 http_inspect: run detection on failed utf decoding --- diff --git a/src/decompress/test/file_olefile_test.cc b/src/decompress/test/file_olefile_test.cc index ffd5026ae..c2a301639 100644 --- a/src/decompress/test/file_olefile_test.cc +++ b/src/decompress/test/file_olefile_test.cc @@ -44,7 +44,7 @@ LiteralSearch::Handle* LiteralSearch::setup() { return nullptr; } void LiteralSearch::cleanup(LiteralSearch::Handle*) { } LiteralSearch* LiteralSearch::instantiate(LiteralSearch::Handle*, const uint8_t*, unsigned, bool, bool) { return nullptr; } -void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode) { } +void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode, CharsetSrc) { } bool UtfDecodeSession::decode_utf(unsigned char const*, unsigned int, unsigned char*, unsigned int, int*) { return true; } UtfDecodeSession::UtfDecodeSession() { } diff --git a/src/service_inspectors/http_inspect/http_msg_body.cc b/src/service_inspectors/http_inspect/http_msg_body.cc index 21e3df583..343cd84ed 100644 --- a/src/service_inspectors/http_inspect/http_msg_body.cc +++ b/src/service_inspectors/http_inspect/http_msg_body.cc @@ -328,40 +328,32 @@ void HttpMsgBody::analyze() void HttpMsgBody::do_utf_decoding(const Field& input, Field& output) { - if ((session_data->utf_state[source_id] == nullptr) || (input.length() == 0)) + auto ctx = session_data->utf_state[source_id]; + + if ((ctx == nullptr) || (input.length() <= 0) || !ctx->is_utf_encoding_present()) { output.set(input); return; } - if (session_data->utf_state[source_id]->is_utf_encoding_present()) - { - int bytes_copied; - bool decoded; - uint8_t* buffer = new uint8_t[input.length()]; - decoded = session_data->utf_state[source_id]->decode_utf( - input.start(), input.length(), buffer, input.length(), &bytes_copied); + int bytes_copied; + uint8_t* buffer = new uint8_t[input.length()]; - if (!decoded) - { - delete[] buffer; - output.set(input); - add_infraction(INF_UTF_NORM_FAIL); - create_event(EVENT_UTF_NORM_FAIL); - } - else if (bytes_copied > 0) - { - output.set(bytes_copied, buffer, true); - } - else - { - delete[] buffer; - output.set(input); - } + if (!ctx->decode_utf(input.start(), input.length(), buffer, input.length(), &bytes_copied)) + { + add_infraction(INF_UTF_NORM_FAIL); + create_event(EVENT_UTF_NORM_FAIL); + if (CHARSET_SET_BY_GUESS == ctx->get_decode_utf_charset_src()) + bytes_copied = 0; } + if (bytes_copied > 0) + output.set(bytes_copied, buffer, true); else + { + delete[] buffer; output.set(input); + } } void HttpMsgBody::get_ole_data() diff --git a/src/utils/util_utf.cc b/src/utils/util_utf.cc index 853f73943..70bbdc062 100644 --- a/src/utils/util_utf.cc +++ b/src/utils/util_utf.cc @@ -49,13 +49,15 @@ void UtfDecodeSession::init_decode_utf_state() { dstate.state = DSTATE_FIRST; dstate.charset = CHARSET_DEFAULT; + dstate.charset_src = CHARSET_SET_BY_GUESS; } /* setters & getters */ -void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode charset) +void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode charset, CharsetSrc src) { dstate.state = DSTATE_FIRST; dstate.charset = charset; + dstate.charset_src = src; } CharsetCode UtfDecodeSession::get_decode_utf_state_charset() @@ -63,6 +65,11 @@ CharsetCode UtfDecodeSession::get_decode_utf_state_charset() return dstate.charset; } +CharsetSrc UtfDecodeSession::get_decode_utf_charset_src() +{ + return dstate.charset_src; +} + bool UtfDecodeSession::is_utf_encoding_present() { if ( get_decode_utf_state_charset() > CHARSET_IRRELEVANT ) @@ -252,76 +259,79 @@ bool UtfDecodeSession::DecodeUTF32BE(const uint8_t* src, unsigned int src_len, u void UtfDecodeSession::determine_charset(const uint8_t** src, unsigned int* src_len) { - CharsetCode charset; - if (dstate.charset == CHARSET_UNKNOWN) - { - /* Got a text content type but no charset. - * Look for potential BOM (Byte Order Mark) */ - if (*src_len >= 4) - { - uint8_t size = 0; - - if (!memcmp(*src, "\x00\x00\xFE\xFF", 4)) - { - charset = CHARSET_UTF32BE; - size = 4; - } - else if (!memcmp(*src, "\xFF\xFE\x00\x00", 4)) - { - charset = CHARSET_UTF32LE; - size = 4; - } - else if (!memcmp(*src, "\xFE\xFF", 2)) - { - charset = CHARSET_UTF16BE; - size = 2; - } - else if (!memcmp(*src, "\xFF\xFE", 2)) - { - charset = CHARSET_UTF16LE; - size = 2; - } - // BOM (Byte Order Mark) was missing. Try to guess the encoding. - else if (((*src)[0] == '\0') && ((*src)[2] == '\0') && ((*src)[3] != '\0')) - { - if ((*src)[1] != '\0') - charset = CHARSET_UTF16BE; // \0C\0C - else - charset = CHARSET_UTF32BE; // \0\0\0C - } - else if (((*src)[0] != '\0') && ((*src)[1] == '\0') && ((*src)[3] == '\0')) - { - if ((*src)[2] != '\0') - charset = CHARSET_UTF16LE; // C\0C\0 - else - charset = CHARSET_UTF32LE; // C\0\0\0 - } - else - { - // NOTE: The UTF-8 BOM (Byte Order Mark) does not match the above cases, so we end - // up here when parsing UTF-8. That works out for the moment because the first 128 - // characters of UTF-8 are identical to ASCII. We may want to handle other UTF-8 - // characters beyond 0x7f in the future. + if (dstate.charset != CHARSET_UNKNOWN) + return; - charset = CHARSET_DEFAULT; // ensure we don't try again - } + CharsetCode charset = CHARSET_DEFAULT; + CharsetSrc charset_src = CHARSET_SET_BY_GUESS; - // FIXIT-M We are not currently handling the case where some characters are not ASCII - // and some are ASCII. This is a problem because some UTF-16 characters have no NUL - // bytes (so won't be identified as UTF-16.) + if (*src_len < 4) + { + set_decode_utf_state_charset(charset, charset_src); + return; + } - // FIXIT-L We also do not handle multiple levels of encoding (where unicode becomes - // %u0020 for example). + /* Got a text content type but no charset. + * Look for potential BOM (Byte Order Mark) */ + uint8_t size = 0; - *src += size; - *src_len -= size; - } + if (!memcmp(*src, "\x00\x00\xFE\xFF", 4)) + { + charset = CHARSET_UTF32BE; + size = 4; + } + else if (!memcmp(*src, "\xFF\xFE\x00\x00", 4)) + { + charset = CHARSET_UTF32LE; + size = 4; + } + else if (!memcmp(*src, "\xFE\xFF", 2)) + { + charset = CHARSET_UTF16BE; + size = 2; + } + else if (!memcmp(*src, "\xFF\xFE", 2)) + { + charset = CHARSET_UTF16LE; + size = 2; + } + + // If BOM (Byte Order Mark) is missing try to guess the encoding. + if (charset != CHARSET_DEFAULT) + charset_src = CHARSET_SET_BY_BOM; + else if (((*src)[0] == '\0') && ((*src)[2] == '\0') && ((*src)[3] != '\0')) + { + if ((*src)[1] != '\0') + charset = CHARSET_UTF16BE; // \0C\0C else - { - charset = CHARSET_DEFAULT; // ensure we don't try again - } - set_decode_utf_state_charset(charset); + charset = CHARSET_UTF32BE; // \0\0\0C + } + else if (((*src)[0] != '\0') && ((*src)[1] == '\0') && ((*src)[3] == '\0')) + { + if ((*src)[2] != '\0') + charset = CHARSET_UTF16LE; // C\0C\0 + else + charset = CHARSET_UTF32LE; // C\0\0\0 } + else + { + // NOTE: The UTF-8 BOM (Byte Order Mark) does not match the above cases, so we end + // up here when parsing UTF-8. That works out for the moment because the first 128 + // characters of UTF-8 are identical to ASCII. We may want to handle other UTF-8 + // characters beyond 0x7f in the future. + } + + // FIXIT-M We are not currently handling the case where some characters are not ASCII + // and some are ASCII. This is a problem because some UTF-16 characters have no NUL + // bytes (so won't be identified as UTF-16.) + + // FIXIT-L We also do not handle multiple levels of encoding (where unicode becomes + // %u0020 for example). + + *src += size; + *src_len -= size; + + set_decode_utf_state_charset(charset, charset_src); } /* Wrapper function for DecodeUTF{16,32}{LE,BE} */ diff --git a/src/utils/util_utf.h b/src/utils/util_utf.h index 322674da4..be30bafcd 100644 --- a/src/utils/util_utf.h +++ b/src/utils/util_utf.h @@ -38,6 +38,14 @@ enum CharsetCode CHARSET_UNKNOWN }; +// How character set was set. +enum CharsetSrc +{ + CHARSET_SET_BY_APP=0, + CHARSET_SET_BY_BOM, + CHARSET_SET_BY_GUESS +}; + // Since payloads don't have to end on 2/4-byte boundaries, callers to // DecodeUTF are responsible for keeping a decode_utf_state_t. This carries // state between subsequent calls. @@ -45,6 +53,7 @@ struct decode_utf_state_t { int state; CharsetCode charset; + CharsetSrc charset_src; }; namespace snort @@ -55,8 +64,9 @@ public: UtfDecodeSession(); virtual ~UtfDecodeSession() = default; void init_decode_utf_state(); - void set_decode_utf_state_charset(CharsetCode charset); + void set_decode_utf_state_charset(CharsetCode charset, CharsetSrc src = CHARSET_SET_BY_APP); CharsetCode get_decode_utf_state_charset(); + CharsetSrc get_decode_utf_charset_src(); bool is_utf_encoding_present(); bool decode_utf(const uint8_t* src, unsigned int src_len, uint8_t* dst, unsigned int dst_len, int* bytes_copied);