From: Adrian Mamolea (admamole) <admamole@cisco.com>
Date: Fri, 6 Oct 2023 15:34:41 +0000 (+0000)
Subject: Pull request #4039: http_inspect: run detection on failed utf decoding
X-Git-Tag: 3.1.72.0~4
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f45faeac0b2be9ba3b00b4e12ce8bf347479e258;p=thirdparty%2Fsnort3.git

Pull request #4039: http_inspect: run detection on failed utf decoding

Merge in SNORT/snort3 from ~ADMAMOLE/snort3:utf to master

Squashed commit of the following:

commit fe4c6b14626890ba2fa116faa4c9b632532e0cf9
Author: Adrian Mamolea <admamole@cisco.com>
Date:   Fri Sep 29 17:11:09 2023 -0400

    http_inspect: run detection on failed utf decoding
---

diff --git a/src/decompress/test/file_olefile_test.cc b/src/decompress/test/file_olefile_test.cc
index ffd5026ae..c2a301639 100644
--- a/src/decompress/test/file_olefile_test.cc
+++ b/src/decompress/test/file_olefile_test.cc
@@ -44,7 +44,7 @@ LiteralSearch::Handle* LiteralSearch::setup() { return nullptr; }
 void LiteralSearch::cleanup(LiteralSearch::Handle*) { }
 LiteralSearch* LiteralSearch::instantiate(LiteralSearch::Handle*, const uint8_t*, unsigned, bool,
     bool) { return nullptr; }
-void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode) { }
+void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode, CharsetSrc) { }
 bool UtfDecodeSession::decode_utf(unsigned char const*, unsigned int, unsigned char*, unsigned int,
     int*) { return true; }
 UtfDecodeSession::UtfDecodeSession() { }
diff --git a/src/service_inspectors/http_inspect/http_msg_body.cc b/src/service_inspectors/http_inspect/http_msg_body.cc
index 21e3df583..343cd84ed 100644
--- a/src/service_inspectors/http_inspect/http_msg_body.cc
+++ b/src/service_inspectors/http_inspect/http_msg_body.cc
@@ -328,40 +328,32 @@ void HttpMsgBody::analyze()
 
 void HttpMsgBody::do_utf_decoding(const Field& input, Field& output)
 {
-    if ((session_data->utf_state[source_id] == nullptr) || (input.length() == 0))
+    auto ctx = session_data->utf_state[source_id];
+
+    if ((ctx == nullptr) || (input.length() <= 0) || !ctx->is_utf_encoding_present())
     {
         output.set(input);
         return;
     }
 
-    if (session_data->utf_state[source_id]->is_utf_encoding_present())
-    {
-        int bytes_copied;
-        bool decoded;
-        uint8_t* buffer = new uint8_t[input.length()];
-        decoded = session_data->utf_state[source_id]->decode_utf(
-            input.start(), input.length(), buffer, input.length(), &bytes_copied);
+    int bytes_copied;
+    uint8_t* buffer = new uint8_t[input.length()];
 
-        if (!decoded)
-        {
-            delete[] buffer;
-            output.set(input);
-            add_infraction(INF_UTF_NORM_FAIL);
-            create_event(EVENT_UTF_NORM_FAIL);
-        }
-        else if (bytes_copied > 0)
-        {
-            output.set(bytes_copied, buffer, true);
-        }
-        else
-        {
-            delete[] buffer;
-            output.set(input);
-        }
+    if (!ctx->decode_utf(input.start(), input.length(), buffer, input.length(), &bytes_copied))
+    {
+        add_infraction(INF_UTF_NORM_FAIL);
+        create_event(EVENT_UTF_NORM_FAIL);
+        if (CHARSET_SET_BY_GUESS == ctx->get_decode_utf_charset_src())
+            bytes_copied = 0;
     }
 
+    if (bytes_copied > 0)
+        output.set(bytes_copied, buffer, true);
     else
+    {
+        delete[] buffer;
         output.set(input);
+    }
 }
 
 void HttpMsgBody::get_ole_data()
diff --git a/src/utils/util_utf.cc b/src/utils/util_utf.cc
index 853f73943..70bbdc062 100644
--- a/src/utils/util_utf.cc
+++ b/src/utils/util_utf.cc
@@ -49,13 +49,15 @@ void UtfDecodeSession::init_decode_utf_state()
 {
     dstate.state = DSTATE_FIRST;
     dstate.charset = CHARSET_DEFAULT;
+    dstate.charset_src = CHARSET_SET_BY_GUESS;
 }
 
 /* setters & getters */
-void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode charset)
+void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode charset, CharsetSrc src)
 {
     dstate.state = DSTATE_FIRST;
     dstate.charset = charset;
+    dstate.charset_src = src;
 }
 
 CharsetCode UtfDecodeSession::get_decode_utf_state_charset()
@@ -63,6 +65,11 @@ CharsetCode UtfDecodeSession::get_decode_utf_state_charset()
     return dstate.charset;
 }
 
+CharsetSrc UtfDecodeSession::get_decode_utf_charset_src()
+{
+    return dstate.charset_src;
+}
+
 bool UtfDecodeSession::is_utf_encoding_present()
 {
     if ( get_decode_utf_state_charset() > CHARSET_IRRELEVANT )
@@ -252,76 +259,79 @@ bool UtfDecodeSession::DecodeUTF32BE(const uint8_t* src, unsigned int src_len, u
 
 void UtfDecodeSession::determine_charset(const uint8_t** src, unsigned int* src_len)
 {
-    CharsetCode charset;
-    if (dstate.charset == CHARSET_UNKNOWN)
-    {
-        /* Got a text content type but no charset.
-         * Look for potential BOM (Byte Order Mark) */
-        if (*src_len >= 4)
-        {
-            uint8_t size = 0;
-
-            if (!memcmp(*src, "\x00\x00\xFE\xFF", 4))
-            {
-                charset = CHARSET_UTF32BE;
-                size = 4;
-            }
-            else if (!memcmp(*src, "\xFF\xFE\x00\x00", 4))
-            {
-                charset = CHARSET_UTF32LE;
-                size = 4;
-            }
-            else if (!memcmp(*src, "\xFE\xFF", 2))
-            {
-                charset = CHARSET_UTF16BE;
-                size = 2;
-            }
-            else if (!memcmp(*src, "\xFF\xFE", 2))
-            {
-                charset = CHARSET_UTF16LE;
-                size = 2;
-            }
-            //  BOM (Byte Order Mark) was missing. Try to guess the encoding.
-            else if (((*src)[0] == '\0') && ((*src)[2] == '\0') && ((*src)[3] != '\0'))
-            {
-                if ((*src)[1] != '\0')
-                    charset = CHARSET_UTF16BE;  // \0C\0C
-                else
-                    charset = CHARSET_UTF32BE;  // \0\0\0C
-            }
-            else if (((*src)[0] != '\0') && ((*src)[1] == '\0') && ((*src)[3] == '\0'))
-            {
-                if ((*src)[2] != '\0')
-                    charset = CHARSET_UTF16LE;  // C\0C\0
-                else
-                    charset = CHARSET_UTF32LE;  // C\0\0\0
-            }
-            else
-            {
-                // NOTE: The UTF-8 BOM (Byte Order Mark) does not match the above cases, so we end
-                // up here when parsing UTF-8. That works out for the moment because the first 128
-                // characters of UTF-8 are identical to ASCII. We may want to handle other UTF-8
-                // characters beyond 0x7f in the future.
+    if (dstate.charset != CHARSET_UNKNOWN)
+        return;
 
-                charset = CHARSET_DEFAULT; // ensure we don't try again
-            }
+    CharsetCode charset = CHARSET_DEFAULT;
+    CharsetSrc charset_src = CHARSET_SET_BY_GUESS;
 
-            // FIXIT-M We are not currently handling the case where some characters are not ASCII
-            // and some are ASCII. This is a problem because some UTF-16 characters have no NUL
-            // bytes (so won't be identified as UTF-16.)
+    if (*src_len < 4)
+    {
+        set_decode_utf_state_charset(charset, charset_src);
+        return;
+    }
 
-            // FIXIT-L We also do not handle multiple levels of encoding (where unicode becomes
-            // %u0020 for example).
+    /* Got a text content type but no charset.
+     * Look for potential BOM (Byte Order Mark) */
+    uint8_t size = 0;
 
-            *src += size;
-            *src_len -= size;
-        }
+    if (!memcmp(*src, "\x00\x00\xFE\xFF", 4))
+    {
+        charset = CHARSET_UTF32BE;
+        size = 4;
+    }
+    else if (!memcmp(*src, "\xFF\xFE\x00\x00", 4))
+    {
+        charset = CHARSET_UTF32LE;
+        size = 4;
+    }
+    else if (!memcmp(*src, "\xFE\xFF", 2))
+    {
+        charset = CHARSET_UTF16BE;
+        size = 2;
+    }
+    else if (!memcmp(*src, "\xFF\xFE", 2))
+    {
+        charset = CHARSET_UTF16LE;
+        size = 2;
+    }
+
+    // If BOM (Byte Order Mark) is missing try to guess the encoding.
+    if (charset != CHARSET_DEFAULT)
+        charset_src = CHARSET_SET_BY_BOM;
+    else if (((*src)[0] == '\0') && ((*src)[2] == '\0') && ((*src)[3] != '\0'))
+    {
+        if ((*src)[1] != '\0')
+            charset = CHARSET_UTF16BE;  // \0C\0C
         else
-        {
-            charset = CHARSET_DEFAULT; // ensure we don't try again
-        }
-        set_decode_utf_state_charset(charset);
+            charset = CHARSET_UTF32BE;  // \0\0\0C
+    }
+    else if (((*src)[0] != '\0') && ((*src)[1] == '\0') && ((*src)[3] == '\0'))
+    {
+        if ((*src)[2] != '\0')
+            charset = CHARSET_UTF16LE;  // C\0C\0
+        else
+            charset = CHARSET_UTF32LE;  // C\0\0\0
     }
+    else
+    {
+        // NOTE: The UTF-8 BOM (Byte Order Mark) does not match the above cases, so we end
+        // up here when parsing UTF-8. That works out for the moment because the first 128
+        // characters of UTF-8 are identical to ASCII. We may want to handle other UTF-8
+        // characters beyond 0x7f in the future.
+    }
+
+    // FIXIT-M We are not currently handling the case where some characters are not ASCII
+    // and some are ASCII. This is a problem because some UTF-16 characters have no NUL
+    // bytes (so won't be identified as UTF-16.)
+
+    // FIXIT-L We also do not handle multiple levels of encoding (where unicode becomes
+    // %u0020 for example).
+
+    *src += size;
+    *src_len -= size;
+
+    set_decode_utf_state_charset(charset, charset_src);
 }
 
 /* Wrapper function for DecodeUTF{16,32}{LE,BE} */
diff --git a/src/utils/util_utf.h b/src/utils/util_utf.h
index 322674da4..be30bafcd 100644
--- a/src/utils/util_utf.h
+++ b/src/utils/util_utf.h
@@ -38,6 +38,14 @@ enum CharsetCode
     CHARSET_UNKNOWN
 };
 
+// How character set was set.
+enum CharsetSrc
+{
+    CHARSET_SET_BY_APP=0,
+    CHARSET_SET_BY_BOM,
+    CHARSET_SET_BY_GUESS
+};
+
 // Since payloads don't have to end on 2/4-byte boundaries, callers to
 // DecodeUTF are responsible for keeping a decode_utf_state_t. This carries
 // state between subsequent calls.
@@ -45,6 +53,7 @@ struct decode_utf_state_t
 {
     int state;
     CharsetCode charset;
+    CharsetSrc charset_src;
 };
 
 namespace snort
@@ -55,8 +64,9 @@ public:
     UtfDecodeSession();
     virtual ~UtfDecodeSession() = default;
     void init_decode_utf_state();
-    void set_decode_utf_state_charset(CharsetCode charset);
+    void set_decode_utf_state_charset(CharsetCode charset, CharsetSrc src = CHARSET_SET_BY_APP);
     CharsetCode get_decode_utf_state_charset();
+    CharsetSrc get_decode_utf_charset_src();
     bool is_utf_encoding_present();
     bool decode_utf(const uint8_t* src, unsigned int src_len, uint8_t* dst, unsigned int dst_len,
         int* bytes_copied);