Pull request #4039: http_inspect: run detection on failed utf decoding

author Adrian Mamolea (admamole) <admamole@cisco.com>

Fri, 6 Oct 2023 15:34:41 +0000 (15:34 +0000)

committer Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>

Fri, 6 Oct 2023 15:34:41 +0000 (15:34 +0000)
author Adrian Mamolea (admamole) <admamole@cisco.com>
Fri, 6 Oct 2023 15:34:41 +0000 (15:34 +0000)
committer Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
Fri, 6 Oct 2023 15:34:41 +0000 (15:34 +0000)
diff --git a/src/decompress/test/file_olefile_test.cc b/src/decompress/test/file_olefile_test.cc

index ffd5026aeb420a3b7826ea24ccfca3c75a83a002..c2a301639df3fbf4ab6343b94956fb1b4677fb62 100644 (file)
--- a/src/decompress/test/file_olefile_test.cc
+++ b/src/decompress/test/file_olefile_test.cc
@@ -44,7 +44,7 @@ LiteralSearch::Handle* LiteralSearch::setup() { return nullptr; }
  void LiteralSearch::cleanup(LiteralSearch::Handle*) { }
  LiteralSearch* LiteralSearch::instantiate(LiteralSearch::Handle*, const uint8_t*, unsigned, bool,
      bool) { return nullptr; }
-void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode) { }
+void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode, CharsetSrc) { }
  bool UtfDecodeSession::decode_utf(unsigned char const*, unsigned int, unsigned char*, unsigned int,
      int*) { return true; }
  UtfDecodeSession::UtfDecodeSession() { }
diff --git a/src/service_inspectors/http_inspect/http_msg_body.cc b/src/service_inspectors/http_inspect/http_msg_body.cc

index 21e3df583675f1e57c7dac6b1c461f52297395c0..343cd84ed3522a3cebe78318e2b7f1ff3d124a1a 100644 (file)
--- a/src/service_inspectors/http_inspect/http_msg_body.cc
+++ b/src/service_inspectors/http_inspect/http_msg_body.cc
@@ -328,40 +328,32 @@ void HttpMsgBody::analyze()
  
  void HttpMsgBody::do_utf_decoding(const Field& input, Field& output)
  {
-    if ((session_data->utf_state[source_id] == nullptr) || (input.length() == 0))
+    auto ctx = session_data->utf_state[source_id];
+
+    if ((ctx == nullptr) || (input.length() <= 0) || !ctx->is_utf_encoding_present())
      {
          output.set(input);
          return;
      }
  
-    if (session_data->utf_state[source_id]->is_utf_encoding_present())
-    {
-        int bytes_copied;
-        bool decoded;
-        uint8_t* buffer = new uint8_t[input.length()];
-        decoded = session_data->utf_state[source_id]->decode_utf(
-            input.start(), input.length(), buffer, input.length(), &bytes_copied);
+    int bytes_copied;
+    uint8_t* buffer = new uint8_t[input.length()];
  
-        if (!decoded)
-        {
-            delete[] buffer;
-            output.set(input);
-            add_infraction(INF_UTF_NORM_FAIL);
-            create_event(EVENT_UTF_NORM_FAIL);
-        }
-        else if (bytes_copied > 0)
-        {
-            output.set(bytes_copied, buffer, true);
-        }
-        else
-        {
-            delete[] buffer;
-            output.set(input);
-        }
+    if (!ctx->decode_utf(input.start(), input.length(), buffer, input.length(), &bytes_copied))
+    {
+        add_infraction(INF_UTF_NORM_FAIL);
+        create_event(EVENT_UTF_NORM_FAIL);
+        if (CHARSET_SET_BY_GUESS == ctx->get_decode_utf_charset_src())
+            bytes_copied = 0;
      }
  
+    if (bytes_copied > 0)
+        output.set(bytes_copied, buffer, true);
      else
+    {
+        delete[] buffer;
          output.set(input);
+    }
  }
  
  void HttpMsgBody::get_ole_data()
diff --git a/src/utils/util_utf.cc b/src/utils/util_utf.cc

index 853f73943c8eafea63f997fcb4703eadd2eba28b..70bbdc06293c4300bcf127e05ee45fcc58fe61e2 100644 (file)
--- a/src/utils/util_utf.cc
+++ b/src/utils/util_utf.cc
@@ -49,13 +49,15 @@ void UtfDecodeSession::init_decode_utf_state()
  {
      dstate.state = DSTATE_FIRST;
      dstate.charset = CHARSET_DEFAULT;
+    dstate.charset_src = CHARSET_SET_BY_GUESS;
  }
  
  /* setters & getters */
-void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode charset)
+void UtfDecodeSession::set_decode_utf_state_charset(CharsetCode charset, CharsetSrc src)
  {
      dstate.state = DSTATE_FIRST;
      dstate.charset = charset;
+    dstate.charset_src = src;
  }
  
  CharsetCode UtfDecodeSession::get_decode_utf_state_charset()
@@ -63,6 +65,11 @@ CharsetCode UtfDecodeSession::get_decode_utf_state_charset()
      return dstate.charset;
  }
  
+CharsetSrc UtfDecodeSession::get_decode_utf_charset_src()
+{
+    return dstate.charset_src;
+}
+
  bool UtfDecodeSession::is_utf_encoding_present()
  {
      if ( get_decode_utf_state_charset() > CHARSET_IRRELEVANT )
@@ -252,76 +259,79 @@ bool UtfDecodeSession::DecodeUTF32BE(const uint8_t* src, unsigned int src_len, u
  
  void UtfDecodeSession::determine_charset(const uint8_t** src, unsigned int* src_len)
  {
-    CharsetCode charset;
-    if (dstate.charset == CHARSET_UNKNOWN)
-    {
-        /* Got a text content type but no charset.
-         * Look for potential BOM (Byte Order Mark) */
-        if (*src_len >= 4)
-        {
-            uint8_t size = 0;
-
-            if (!memcmp(*src, "\x00\x00\xFE\xFF", 4))
-            {
-                charset = CHARSET_UTF32BE;
-                size = 4;
-            }
-            else if (!memcmp(*src, "\xFF\xFE\x00\x00", 4))
-            {
-                charset = CHARSET_UTF32LE;
-                size = 4;
-            }
-            else if (!memcmp(*src, "\xFE\xFF", 2))
-            {
-                charset = CHARSET_UTF16BE;
-                size = 2;
-            }
-            else if (!memcmp(*src, "\xFF\xFE", 2))
-            {
-                charset = CHARSET_UTF16LE;
-                size = 2;
-            }
-            //  BOM (Byte Order Mark) was missing. Try to guess the encoding.
-            else if (((*src)[0] == '\0') && ((*src)[2] == '\0') && ((*src)[3] != '\0'))
-            {
-                if ((*src)[1] != '\0')
-                    charset = CHARSET_UTF16BE;  // \0C\0C
-                else
-                    charset = CHARSET_UTF32BE;  // \0\0\0C
-            }
-            else if (((*src)[0] != '\0') && ((*src)[1] == '\0') && ((*src)[3] == '\0'))
-            {
-                if ((*src)[2] != '\0')
-                    charset = CHARSET_UTF16LE;  // C\0C\0
-                else
-                    charset = CHARSET_UTF32LE;  // C\0\0\0
-            }
-            else
-            {
-                // NOTE: The UTF-8 BOM (Byte Order Mark) does not match the above cases, so we end
-                // up here when parsing UTF-8. That works out for the moment because the first 128
-                // characters of UTF-8 are identical to ASCII. We may want to handle other UTF-8
-                // characters beyond 0x7f in the future.
+    if (dstate.charset != CHARSET_UNKNOWN)
+        return;
  
-                charset = CHARSET_DEFAULT; // ensure we don't try again
-            }
+    CharsetCode charset = CHARSET_DEFAULT;
+    CharsetSrc charset_src = CHARSET_SET_BY_GUESS;
  
-            // FIXIT-M We are not currently handling the case where some characters are not ASCII
-            // and some are ASCII. This is a problem because some UTF-16 characters have no NUL
-            // bytes (so won't be identified as UTF-16.)
+    if (*src_len < 4)
+    {
+        set_decode_utf_state_charset(charset, charset_src);
+        return;
+    }
  
-            // FIXIT-L We also do not handle multiple levels of encoding (where unicode becomes
-            // %u0020 for example).
+    /* Got a text content type but no charset.
+     * Look for potential BOM (Byte Order Mark) */
+    uint8_t size = 0;
  
-            *src += size;
-            *src_len -= size;
-        }
+    if (!memcmp(*src, "\x00\x00\xFE\xFF", 4))
+    {
+        charset = CHARSET_UTF32BE;
+        size = 4;
+    }
+    else if (!memcmp(*src, "\xFF\xFE\x00\x00", 4))
+    {
+        charset = CHARSET_UTF32LE;
+        size = 4;
+    }
+    else if (!memcmp(*src, "\xFE\xFF", 2))
+    {
+        charset = CHARSET_UTF16BE;
+        size = 2;
+    }
+    else if (!memcmp(*src, "\xFF\xFE", 2))
+    {
+        charset = CHARSET_UTF16LE;
+        size = 2;
+    }
+
+    // If BOM (Byte Order Mark) is missing try to guess the encoding.
+    if (charset != CHARSET_DEFAULT)
+        charset_src = CHARSET_SET_BY_BOM;
+    else if (((*src)[0] == '\0') && ((*src)[2] == '\0') && ((*src)[3] != '\0'))
+    {
+        if ((*src)[1] != '\0')
+            charset = CHARSET_UTF16BE;  // \0C\0C
          else
-        {
-            charset = CHARSET_DEFAULT; // ensure we don't try again
-        }
-        set_decode_utf_state_charset(charset);
+            charset = CHARSET_UTF32BE;  // \0\0\0C
+    }
+    else if (((*src)[0] != '\0') && ((*src)[1] == '\0') && ((*src)[3] == '\0'))
+    {
+        if ((*src)[2] != '\0')
+            charset = CHARSET_UTF16LE;  // C\0C\0
+        else
+            charset = CHARSET_UTF32LE;  // C\0\0\0
      }
+    else
+    {
+        // NOTE: The UTF-8 BOM (Byte Order Mark) does not match the above cases, so we end
+        // up here when parsing UTF-8. That works out for the moment because the first 128
+        // characters of UTF-8 are identical to ASCII. We may want to handle other UTF-8
+        // characters beyond 0x7f in the future.
+    }
+
+    // FIXIT-M We are not currently handling the case where some characters are not ASCII
+    // and some are ASCII. This is a problem because some UTF-16 characters have no NUL
+    // bytes (so won't be identified as UTF-16.)
+
+    // FIXIT-L We also do not handle multiple levels of encoding (where unicode becomes
+    // %u0020 for example).
+
+    *src += size;
+    *src_len -= size;
+
+    set_decode_utf_state_charset(charset, charset_src);
  }
  
  /* Wrapper function for DecodeUTF{16,32}{LE,BE} */
diff --git a/src/utils/util_utf.h b/src/utils/util_utf.h

index 322674da47b694a5c4d7a87c2614ddfc37c62e88..be30bafcd59e310dee0974a16dde85fd3e1a5310 100644 (file)
--- a/src/utils/util_utf.h
+++ b/src/utils/util_utf.h
@@ -38,6 +38,14 @@ enum CharsetCode
      CHARSET_UNKNOWN
  };
  
+// How character set was set.
+enum CharsetSrc
+{
+    CHARSET_SET_BY_APP=0,
+    CHARSET_SET_BY_BOM,
+    CHARSET_SET_BY_GUESS
+};
+
  // Since payloads don't have to end on 2/4-byte boundaries, callers to
  // DecodeUTF are responsible for keeping a decode_utf_state_t. This carries
  // state between subsequent calls.
@@ -45,6 +53,7 @@ struct decode_utf_state_t
  {
      int state;
      CharsetCode charset;
+    CharsetSrc charset_src;
  };
  
  namespace snort
@@ -55,8 +64,9 @@ public:
      UtfDecodeSession();
      virtual ~UtfDecodeSession() = default;
      void init_decode_utf_state();
-    void set_decode_utf_state_charset(CharsetCode charset);
+    void set_decode_utf_state_charset(CharsetCode charset, CharsetSrc src = CHARSET_SET_BY_APP);
      CharsetCode get_decode_utf_state_charset();
+    CharsetSrc get_decode_utf_charset_src();
      bool is_utf_encoding_present();
      bool decode_utf(const uint8_t* src, unsigned int src_len, uint8_t* dst, unsigned int dst_len,
          int* bytes_copied);
author	Adrian Mamolea (admamole) <admamole@cisco.com>
	Fri, 6 Oct 2023 15:34:41 +0000 (15:34 +0000)
committer	Oleksii Shumeiko -X (oshumeik - SOFTSERVE INC at Cisco) <oshumeik@cisco.com>
	Fri, 6 Oct 2023 15:34:41 +0000 (15:34 +0000)
src/decompress/test/file_olefile_test.cc		patch \| blob \| blame \| history
src/service_inspectors/http_inspect/http_msg_body.cc		patch \| blob \| blame \| history
src/utils/util_utf.cc		patch \| blob \| blame \| history
src/utils/util_utf.h		patch \| blob \| blame \| history