From: Sascha Steinbiss <satta@debian.org>
Date: Fri, 31 May 2024 09:15:26 +0000 (+0200)
Subject: rust: add JsonBuilder::set_string_limited()
X-Git-Tag: suricata-8.0.0-beta1~1049
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=dd972f72dd5922b35bddbf4557720ac5ed259a3a;p=thirdparty%2Fsuricata.git

rust: add JsonBuilder::set_string_limited()
---

diff --git a/rust/src/jsonbuilder.rs b/rust/src/jsonbuilder.rs
index 8b42966636..94bcaa3cfc 100644
--- a/rust/src/jsonbuilder.rs
+++ b/rust/src/jsonbuilder.rs
@@ -15,7 +15,7 @@
  * 02110-1301, USA.
  */
 
- //! Module for building JSON documents.
+//! Module for building JSON documents.
 
 #![allow(clippy::missing_safety_doc)]
 
@@ -502,6 +502,40 @@ impl JsonBuilder {
         Ok(self)
     }
 
+    /// Set a key and a string value on an object, with a limited size
+    pub fn set_string_limited(
+        &mut self, key: &str, val: &str, limit: usize,
+    ) -> Result<&mut Self, JsonError> {
+        if val.len() > limit {
+            // Gracefully handle splitting UTF-8 strings at arbitrary locations.
+            // Strings in Rust are UTF-8; and a UTF-8 code point is max 4 bytes.
+            // Hence we will find a suitable boundary within 4 bytes of any byte
+            // position, in any direction with sufficiently long strings left.
+            // This is an approach similar to Rust's (currently nightly unstable
+            // only) "floor_char_boundary" str method:
+            // https://doc.rust-lang.org/std/primitive.str.html#method.floor_char_boundary
+            for i in 0..=std::cmp::min(limit, 4) {
+                // We first try the requested boundary. In the expected
+                // ("happy") case the limit is at a code point boundary so the
+                // slice will succeed immediately. If not, we successively try
+                // earlier positions in the string until we find a suitable
+                // position.
+                if let Some(valtrunc) = val.get(0..limit - i) {
+                    let additional_bytes = val.len() - limit;
+                    let outstr = format!(
+                        "{valtrunc}[truncated {additional_bytes} additional byte{}]",
+                        if additional_bytes != 1 { "s" } else { "" }
+                    );
+                    self.set_string(key, &outstr)?;
+                    break;
+                }
+            }
+        } else {
+            self.set_string(key, val)?;
+        }
+        Ok(self)
+    }
+
     pub fn set_formatted(&mut self, formatted: &str) -> Result<&mut Self, JsonError> {
         match self.current_state() {
             State::ObjectNth => {
@@ -531,8 +565,15 @@ impl JsonBuilder {
     pub fn set_string_from_bytes_limited(&mut self, key: &str, val: &[u8], limit: usize) -> Result<&mut Self, JsonError> {
         let mut valtrunc = Vec::new();
         let val = if val.len() > limit {
+            let additional_bytes = val.len() - limit;
             valtrunc.extend_from_slice(&val[..limit]);
-            valtrunc.extend_from_slice(b"[truncated]");
+            valtrunc.extend_from_slice(
+                format!(
+                    "[truncated {additional_bytes} additional byte{}]",
+                    if additional_bytes != 1 { "s" } else { "" }
+                )
+                .as_bytes(),
+            );
             &valtrunc
         } else {
             val
@@ -1285,6 +1326,88 @@ mod test {
         Ok(())
     }
 
+    #[test]
+    fn test_set_string_limited() {
+        let mut jb = JsonBuilder::try_new_object().unwrap();
+        jb.set_string_limited("val", "foobar", 10).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"foobar""#);
+        jb.reset();
+        jb.set_string_limited("val", "foobar", 2).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"fo[truncated 4 additional bytes]""#);
+        jb.reset();
+        jb.set_string_limited("val", "foobar", 0).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"[truncated 6 additional bytes]""#);
+        jb.reset();
+        let unicode_str = "Hello, ä¸ç! ðð";
+        // invalid unicode boundary, naive access should panic
+        let result = std::panic::catch_unwind(|| _ = unicode_str[..9]);
+        assert!(result.is_err());
+        // our code should just skip the incomplete character
+        jb.set_string_limited("val", unicode_str, 9).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"Hello, [truncated 14 additional bytes]""#);
+        jb.reset();
+        // valid unicode boundary, naive access should not panic
+        let result = std::panic::catch_unwind(|| _ = unicode_str[..10]);
+        assert!(result.is_ok());
+        jb.set_string_limited("val", unicode_str, 10).unwrap();
+        assert_eq!(
+            jb.buf,
+            r#"{"val":"Hello, ä¸[truncated 13 additional bytes]""#
+        );
+        jb.reset();
+        let unicode_str2 = "ä¸";
+        // this character has three UTF-8 bytes
+        assert_eq!(
+            unicode_str2,
+            std::str::from_utf8(&[0xE4, 0xB8, 0x96]).unwrap()
+        );
+        let result = std::panic::catch_unwind(|| _ = unicode_str2[..1]);
+        assert!(result.is_err());
+        jb.set_string_limited("val", unicode_str2, 1).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"[truncated 2 additional bytes]""#);
+        jb.reset();
+        jb.set_string_limited("val", unicode_str2, 2).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"[truncated 1 additional byte]""#);
+        jb.reset();
+        // with limit 3 or more we should include it in the log
+        jb.set_string_limited("val", unicode_str2, 3).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"ä¸""#);
+        jb.reset();
+        jb.set_string_limited("val", unicode_str2, 4).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"ä¸""#);
+        jb.reset();
+        jb.set_string_limited("val", unicode_str2, 0).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"[truncated 3 additional bytes]""#);
+        let unicode_str3 = "ð´ó §ó ¢ó ·ó ¬ó ³ó ¿";
+        // this character consists of multiple code points
+        jb.reset();
+        jb.set_string_limited("val", unicode_str3, 7).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"ð´[truncated 21 additional bytes]""#);
+        jb.reset();
+        jb.set_string_limited("val", unicode_str3, 2).unwrap();
+        assert_eq!(jb.buf, r#"{"val":"[truncated 26 additional bytes]""#);
+    }
+
+    #[test]
+    fn test_set_string_from_bytes_limited() {
+        let mut jb = JsonBuilder::try_new_object().unwrap();
+        jb.set_string_from_bytes_limited("first", b"foobar", 10)
+            .unwrap();
+        assert_eq!(jb.buf, r#"{"first":"foobar""#);
+        jb.set_string_from_bytes_limited("second", b"foobar", 2)
+            .unwrap();
+        assert_eq!(
+            jb.buf,
+            r#"{"first":"foobar","second":"fo[truncated 4 additional bytes]""#
+        );
+        jb.set_string_from_bytes_limited("third", b"foobar", 0)
+            .unwrap();
+        assert_eq!(
+            jb.buf,
+            r#"{"first":"foobar","second":"fo[truncated 4 additional bytes]","third":"[truncated 6 additional bytes]""#
+        );
+    }
+
     #[test]
     fn test_invalid_utf8() {
         let mut jb = JsonBuilder::try_new_object().unwrap();