From: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Date: Tue, 30 Apr 2024 00:41:43 +0000 (+1200)
Subject: util:charset: add strncasecmp_ldb()
X-Git-Tag: tdb-1.4.11~564
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=eb91e3437b44c7ad653aac86d481ceaaddb06b01;p=thirdparty%2Fsamba.git

util:charset: add strncasecmp_ldb()

This is a function for comparing strings in a way that suits a
case-insenstive syntaxes in LDB.

We have it here, rahter than in LDB itself, because it needs the
upcase table. By default uses ASCII-only comparisons. SSSD and
OpenChange use it in that configuration, but Samba replaces the
comparison and casefold functions with Unicode aware versions.

Until now Samba has done that in a bad way; this will allow it to do
better.

Signed-off-by: Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Reviewed-by: Andrew Bartlett <abartlet@samba.org>
---

diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h
index 1f90718028c..8452b5766d1 100644
--- a/lib/util/charset/charset.h
+++ b/lib/util/charset/charset.h
@@ -158,6 +158,10 @@ char *talloc_alpha_strcpy(TALLOC_CTX *mem_ctx,
 void string_replace_m(char *s, char oldc, char newc);
 bool strcsequal(const char *s1,const char *s2);
 bool strequal_m(const char *s1, const char *s2);
+int strncasecmp_ldb(const char *s1,
+		    size_t len1,
+		    const char *s2,
+		    size_t len2);
 int strncasecmp_m(const char *s1, const char *s2, size_t n);
 int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
 			 const char *s1, const char *s2, size_t n);
@@ -174,6 +178,7 @@ char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n);
  char *strlower_talloc_handle(struct smb_iconv_handle *iconv_handle,
 			      TALLOC_CTX *ctx, const char *src);
 char *strlower_talloc(TALLOC_CTX *ctx, const char *src);
+
 bool strhasupper(const char *string);
 bool strhaslower_handle(struct smb_iconv_handle *ic,
 			const char *string);
diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c
index 830b4808c54..5e301852c54 100644
--- a/lib/util/charset/util_unistr.c
+++ b/lib/util/charset/util_unistr.c
@@ -23,6 +23,7 @@
 #include "charset.h"
 #include "lib/util/byteorder.h"
 #include "lib/util/fault.h"
+#include "lib/util/tsort.h"
 
 /**
  String replace.
@@ -165,6 +166,155 @@ _PUBLIC_ char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src)
 	return strupper_talloc(ctx, src);
 }
 
+
+/*
+ * strncasecmp_ldb() works like a *bit* like strncasecmp, with various
+ * tricks to suit the way LDB compares strings. The differences are:
+ *
+ * 0. each string has it's own length.
+ *
+ * 1. consecutive spaces are collapsed down to one space, so that
+ *    "a  b" equals "a b". (this is why each string needs its own
+ *    length). Leading and trailing spaces are removed altogether.
+ *
+ * 2. Comparisons are done in UPPER CASE, as Windows does, not in
+ *    lowercase as POSIX would have it.
+ *
+ * 3. The first invalid byte sequence ends the comparison, as if it
+ *    were '\0', but it compares higher than any real character,
+ *    unlike '\0'. For example, "hello\xc2\xffworld" would end after
+ *    "hello" -- although '\xc2' is a valid utf-8 byte in the right
+ *    circumstances, "\xc2\xff" is an invalid sequence.
+ *
+ *    All invalid sequences compare equal. This means
+ *    "hello\xC2\xFFworld" equals "  hElLo\xFE ".
+ */
+
+#define EAT_SPACE(s, len, ends_in_space)			 \
+	do {							 \
+		while (len) {					 \
+			if (*s != ' ') {			 \
+				break;				 \
+			}					 \
+			s++;					 \
+			len--;					 \
+		}						 \
+		ends_in_space = (len == 0 || *s == '\0');	 \
+	} while(0)
+
+
+_PUBLIC_ int strncasecmp_ldb(const char *s1,
+			     size_t len1,
+			     const char *s2,
+			     size_t len2)
+{
+	struct smb_iconv_handle *iconv_handle = get_iconv_handle();
+	codepoint_t c1, c2;
+	size_t cs1, cs2;
+	bool ends_in_space1, ends_in_space2;
+	int ret;
+	bool end1, end2;
+
+	EAT_SPACE(s1, len1, ends_in_space1);
+	EAT_SPACE(s2, len2, ends_in_space2);
+	/*
+	 * if ends_in_space was set, the string was empty or only
+	 * spaces (which we treat as equivalent).
+	 */
+	if (ends_in_space1 && ends_in_space2) {
+		return 0;
+	}
+	if (ends_in_space1) {
+		return -1;
+	}
+	if (ends_in_space2) {
+		return 1;
+	}
+
+	while (true) {
+		/*
+		 * If the next byte is a space, we eat all the spaces,
+		 * and say we found a single codepoint. If the spaces
+		 * were at the end of the string, the codepoint is 0,
+		 * as if there were no spaces. Otherwise it is 0x20,
+		 * as if there was one space.
+		 *
+		 * Setting the codepoint to 0 will break the loop, but
+		 * only after codepoints have been found in both strings.
+		 */
+		if (len1 == 0 || *s1 == 0) {
+			c1 = 0;
+		} else if (*s1 == ' ') {
+			EAT_SPACE(s1, len1, ends_in_space1);
+			c1 = ends_in_space1 ? 0 : ' ';
+		} else {
+			c1 = next_codepoint_handle_ext(iconv_handle, s1, len1,
+						       CH_UNIX, &cs1);
+			if (c1 != INVALID_CODEPOINT) {
+				s1 += cs1;
+				len1 -= cs1;
+			}
+		}
+
+		if (len2 == 0 || *s2 == 0) {
+			c2 = 0;
+		} else if (*s2 == ' ') {
+			EAT_SPACE(s2, len2, ends_in_space2);
+			c2 = ends_in_space2 ? 0 : ' ';
+		} else {
+			c2 = next_codepoint_handle_ext(iconv_handle, s2, len2,
+						       CH_UNIX, &cs2);
+			if (c2 != INVALID_CODEPOINT) {
+				s2 += cs2;
+				len2 -= cs2;
+			}
+		}
+
+		if (c1 == 0 || c2 == 0 ||
+		    c1 == INVALID_CODEPOINT || c2 == INVALID_CODEPOINT) {
+			break;
+		}
+
+		if (c1 == c2) {
+			continue;
+		}
+		c1 = toupper_m(c1);
+		c2 = toupper_m(c2);
+		if (c1 != c2) {
+			break;
+		}
+	}
+
+	/*
+	 * Either a difference has been found, or one or both strings have
+	 * ended or hit invalid codepoints.
+	 */
+	ret = NUMERIC_CMP(c1, c2);
+
+	if (ret != 0) {
+		return ret;
+	}
+	/*
+	 * the strings are equal up to here, but one might be longer.
+	 */
+	end1 = len1 == 0 || *s1 == 0;
+	end2 = len2 == 0 || *s2 == 0;
+
+	if (end1 && end2) {
+		return 0;
+	}
+	if (end1) {
+		return -1;
+	}
+	if (end2) {
+		return -1;
+	}
+	return 0;
+}
+
+#undef EAT_SPACE
+
+
 /**
  Find the number of 'c' chars in a string
 **/