util:charset: add strncasecmp_ldb()

author Douglas Bagnall <douglas.bagnall@catalyst.net.nz>

Tue, 30 Apr 2024 00:41:43 +0000 (12:41 +1200)

committer Andrew Bartlett <abartlet@samba.org>

Wed, 22 May 2024 23:12:32 +0000 (23:12 +0000)
author Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
Tue, 30 Apr 2024 00:41:43 +0000 (12:41 +1200)
committer Andrew Bartlett <abartlet@samba.org>
Wed, 22 May 2024 23:12:32 +0000 (23:12 +0000)
diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h

index 1f90718028caca7a3b0036c903b8572193150d83..8452b5766d1679b12655dbae5bcb1d74f8af6c15 100644 (file)
--- a/lib/util/charset/charset.h
+++ b/lib/util/charset/charset.h
@@ -158,6 +158,10 @@ char *talloc_alpha_strcpy(TALLOC_CTX *mem_ctx,
  void string_replace_m(char *s, char oldc, char newc);
  bool strcsequal(const char *s1,const char *s2);
  bool strequal_m(const char *s1, const char *s2);
+int strncasecmp_ldb(const char *s1,
+                   size_t len1,
+                   const char *s2,
+                   size_t len2);
  int strncasecmp_m(const char *s1, const char *s2, size_t n);
  int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle,
                          const char *s1, const char *s2, size_t n);
@@ -174,6 +178,7 @@ char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n);
   char *strlower_talloc_handle(struct smb_iconv_handle *iconv_handle,
                               TALLOC_CTX *ctx, const char *src);
  char *strlower_talloc(TALLOC_CTX *ctx, const char *src);
+
  bool strhasupper(const char *string);
  bool strhaslower_handle(struct smb_iconv_handle *ic,
                         const char *string);
diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c

index 830b4808c54793c0235a16249c10e68f1bf8d669..5e301852c54d7d37825713057f7dd7aba158d149 100644 (file)
--- a/lib/util/charset/util_unistr.c
+++ b/lib/util/charset/util_unistr.c
@@ -23,6 +23,7 @@
  #include "charset.h"
  #include "lib/util/byteorder.h"
  #include "lib/util/fault.h"
+#include "lib/util/tsort.h"
  
  /**
   String replace.
@@ -165,6 +166,155 @@ _PUBLIC_ char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src)
         return strupper_talloc(ctx, src);
  }
  
+
+/*
+ * strncasecmp_ldb() works like a *bit* like strncasecmp, with various
+ * tricks to suit the way LDB compares strings. The differences are:
+ *
+ * 0. each string has it's own length.
+ *
+ * 1. consecutive spaces are collapsed down to one space, so that
+ *    "a  b" equals "a b". (this is why each string needs its own
+ *    length). Leading and trailing spaces are removed altogether.
+ *
+ * 2. Comparisons are done in UPPER CASE, as Windows does, not in
+ *    lowercase as POSIX would have it.
+ *
+ * 3. The first invalid byte sequence ends the comparison, as if it
+ *    were '\0', but it compares higher than any real character,
+ *    unlike '\0'. For example, "hello\xc2\xffworld" would end after
+ *    "hello" -- although '\xc2' is a valid utf-8 byte in the right
+ *    circumstances, "\xc2\xff" is an invalid sequence.
+ *
+ *    All invalid sequences compare equal. This means
+ *    "hello\xC2\xFFworld" equals "  hElLo\xFE ".
+ */
+
+#define EAT_SPACE(s, len, ends_in_space)                        \
+       do {                                                     \
+               while (len) {                                    \
+                       if (*s != ' ') {                         \
+                               break;                           \
+                       }                                        \
+                       s++;                                     \
+                       len--;                                   \
+               }                                                \
+               ends_in_space = (len == 0 || *s == '\0');        \
+       } while(0)
+
+
+_PUBLIC_ int strncasecmp_ldb(const char *s1,
+                            size_t len1,
+                            const char *s2,
+                            size_t len2)
+{
+       struct smb_iconv_handle *iconv_handle = get_iconv_handle();
+       codepoint_t c1, c2;
+       size_t cs1, cs2;
+       bool ends_in_space1, ends_in_space2;
+       int ret;
+       bool end1, end2;
+
+       EAT_SPACE(s1, len1, ends_in_space1);
+       EAT_SPACE(s2, len2, ends_in_space2);
+       /*
+        * if ends_in_space was set, the string was empty or only
+        * spaces (which we treat as equivalent).
+        */
+       if (ends_in_space1 && ends_in_space2) {
+               return 0;
+       }
+       if (ends_in_space1) {
+               return -1;
+       }
+       if (ends_in_space2) {
+               return 1;
+       }
+
+       while (true) {
+               /*
+                * If the next byte is a space, we eat all the spaces,
+                * and say we found a single codepoint. If the spaces
+                * were at the end of the string, the codepoint is 0,
+                * as if there were no spaces. Otherwise it is 0x20,
+                * as if there was one space.
+                *
+                * Setting the codepoint to 0 will break the loop, but
+                * only after codepoints have been found in both strings.
+                */
+               if (len1 == 0 || *s1 == 0) {
+                       c1 = 0;
+               } else if (*s1 == ' ') {
+                       EAT_SPACE(s1, len1, ends_in_space1);
+                       c1 = ends_in_space1 ? 0 : ' ';
+               } else {
+                       c1 = next_codepoint_handle_ext(iconv_handle, s1, len1,
+                                                      CH_UNIX, &cs1);
+                       if (c1 != INVALID_CODEPOINT) {
+                               s1 += cs1;
+                               len1 -= cs1;
+                       }
+               }
+
+               if (len2 == 0 || *s2 == 0) {
+                       c2 = 0;
+               } else if (*s2 == ' ') {
+                       EAT_SPACE(s2, len2, ends_in_space2);
+                       c2 = ends_in_space2 ? 0 : ' ';
+               } else {
+                       c2 = next_codepoint_handle_ext(iconv_handle, s2, len2,
+                                                      CH_UNIX, &cs2);
+                       if (c2 != INVALID_CODEPOINT) {
+                               s2 += cs2;
+                               len2 -= cs2;
+                       }
+               }
+
+               if (c1 == 0 || c2 == 0 ||
+                   c1 == INVALID_CODEPOINT || c2 == INVALID_CODEPOINT) {
+                       break;
+               }
+
+               if (c1 == c2) {
+                       continue;
+               }
+               c1 = toupper_m(c1);
+               c2 = toupper_m(c2);
+               if (c1 != c2) {
+                       break;
+               }
+       }
+
+       /*
+        * Either a difference has been found, or one or both strings have
+        * ended or hit invalid codepoints.
+        */
+       ret = NUMERIC_CMP(c1, c2);
+
+       if (ret != 0) {
+               return ret;
+       }
+       /*
+        * the strings are equal up to here, but one might be longer.
+        */
+       end1 = len1 == 0 || *s1 == 0;
+       end2 = len2 == 0 || *s2 == 0;
+
+       if (end1 && end2) {
+               return 0;
+       }
+       if (end1) {
+               return -1;
+       }
+       if (end2) {
+               return -1;
+       }
+       return 0;
+}
+
+#undef EAT_SPACE
+
+
  /**
   Find the number of 'c' chars in a string
  **/
author	Douglas Bagnall <douglas.bagnall@catalyst.net.nz>
	Tue, 30 Apr 2024 00:41:43 +0000 (12:41 +1200)
committer	Andrew Bartlett <abartlet@samba.org>
	Wed, 22 May 2024 23:12:32 +0000 (23:12 +0000)
lib/util/charset/charset.h		patch \| blob \| blame \| history
lib/util/charset/util_unistr.c		patch \| blob \| blame \| history