From: Douglas Bagnall Date: Tue, 30 Apr 2024 00:41:43 +0000 (+1200) Subject: util:charset: add strncasecmp_ldb() X-Git-Tag: tdb-1.4.11~564 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=eb91e3437b44c7ad653aac86d481ceaaddb06b01;p=thirdparty%2Fsamba.git util:charset: add strncasecmp_ldb() This is a function for comparing strings in a way that suits a case-insenstive syntaxes in LDB. We have it here, rahter than in LDB itself, because it needs the upcase table. By default uses ASCII-only comparisons. SSSD and OpenChange use it in that configuration, but Samba replaces the comparison and casefold functions with Unicode aware versions. Until now Samba has done that in a bad way; this will allow it to do better. Signed-off-by: Douglas Bagnall Reviewed-by: Andrew Bartlett --- diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h index 1f90718028c..8452b5766d1 100644 --- a/lib/util/charset/charset.h +++ b/lib/util/charset/charset.h @@ -158,6 +158,10 @@ char *talloc_alpha_strcpy(TALLOC_CTX *mem_ctx, void string_replace_m(char *s, char oldc, char newc); bool strcsequal(const char *s1,const char *s2); bool strequal_m(const char *s1, const char *s2); +int strncasecmp_ldb(const char *s1, + size_t len1, + const char *s2, + size_t len2); int strncasecmp_m(const char *s1, const char *s2, size_t n); int strncasecmp_m_handle(struct smb_iconv_handle *iconv_handle, const char *s1, const char *s2, size_t n); @@ -174,6 +178,7 @@ char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n); char *strlower_talloc_handle(struct smb_iconv_handle *iconv_handle, TALLOC_CTX *ctx, const char *src); char *strlower_talloc(TALLOC_CTX *ctx, const char *src); + bool strhasupper(const char *string); bool strhaslower_handle(struct smb_iconv_handle *ic, const char *string); diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c index 830b4808c54..5e301852c54 100644 --- a/lib/util/charset/util_unistr.c +++ b/lib/util/charset/util_unistr.c @@ -23,6 +23,7 @@ #include "charset.h" #include "lib/util/byteorder.h" #include "lib/util/fault.h" +#include "lib/util/tsort.h" /** String replace. @@ -165,6 +166,155 @@ _PUBLIC_ char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src) return strupper_talloc(ctx, src); } + +/* + * strncasecmp_ldb() works like a *bit* like strncasecmp, with various + * tricks to suit the way LDB compares strings. The differences are: + * + * 0. each string has it's own length. + * + * 1. consecutive spaces are collapsed down to one space, so that + * "a b" equals "a b". (this is why each string needs its own + * length). Leading and trailing spaces are removed altogether. + * + * 2. Comparisons are done in UPPER CASE, as Windows does, not in + * lowercase as POSIX would have it. + * + * 3. The first invalid byte sequence ends the comparison, as if it + * were '\0', but it compares higher than any real character, + * unlike '\0'. For example, "hello\xc2\xffworld" would end after + * "hello" -- although '\xc2' is a valid utf-8 byte in the right + * circumstances, "\xc2\xff" is an invalid sequence. + * + * All invalid sequences compare equal. This means + * "hello\xC2\xFFworld" equals " hElLo\xFE ". + */ + +#define EAT_SPACE(s, len, ends_in_space) \ + do { \ + while (len) { \ + if (*s != ' ') { \ + break; \ + } \ + s++; \ + len--; \ + } \ + ends_in_space = (len == 0 || *s == '\0'); \ + } while(0) + + +_PUBLIC_ int strncasecmp_ldb(const char *s1, + size_t len1, + const char *s2, + size_t len2) +{ + struct smb_iconv_handle *iconv_handle = get_iconv_handle(); + codepoint_t c1, c2; + size_t cs1, cs2; + bool ends_in_space1, ends_in_space2; + int ret; + bool end1, end2; + + EAT_SPACE(s1, len1, ends_in_space1); + EAT_SPACE(s2, len2, ends_in_space2); + /* + * if ends_in_space was set, the string was empty or only + * spaces (which we treat as equivalent). + */ + if (ends_in_space1 && ends_in_space2) { + return 0; + } + if (ends_in_space1) { + return -1; + } + if (ends_in_space2) { + return 1; + } + + while (true) { + /* + * If the next byte is a space, we eat all the spaces, + * and say we found a single codepoint. If the spaces + * were at the end of the string, the codepoint is 0, + * as if there were no spaces. Otherwise it is 0x20, + * as if there was one space. + * + * Setting the codepoint to 0 will break the loop, but + * only after codepoints have been found in both strings. + */ + if (len1 == 0 || *s1 == 0) { + c1 = 0; + } else if (*s1 == ' ') { + EAT_SPACE(s1, len1, ends_in_space1); + c1 = ends_in_space1 ? 0 : ' '; + } else { + c1 = next_codepoint_handle_ext(iconv_handle, s1, len1, + CH_UNIX, &cs1); + if (c1 != INVALID_CODEPOINT) { + s1 += cs1; + len1 -= cs1; + } + } + + if (len2 == 0 || *s2 == 0) { + c2 = 0; + } else if (*s2 == ' ') { + EAT_SPACE(s2, len2, ends_in_space2); + c2 = ends_in_space2 ? 0 : ' '; + } else { + c2 = next_codepoint_handle_ext(iconv_handle, s2, len2, + CH_UNIX, &cs2); + if (c2 != INVALID_CODEPOINT) { + s2 += cs2; + len2 -= cs2; + } + } + + if (c1 == 0 || c2 == 0 || + c1 == INVALID_CODEPOINT || c2 == INVALID_CODEPOINT) { + break; + } + + if (c1 == c2) { + continue; + } + c1 = toupper_m(c1); + c2 = toupper_m(c2); + if (c1 != c2) { + break; + } + } + + /* + * Either a difference has been found, or one or both strings have + * ended or hit invalid codepoints. + */ + ret = NUMERIC_CMP(c1, c2); + + if (ret != 0) { + return ret; + } + /* + * the strings are equal up to here, but one might be longer. + */ + end1 = len1 == 0 || *s1 == 0; + end2 = len2 == 0 || *s2 == 0; + + if (end1 && end2) { + return 0; + } + if (end1) { + return -1; + } + if (end2) { + return -1; + } + return 0; +} + +#undef EAT_SPACE + + /** Find the number of 'c' chars in a string **/