misc: add static to various sourcefile-local functions

[thirdparty/xfsprogs-dev.git] / scrub / unicrash.c
diff --git a/scrub/unicrash.c b/scrub/unicrash.c

index 10d7c142450b6c1f9cb8479101fd01999418743e..cb0880c1040a73ae3e4c5bf69b4823c9dfc77898 100644 (file)
--- a/scrub/unicrash.c
+++ b/scrub/unicrash.c
@@ -1,21 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0+
  /*
   * Copyright (C) 2018 Oracle.  All Rights Reserved.
- *
   * Author: Darrick J. Wong <darrick.wong@oracle.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
   */
  #include "xfs.h"
  #include <stdint.h>
@@ -23,60 +9,107 @@
  #include <dirent.h>
  #include <sys/types.h>
  #include <sys/statvfs.h>
-#include <unistr.h>
-#include <uninorm.h>
-#include "path.h"
+#include <strings.h>
+#include <unicode/ustring.h>
+#include <unicode/unorm2.h>
+#include <unicode/uspoof.h>
+#include "libfrog/paths.h"
  #include "xfs_scrub.h"
  #include "common.h"
+#include "descr.h"
+#include "unicrash.h"
  
  /*
- * Detect collisions of Unicode-normalized names.
+ * Detect Unicode confusable names in directories and attributes.
   *
- * Record all the name->ino mappings in a directory/xattr, with a twist!
- * The twist is that we perform unicode normalization on every name we
- * see, so that we can warn about a directory containing more than one
- * directory entries that normalize to the same Unicode string.  These
- * entries are at best a sign of Unicode mishandling, or some sort of
- * weird name substitution attack if the entries do not point to the
- * same inode.  Warn if we see multiple dirents that do not all point to
- * the same inode.
+ * Record all the name->ino mappings in a directory/xattr, with a twist!  The
+ * twist is to record the Unicode skeleton and normalized version of every
+ * name we see so that we can check for a name space (directory, extended
+ * attribute set) containing names containing malicious characters or that
+ * could be confused for one another.  These entries are at best a sign of
+ * Unicode mishandling, or some sort of weird name substitution attack if the
+ * entries do not point to the same inode.  Warn if we see multiple dirents
+ * that do not all point to the same inode.
   *
   * For extended attributes we perform the same collision checks on the
   * attribute, though any collision is enough to trigger a warning.
   *
- * We flag these collisions as warnings and not errors because XFS
- * treats names as a sequence of arbitrary nonzero bytes.  While a
- * Unicode collision is not technically a filesystem corruption, we
- * ought to say something if there's a possibility for misleading a
- * user.
+ * We avoid flagging these problems as errors because XFS treats names as a
+ * sequence of arbitrary nonzero bytes.  While a Unicode collision is not
+ * technically a filesystem corruption, we ought to say something if there's a
+ * possibility for misleading a user.  Unquestionably bad things (direction
+ * overrides, control characters, names that normalize to the same string)
+ * produce warnings, whereas potentially confusable names produce
+ * informational messages.
   *
- * To normalize, we use Unicode NFKC.  We use the composing
- * normalization mode (e.g. "E WITH ACUTE" instead of "E" then "ACUTE")
- * because that's what W3C (and in general Linux) uses.  This enables us
- * to detect multiple object names that normalize to the same name and
- * could be confusing to users.  Furthermore, we use the compatibility
- * mode to detect names with compatible but different code points to
- * strengthen those checks.
+ * The skeleton algorithm is detailed in section 4 ("Confusable Detection") of
+ * the Unicode technical standard #39.  First we normalize the name, then we
+ * substitute code points according to the confusable code point table, then
+ * normalize again.
+ *
+ * We take the extra step of removing non-identifier code points such as
+ * formatting characters, control characters, zero width characters, etc.
+ * from the skeleton so that we can complain about names that are confusable
+ * due to invisible control characters.
+ *
+ * In other words, skel = remove_invisible(nfd(remap_confusables(nfd(name)))).
   */
  
  struct name_entry {
         struct name_entry       *next;
+
+       /* NFKC normalized name */
+       UChar                   *normstr;
+       size_t                  normstrlen;
+
+       /* Unicode skeletonized name */
+       UChar                   *skelstr;
+       size_t                  skelstrlen;
+
         xfs_ino_t               ino;
-       size_t                  uninamelen;
-       uint8_t                 uniname[0];
+
+       /* Raw dirent name */
+       size_t                  namelen;
+       char                    name[0];
  };
  #define NAME_ENTRY_SZ(nl)      (sizeof(struct name_entry) + 1 + \
                                  (nl * sizeof(uint8_t)))
  
  struct unicrash {
         struct scrub_ctx        *ctx;
+       USpoofChecker           *spoof;
+       const UNormalizer2      *normalizer;
         bool                    compare_ino;
+       bool                    is_only_root_writeable;
         size_t                  nr_buckets;
         struct name_entry       *buckets[0];
  };
  #define UNICRASH_SZ(nr)                (sizeof(struct unicrash) + \
                                  (nr * sizeof(struct name_entry *)))
  
+/* Things to complain about in Unicode naming. */
+
+/*
+ * Multiple names resolve to the same normalized string and therefore render
+ * identically.
+ */
+#define UNICRASH_NOT_UNIQUE    (1 << 0)
+
+/* Name contains directional overrides. */
+#define UNICRASH_BIDI_OVERRIDE (1 << 1)
+
+/* Name mixes left-to-right and right-to-left characters. */
+#define UNICRASH_BIDI_MIXED    (1 << 2)
+
+/* Control characters in name. */
+#define UNICRASH_CONTROL_CHAR  (1 << 3)
+
+/* Invisible characters.  Only a problem if we have collisions. */
+#define UNICRASH_ZERO_WIDTH    (1 << 4)
+
+/* Multiple names resolve to the same skeleton string. */
+#define UNICRASH_CONFUSABLE    (1 << 5)
+
  /*
   * We only care about validating utf8 collisions if the underlying
   * system configuration says we're using utf8.  If the language
@@ -111,19 +144,255 @@ is_utf8_locale(void)
         return answer;
  }
  
-/* Initialize the collision detector. */
+/*
+ * Generate normalized form and skeleton of the name.  If this fails, just
+ * forget everything and return false; this is an advisory checker.
+ */
+static bool
+name_entry_compute_checknames(
+       struct unicrash         *uc,
+       struct name_entry       *entry)
+{
+       UChar                   *normstr;
+       UChar                   *unistr;
+       UChar                   *skelstr;
+       int32_t                 normstrlen;
+       int32_t                 unistrlen;
+       int32_t                 skelstrlen;
+       UChar32                 uchr;
+       int32_t                 i, j;
+
+       UErrorCode              uerr = U_ZERO_ERROR;
+
+       /* Convert bytestr to unistr for normalization */
+       u_strFromUTF8(NULL, 0, &unistrlen, entry->name, entry->namelen, &uerr);
+       if (uerr != U_BUFFER_OVERFLOW_ERROR)
+               return false;
+       uerr = U_ZERO_ERROR;
+       unistr = calloc(unistrlen + 1, sizeof(UChar));
+       if (!unistr)
+               return false;
+       u_strFromUTF8(unistr, unistrlen, NULL, entry->name, entry->namelen,
+                       &uerr);
+       if (U_FAILURE(uerr))
+               goto out_unistr;
+
+       /* Normalize the string. */
+       normstrlen = unorm2_normalize(uc->normalizer, unistr, unistrlen, NULL,
+                       0, &uerr);
+       if (uerr != U_BUFFER_OVERFLOW_ERROR)
+               goto out_unistr;
+       uerr = U_ZERO_ERROR;
+       normstr = calloc(normstrlen + 1, sizeof(UChar));
+       if (!normstr)
+               goto out_unistr;
+       unorm2_normalize(uc->normalizer, unistr, unistrlen, normstr, normstrlen,
+                       &uerr);
+       if (U_FAILURE(uerr))
+               goto out_normstr;
+
+       /* Compute skeleton. */
+       skelstrlen = uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, NULL,
+                       0, &uerr);
+       if (uerr != U_BUFFER_OVERFLOW_ERROR)
+               goto out_normstr;
+       uerr = U_ZERO_ERROR;
+       skelstr = calloc(skelstrlen + 1, sizeof(UChar));
+       if (!skelstr)
+               goto out_normstr;
+       uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, skelstr, skelstrlen,
+                       &uerr);
+       if (U_FAILURE(uerr))
+               goto out_skelstr;
+
+       /* Remove control/formatting characters from skeleton. */
+       for (i = 0, j = 0; i < skelstrlen; j = i) {
+               U16_NEXT_UNSAFE(skelstr, i, uchr);
+               if (!u_isIDIgnorable(uchr))
+                       continue;
+               memmove(&skelstr[j], &skelstr[i],
+                               (skelstrlen - i + 1) * sizeof(UChar));
+               skelstrlen -= (i - j);
+               i = j;
+       }
+
+       entry->skelstr = skelstr;
+       entry->skelstrlen = skelstrlen;
+       entry->normstr = normstr;
+       entry->normstrlen = normstrlen;
+       free(unistr);
+       return true;
+
+out_skelstr:
+       free(skelstr);
+out_normstr:
+       free(normstr);
+out_unistr:
+       free(unistr);
+       return false;
+}
+
+/* Create a new name entry, returns false if we could not succeed. */
  static bool
+name_entry_create(
+       struct unicrash         *uc,
+       const char              *name,
+       xfs_ino_t               ino,
+       struct name_entry       **entry)
+{
+       struct name_entry       *new_entry;
+       size_t                  namelen = strlen(name);
+
+       /* Create new entry */
+       new_entry = calloc(NAME_ENTRY_SZ(namelen), 1);
+       if (!new_entry)
+               return false;
+       new_entry->next = NULL;
+       new_entry->ino = ino;
+       memcpy(new_entry->name, name, namelen);
+       new_entry->name[namelen] = 0;
+       new_entry->namelen = namelen;
+
+       /* Normalize/skeletonize name to find collisions. */
+       if (!name_entry_compute_checknames(uc, new_entry))
+               goto out;
+
+       *entry = new_entry;
+       return true;
+
+out:
+       free(new_entry);
+       return false;
+}
+
+/* Free a name entry */
+static void
+name_entry_free(
+       struct name_entry       *entry)
+{
+       free(entry->normstr);
+       free(entry->skelstr);
+       free(entry);
+}
+
+/* Adapt the dirhash function from libxfs, avoid linking with libxfs. */
+
+#define rol32(x, y)            (((x) << (y)) | ((x) >> (32 - (y))))
+
+/*
+ * Implement a simple hash on a character string.
+ * Rotate the hash value by 7 bits, then XOR each character in.
+ * This is implemented with some source-level loop unrolling.
+ */
+static xfs_dahash_t
+name_entry_hash(
+       struct name_entry       *entry)
+{
+       uint8_t                 *name;
+       size_t                  namelen;
+       xfs_dahash_t            hash;
+
+       name = (uint8_t *)entry->skelstr;
+       namelen = entry->skelstrlen * sizeof(UChar);
+
+       /*
+        * Do four characters at a time as long as we can.
+        */
+       for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
+               hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
+                      (name[3] << 0) ^ rol32(hash, 7 * 4);
+
+       /*
+        * Now do the rest of the characters.
+        */
+       switch (namelen) {
+       case 3:
+               return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
+                      rol32(hash, 7 * 3);
+       case 2:
+               return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
+       case 1:
+               return (name[0] << 0) ^ rol32(hash, 7 * 1);
+       default: /* case 0: */
+               return hash;
+       }
+}
+
+/*
+ * Check a name for suspicious elements that have appeared in filename
+ * spoofing attacks.  This includes names that mixed directions or contain
+ * direction overrides control characters, both of which have appeared in
+ * filename spoofing attacks.
+ */
+static void
+name_entry_examine(
+       struct name_entry       *entry,
+       unsigned int            *badflags)
+{
+       UChar32                 uchr;
+       int32_t                 i;
+       uint8_t                 mask = 0;
+
+       for (i = 0; i < entry->normstrlen;) {
+               U16_NEXT_UNSAFE(entry->normstr, i, uchr);
+
+               /* zero width character sequences */
+               switch (uchr) {
+               case 0x200B:    /* zero width space */
+               case 0x200C:    /* zero width non-joiner */
+               case 0x200D:    /* zero width joiner */
+               case 0xFEFF:    /* zero width non breaking space */
+               case 0x2060:    /* word joiner */
+               case 0x2061:    /* function application */
+               case 0x2062:    /* invisible times (multiply) */
+               case 0x2063:    /* invisible separator (comma) */
+               case 0x2064:    /* invisible plus (addition) */
+                       *badflags |= UNICRASH_ZERO_WIDTH;
+                       break;
+               }
+
+               /* control characters */
+               if (u_iscntrl(uchr))
+                       *badflags |= UNICRASH_CONTROL_CHAR;
+
+               switch (u_charDirection(uchr)) {
+               case U_LEFT_TO_RIGHT:
+                       mask |= 0x01;
+                       break;
+               case U_RIGHT_TO_LEFT:
+                       mask |= 0x02;
+                       break;
+               case U_RIGHT_TO_LEFT_OVERRIDE:
+                       *badflags |= UNICRASH_BIDI_OVERRIDE;
+                       break;
+               case U_LEFT_TO_RIGHT_OVERRIDE:
+                       *badflags |= UNICRASH_BIDI_OVERRIDE;
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       /* mixing left-to-right and right-to-left chars */
+       if (mask == 0x3)
+               *badflags |= UNICRASH_BIDI_MIXED;
+}
+
+/* Initialize the collision detector. */
+static int
  unicrash_init(
         struct unicrash         **ucp,
         struct scrub_ctx        *ctx,
         bool                    compare_ino,
-       size_t                  nr_buckets)
+       size_t                  nr_buckets,
+       bool                    is_only_root_writeable)
  {
         struct unicrash         *p;
+       UErrorCode              uerr = U_ZERO_ERROR;
  
         if (!is_utf8_locale()) {
                 *ucp = NULL;
-               return true;
+               return 0;
         }
  
         if (nr_buckets > 65536)
@@ -133,38 +402,78 @@ unicrash_init(
  
         p = calloc(1, UNICRASH_SZ(nr_buckets));
         if (!p)
-               return false;
+               return errno;
         p->ctx = ctx;
         p->nr_buckets = nr_buckets;
         p->compare_ino = compare_ino;
+       p->normalizer = unorm2_getNFKCInstance(&uerr);
+       if (U_FAILURE(uerr))
+               goto out_free;
+       p->spoof = uspoof_open(&uerr);
+       if (U_FAILURE(uerr))
+               goto out_free;
+       uspoof_setChecks(p->spoof, USPOOF_ALL_CHECKS, &uerr);
+       if (U_FAILURE(uerr))
+               goto out_spoof;
+       p->is_only_root_writeable = is_only_root_writeable;
         *ucp = p;
  
-       return true;
+       return 0;
+out_spoof:
+       uspoof_close(p->spoof);
+out_free:
+       free(p);
+       return ENOMEM;
+}
+
+/*
+ * Is this inode owned by root and not writable by others?  If so, skip
+ * even the informational messages, because this was put in place by the
+ * administrator.
+ */
+static bool
+is_only_root_writable(
+       struct xfs_bulkstat     *bstat)
+{
+       if (bstat->bs_uid != 0 || bstat->bs_gid != 0)
+               return false;
+       return !(bstat->bs_mode & S_IWOTH);
  }
  
  /* Initialize the collision detector for a directory. */
-bool
+int
  unicrash_dir_init(
         struct unicrash         **ucp,
         struct scrub_ctx        *ctx,
-       struct xfs_bstat        *bstat)
+       struct xfs_bulkstat     *bstat)
  {
         /*
          * Assume 64 bytes per dentry, clamp buckets between 16 and 64k.
          * Same general idea as dir_hash_init in xfs_repair.
          */
-       return unicrash_init(ucp, ctx, true, bstat->bs_size / 64);
+       return unicrash_init(ucp, ctx, true, bstat->bs_size / 64,
+                       is_only_root_writable(bstat));
  }
  
  /* Initialize the collision detector for an extended attribute. */
-bool
+int
  unicrash_xattr_init(
         struct unicrash         **ucp,
         struct scrub_ctx        *ctx,
-       struct xfs_bstat        *bstat)
+       struct xfs_bulkstat     *bstat)
  {
         /* Assume 16 attributes per extent for lack of a better idea. */
-       return unicrash_init(ucp, ctx, false, 16 * (1 + bstat->bs_aextents));
+       return unicrash_init(ucp, ctx, false, 16 * (1 + bstat->bs_aextents),
+                       is_only_root_writable(bstat));
+}
+
+/* Initialize the collision detector for a filesystem label. */
+int
+unicrash_fs_label_init(
+       struct unicrash         **ucp,
+       struct scrub_ctx        *ctx)
+{
+       return unicrash_init(ucp, ctx, false, 16, true);
  }
  
  /* Free the crash detector. */
@@ -179,204 +488,268 @@ unicrash_free(
         if (!uc)
                 return;
  
+       uspoof_close(uc->spoof);
         for (i = 0; i < uc->nr_buckets; i++) {
                 for (ne = uc->buckets[i]; ne != NULL; ne = x) {
                         x = ne->next;
-                       free(ne);
+                       name_entry_free(ne);
                 }
         }
         free(uc);
  }
  
-/* Steal the dirhash function from libxfs, avoid linking with libxfs. */
-
-#define rol32(x, y)            (((x) << (y)) | ((x) >> (32 - (y))))
-
-/*
- * Implement a simple hash on a character string.
- * Rotate the hash value by 7 bits, then XOR each character in.
- * This is implemented with some source-level loop unrolling.
- */
-static xfs_dahash_t
-unicrash_hashname(
-       const uint8_t           *name,
-       size_t                  namelen)
+/* Complain about Unicode problems. */
+static void
+unicrash_complain(
+       struct unicrash         *uc,
+       struct descr            *dsc,
+       const char              *what,
+       struct name_entry       *entry,
+       unsigned int            badflags,
+       struct name_entry       *dup_entry)
  {
-       xfs_dahash_t            hash;
+       char                    *bad1 = NULL;
+       char                    *bad2 = NULL;
+
+       bad1 = string_escape(entry->name);
+       if (dup_entry)
+               bad2 = string_escape(dup_entry->name);
  
         /*
-        * Do four characters at a time as long as we can.
+        * Most filechooser UIs do not look for bidirectional overrides when
+        * they render names.  This can result in misleading name presentation
+        * that makes "hig<rtl>gnp.sh" render like "highs.png".
          */
-       for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
-               hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
-                      (name[3] << 0) ^ rol32(hash, 7 * 4);
+       if (badflags & UNICRASH_BIDI_OVERRIDE) {
+               str_warn(uc->ctx, descr_render(dsc),
+_("Unicode name \"%s\" in %s contains suspicious text direction overrides."),
+                               bad1, what);
+               goto out;
+       }
  
         /*
-        * Now do the rest of the characters.
+        * Two names that normalize to the same string will render
+        * identically even though the filesystem considers them unique
+        * names.  "cafe\xcc\x81" and "caf\xc3\xa9" have different byte
+        * sequences, but they both appear as "café".
          */
-       switch (namelen) {
-       case 3:
-               return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
-                      rol32(hash, 7 * 3);
-       case 2:
-               return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
-       case 1:
-               return (name[0] << 0) ^ rol32(hash, 7 * 1);
-       default: /* case 0: */
-               return hash;
+       if (badflags & UNICRASH_NOT_UNIQUE) {
+               str_warn(uc->ctx, descr_render(dsc),
+_("Unicode name \"%s\" in %s renders identically to \"%s\"."),
+                               bad1, what, bad2);
+               goto out;
         }
-}
  
-/*
- * Normalize a name according to Unicode NFKC normalization rules.
- * Returns true if the name was already normalized.
- */
-static bool
-unicrash_normalize(
-       const char              *in,
-       uint8_t                 *out,
-       size_t                  outlen)
-{
-       size_t                  inlen = strlen(in);
-
-       assert(inlen <= outlen);
-       if (!u8_normalize(UNINORM_NFKC, (const uint8_t *)in, inlen,
-                       out, &outlen)) {
-               /* Didn't normalize, just return the same buffer. */
-               memcpy(out, in, inlen + 1);
-               return true;
+       /*
+        * If a name contains invisible/nonprinting characters and can be
+        * confused with another name as a result, we should complain.
+        * "moo<zerowidthspace>cow" and "moocow" are misleading.
+        */
+       if ((badflags & UNICRASH_ZERO_WIDTH) &&
+           (badflags & UNICRASH_CONFUSABLE)) {
+               str_warn(uc->ctx, descr_render(dsc),
+_("Unicode name \"%s\" in %s could be confused with '%s' due to invisible characters."),
+                               bad1, what, bad2);
+               goto out;
         }
-       out[outlen] = 0;
-       return outlen == inlen ? memcmp(in, out, inlen) == 0 : false;
-}
  
-/* Complain about Unicode problems. */
-static void
-unicrash_complain(
-       struct unicrash         *uc,
-       const char              *descr,
-       const char              *what,
-       bool                    unique,
-       const char              *name,
-       uint8_t                 *uniname)
-{
-       char                    *bad1 = NULL;
-       char                    *bad2 = NULL;
+       /*
+        * Unfiltered control characters can mess up your terminal and render
+        * invisibly in filechooser UIs.
+        */
+       if (badflags & UNICRASH_CONTROL_CHAR) {
+               str_warn(uc->ctx, descr_render(dsc),
+_("Unicode name \"%s\" in %s contains control characters."),
+                               bad1, what);
+               goto out;
+       }
  
-       bad1 = string_escape(name);
-       bad2 = string_escape((char *)uniname);
+       /*
+        * Skip the informational messages if the inode owning the name is
+        * only writeable by root, because those files were put there by the
+        * sysadmin.  Also skip names less than four letters long because
+        * there's a much higher chance of collisions with short names.
+        */
+       if (!verbose && (uc->is_only_root_writeable || entry->namelen < 4))
+               goto out;
  
-       if (!unique)
-               str_warn(uc->ctx, descr,
-_("Duplicate normalized Unicode name \"%s\" found in %s."),
+       /*
+        * It's not considered good practice (says Unicode) to mix LTR
+        * characters with RTL characters.  The mere presence of different
+        * bidirectional characters isn't enough to trip up software, so don't
+        * warn about this too loudly.
+        */
+       if (badflags & UNICRASH_BIDI_MIXED) {
+               str_info(uc->ctx, descr_render(dsc),
+_("Unicode name \"%s\" in %s mixes bidirectional characters."),
                                 bad1, what);
+               goto out;
+       }
  
+       /*
+        * We'll note if two names could be confusable with each other, but
+        * whether or not the user will actually confuse them is dependent
+        * on the rendering system and the typefaces in use.  Maybe "foo.1"
+        * and "moo.l" look the same, maybe they do not.
+        */
+       if (badflags & UNICRASH_CONFUSABLE) {
+               str_info(uc->ctx, descr_render(dsc),
+_("Unicode name \"%s\" in %s could be confused with \"%s\"."),
+                               bad1, what, bad2);
+       }
+
+out:
         free(bad1);
         free(bad2);
  }
  
  /*
   * Try to add a name -> ino entry to the collision detector.  The name
- * must be normalized according to Unicode NFKC normalization rules to
- * detect byte-unique names that map to the same sequence of Unicode
- * code points.
- *
- * This function returns true either if there was no previous mapping or
- * there was a mapping that matched exactly.  It returns false if
- * there is already a record with that name pointing to a different
- * inode.
+ * must be skeletonized according to Unicode TR39 to detect names that
+ * could be visually confused with each other.
   */
-static bool
+static void
  unicrash_add(
         struct unicrash         *uc,
-       uint8_t                 *uniname,
-       xfs_ino_t               ino,
-       bool                    *unique)
+       struct name_entry       *new_entry,
+       unsigned int            *badflags,
+       struct name_entry       **existing_entry)
  {
-       struct name_entry       *ne;
-       struct name_entry       *x;
-       struct name_entry       **nep;
-       size_t                  uninamelen = u8_strlen(uniname);
+       struct name_entry       *entry;
         size_t                  bucket;
         xfs_dahash_t            hash;
  
-       /* Do we already know about that name? */
-       hash = unicrash_hashname(uniname, uninamelen);
+       /* Store name in hashtable. */
+       hash = name_entry_hash(new_entry);
         bucket = hash % uc->nr_buckets;
-       for (nep = &uc->buckets[bucket], ne = *nep; ne != NULL; ne = x) {
-               if (u8_strcmp(uniname, ne->uniname) == 0) {
-                       *unique = uc->compare_ino ? ne->ino == ino : false;
-                       return true;
+       entry = uc->buckets[bucket];
+       new_entry->next = entry;
+       uc->buckets[bucket] = new_entry;
+
+       while (entry != NULL) {
+               /*
+                * If we see the same byte sequence then someone's modifying
+                * the namespace while we're scanning it.  Update the existing
+                * entry's inode mapping and erase the new entry from existence.
+                */
+               if (new_entry->namelen == entry->namelen &&
+                   !memcmp(new_entry->name, entry->name, entry->namelen)) {
+                       entry->ino = new_entry->ino;
+                       uc->buckets[bucket] = new_entry->next;
+                       name_entry_free(new_entry);
+                       *badflags = 0;
+                       return;
                 }
-               nep = &ne->next;
-               x = ne->next;
-       }
  
-       /* Remember that name. */
-       x = malloc(NAME_ENTRY_SZ(uninamelen));
-       if (!x)
-               return false;
-       x->next = NULL;
-       x->ino = ino;
-       x->uninamelen = uninamelen;
-       memcpy(x->uniname, uniname, uninamelen + 1);
-       *nep = x;
-       *unique = true;
+               /* Same normalization? */
+               if (new_entry->normstrlen == entry->normstrlen &&
+                   !u_strcmp(new_entry->normstr, entry->normstr) &&
+                   (uc->compare_ino ? entry->ino != new_entry->ino : true)) {
+                       *badflags |= UNICRASH_NOT_UNIQUE;
+                       *existing_entry = entry;
+                       return;
+               }
  
-       return true;
+               /* Confusable? */
+               if (new_entry->skelstrlen == entry->skelstrlen &&
+                   !u_strcmp(new_entry->skelstr, entry->skelstr) &&
+                   (uc->compare_ino ? entry->ino != new_entry->ino : true)) {
+                       *badflags |= UNICRASH_CONFUSABLE;
+                       *existing_entry = entry;
+                       return;
+               }
+               entry = entry->next;
+       }
  }
  
  /* Check a name for unicode normalization problems or collisions. */
-static bool
+static int
  __unicrash_check_name(
         struct unicrash         *uc,
-       const char              *descr,
+       struct descr            *dsc,
         const char              *namedescr,
         const char              *name,
         xfs_ino_t               ino)
  {
-       uint8_t                 uniname[(NAME_MAX * 2) + 1];
-       bool                    moveon;
-       bool                    unique;
-
-       memset(uniname, 0, (NAME_MAX * 2) + 1);
-       unicrash_normalize(name, uniname, NAME_MAX * 2);
-       moveon = unicrash_add(uc, uniname, ino, &unique);
-       if (!moveon)
-               return false;
+       struct name_entry       *dup_entry = NULL;
+       struct name_entry       *new_entry = NULL;
+       unsigned int            badflags = 0;
  
-       if (unique)
-               return true;
+       /* If we can't create entry data, just skip it. */
+       if (!name_entry_create(uc, name, ino, &new_entry))
+               return 0;
  
-       unicrash_complain(uc, descr, namedescr, unique, name, uniname);
-       return true;
+       name_entry_examine(new_entry, &badflags);
+       unicrash_add(uc, new_entry, &badflags, &dup_entry);
+       if (badflags)
+               unicrash_complain(uc, dsc, namedescr, new_entry, badflags,
+                               dup_entry);
+
+       return 0;
  }
  
-/* Check a directory entry for unicode normalization problems or collisions. */
-bool
+/*
+ * Check a directory entry for unicode normalization problems or collisions.
+ * If errors occur, this function will log them and return nonzero.
+ */
+int
  unicrash_check_dir_name(
         struct unicrash         *uc,
-       const char              *descr,
+       struct descr            *dsc,
         struct dirent           *dentry)
  {
         if (!uc)
-               return true;
-       return __unicrash_check_name(uc, descr, _("directory"),
+               return 0;
+       return __unicrash_check_name(uc, dsc, _("directory"),
                         dentry->d_name, dentry->d_ino);
  }
  
  /*
   * Check an extended attribute name for unicode normalization problems
- * or collisions.
+ * or collisions.  If errors occur, this function will log them and return
+ * nonzero.
   */
-bool
+int
  unicrash_check_xattr_name(
         struct unicrash         *uc,
-       const char              *descr,
+       struct descr            *dsc,
         const char              *attrname)
  {
         if (!uc)
-               return true;
-       return __unicrash_check_name(uc, descr, _("extended attribute"),
+               return 0;
+       return __unicrash_check_name(uc, dsc, _("extended attribute"),
                         attrname, 0);
  }
+
+/*
+ * Check the fs label for unicode normalization problems or misleading bits.
+ * If errors occur, this function will log them and return nonzero.
+ */
+int
+unicrash_check_fs_label(
+       struct unicrash         *uc,
+       struct descr            *dsc,
+       const char              *label)
+{
+       if (!uc)
+               return 0;
+       return __unicrash_check_name(uc, dsc, _("filesystem label"),
+                       label, 0);
+}
+
+/* Load libicu and initialize it. */
+bool
+unicrash_load(void)
+{
+       UErrorCode              uerr = U_ZERO_ERROR;
+
+       u_init(&uerr);
+       return U_FAILURE(uerr);
+}
+
+/* Unload libicu once we're done with it. */
+void
+unicrash_unload(void)
+{
+       u_cleanup();
+}