]>
git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - scrub/unicrash.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * Copyright (C) 2018-2024 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <djwong@kernel.org>
10 #include <sys/types.h>
11 #include <sys/statvfs.h>
13 #include <unicode/uclean.h>
14 #include <unicode/ustring.h>
15 #include <unicode/unorm2.h>
16 #include <unicode/uspoof.h>
17 #include "libfrog/paths.h"
18 #include "xfs_scrub.h"
24 * Detect Unicode confusable names in directories and attributes.
26 * Record all the name->ino mappings in a directory/xattr, with a twist! The
27 * twist is to record the Unicode skeleton and normalized version of every
28 * name we see so that we can check for a name space (directory, extended
29 * attribute set) containing names containing malicious characters or that
30 * could be confused for one another. These entries are at best a sign of
31 * Unicode mishandling, or some sort of weird name substitution attack if the
32 * entries do not point to the same inode. Warn if we see multiple dirents
33 * that do not all point to the same inode.
35 * For extended attributes we perform the same collision checks on the
36 * attribute, though any collision is enough to trigger a warning.
38 * We avoid flagging these problems as errors because XFS treats names as a
39 * sequence of arbitrary nonzero bytes. While a Unicode collision is not
40 * technically a filesystem corruption, we ought to say something if there's a
41 * possibility for misleading a user. Unquestionably bad things (direction
42 * overrides, control characters, names that normalize to the same string)
43 * produce warnings, whereas potentially confusable names produce
44 * informational messages.
46 * The skeleton algorithm is detailed in section 4 ("Confusable Detection") of
47 * the Unicode technical standard #39. First we normalize the name, then we
48 * substitute code points according to the confusable code point table, then
51 * We take the extra step of removing non-identifier code points such as
52 * formatting characters, control characters, zero width characters, etc.
53 * from the skeleton so that we can complain about names that are confusable
54 * due to invisible control characters.
56 * In other words, skel = remove_invisible(nfd(remap_confusables(nfd(name)))).
60 struct name_entry
*next
;
62 /* NFKC normalized name */
66 /* Unicode skeletonized name */
76 #define NAME_ENTRY_SZ(nl) (sizeof(struct name_entry) + 1 + \
77 (nl * sizeof(uint8_t)))
80 struct scrub_ctx
*ctx
;
82 const UNormalizer2
*normalizer
;
84 bool is_only_root_writeable
;
86 struct name_entry
*buckets
[0];
88 #define UNICRASH_SZ(nr) (sizeof(struct unicrash) + \
89 (nr * sizeof(struct name_entry *)))
91 /* Things to complain about in Unicode naming. */
94 * Multiple names resolve to the same normalized string and therefore render
97 #define UNICRASH_NOT_UNIQUE (1 << 0)
99 /* Name contains directional overrides. */
100 #define UNICRASH_BIDI_OVERRIDE (1 << 1)
102 /* Name mixes left-to-right and right-to-left characters. */
103 #define UNICRASH_BIDI_MIXED (1 << 2)
105 /* Control characters in name. */
106 #define UNICRASH_CONTROL_CHAR (1 << 3)
108 /* Invisible characters. Only a problem if we have collisions. */
109 #define UNICRASH_ZERO_WIDTH (1 << 4)
111 /* Multiple names resolve to the same skeleton string. */
112 #define UNICRASH_CONFUSABLE (1 << 5)
115 * We only care about validating utf8 collisions if the underlying
116 * system configuration says we're using utf8. If the language
117 * specifier string used to output messages has ".UTF-8" somewhere in
118 * its name, then we conclude utf8 is in use. Otherwise, no checking is
121 * Most modern Linux systems default to utf8, so the only time this
122 * check will return false is if the administrator configured things
123 * this way or if things are so messed up there is no locale data at
126 #define UTF8_STR ".UTF-8"
127 #define UTF8_STRLEN (sizeof(UTF8_STR) - 1)
131 const char *msg_locale
;
132 static int answer
= -1;
137 msg_locale
= setlocale(LC_MESSAGES
, NULL
);
138 if (msg_locale
== NULL
)
141 if (strstr(msg_locale
, UTF8_STR
) != NULL
)
149 * Generate normalized form and skeleton of the name. If this fails, just
150 * forget everything and return false; this is an advisory checker.
153 name_entry_compute_checknames(
155 struct name_entry
*entry
)
166 UErrorCode uerr
= U_ZERO_ERROR
;
168 /* Convert bytestr to unistr for normalization */
169 u_strFromUTF8(NULL
, 0, &unistrlen
, entry
->name
, entry
->namelen
, &uerr
);
170 if (uerr
!= U_BUFFER_OVERFLOW_ERROR
)
173 unistr
= calloc(unistrlen
+ 1, sizeof(UChar
));
176 u_strFromUTF8(unistr
, unistrlen
, NULL
, entry
->name
, entry
->namelen
,
181 /* Normalize the string. */
182 normstrlen
= unorm2_normalize(uc
->normalizer
, unistr
, unistrlen
, NULL
,
184 if (uerr
!= U_BUFFER_OVERFLOW_ERROR
)
187 normstr
= calloc(normstrlen
+ 1, sizeof(UChar
));
190 unorm2_normalize(uc
->normalizer
, unistr
, unistrlen
, normstr
, normstrlen
,
195 /* Compute skeleton. */
196 skelstrlen
= uspoof_getSkeleton(uc
->spoof
, 0, unistr
, unistrlen
, NULL
,
198 if (uerr
!= U_BUFFER_OVERFLOW_ERROR
)
201 skelstr
= calloc(skelstrlen
+ 1, sizeof(UChar
));
204 uspoof_getSkeleton(uc
->spoof
, 0, unistr
, unistrlen
, skelstr
, skelstrlen
,
209 /* Remove control/formatting characters from skeleton. */
210 for (i
= 0, j
= 0; i
< skelstrlen
; j
= i
) {
211 U16_NEXT_UNSAFE(skelstr
, i
, uchr
);
212 if (!u_isIDIgnorable(uchr
))
214 memmove(&skelstr
[j
], &skelstr
[i
],
215 (skelstrlen
- i
+ 1) * sizeof(UChar
));
216 skelstrlen
-= (i
- j
);
220 entry
->skelstr
= skelstr
;
221 entry
->skelstrlen
= skelstrlen
;
222 entry
->normstr
= normstr
;
223 entry
->normstrlen
= normstrlen
;
236 /* Create a new name entry, returns false if we could not succeed. */
242 struct name_entry
**entry
)
244 struct name_entry
*new_entry
;
245 size_t namelen
= strlen(name
);
247 /* Create new entry */
248 new_entry
= calloc(NAME_ENTRY_SZ(namelen
), 1);
251 new_entry
->next
= NULL
;
252 new_entry
->ino
= ino
;
253 memcpy(new_entry
->name
, name
, namelen
);
254 new_entry
->name
[namelen
] = 0;
255 new_entry
->namelen
= namelen
;
257 /* Normalize/skeletonize name to find collisions. */
258 if (!name_entry_compute_checknames(uc
, new_entry
))
269 /* Free a name entry */
272 struct name_entry
*entry
)
274 free(entry
->normstr
);
275 free(entry
->skelstr
);
279 /* Adapt the dirhash function from libxfs, avoid linking with libxfs. */
281 #define rol32(x, y) (((x) << (y)) | ((x) >> (32 - (y))))
284 * Implement a simple hash on a character string.
285 * Rotate the hash value by 7 bits, then XOR each character in.
286 * This is implemented with some source-level loop unrolling.
290 struct name_entry
*entry
)
296 name
= (uint8_t *)entry
->skelstr
;
297 namelen
= entry
->skelstrlen
* sizeof(UChar
);
300 * Do four characters at a time as long as we can.
302 for (hash
= 0; namelen
>= 4; namelen
-= 4, name
+= 4)
303 hash
= (name
[0] << 21) ^ (name
[1] << 14) ^ (name
[2] << 7) ^
304 (name
[3] << 0) ^ rol32(hash
, 7 * 4);
307 * Now do the rest of the characters.
311 return (name
[0] << 14) ^ (name
[1] << 7) ^ (name
[2] << 0) ^
314 return (name
[0] << 7) ^ (name
[1] << 0) ^ rol32(hash
, 7 * 2);
316 return (name
[0] << 0) ^ rol32(hash
, 7 * 1);
317 default: /* case 0: */
323 * Check a name for suspicious elements that have appeared in filename
324 * spoofing attacks. This includes names that mixed directions or contain
325 * direction overrides control characters, both of which have appeared in
326 * filename spoofing attacks.
330 struct name_entry
*entry
,
331 unsigned int *badflags
)
337 for (i
= 0; i
< entry
->normstrlen
;) {
338 U16_NEXT_UNSAFE(entry
->normstr
, i
, uchr
);
340 /* zero width character sequences */
342 case 0x200B: /* zero width space */
343 case 0x200C: /* zero width non-joiner */
344 case 0x200D: /* zero width joiner */
345 case 0xFEFF: /* zero width non breaking space */
346 case 0x2060: /* word joiner */
347 case 0x2061: /* function application */
348 case 0x2062: /* invisible times (multiply) */
349 case 0x2063: /* invisible separator (comma) */
350 case 0x2064: /* invisible plus (addition) */
351 *badflags
|= UNICRASH_ZERO_WIDTH
;
355 /* control characters */
357 *badflags
|= UNICRASH_CONTROL_CHAR
;
359 switch (u_charDirection(uchr
)) {
360 case U_LEFT_TO_RIGHT
:
363 case U_RIGHT_TO_LEFT
:
366 case U_RIGHT_TO_LEFT_OVERRIDE
:
367 *badflags
|= UNICRASH_BIDI_OVERRIDE
;
369 case U_LEFT_TO_RIGHT_OVERRIDE
:
370 *badflags
|= UNICRASH_BIDI_OVERRIDE
;
377 /* mixing left-to-right and right-to-left chars */
379 *badflags
|= UNICRASH_BIDI_MIXED
;
382 /* Initialize the collision detector. */
385 struct unicrash
**ucp
,
386 struct scrub_ctx
*ctx
,
389 bool is_only_root_writeable
)
392 UErrorCode uerr
= U_ZERO_ERROR
;
394 if (!is_utf8_locale()) {
399 if (nr_buckets
> 65536)
401 else if (nr_buckets
< 16)
404 p
= calloc(1, UNICRASH_SZ(nr_buckets
));
408 p
->nr_buckets
= nr_buckets
;
409 p
->compare_ino
= compare_ino
;
410 p
->normalizer
= unorm2_getNFKCInstance(&uerr
);
413 p
->spoof
= uspoof_open(&uerr
);
416 uspoof_setChecks(p
->spoof
, USPOOF_ALL_CHECKS
, &uerr
);
419 p
->is_only_root_writeable
= is_only_root_writeable
;
424 uspoof_close(p
->spoof
);
431 * Is this inode owned by root and not writable by others? If so, skip
432 * even the informational messages, because this was put in place by the
436 is_only_root_writable(
437 struct xfs_bulkstat
*bstat
)
439 if (bstat
->bs_uid
!= 0 || bstat
->bs_gid
!= 0)
441 return !(bstat
->bs_mode
& S_IWOTH
);
444 /* Initialize the collision detector for a directory. */
447 struct unicrash
**ucp
,
448 struct scrub_ctx
*ctx
,
449 struct xfs_bulkstat
*bstat
)
452 * Assume 64 bytes per dentry, clamp buckets between 16 and 64k.
453 * Same general idea as dir_hash_init in xfs_repair.
455 return unicrash_init(ucp
, ctx
, true, bstat
->bs_size
/ 64,
456 is_only_root_writable(bstat
));
459 /* Initialize the collision detector for an extended attribute. */
462 struct unicrash
**ucp
,
463 struct scrub_ctx
*ctx
,
464 struct xfs_bulkstat
*bstat
)
466 /* Assume 16 attributes per extent for lack of a better idea. */
467 return unicrash_init(ucp
, ctx
, false, 16 * (1 + bstat
->bs_aextents
),
468 is_only_root_writable(bstat
));
471 /* Initialize the collision detector for a filesystem label. */
473 unicrash_fs_label_init(
474 struct unicrash
**ucp
,
475 struct scrub_ctx
*ctx
)
477 return unicrash_init(ucp
, ctx
, false, 16, true);
480 /* Free the crash detector. */
485 struct name_entry
*ne
;
486 struct name_entry
*x
;
492 uspoof_close(uc
->spoof
);
493 for (i
= 0; i
< uc
->nr_buckets
; i
++) {
494 for (ne
= uc
->buckets
[i
]; ne
!= NULL
; ne
= x
) {
502 /* Complain about Unicode problems. */
508 struct name_entry
*entry
,
509 unsigned int badflags
,
510 struct name_entry
*dup_entry
)
515 bad1
= string_escape(entry
->name
);
517 bad2
= string_escape(dup_entry
->name
);
520 * Most filechooser UIs do not look for bidirectional overrides when
521 * they render names. This can result in misleading name presentation
522 * that makes "hig<rtl>gnp.sh" render like "highs.png".
524 if (badflags
& UNICRASH_BIDI_OVERRIDE
) {
525 str_warn(uc
->ctx
, descr_render(dsc
),
526 _("Unicode name \"%s\" in %s contains suspicious text direction overrides."),
532 * Two names that normalize to the same string will render
533 * identically even though the filesystem considers them unique
534 * names. "cafe\xcc\x81" and "caf\xc3\xa9" have different byte
535 * sequences, but they both appear as "café".
537 if (badflags
& UNICRASH_NOT_UNIQUE
) {
538 str_warn(uc
->ctx
, descr_render(dsc
),
539 _("Unicode name \"%s\" in %s renders identically to \"%s\"."),
545 * If a name contains invisible/nonprinting characters and can be
546 * confused with another name as a result, we should complain.
547 * "moo<zerowidthspace>cow" and "moocow" are misleading.
549 if ((badflags
& UNICRASH_ZERO_WIDTH
) &&
550 (badflags
& UNICRASH_CONFUSABLE
)) {
551 str_warn(uc
->ctx
, descr_render(dsc
),
552 _("Unicode name \"%s\" in %s could be confused with '%s' due to invisible characters."),
558 * Unfiltered control characters can mess up your terminal and render
559 * invisibly in filechooser UIs.
561 if (badflags
& UNICRASH_CONTROL_CHAR
) {
562 str_warn(uc
->ctx
, descr_render(dsc
),
563 _("Unicode name \"%s\" in %s contains control characters."),
569 * Skip the informational messages if the inode owning the name is
570 * only writeable by root, because those files were put there by the
571 * sysadmin. Also skip names less than four letters long because
572 * there's a much higher chance of collisions with short names.
574 if (!verbose
&& (uc
->is_only_root_writeable
|| entry
->namelen
< 4))
578 * It's not considered good practice (says Unicode) to mix LTR
579 * characters with RTL characters. The mere presence of different
580 * bidirectional characters isn't enough to trip up software, so don't
581 * warn about this too loudly.
583 if (badflags
& UNICRASH_BIDI_MIXED
) {
584 str_info(uc
->ctx
, descr_render(dsc
),
585 _("Unicode name \"%s\" in %s mixes bidirectional characters."),
591 * We'll note if two names could be confusable with each other, but
592 * whether or not the user will actually confuse them is dependent
593 * on the rendering system and the typefaces in use. Maybe "foo.1"
594 * and "moo.l" look the same, maybe they do not.
596 if (badflags
& UNICRASH_CONFUSABLE
) {
597 str_info(uc
->ctx
, descr_render(dsc
),
598 _("Unicode name \"%s\" in %s could be confused with \"%s\"."),
608 * Try to add a name -> ino entry to the collision detector. The name
609 * must be skeletonized according to Unicode TR39 to detect names that
610 * could be visually confused with each other.
615 struct name_entry
*new_entry
,
616 unsigned int *badflags
,
617 struct name_entry
**existing_entry
)
619 struct name_entry
*entry
;
623 /* Store name in hashtable. */
624 hash
= name_entry_hash(new_entry
);
625 bucket
= hash
% uc
->nr_buckets
;
626 entry
= uc
->buckets
[bucket
];
627 new_entry
->next
= entry
;
628 uc
->buckets
[bucket
] = new_entry
;
630 while (entry
!= NULL
) {
632 * If we see the same byte sequence then someone's modifying
633 * the namespace while we're scanning it. Update the existing
634 * entry's inode mapping and erase the new entry from existence.
636 if (new_entry
->namelen
== entry
->namelen
&&
637 !memcmp(new_entry
->name
, entry
->name
, entry
->namelen
)) {
638 entry
->ino
= new_entry
->ino
;
639 uc
->buckets
[bucket
] = new_entry
->next
;
640 name_entry_free(new_entry
);
645 /* Same normalization? */
646 if (new_entry
->normstrlen
== entry
->normstrlen
&&
647 !u_strcmp(new_entry
->normstr
, entry
->normstr
) &&
648 (uc
->compare_ino
? entry
->ino
!= new_entry
->ino
: true)) {
649 *badflags
|= UNICRASH_NOT_UNIQUE
;
650 *existing_entry
= entry
;
655 if (new_entry
->skelstrlen
== entry
->skelstrlen
&&
656 !u_strcmp(new_entry
->skelstr
, entry
->skelstr
) &&
657 (uc
->compare_ino
? entry
->ino
!= new_entry
->ino
: true)) {
658 *badflags
|= UNICRASH_CONFUSABLE
;
659 *existing_entry
= entry
;
666 /* Check a name for unicode normalization problems or collisions. */
668 __unicrash_check_name(
671 const char *namedescr
,
675 struct name_entry
*dup_entry
= NULL
;
676 struct name_entry
*new_entry
= NULL
;
677 unsigned int badflags
= 0;
679 /* If we can't create entry data, just skip it. */
680 if (!name_entry_create(uc
, name
, ino
, &new_entry
))
683 name_entry_examine(new_entry
, &badflags
);
684 unicrash_add(uc
, new_entry
, &badflags
, &dup_entry
);
686 unicrash_complain(uc
, dsc
, namedescr
, new_entry
, badflags
,
693 * Check a directory entry for unicode normalization problems or collisions.
694 * If errors occur, this function will log them and return nonzero.
697 unicrash_check_dir_name(
700 struct dirent
*dentry
)
704 return __unicrash_check_name(uc
, dsc
, _("directory"),
705 dentry
->d_name
, dentry
->d_ino
);
709 * Check an extended attribute name for unicode normalization problems
710 * or collisions. If errors occur, this function will log them and return
714 unicrash_check_xattr_name(
717 const char *attrname
)
721 return __unicrash_check_name(uc
, dsc
, _("extended attribute"),
726 * Check the fs label for unicode normalization problems or misleading bits.
727 * If errors occur, this function will log them and return nonzero.
730 unicrash_check_fs_label(
737 return __unicrash_check_name(uc
, dsc
, _("filesystem label"),
741 /* Load libicu and initialize it. */
745 UErrorCode uerr
= U_ZERO_ERROR
;
748 return U_FAILURE(uerr
);
751 /* Unload libicu once we're done with it. */
753 unicrash_unload(void)