]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - scrub/unicrash.c
b02c5658e9dcb6ef2c7225b9088ae9bcb183e0a2
[thirdparty/xfsprogs-dev.git] / scrub / unicrash.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3 * Copyright (C) 2018 Oracle. All Rights Reserved.
4 * Author: Darrick J. Wong <darrick.wong@oracle.com>
5 */
6 #include "xfs.h"
7 #include <stdint.h>
8 #include <stdlib.h>
9 #include <dirent.h>
10 #include <sys/types.h>
11 #include <sys/statvfs.h>
12 #include <strings.h>
13 #include <unicode/ustring.h>
14 #include <unicode/unorm2.h>
15 #include <unicode/uspoof.h>
16 #include "libfrog/paths.h"
17 #include "xfs_scrub.h"
18 #include "common.h"
19 #include "unicrash.h"
20
21 /*
22 * Detect Unicode confusable names in directories and attributes.
23 *
24 * Record all the name->ino mappings in a directory/xattr, with a twist! The
25 * twist is to record the Unicode skeleton and normalized version of every
26 * name we see so that we can check for a name space (directory, extended
27 * attribute set) containing names containing malicious characters or that
28 * could be confused for one another. These entries are at best a sign of
29 * Unicode mishandling, or some sort of weird name substitution attack if the
30 * entries do not point to the same inode. Warn if we see multiple dirents
31 * that do not all point to the same inode.
32 *
33 * For extended attributes we perform the same collision checks on the
34 * attribute, though any collision is enough to trigger a warning.
35 *
36 * We avoid flagging these problems as errors because XFS treats names as a
37 * sequence of arbitrary nonzero bytes. While a Unicode collision is not
38 * technically a filesystem corruption, we ought to say something if there's a
39 * possibility for misleading a user. Unquestionably bad things (direction
40 * overrides, control characters, names that normalize to the same string)
41 * produce warnings, whereas potentially confusable names produce
42 * informational messages.
43 *
44 * The skeleton algorithm is detailed in section 4 ("Confusable Detection") of
45 * the Unicode technical standard #39. First we normalize the name, then we
46 * substitute code points according to the confusable code point table, then
47 * normalize again.
48 *
49 * We take the extra step of removing non-identifier code points such as
50 * formatting characters, control characters, zero width characters, etc.
51 * from the skeleton so that we can complain about names that are confusable
52 * due to invisible control characters.
53 *
54 * In other words, skel = remove_invisible(nfd(remap_confusables(nfd(name)))).
55 */
56
57 struct name_entry {
58 struct name_entry *next;
59
60 /* NFKC normalized name */
61 UChar *normstr;
62 size_t normstrlen;
63
64 /* Unicode skeletonized name */
65 UChar *skelstr;
66 size_t skelstrlen;
67
68 xfs_ino_t ino;
69
70 /* Raw UTF8 name */
71 size_t namelen;
72 char name[0];
73 };
74 #define NAME_ENTRY_SZ(nl) (sizeof(struct name_entry) + 1 + \
75 (nl * sizeof(uint8_t)))
76
77 struct unicrash {
78 struct scrub_ctx *ctx;
79 USpoofChecker *spoof;
80 const UNormalizer2 *normalizer;
81 bool compare_ino;
82 bool is_only_root_writeable;
83 size_t nr_buckets;
84 struct name_entry *buckets[0];
85 };
86 #define UNICRASH_SZ(nr) (sizeof(struct unicrash) + \
87 (nr * sizeof(struct name_entry *)))
88
89 /* Things to complain about in Unicode naming. */
90
91 /*
92 * Multiple names resolve to the same normalized string and therefore render
93 * identically.
94 */
95 #define UNICRASH_NOT_UNIQUE (1 << 0)
96
97 /* Name contains directional overrides. */
98 #define UNICRASH_BIDI_OVERRIDE (1 << 1)
99
100 /* Name mixes left-to-right and right-to-left characters. */
101 #define UNICRASH_BIDI_MIXED (1 << 2)
102
103 /* Control characters in name. */
104 #define UNICRASH_CONTROL_CHAR (1 << 3)
105
106 /* Invisible characters. Only a problem if we have collisions. */
107 #define UNICRASH_ZERO_WIDTH (1 << 4)
108
109 /* Multiple names resolve to the same skeleton string. */
110 #define UNICRASH_CONFUSABLE (1 << 5)
111
112 /*
113 * We only care about validating utf8 collisions if the underlying
114 * system configuration says we're using utf8. If the language
115 * specifier string used to output messages has ".UTF-8" somewhere in
116 * its name, then we conclude utf8 is in use. Otherwise, no checking is
117 * performed.
118 *
119 * Most modern Linux systems default to utf8, so the only time this
120 * check will return false is if the administrator configured things
121 * this way or if things are so messed up there is no locale data at
122 * all.
123 */
124 #define UTF8_STR ".UTF-8"
125 #define UTF8_STRLEN (sizeof(UTF8_STR) - 1)
126 static bool
127 is_utf8_locale(void)
128 {
129 const char *msg_locale;
130 static int answer = -1;
131
132 if (answer != -1)
133 return answer;
134
135 msg_locale = setlocale(LC_MESSAGES, NULL);
136 if (msg_locale == NULL)
137 return false;
138
139 if (strstr(msg_locale, UTF8_STR) != NULL)
140 answer = 1;
141 else
142 answer = 0;
143 return answer;
144 }
145
146 /*
147 * Generate normalized form and skeleton of the name.
148 * If this fails, just forget everything; this is an advisory checker.
149 */
150 static bool
151 name_entry_compute_checknames(
152 struct unicrash *uc,
153 struct name_entry *entry)
154 {
155 UChar *normstr;
156 UChar *unistr;
157 UChar *skelstr;
158 int32_t normstrlen;
159 int32_t unistrlen;
160 int32_t skelstrlen;
161 UChar32 uchr;
162 int32_t i, j;
163
164 UErrorCode uerr = U_ZERO_ERROR;
165
166 /* Convert bytestr to unistr for normalization */
167 u_strFromUTF8(NULL, 0, &unistrlen, entry->name, entry->namelen, &uerr);
168 if (uerr != U_BUFFER_OVERFLOW_ERROR)
169 return false;
170 uerr = U_ZERO_ERROR;
171 unistr = calloc(unistrlen + 1, sizeof(UChar));
172 if (!unistr)
173 return false;
174 u_strFromUTF8(unistr, unistrlen, NULL, entry->name, entry->namelen,
175 &uerr);
176 if (U_FAILURE(uerr))
177 goto out_unistr;
178
179 /* Normalize the string. */
180 normstrlen = unorm2_normalize(uc->normalizer, unistr, unistrlen, NULL,
181 0, &uerr);
182 if (uerr != U_BUFFER_OVERFLOW_ERROR)
183 goto out_unistr;
184 uerr = U_ZERO_ERROR;
185 normstr = calloc(normstrlen + 1, sizeof(UChar));
186 if (!normstr)
187 goto out_unistr;
188 unorm2_normalize(uc->normalizer, unistr, unistrlen, normstr, normstrlen,
189 &uerr);
190 if (U_FAILURE(uerr))
191 goto out_normstr;
192
193 /* Compute skeleton. */
194 skelstrlen = uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, NULL,
195 0, &uerr);
196 if (uerr != U_BUFFER_OVERFLOW_ERROR)
197 goto out_normstr;
198 uerr = U_ZERO_ERROR;
199 skelstr = calloc(skelstrlen + 1, sizeof(UChar));
200 if (!skelstr)
201 goto out_normstr;
202 uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, skelstr, skelstrlen,
203 &uerr);
204 if (U_FAILURE(uerr))
205 goto out_skelstr;
206
207 /* Remove control/formatting characters from skeleton. */
208 for (i = 0, j = 0; i < skelstrlen; j = i) {
209 U16_NEXT_UNSAFE(skelstr, i, uchr);
210 if (!u_isIDIgnorable(uchr))
211 continue;
212 memmove(&skelstr[j], &skelstr[i],
213 (skelstrlen - i + 1) * sizeof(UChar));
214 skelstrlen -= (i - j);
215 i = j;
216 }
217
218 entry->skelstr = skelstr;
219 entry->skelstrlen = skelstrlen;
220 entry->normstr = normstr;
221 entry->normstrlen = normstrlen;
222 free(unistr);
223 return true;
224
225 out_skelstr:
226 free(skelstr);
227 out_normstr:
228 free(normstr);
229 out_unistr:
230 free(unistr);
231 return false;
232 }
233
234 /* Create a new name entry, returns false if we could not succeed. */
235 static bool
236 name_entry_create(
237 struct unicrash *uc,
238 const char *name,
239 xfs_ino_t ino,
240 struct name_entry **entry)
241 {
242 struct name_entry *new_entry;
243 size_t namelen = strlen(name);
244
245 /* Create new entry */
246 new_entry = calloc(NAME_ENTRY_SZ(namelen), 1);
247 if (!new_entry)
248 return false;
249 new_entry->next = NULL;
250 new_entry->ino = ino;
251 memcpy(new_entry->name, name, namelen);
252 new_entry->name[namelen] = 0;
253 new_entry->namelen = namelen;
254
255 /* Normalize/skeletonize name to find collisions. */
256 if (!name_entry_compute_checknames(uc, new_entry))
257 goto out;
258
259 *entry = new_entry;
260 return true;
261
262 out:
263 free(new_entry);
264 return false;
265 }
266
267 /* Free a name entry */
268 static void
269 name_entry_free(
270 struct name_entry *entry)
271 {
272 free(entry->normstr);
273 free(entry->skelstr);
274 free(entry);
275 }
276
277 /* Adapt the dirhash function from libxfs, avoid linking with libxfs. */
278
279 #define rol32(x, y) (((x) << (y)) | ((x) >> (32 - (y))))
280
281 /*
282 * Implement a simple hash on a character string.
283 * Rotate the hash value by 7 bits, then XOR each character in.
284 * This is implemented with some source-level loop unrolling.
285 */
286 static xfs_dahash_t
287 name_entry_hash(
288 struct name_entry *entry)
289 {
290 uint8_t *name;
291 size_t namelen;
292 xfs_dahash_t hash;
293
294 name = (uint8_t *)entry->skelstr;
295 namelen = entry->skelstrlen * sizeof(UChar);
296
297 /*
298 * Do four characters at a time as long as we can.
299 */
300 for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
301 hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
302 (name[3] << 0) ^ rol32(hash, 7 * 4);
303
304 /*
305 * Now do the rest of the characters.
306 */
307 switch (namelen) {
308 case 3:
309 return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
310 rol32(hash, 7 * 3);
311 case 2:
312 return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
313 case 1:
314 return (name[0] << 0) ^ rol32(hash, 7 * 1);
315 default: /* case 0: */
316 return hash;
317 }
318 }
319
320 /*
321 * Check a name for suspicious elements that have appeared in filename
322 * spoofing attacks. This includes names that mixed directions or contain
323 * direction overrides control characters, both of which have appeared in
324 * filename spoofing attacks.
325 */
326 static void
327 name_entry_examine(
328 struct name_entry *entry,
329 unsigned int *badflags)
330 {
331 UChar32 uchr;
332 int32_t i;
333 uint8_t mask = 0;
334
335 for (i = 0; i < entry->normstrlen;) {
336 U16_NEXT_UNSAFE(entry->normstr, i, uchr);
337
338 /* zero width character sequences */
339 switch (uchr) {
340 case 0x200B: /* zero width space */
341 case 0x200C: /* zero width non-joiner */
342 case 0x200D: /* zero width joiner */
343 case 0xFEFF: /* zero width non breaking space */
344 case 0x2060: /* word joiner */
345 case 0x2061: /* function application */
346 case 0x2062: /* invisible times (multiply) */
347 case 0x2063: /* invisible separator (comma) */
348 case 0x2064: /* invisible plus (addition) */
349 *badflags |= UNICRASH_ZERO_WIDTH;
350 break;
351 }
352
353 /* control characters */
354 if (u_iscntrl(uchr))
355 *badflags |= UNICRASH_CONTROL_CHAR;
356
357 switch (u_charDirection(uchr)) {
358 case U_LEFT_TO_RIGHT:
359 mask |= 0x01;
360 break;
361 case U_RIGHT_TO_LEFT:
362 mask |= 0x02;
363 break;
364 case U_RIGHT_TO_LEFT_OVERRIDE:
365 *badflags |= UNICRASH_BIDI_OVERRIDE;
366 break;
367 case U_LEFT_TO_RIGHT_OVERRIDE:
368 *badflags |= UNICRASH_BIDI_OVERRIDE;
369 break;
370 default:
371 break;
372 }
373 }
374
375 /* mixing left-to-right and right-to-left chars */
376 if (mask == 0x3)
377 *badflags |= UNICRASH_BIDI_MIXED;
378 }
379
380 /* Initialize the collision detector. */
381 static bool
382 unicrash_init(
383 struct unicrash **ucp,
384 struct scrub_ctx *ctx,
385 bool compare_ino,
386 size_t nr_buckets,
387 bool is_only_root_writeable)
388 {
389 struct unicrash *p;
390 UErrorCode uerr = U_ZERO_ERROR;
391
392 if (!is_utf8_locale()) {
393 *ucp = NULL;
394 return true;
395 }
396
397 if (nr_buckets > 65536)
398 nr_buckets = 65536;
399 else if (nr_buckets < 16)
400 nr_buckets = 16;
401
402 p = calloc(1, UNICRASH_SZ(nr_buckets));
403 if (!p)
404 return false;
405 p->ctx = ctx;
406 p->nr_buckets = nr_buckets;
407 p->compare_ino = compare_ino;
408 p->normalizer = unorm2_getNFKCInstance(&uerr);
409 if (U_FAILURE(uerr))
410 goto out_free;
411 p->spoof = uspoof_open(&uerr);
412 if (U_FAILURE(uerr))
413 goto out_free;
414 uspoof_setChecks(p->spoof, USPOOF_ALL_CHECKS, &uerr);
415 if (U_FAILURE(uerr))
416 goto out_spoof;
417 p->is_only_root_writeable = is_only_root_writeable;
418 *ucp = p;
419
420 return true;
421 out_spoof:
422 uspoof_close(p->spoof);
423 out_free:
424 free(p);
425 return false;
426 }
427
428 /*
429 * Is this inode owned by root and not writable by others? If so, skip
430 * even the informational messages, because this was put in place by the
431 * administrator.
432 */
433 static bool
434 is_only_root_writable(
435 struct xfs_bulkstat *bstat)
436 {
437 if (bstat->bs_uid != 0 || bstat->bs_gid != 0)
438 return false;
439 return !(bstat->bs_mode & S_IWOTH);
440 }
441
442 /* Initialize the collision detector for a directory. */
443 bool
444 unicrash_dir_init(
445 struct unicrash **ucp,
446 struct scrub_ctx *ctx,
447 struct xfs_bulkstat *bstat)
448 {
449 /*
450 * Assume 64 bytes per dentry, clamp buckets between 16 and 64k.
451 * Same general idea as dir_hash_init in xfs_repair.
452 */
453 return unicrash_init(ucp, ctx, true, bstat->bs_size / 64,
454 is_only_root_writable(bstat));
455 }
456
457 /* Initialize the collision detector for an extended attribute. */
458 bool
459 unicrash_xattr_init(
460 struct unicrash **ucp,
461 struct scrub_ctx *ctx,
462 struct xfs_bulkstat *bstat)
463 {
464 /* Assume 16 attributes per extent for lack of a better idea. */
465 return unicrash_init(ucp, ctx, false, 16 * (1 + bstat->bs_aextents),
466 is_only_root_writable(bstat));
467 }
468
469 /* Initialize the collision detector for a filesystem label. */
470 bool
471 unicrash_fs_label_init(
472 struct unicrash **ucp,
473 struct scrub_ctx *ctx)
474 {
475 return unicrash_init(ucp, ctx, false, 16, true);
476 }
477
478 /* Free the crash detector. */
479 void
480 unicrash_free(
481 struct unicrash *uc)
482 {
483 struct name_entry *ne;
484 struct name_entry *x;
485 size_t i;
486
487 if (!uc)
488 return;
489
490 uspoof_close(uc->spoof);
491 for (i = 0; i < uc->nr_buckets; i++) {
492 for (ne = uc->buckets[i]; ne != NULL; ne = x) {
493 x = ne->next;
494 name_entry_free(ne);
495 }
496 }
497 free(uc);
498 }
499
500 /* Complain about Unicode problems. */
501 static void
502 unicrash_complain(
503 struct unicrash *uc,
504 const char *descr,
505 const char *what,
506 struct name_entry *entry,
507 unsigned int badflags,
508 struct name_entry *dup_entry)
509 {
510 char *bad1 = NULL;
511 char *bad2 = NULL;
512
513 bad1 = string_escape(entry->name);
514 if (dup_entry)
515 bad2 = string_escape(dup_entry->name);
516
517 /*
518 * Most filechooser UIs do not look for bidirectional overrides when
519 * they render names. This can result in misleading name presentation
520 * that makes "hig<rtl>gnp.sh" render like "highs.png".
521 */
522 if (badflags & UNICRASH_BIDI_OVERRIDE) {
523 str_warn(uc->ctx, descr,
524 _("Unicode name \"%s\" in %s contains suspicious text direction overrides."),
525 bad1, what);
526 goto out;
527 }
528
529 /*
530 * Two names that normalize to the same string will render
531 * identically even though the filesystem considers them unique
532 * names. "cafe\xcc\x81" and "caf\xc3\xa9" have different byte
533 * sequences, but they both appear as "café".
534 */
535 if (badflags & UNICRASH_NOT_UNIQUE) {
536 str_warn(uc->ctx, descr,
537 _("Unicode name \"%s\" in %s renders identically to \"%s\"."),
538 bad1, what, bad2);
539 goto out;
540 }
541
542 /*
543 * If a name contains invisible/nonprinting characters and can be
544 * confused with another name as a result, we should complain.
545 * "moo<zerowidthspace>cow" and "moocow" are misleading.
546 */
547 if ((badflags & UNICRASH_ZERO_WIDTH) &&
548 (badflags & UNICRASH_CONFUSABLE)) {
549 str_warn(uc->ctx, descr,
550 _("Unicode name \"%s\" in %s could be confused with '%s' due to invisible characters."),
551 bad1, what, bad2);
552 goto out;
553 }
554
555 /*
556 * Unfiltered control characters can mess up your terminal and render
557 * invisibly in filechooser UIs.
558 */
559 if (badflags & UNICRASH_CONTROL_CHAR) {
560 str_warn(uc->ctx, descr,
561 _("Unicode name \"%s\" in %s contains control characters."),
562 bad1, what);
563 goto out;
564 }
565
566 /*
567 * Skip the informational messages if the inode owning the name is
568 * only writeable by root, because those files were put there by the
569 * sysadmin. Also skip names less than four letters long because
570 * there's a much higher chance of collisions with short names.
571 */
572 if (!verbose && (uc->is_only_root_writeable || entry->namelen < 4))
573 goto out;
574
575 /*
576 * It's not considered good practice (says Unicode) to mix LTR
577 * characters with RTL characters. The mere presence of different
578 * bidirectional characters isn't enough to trip up software, so don't
579 * warn about this too loudly.
580 */
581 if (badflags & UNICRASH_BIDI_MIXED) {
582 str_info(uc->ctx, descr,
583 _("Unicode name \"%s\" in %s mixes bidirectional characters."),
584 bad1, what);
585 goto out;
586 }
587
588 /*
589 * We'll note if two names could be confusable with each other, but
590 * whether or not the user will actually confuse them is dependent
591 * on the rendering system and the typefaces in use. Maybe "foo.1"
592 * and "moo.l" look the same, maybe they do not.
593 */
594 if (badflags & UNICRASH_CONFUSABLE) {
595 str_info(uc->ctx, descr,
596 _("Unicode name \"%s\" in %s could be confused with \"%s\"."),
597 bad1, what, bad2);
598 }
599
600 out:
601 free(bad1);
602 free(bad2);
603 }
604
605 /*
606 * Try to add a name -> ino entry to the collision detector. The name
607 * must be skeletonized according to Unicode TR39 to detect names that
608 * could be visually confused with each other.
609 */
610 static bool
611 unicrash_add(
612 struct unicrash *uc,
613 struct name_entry *new_entry,
614 unsigned int *badflags,
615 struct name_entry **existing_entry)
616 {
617 struct name_entry *entry;
618 size_t bucket;
619 xfs_dahash_t hash;
620
621 /* Store name in hashtable. */
622 hash = name_entry_hash(new_entry);
623 bucket = hash % uc->nr_buckets;
624 entry = uc->buckets[bucket];
625 new_entry->next = entry;
626 uc->buckets[bucket] = new_entry;
627
628 while (entry != NULL) {
629 /* Same normalization? */
630 if (new_entry->normstrlen == entry->normstrlen &&
631 !u_strcmp(new_entry->normstr, entry->normstr) &&
632 (uc->compare_ino ? entry->ino != new_entry->ino : true)) {
633 *badflags |= UNICRASH_NOT_UNIQUE;
634 *existing_entry = entry;
635 return true;
636 }
637
638 /* Confusable? */
639 if (new_entry->skelstrlen == entry->skelstrlen &&
640 !u_strcmp(new_entry->skelstr, entry->skelstr) &&
641 (uc->compare_ino ? entry->ino != new_entry->ino : true)) {
642 *badflags |= UNICRASH_CONFUSABLE;
643 *existing_entry = entry;
644 return true;
645 }
646 entry = entry->next;
647 }
648
649 return true;
650 }
651
652 /* Check a name for unicode normalization problems or collisions. */
653 static bool
654 __unicrash_check_name(
655 struct unicrash *uc,
656 const char *descr,
657 const char *namedescr,
658 const char *name,
659 xfs_ino_t ino)
660 {
661 struct name_entry *dup_entry = NULL;
662 struct name_entry *new_entry;
663 unsigned int badflags = 0;
664 bool moveon;
665
666 /* If we can't create entry data, just skip it. */
667 if (!name_entry_create(uc, name, ino, &new_entry))
668 return true;
669
670 name_entry_examine(new_entry, &badflags);
671
672 moveon = unicrash_add(uc, new_entry, &badflags, &dup_entry);
673 if (!moveon)
674 return false;
675
676 if (badflags)
677 unicrash_complain(uc, descr, namedescr, new_entry, badflags,
678 dup_entry);
679
680 return true;
681 }
682
683 /* Check a directory entry for unicode normalization problems or collisions. */
684 bool
685 unicrash_check_dir_name(
686 struct unicrash *uc,
687 const char *descr,
688 struct dirent *dentry)
689 {
690 if (!uc)
691 return true;
692 return __unicrash_check_name(uc, descr, _("directory"),
693 dentry->d_name, dentry->d_ino);
694 }
695
696 /*
697 * Check an extended attribute name for unicode normalization problems
698 * or collisions.
699 */
700 bool
701 unicrash_check_xattr_name(
702 struct unicrash *uc,
703 const char *descr,
704 const char *attrname)
705 {
706 if (!uc)
707 return true;
708 return __unicrash_check_name(uc, descr, _("extended attribute"),
709 attrname, 0);
710 }
711
712 /*
713 * Check the fs label for unicode normalization problems or misleading bits.
714 */
715 bool
716 unicrash_check_fs_label(
717 struct unicrash *uc,
718 const char *descr,
719 const char *label)
720 {
721 if (!uc)
722 return true;
723 return __unicrash_check_name(uc, descr, _("filesystem label"),
724 label, 0);
725 }