[thirdparty/xfsprogs-dev.git] / scrub / unicrash.c

/*
 * Copyright (C) 2018 Oracle.  All Rights Reserved.
 *
 * Author: Darrick J. Wong <darrick.wong@oracle.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
 */
#include "xfs.h"
#include <stdint.h>
#include <stdlib.h>
#include <dirent.h>
#include <sys/types.h>
#include <sys/statvfs.h>
#include <strings.h>
#include <unicode/ustring.h>
#include <unicode/unorm2.h>
#include <unicode/uspoof.h>
#include "path.h"
#include "xfs_scrub.h"
#include "common.h"

/*
 * Detect Unicode confusable names in directories and attributes.
 *
 * Record all the name->ino mappings in a directory/xattr, with a twist!  The
 * twist is to record the Unicode skeleton and normalized version of every
 * name we see so that we can check for a name space (directory, extended
 * attribute set) containing names containing malicious characters or that
 * could be confused for one another.  These entries are at best a sign of
 * Unicode mishandling, or some sort of weird name substitution attack if the
 * entries do not point to the same inode.  Warn if we see multiple dirents
 * that do not all point to the same inode.
 *
 * For extended attributes we perform the same collision checks on the
 * attribute, though any collision is enough to trigger a warning.
 *
 * We avoid flagging these problems as errors because XFS treats names as a
 * sequence of arbitrary nonzero bytes.  While a Unicode collision is not
 * technically a filesystem corruption, we ought to say something if there's a
 * possibility for misleading a user.  Unquestionably bad things (direction
 * overrides, control characters, names that normalize to the same string)
 * produce warnings, whereas potentially confusable names produce
 * informational messages.
 *
 * The skeleton algorithm is detailed in section 4 ("Confusable Detection") of
 * the Unicode technical standard #39.  First we normalize the name, then we
 * substitute code points according to the confusable code point table, then
 * normalize again.
 *
 * We take the extra step of removing non-identifier code points such as
 * formatting characters, control characters, zero width characters, etc.
 * from the skeleton so that we can complain about names that are confusable
 * due to invisible control characters.
 *
 * In other words, skel = remove_invisible(nfd(remap_confusables(nfd(name)))).
 */

struct name_entry {
	struct name_entry	*next;

	/* NFKC normalized name */
	UChar			*normstr;
	size_t			normstrlen;

	/* Unicode skeletonized name */
	UChar			*skelstr;
	size_t			skelstrlen;

	xfs_ino_t		ino;

	/* Raw UTF8 name */
	size_t			namelen;
	char			name[0];
};
#define NAME_ENTRY_SZ(nl)	(sizeof(struct name_entry) + 1 + \
				 (nl * sizeof(uint8_t)))

struct unicrash {
	struct scrub_ctx	*ctx;
	USpoofChecker		*spoof;
	const UNormalizer2	*normalizer;
	bool			compare_ino;
	size_t			nr_buckets;
	struct name_entry	*buckets[0];
};
#define UNICRASH_SZ(nr)		(sizeof(struct unicrash) + \
				 (nr * sizeof(struct name_entry *)))

/* Things to complain about in Unicode naming. */

/*
 * Multiple names resolve to the same normalized string and therefore render
 * identically.
 */
#define UNICRASH_NOT_UNIQUE	(1 << 0)

/* Name contains directional overrides. */
#define UNICRASH_BIDI_OVERRIDE	(1 << 1)

/* Name mixes left-to-right and right-to-left characters. */
#define UNICRASH_BIDI_MIXED	(1 << 2)

/* Control characters in name. */
#define UNICRASH_CONTROL_CHAR	(1 << 3)

/* Invisible characters.  Only a problem if we have collisions. */
#define UNICRASH_ZERO_WIDTH	(1 << 4)

/* Multiple names resolve to the same skeleton string. */
#define UNICRASH_CONFUSABLE	(1 << 5)

/*
 * We only care about validating utf8 collisions if the underlying
 * system configuration says we're using utf8.  If the language
 * specifier string used to output messages has ".UTF-8" somewhere in
 * its name, then we conclude utf8 is in use.  Otherwise, no checking is
 * performed.
 *
 * Most modern Linux systems default to utf8, so the only time this
 * check will return false is if the administrator configured things
 * this way or if things are so messed up there is no locale data at
 * all.
 */
#define UTF8_STR		".UTF-8"
#define UTF8_STRLEN		(sizeof(UTF8_STR) - 1)
static bool
is_utf8_locale(void)
{
	const char		*msg_locale;
	static int		answer = -1;

	if (answer != -1)
		return answer;

	msg_locale = setlocale(LC_MESSAGES, NULL);
	if (msg_locale == NULL)
		return false;

	if (strstr(msg_locale, UTF8_STR) != NULL)
		answer = 1;
	else
		answer = 0;
	return answer;
}

/*
 * Generate normalized form and skeleton of the name.
 * If this fails, just forget everything; this is an advisory checker.
 */
static bool
name_entry_compute_checknames(
	struct unicrash		*uc,
	struct name_entry	*entry)
{
	UChar			*normstr;
	UChar			*unistr;
	UChar			*skelstr;
	int32_t			normstrlen;
	int32_t			unistrlen;
	int32_t			skelstrlen;
	UChar32			uchr;
	int32_t			i, j;

	UErrorCode		uerr = U_ZERO_ERROR;

	/* Convert bytestr to unistr for normalization */
	u_strFromUTF8(NULL, 0, &unistrlen, entry->name, entry->namelen, &uerr);
	if (uerr != U_BUFFER_OVERFLOW_ERROR)
		return false;
	uerr = U_ZERO_ERROR;
	unistr = calloc(unistrlen + 1, sizeof(UChar));
	if (!unistr)
		return false;
	u_strFromUTF8(unistr, unistrlen, NULL, entry->name, entry->namelen,
			&uerr);
	if (U_FAILURE(uerr))
		goto out_unistr;

	/* Normalize the string. */
	normstrlen = unorm2_normalize(uc->normalizer, unistr, unistrlen, NULL,
			0, &uerr);
	if (uerr != U_BUFFER_OVERFLOW_ERROR)
		goto out_unistr;
	uerr = U_ZERO_ERROR;
	normstr = calloc(normstrlen + 1, sizeof(UChar));
	if (!normstr)
		goto out_unistr;
	unorm2_normalize(uc->normalizer, unistr, unistrlen, normstr, normstrlen,
			&uerr);
	if (U_FAILURE(uerr))
		goto out_normstr;

	/* Compute skeleton. */
	skelstrlen = uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, NULL,
			0, &uerr);
	if (uerr != U_BUFFER_OVERFLOW_ERROR)
		goto out_normstr;
	uerr = U_ZERO_ERROR;
	skelstr = calloc(skelstrlen + 1, sizeof(UChar));
	if (!skelstr)
		goto out_normstr;
	uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, skelstr, skelstrlen,
			&uerr);
	if (U_FAILURE(uerr))
		goto out_skelstr;

	/* Remove control/formatting characters from skeleton. */
	for (i = 0, j = 0; i < skelstrlen; j = i) {
		U16_NEXT_UNSAFE(skelstr, i, uchr);
		if (!u_isIDIgnorable(uchr))
			continue;
		memmove(&skelstr[j], &skelstr[i],
				(skelstrlen - i + 1) * sizeof(UChar));
		skelstrlen -= (i - j);
		i = j;
	}

	entry->skelstr = skelstr;
	entry->skelstrlen = skelstrlen;
	entry->normstr = normstr;
	entry->normstrlen = normstrlen;
	free(unistr);
	return true;

out_skelstr:
	free(skelstr);
out_normstr:
	free(normstr);
out_unistr:
	free(unistr);
	return false;
}

/* Create a new name entry, returns false if we could not succeed. */
static bool
name_entry_create(
	struct unicrash		*uc,
	const char		*name,
	xfs_ino_t		ino,
	struct name_entry	**entry)
{
	struct name_entry	*new_entry;
	size_t			namelen = strlen(name);

	/* Create new entry */
	new_entry = calloc(NAME_ENTRY_SZ(namelen), 1);
	if (!new_entry)
		return false;
	new_entry->next = NULL;
	new_entry->ino = ino;
	memcpy(new_entry->name, name, namelen);
	new_entry->name[namelen] = 0;
	new_entry->namelen = namelen;

	/* Normalize/skeletonize name to find collisions. */
	if (!name_entry_compute_checknames(uc, new_entry))
		goto out;

	*entry = new_entry;
	return true;

out:
	free(new_entry);
	return false;
}

/* Free a name entry */
static void
name_entry_free(
	struct name_entry	*entry)
{
	free(entry->normstr);
	free(entry->skelstr);
	free(entry);
}

/* Adapt the dirhash function from libxfs, avoid linking with libxfs. */

#define rol32(x, y)		(((x) << (y)) | ((x) >> (32 - (y))))

/*
 * Implement a simple hash on a character string.
 * Rotate the hash value by 7 bits, then XOR each character in.
 * This is implemented with some source-level loop unrolling.
 */
static xfs_dahash_t
name_entry_hash(
	struct name_entry	*entry)
{
	uint8_t			*name;
	size_t			namelen;
	xfs_dahash_t		hash;

	name = (uint8_t *)entry->skelstr;
	namelen = entry->skelstrlen * sizeof(UChar);

	/*
	 * Do four characters at a time as long as we can.
	 */
	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
		       (name[3] << 0) ^ rol32(hash, 7 * 4);

	/*
	 * Now do the rest of the characters.
	 */
	switch (namelen) {
	case 3:
		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
		       rol32(hash, 7 * 3);
	case 2:
		return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
	case 1:
		return (name[0] << 0) ^ rol32(hash, 7 * 1);
	default: /* case 0: */
		return hash;
	}
}

/*
 * Check a name for suspicious elements that have appeared in filename
 * spoofing attacks.  This includes names that mixed directions or contain
 * direction overrides control characters, both of which have appeared in
 * filename spoofing attacks.
 */
static void
name_entry_examine(
	struct name_entry	*entry,
	unsigned int		*badflags)
{
	UChar32			uchr;
	int32_t			i;
	uint8_t			mask = 0;

	for (i = 0; i < entry->normstrlen;) {
		U16_NEXT_UNSAFE(entry->normstr, i, uchr);

		/* zero width character sequences */
		switch (uchr) {
		case 0x200B:	/* zero width space */
		case 0x200C:	/* zero width non-joiner */
		case 0x200D:	/* zero width joiner */
		case 0xFEFF:	/* zero width non breaking space */
		case 0x2060:	/* word joiner */
		case 0x2061:	/* function application */
		case 0x2062:	/* invisible times (multiply) */
		case 0x2063:	/* invisible separator (comma) */
		case 0x2064:	/* invisible plus (addition) */
			*badflags |= UNICRASH_ZERO_WIDTH;
			break;
		}

		/* control characters */
		if (u_iscntrl(uchr))
			*badflags |= UNICRASH_CONTROL_CHAR;

		switch (u_charDirection(uchr)) {
		case U_LEFT_TO_RIGHT:
			mask |= 0x01;
			break;
		case U_RIGHT_TO_LEFT:
			mask |= 0x02;
			break;
		case U_RIGHT_TO_LEFT_OVERRIDE:
			*badflags |= UNICRASH_BIDI_OVERRIDE;
			break;
		case U_LEFT_TO_RIGHT_OVERRIDE:
			*badflags |= UNICRASH_BIDI_OVERRIDE;
			break;
		default:
			break;
		}
	}

	/* mixing left-to-right and right-to-left chars */
	if (mask == 0x3)
		*badflags |= UNICRASH_BIDI_MIXED;
}

/* Initialize the collision detector. */
static bool
unicrash_init(
	struct unicrash		**ucp,
	struct scrub_ctx	*ctx,
	bool			compare_ino,
	size_t			nr_buckets)
{
	struct unicrash		*p;
	UErrorCode		uerr = U_ZERO_ERROR;

	if (!is_utf8_locale()) {
		*ucp = NULL;
		return true;
	}

	if (nr_buckets > 65536)
		nr_buckets = 65536;
	else if (nr_buckets < 16)
		nr_buckets = 16;

	p = calloc(1, UNICRASH_SZ(nr_buckets));
	if (!p)
		return false;
	p->ctx = ctx;
	p->nr_buckets = nr_buckets;
	p->compare_ino = compare_ino;
	p->normalizer = unorm2_getNFKCInstance(&uerr);
	if (U_FAILURE(uerr))
		goto out_free;
	p->spoof = uspoof_open(&uerr);
	if (U_FAILURE(uerr))
		goto out_free;
	uspoof_setChecks(p->spoof, USPOOF_ALL_CHECKS, &uerr);
	if (U_FAILURE(uerr))
		goto out_spoof;
	*ucp = p;

	return true;
out_spoof:
	uspoof_close(p->spoof);
out_free:
	free(p);
	return false;
}

/* Initialize the collision detector for a directory. */
bool
unicrash_dir_init(
	struct unicrash		**ucp,
	struct scrub_ctx	*ctx,
	struct xfs_bstat	*bstat)
{
	/*
	 * Assume 64 bytes per dentry, clamp buckets between 16 and 64k.
	 * Same general idea as dir_hash_init in xfs_repair.
	 */
	return unicrash_init(ucp, ctx, true, bstat->bs_size / 64);
}

/* Initialize the collision detector for an extended attribute. */
bool
unicrash_xattr_init(
	struct unicrash		**ucp,
	struct scrub_ctx	*ctx,
	struct xfs_bstat	*bstat)
{
	/* Assume 16 attributes per extent for lack of a better idea. */
	return unicrash_init(ucp, ctx, false, 16 * (1 + bstat->bs_aextents));
}

/* Free the crash detector. */
void
unicrash_free(
	struct unicrash		*uc)
{
	struct name_entry	*ne;
	struct name_entry	*x;
	size_t			i;

	if (!uc)
		return;

	uspoof_close(uc->spoof);
	for (i = 0; i < uc->nr_buckets; i++) {
		for (ne = uc->buckets[i]; ne != NULL; ne = x) {
			x = ne->next;
			name_entry_free(ne);
		}
	}
	free(uc);
}

/* Complain about Unicode problems. */
static void
unicrash_complain(
	struct unicrash		*uc,
	const char		*descr,
	const char		*what,
	struct name_entry	*entry,
	unsigned int		badflags,
	struct name_entry	*dup_entry)
{
	char			*bad1 = NULL;
	char			*bad2 = NULL;

	bad1 = string_escape(entry->name);
	if (dup_entry)
		bad2 = string_escape(dup_entry->name);

	/*
	 * Most filechooser UIs do not look for bidirectional overrides when
	 * they render names.  This can result in misleading name presentation
	 * that makes "hig<rtl>gnp.sh" render like "highs.png".
	 */
	if (badflags & UNICRASH_BIDI_OVERRIDE) {
		str_warn(uc->ctx, descr,
_("Unicode name \"%s\" in %s contains suspicious text direction overrides."),
				bad1, what);
		goto out;
	}

	/*
	 * Two names that normalize to the same string will render
	 * identically even though the filesystem considers them unique
	 * names.  "cafe\xcc\x81" and "caf\xc3\xa9" have different byte
	 * sequences, but they both appear as "café".
	 */
	if (badflags & UNICRASH_NOT_UNIQUE) {
		str_warn(uc->ctx, descr,
_("Unicode name \"%s\" in %s renders identically to \"%s\"."),
				bad1, what, bad2);
		goto out;
	}

	/*
	 * If a name contains invisible/nonprinting characters and can be
	 * confused with another name as a result, we should complain.
	 * "moo<zerowidthspace>cow" and "moocow" are misleading.
	 */
	if ((badflags & UNICRASH_ZERO_WIDTH) &&
	    (badflags & UNICRASH_CONFUSABLE)) {
		str_warn(uc->ctx, descr,
_("Unicode name \"%s\" in %s could be confused with '%s' due to invisible characters."),
				bad1, what, bad2);
		goto out;
	}

	/*
	 * Unfiltered control characters can mess up your terminal and render
	 * invisibly in filechooser UIs.
	 */
	if (badflags & UNICRASH_CONTROL_CHAR) {
		str_warn(uc->ctx, descr,
_("Unicode name \"%s\" in %s contains control characters."),
				bad1, what);
		goto out;
	}

	/*
	 * It's not considered good practice (says Unicode) to mix LTR
	 * characters with RTL characters.  The mere presence of different
	 * bidirectional characters isn't enough to trip up software, so don't
	 * warn about this too loudly.
	 */
	if (badflags & UNICRASH_BIDI_MIXED) {
		str_info(uc->ctx, descr,
_("Unicode name \"%s\" in %s mixes bidirectional characters."),
				bad1, what);
		goto out;
	}

	/*
	 * We'll note if two names could be confusable with each other, but
	 * whether or not the user will actually confuse them is dependent
	 * on the rendering system and the typefaces in use.  Maybe "foo.1"
	 * and "moo.l" look the same, maybe they do not.
	 */
	if (badflags & UNICRASH_CONFUSABLE) {
		str_info(uc->ctx, descr,
_("Unicode name \"%s\" in %s could be confused with \"%s\"."),
				bad1, what, bad2);
	}

out:
	free(bad1);
	free(bad2);
}

/*
 * Try to add a name -> ino entry to the collision detector.  The name
 * must be skeletonized according to Unicode TR39 to detect names that
 * could be visually confused with each other.
 */
static bool
unicrash_add(
	struct unicrash		*uc,
	struct name_entry	*new_entry,
	unsigned int		*badflags,
	struct name_entry	**existing_entry)
{
	struct name_entry	*entry;
	size_t			bucket;
	xfs_dahash_t		hash;

	/* Store name in hashtable. */
	hash = name_entry_hash(new_entry);
	bucket = hash % uc->nr_buckets;
	entry = uc->buckets[bucket];
	new_entry->next = entry;
	uc->buckets[bucket] = new_entry;

	while (entry != NULL) {
		/* Same normalization? */
		if (new_entry->normstrlen == entry->normstrlen &&
		    !u_strcmp(new_entry->normstr, entry->normstr) &&
		    (uc->compare_ino ? entry->ino != new_entry->ino : true)) {
			*badflags |= UNICRASH_NOT_UNIQUE;
			*existing_entry = entry;
			return true;
		}

		/* Confusable? */
		if (new_entry->skelstrlen == entry->skelstrlen &&
		    !u_strcmp(new_entry->skelstr, entry->skelstr) &&
		    (uc->compare_ino ? entry->ino != new_entry->ino : true)) {
			*badflags |= UNICRASH_CONFUSABLE;
			*existing_entry = entry;
			return true;
		}
		entry = entry->next;
	}

	return true;
}

/* Check a name for unicode normalization problems or collisions. */
static bool
__unicrash_check_name(
	struct unicrash		*uc,
	const char		*descr,
	const char		*namedescr,
	const char		*name,
	xfs_ino_t		ino)
{
	struct name_entry	*dup_entry = NULL;
	struct name_entry	*new_entry;
	unsigned int		badflags = 0;
	bool			moveon;

	/* If we can't create entry data, just skip it. */
	if (!name_entry_create(uc, name, ino, &new_entry))
		return true;

	name_entry_examine(new_entry, &badflags);

	moveon = unicrash_add(uc, new_entry, &badflags, &dup_entry);
	if (!moveon)
		return false;

	if (badflags)
		unicrash_complain(uc, descr, namedescr, new_entry, badflags,
				dup_entry);

	return true;
}

/* Check a directory entry for unicode normalization problems or collisions. */
bool
unicrash_check_dir_name(
	struct unicrash		*uc,
	const char		*descr,
	struct dirent		*dentry)
{
	if (!uc)
		return true;
	return __unicrash_check_name(uc, descr, _("directory"),
			dentry->d_name, dentry->d_ino);
}

/*
 * Check an extended attribute name for unicode normalization problems
 * or collisions.
 */
bool
unicrash_check_xattr_name(
	struct unicrash		*uc,
	const char		*descr,
	const char		*attrname)
{
	if (!uc)
		return true;
	return __unicrash_check_name(uc, descr, _("extended attribute"),
			attrname, 0);
}
Commit	Line	Data
4bbed4ec DW	1	/*
	2	* Copyright (C) 2018 Oracle. All Rights Reserved.
	3	*
	4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
	5	*
	6	* This program is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU General Public License
	8	* as published by the Free Software Foundation; either version 2
	9	* of the License, or (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it would be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write the Free Software Foundation,
	18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
	19	*/
a440f877	20	#include "xfs.h"
4bbed4ec	21	#include <stdint.h>
4bbed4ec DW	22	#include <stdlib.h>
	23	#include <dirent.h>
	24	#include <sys/types.h>
4bbed4ec	25	#include <sys/statvfs.h>
bff5d1a4 DW	26	#include <strings.h>
	27	#include <unicode/ustring.h>
	28	#include <unicode/unorm2.h>
367db2f5	29	#include <unicode/uspoof.h>
4bbed4ec DW	30	#include "path.h"
	31	#include "xfs_scrub.h"
	32	#include "common.h"
	33
	34	/*
367db2f5	35	* Detect Unicode confusable names in directories and attributes.
4bbed4ec	36	*
367db2f5 DW	37	* Record all the name->ino mappings in a directory/xattr, with a twist! The
	38	* twist is to record the Unicode skeleton and normalized version of every
	39	* name we see so that we can check for a name space (directory, extended
	40	* attribute set) containing names containing malicious characters or that
	41	* could be confused for one another. These entries are at best a sign of
	42	* Unicode mishandling, or some sort of weird name substitution attack if the
	43	* entries do not point to the same inode. Warn if we see multiple dirents
	44	* that do not all point to the same inode.
4bbed4ec DW	45	*
	46	* For extended attributes we perform the same collision checks on the
	47	* attribute, though any collision is enough to trigger a warning.
	48	*
367db2f5 DW	49	* We avoid flagging these problems as errors because XFS treats names as a
	50	* sequence of arbitrary nonzero bytes. While a Unicode collision is not
	51	* technically a filesystem corruption, we ought to say something if there's a
	52	* possibility for misleading a user. Unquestionably bad things (direction
	53	* overrides, control characters, names that normalize to the same string)
	54	* produce warnings, whereas potentially confusable names produce
	55	* informational messages.
4bbed4ec	56	*
367db2f5 DW	57	* The skeleton algorithm is detailed in section 4 ("Confusable Detection") of
	58	* the Unicode technical standard #39. First we normalize the name, then we
	59	* substitute code points according to the confusable code point table, then
	60	* normalize again.
	61	*
	62	* We take the extra step of removing non-identifier code points such as
	63	* formatting characters, control characters, zero width characters, etc.
	64	* from the skeleton so that we can complain about names that are confusable
	65	* due to invisible control characters.
	66	*
	67	* In other words, skel = remove_invisible(nfd(remap_confusables(nfd(name)))).
4bbed4ec DW	68	*/
	69
	70	struct name_entry {
	71	struct name_entry *next;
3029a02c DW	72
3029a02c DW	73	/* NFKC normalized name */
bff5d1a4	74	UChar *normstr;
3029a02c DW	75	size_t normstrlen;
3029a02c DW	76
367db2f5 DW	77	/* Unicode skeletonized name */
	78	UChar *skelstr;
	79	size_t skelstrlen;
	80
4bbed4ec	81	xfs_ino_t ino;
3029a02c DW	82
	83	/* Raw UTF8 name */
	84	size_t namelen;
	85	char name[0];
4bbed4ec DW	86	};
	87	#define NAME_ENTRY_SZ(nl) (sizeof(struct name_entry) + 1 + \
	88	(nl * sizeof(uint8_t)))
	89
	90	struct unicrash {
	91	struct scrub_ctx *ctx;
367db2f5	92	USpoofChecker *spoof;
bff5d1a4	93	const UNormalizer2 *normalizer;
4bbed4ec DW	94	bool compare_ino;
	95	size_t nr_buckets;
	96	struct name_entry *buckets[0];
	97	};
	98	#define UNICRASH_SZ(nr) (sizeof(struct unicrash) + \
	99	(nr * sizeof(struct name_entry *)))
	100
95c2f78b DW	101	/* Things to complain about in Unicode naming. */
	102
	103	/*
	104	* Multiple names resolve to the same normalized string and therefore render
	105	* identically.
	106	*/
	107	#define UNICRASH_NOT_UNIQUE (1 << 0)
	108
baa9ed8d DW	109	/* Name contains directional overrides. */
	110	#define UNICRASH_BIDI_OVERRIDE (1 << 1)
	111
	112	/* Name mixes left-to-right and right-to-left characters. */
	113	#define UNICRASH_BIDI_MIXED (1 << 2)
	114
	115	/* Control characters in name. */
	116	#define UNICRASH_CONTROL_CHAR (1 << 3)
	117
	118	/* Invisible characters. Only a problem if we have collisions. */
	119	#define UNICRASH_ZERO_WIDTH (1 << 4)
	120
367db2f5 DW	121	/* Multiple names resolve to the same skeleton string. */
	122	#define UNICRASH_CONFUSABLE (1 << 5)
	123
4bbed4ec DW	124	/*
	125	* We only care about validating utf8 collisions if the underlying
	126	* system configuration says we're using utf8. If the language
	127	* specifier string used to output messages has ".UTF-8" somewhere in
	128	* its name, then we conclude utf8 is in use. Otherwise, no checking is
	129	* performed.
	130	*
	131	* Most modern Linux systems default to utf8, so the only time this
	132	* check will return false is if the administrator configured things
	133	* this way or if things are so messed up there is no locale data at
	134	* all.
	135	*/
	136	#define UTF8_STR ".UTF-8"
	137	#define UTF8_STRLEN (sizeof(UTF8_STR) - 1)
	138	static bool
	139	is_utf8_locale(void)
	140	{
	141	const char *msg_locale;
	142	static int answer = -1;
	143
	144	if (answer != -1)
	145	return answer;
	146
	147	msg_locale = setlocale(LC_MESSAGES, NULL);
	148	if (msg_locale == NULL)
	149	return false;
	150
	151	if (strstr(msg_locale, UTF8_STR) != NULL)
	152	answer = 1;
	153	else
	154	answer = 0;
	155	return answer;
	156	}
	157
3029a02c	158	/*
367db2f5	159	* Generate normalized form and skeleton of the name.
3029a02c DW	160	* If this fails, just forget everything; this is an advisory checker.
	161	*/
	162	static bool
	163	name_entry_compute_checknames(
	164	struct unicrash *uc,
	165	struct name_entry *entry)
	166	{
bff5d1a4 DW	167	UChar *normstr;
bff5d1a4 DW	168	UChar *unistr;
367db2f5	169	UChar *skelstr;
bff5d1a4 DW	170	int32_t normstrlen;
bff5d1a4 DW	171	int32_t unistrlen;
367db2f5 DW	172	int32_t skelstrlen;
	173	UChar32 uchr;
	174	int32_t i, j;
	175
bff5d1a4 DW	176	UErrorCode uerr = U_ZERO_ERROR;
	177
	178	/* Convert bytestr to unistr for normalization */
	179	u_strFromUTF8(NULL, 0, &unistrlen, entry->name, entry->namelen, &uerr);
	180	if (uerr != U_BUFFER_OVERFLOW_ERROR)
3029a02c	181	return false;
bff5d1a4 DW	182	uerr = U_ZERO_ERROR;
	183	unistr = calloc(unistrlen + 1, sizeof(UChar));
	184	if (!unistr)
	185	return false;
	186	u_strFromUTF8(unistr, unistrlen, NULL, entry->name, entry->namelen,
	187	&uerr);
	188	if (U_FAILURE(uerr))
	189	goto out_unistr;
	190
	191	/* Normalize the string. */
	192	normstrlen = unorm2_normalize(uc->normalizer, unistr, unistrlen, NULL,
	193	0, &uerr);
	194	if (uerr != U_BUFFER_OVERFLOW_ERROR)
	195	goto out_unistr;
	196	uerr = U_ZERO_ERROR;
	197	normstr = calloc(normstrlen + 1, sizeof(UChar));
	198	if (!normstr)
	199	goto out_unistr;
	200	unorm2_normalize(uc->normalizer, unistr, unistrlen, normstr, normstrlen,
	201	&uerr);
	202	if (U_FAILURE(uerr))
3029a02c DW	203	goto out_normstr;
3029a02c DW	204
367db2f5 DW	205	/* Compute skeleton. */
	206	skelstrlen = uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, NULL,
	207	0, &uerr);
	208	if (uerr != U_BUFFER_OVERFLOW_ERROR)
	209	goto out_normstr;
	210	uerr = U_ZERO_ERROR;
	211	skelstr = calloc(skelstrlen + 1, sizeof(UChar));
	212	if (!skelstr)
	213	goto out_normstr;
	214	uspoof_getSkeleton(uc->spoof, 0, unistr, unistrlen, skelstr, skelstrlen,
	215	&uerr);
	216	if (U_FAILURE(uerr))
	217	goto out_skelstr;
	218
	219	/* Remove control/formatting characters from skeleton. */
	220	for (i = 0, j = 0; i < skelstrlen; j = i) {
	221	U16_NEXT_UNSAFE(skelstr, i, uchr);
	222	if (!u_isIDIgnorable(uchr))
	223	continue;
	224	memmove(&skelstr[j], &skelstr[i],
	225	(skelstrlen - i + 1) * sizeof(UChar));
	226	skelstrlen -= (i - j);
	227	i = j;
	228	}
	229
	230	entry->skelstr = skelstr;
	231	entry->skelstrlen = skelstrlen;
3029a02c DW	232	entry->normstr = normstr;
3029a02c DW	233	entry->normstrlen = normstrlen;
bff5d1a4	234	free(unistr);
3029a02c	235	return true;
bff5d1a4	236
367db2f5 DW	237	out_skelstr:
367db2f5 DW	238	free(skelstr);
3029a02c DW	239	out_normstr:
3029a02c DW	240	free(normstr);
bff5d1a4 DW	241	out_unistr:
bff5d1a4 DW	242	free(unistr);
3029a02c DW	243	return false;
	244	}
	245
	246	/* Create a new name entry, returns false if we could not succeed. */
	247	static bool
	248	name_entry_create(
	249	struct unicrash *uc,
	250	const char *name,
	251	xfs_ino_t ino,
	252	struct name_entry **entry)
	253	{
	254	struct name_entry *new_entry;
	255	size_t namelen = strlen(name);
	256
	257	/* Create new entry */
	258	new_entry = calloc(NAME_ENTRY_SZ(namelen), 1);
	259	if (!new_entry)
	260	return false;
	261	new_entry->next = NULL;
	262	new_entry->ino = ino;
	263	memcpy(new_entry->name, name, namelen);
	264	new_entry->name[namelen] = 0;
	265	new_entry->namelen = namelen;
	266
367db2f5	267	/* Normalize/skeletonize name to find collisions. */
3029a02c DW	268	if (!name_entry_compute_checknames(uc, new_entry))
	269	goto out;
	270
	271	*entry = new_entry;
	272	return true;
	273
	274	out:
	275	free(new_entry);
	276	return false;
	277	}
	278
	279	/* Free a name entry */
	280	static void
	281	name_entry_free(
	282	struct name_entry *entry)
	283	{
	284	free(entry->normstr);
367db2f5	285	free(entry->skelstr);
3029a02c DW	286	free(entry);
	287	}
	288
	289	/* Adapt the dirhash function from libxfs, avoid linking with libxfs. */
	290
	291	#define rol32(x, y) (((x) << (y)) \| ((x) >> (32 - (y))))
	292
	293	/*
	294	* Implement a simple hash on a character string.
	295	* Rotate the hash value by 7 bits, then XOR each character in.
	296	* This is implemented with some source-level loop unrolling.
	297	*/
	298	static xfs_dahash_t
	299	name_entry_hash(
	300	struct name_entry *entry)
	301	{
	302	uint8_t *name;
	303	size_t namelen;
	304	xfs_dahash_t hash;
	305
367db2f5 DW	306	name = (uint8_t *)entry->skelstr;
367db2f5 DW	307	namelen = entry->skelstrlen * sizeof(UChar);
3029a02c DW	308
	309	/*
	310	* Do four characters at a time as long as we can.
	311	*/
	312	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
	313	hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
	314	(name[3] << 0) ^ rol32(hash, 7 * 4);
	315
	316	/*
	317	* Now do the rest of the characters.
	318	*/
	319	switch (namelen) {
	320	case 3:
	321	return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
	322	rol32(hash, 7 * 3);
	323	case 2:
	324	return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
	325	case 1:
	326	return (name[0] << 0) ^ rol32(hash, 7 * 1);
	327	default: /* case 0: */
	328	return hash;
	329	}
	330	}
	331
baa9ed8d DW	332	/*
	333	* Check a name for suspicious elements that have appeared in filename
	334	* spoofing attacks. This includes names that mixed directions or contain
	335	* direction overrides control characters, both of which have appeared in
	336	* filename spoofing attacks.
	337	*/
	338	static void
	339	name_entry_examine(
	340	struct name_entry *entry,
	341	unsigned int *badflags)
	342	{
	343	UChar32 uchr;
	344	int32_t i;
	345	uint8_t mask = 0;
	346
	347	for (i = 0; i < entry->normstrlen;) {
	348	U16_NEXT_UNSAFE(entry->normstr, i, uchr);
	349
	350	/* zero width character sequences */
	351	switch (uchr) {
	352	case 0x200B: /* zero width space */
	353	case 0x200C: /* zero width non-joiner */
	354	case 0x200D: /* zero width joiner */
	355	case 0xFEFF: /* zero width non breaking space */
	356	case 0x2060: /* word joiner */
	357	case 0x2061: /* function application */
	358	case 0x2062: /* invisible times (multiply) */
	359	case 0x2063: /* invisible separator (comma) */
	360	case 0x2064: /* invisible plus (addition) */
	361	*badflags \|= UNICRASH_ZERO_WIDTH;
	362	break;
	363	}
	364
	365	/* control characters */
	366	if (u_iscntrl(uchr))
	367	*badflags \|= UNICRASH_CONTROL_CHAR;
	368
	369	switch (u_charDirection(uchr)) {
	370	case U_LEFT_TO_RIGHT:
	371	mask \|= 0x01;
	372	break;
	373	case U_RIGHT_TO_LEFT:
	374	mask \|= 0x02;
	375	break;
	376	case U_RIGHT_TO_LEFT_OVERRIDE:
	377	*badflags \|= UNICRASH_BIDI_OVERRIDE;
	378	break;
	379	case U_LEFT_TO_RIGHT_OVERRIDE:
	380	*badflags \|= UNICRASH_BIDI_OVERRIDE;
	381	break;
	382	default:
	383	break;
	384	}
	385	}
	386
	387	/* mixing left-to-right and right-to-left chars */
	388	if (mask == 0x3)
	389	*badflags \|= UNICRASH_BIDI_MIXED;
	390	}
	391
4bbed4ec DW	392	/* Initialize the collision detector. */
	393	static bool
	394	unicrash_init(
	395	struct unicrash **ucp,
	396	struct scrub_ctx *ctx,
	397	bool compare_ino,
	398	size_t nr_buckets)
	399	{
	400	struct unicrash *p;
bff5d1a4	401	UErrorCode uerr = U_ZERO_ERROR;
4bbed4ec DW	402
	403	if (!is_utf8_locale()) {
	404	*ucp = NULL;
	405	return true;
	406	}
	407
	408	if (nr_buckets > 65536)
	409	nr_buckets = 65536;
	410	else if (nr_buckets < 16)
	411	nr_buckets = 16;
	412
	413	p = calloc(1, UNICRASH_SZ(nr_buckets));
	414	if (!p)
	415	return false;
	416	p->ctx = ctx;
	417	p->nr_buckets = nr_buckets;
	418	p->compare_ino = compare_ino;
bff5d1a4 DW	419	p->normalizer = unorm2_getNFKCInstance(&uerr);
	420	if (U_FAILURE(uerr))
	421	goto out_free;
367db2f5 DW	422	p->spoof = uspoof_open(&uerr);
	423	if (U_FAILURE(uerr))
	424	goto out_free;
	425	uspoof_setChecks(p->spoof, USPOOF_ALL_CHECKS, &uerr);
	426	if (U_FAILURE(uerr))
	427	goto out_spoof;
4bbed4ec DW	428	*ucp = p;
	429
	430	return true;
367db2f5 DW	431	out_spoof:
367db2f5 DW	432	uspoof_close(p->spoof);
bff5d1a4 DW	433	out_free:
	434	free(p);
	435	return false;
4bbed4ec DW	436	}
	437
	438	/* Initialize the collision detector for a directory. */
	439	bool
	440	unicrash_dir_init(
	441	struct unicrash **ucp,
	442	struct scrub_ctx *ctx,
	443	struct xfs_bstat *bstat)
	444	{
	445	/*
	446	* Assume 64 bytes per dentry, clamp buckets between 16 and 64k.
	447	* Same general idea as dir_hash_init in xfs_repair.
	448	*/
	449	return unicrash_init(ucp, ctx, true, bstat->bs_size / 64);
	450	}
	451
	452	/* Initialize the collision detector for an extended attribute. */
	453	bool
	454	unicrash_xattr_init(
	455	struct unicrash **ucp,
	456	struct scrub_ctx *ctx,
	457	struct xfs_bstat *bstat)
	458	{
	459	/* Assume 16 attributes per extent for lack of a better idea. */
	460	return unicrash_init(ucp, ctx, false, 16 * (1 + bstat->bs_aextents));
	461	}
	462
	463	/* Free the crash detector. */
	464	void
	465	unicrash_free(
	466	struct unicrash *uc)
	467	{
	468	struct name_entry *ne;
	469	struct name_entry *x;
	470	size_t i;
	471
	472	if (!uc)
	473	return;
	474
367db2f5	475	uspoof_close(uc->spoof);
4bbed4ec DW	476	for (i = 0; i < uc->nr_buckets; i++) {
	477	for (ne = uc->buckets[i]; ne != NULL; ne = x) {
	478	x = ne->next;
3029a02c	479	name_entry_free(ne);
4bbed4ec DW	480	}
	481	}
	482	free(uc);
	483	}
	484
4bbed4ec DW	485	/* Complain about Unicode problems. */
	486	static void
	487	unicrash_complain(
	488	struct unicrash *uc,
	489	const char *descr,
	490	const char *what,
3029a02c	491	struct name_entry *entry,
95c2f78b	492	unsigned int badflags,
3029a02c	493	struct name_entry *dup_entry)
4bbed4ec DW	494	{
	495	char *bad1 = NULL;
	496	char *bad2 = NULL;
	497
3029a02c DW	498	bad1 = string_escape(entry->name);
	499	if (dup_entry)
	500	bad2 = string_escape(dup_entry->name);
4bbed4ec	501
baa9ed8d DW	502	/*
	503	* Most filechooser UIs do not look for bidirectional overrides when
	504	* they render names. This can result in misleading name presentation
	505	* that makes "hig<rtl>gnp.sh" render like "highs.png".
	506	*/
	507	if (badflags & UNICRASH_BIDI_OVERRIDE) {
	508	str_warn(uc->ctx, descr,
	509	_("Unicode name \"%s\" in %s contains suspicious text direction overrides."),
	510	bad1, what);
	511	goto out;
	512	}
	513
95c2f78b DW	514	/*
	515	* Two names that normalize to the same string will render
	516	* identically even though the filesystem considers them unique
	517	* names. "cafe\xcc\x81" and "caf\xc3\xa9" have different byte
	518	* sequences, but they both appear as "café".
	519	*/
	520	if (badflags & UNICRASH_NOT_UNIQUE) {
4bbed4ec	521	str_warn(uc->ctx, descr,
95c2f78b DW	522	_("Unicode name \"%s\" in %s renders identically to \"%s\"."),
	523	bad1, what, bad2);
	524	goto out;
	525	}
4bbed4ec	526
367db2f5 DW	527	/*
	528	* If a name contains invisible/nonprinting characters and can be
	529	* confused with another name as a result, we should complain.
	530	* "moo<zerowidthspace>cow" and "moocow" are misleading.
	531	*/
	532	if ((badflags & UNICRASH_ZERO_WIDTH) &&
	533	(badflags & UNICRASH_CONFUSABLE)) {
	534	str_warn(uc->ctx, descr,
	535	_("Unicode name \"%s\" in %s could be confused with '%s' due to invisible characters."),
	536	bad1, what, bad2);
	537	goto out;
	538	}
	539
baa9ed8d DW	540	/*
	541	* Unfiltered control characters can mess up your terminal and render
	542	* invisibly in filechooser UIs.
	543	*/
	544	if (badflags & UNICRASH_CONTROL_CHAR) {
	545	str_warn(uc->ctx, descr,
	546	_("Unicode name \"%s\" in %s contains control characters."),
	547	bad1, what);
	548	goto out;
	549	}
	550
	551	/*
	552	* It's not considered good practice (says Unicode) to mix LTR
	553	* characters with RTL characters. The mere presence of different
	554	* bidirectional characters isn't enough to trip up software, so don't
	555	* warn about this too loudly.
	556	*/
	557	if (badflags & UNICRASH_BIDI_MIXED) {
	558	str_info(uc->ctx, descr,
	559	_("Unicode name \"%s\" in %s mixes bidirectional characters."),
	560	bad1, what);
	561	goto out;
	562	}
	563
367db2f5 DW	564	/*
	565	* We'll note if two names could be confusable with each other, but
	566	* whether or not the user will actually confuse them is dependent
	567	* on the rendering system and the typefaces in use. Maybe "foo.1"
	568	* and "moo.l" look the same, maybe they do not.
	569	*/
	570	if (badflags & UNICRASH_CONFUSABLE) {
	571	str_info(uc->ctx, descr,
	572	_("Unicode name \"%s\" in %s could be confused with \"%s\"."),
	573	bad1, what, bad2);
	574	}
	575
95c2f78b	576	out:
4bbed4ec DW	577	free(bad1);
	578	free(bad2);
	579	}
	580
	581	/*
	582	* Try to add a name -> ino entry to the collision detector. The name
367db2f5 DW	583	* must be skeletonized according to Unicode TR39 to detect names that
367db2f5 DW	584	* could be visually confused with each other.
4bbed4ec DW	585	*/
	586	static bool
	587	unicrash_add(
	588	struct unicrash *uc,
3029a02c DW	589	struct name_entry *new_entry,
	590	unsigned int *badflags,
	591	struct name_entry **existing_entry)
4bbed4ec	592	{
3029a02c	593	struct name_entry *entry;
4bbed4ec DW	594	size_t bucket;
	595	xfs_dahash_t hash;
	596
3029a02c DW	597	/* Store name in hashtable. */
3029a02c DW	598	hash = name_entry_hash(new_entry);
4bbed4ec	599	bucket = hash % uc->nr_buckets;
3029a02c DW	600	entry = uc->buckets[bucket];
	601	new_entry->next = entry;
	602	uc->buckets[bucket] = new_entry;
	603
	604	while (entry != NULL) {
	605	/* Same normalization? */
	606	if (new_entry->normstrlen == entry->normstrlen &&
bff5d1a4	607	!u_strcmp(new_entry->normstr, entry->normstr) &&
3029a02c	608	(uc->compare_ino ? entry->ino != new_entry->ino : true)) {
95c2f78b	609	*badflags \|= UNICRASH_NOT_UNIQUE;
3029a02c	610	*existing_entry = entry;
4bbed4ec DW	611	return true;
4bbed4ec DW	612	}
367db2f5 DW	613
	614	/* Confusable? */
	615	if (new_entry->skelstrlen == entry->skelstrlen &&
	616	!u_strcmp(new_entry->skelstr, entry->skelstr) &&
	617	(uc->compare_ino ? entry->ino != new_entry->ino : true)) {
	618	*badflags \|= UNICRASH_CONFUSABLE;
	619	*existing_entry = entry;
	620	return true;
	621	}
3029a02c	622	entry = entry->next;
4bbed4ec DW	623	}
4bbed4ec DW	624
4bbed4ec DW	625	return true;
	626	}
	627
	628	/* Check a name for unicode normalization problems or collisions. */
	629	static bool
	630	__unicrash_check_name(
	631	struct unicrash *uc,
	632	const char *descr,
	633	const char *namedescr,
	634	const char *name,
	635	xfs_ino_t ino)
	636	{
3029a02c DW	637	struct name_entry *dup_entry = NULL;
3029a02c DW	638	struct name_entry *new_entry;
95c2f78b	639	unsigned int badflags = 0;
4bbed4ec	640	bool moveon;
4bbed4ec	641
3029a02c DW	642	/* If we can't create entry data, just skip it. */
	643	if (!name_entry_create(uc, name, ino, &new_entry))
	644	return true;
	645
baa9ed8d DW	646	name_entry_examine(new_entry, &badflags);
baa9ed8d DW	647
3029a02c	648	moveon = unicrash_add(uc, new_entry, &badflags, &dup_entry);
4bbed4ec DW	649	if (!moveon)
	650	return false;
	651
95c2f78b	652	if (badflags)
3029a02c DW	653	unicrash_complain(uc, descr, namedescr, new_entry, badflags,
3029a02c DW	654	dup_entry);
4bbed4ec	655
4bbed4ec DW	656	return true;
	657	}
	658
	659	/* Check a directory entry for unicode normalization problems or collisions. */
	660	bool
	661	unicrash_check_dir_name(
	662	struct unicrash *uc,
	663	const char *descr,
	664	struct dirent *dentry)
	665	{
	666	if (!uc)
	667	return true;
	668	return __unicrash_check_name(uc, descr, _("directory"),
	669	dentry->d_name, dentry->d_ino);
	670	}
	671
	672	/*
	673	* Check an extended attribute name for unicode normalization problems
	674	* or collisions.
	675	*/
	676	bool
	677	unicrash_check_xattr_name(
	678	struct unicrash *uc,
	679	const char *descr,
	680	const char *attrname)
	681	{
	682	if (!uc)
	683	return true;
	684	return __unicrash_check_name(uc, descr, _("extended attribute"),
	685	attrname, 0);
	686	}