[thirdparty/xfsprogs-dev.git] / scrub / unicrash.c

/*
 * Copyright (C) 2018 Oracle.  All Rights Reserved.
 *
 * Author: Darrick J. Wong <darrick.wong@oracle.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it would be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write the Free Software Foundation,
 * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
 */
#include "xfs.h"
#include <stdint.h>
#include <stdlib.h>
#include <dirent.h>
#include <sys/types.h>
#include <sys/statvfs.h>
#include <unistr.h>
#include <uninorm.h>
#include "path.h"
#include "xfs_scrub.h"
#include "common.h"

/*
 * Detect collisions of Unicode-normalized names.
 *
 * Record all the name->ino mappings in a directory/xattr, with a twist!
 * The twist is that we perform unicode normalization on every name we
 * see, so that we can warn about a directory containing more than one
 * directory entries that normalize to the same Unicode string.  These
 * entries are at best a sign of Unicode mishandling, or some sort of
 * weird name substitution attack if the entries do not point to the
 * same inode.  Warn if we see multiple dirents that do not all point to
 * the same inode.
 *
 * For extended attributes we perform the same collision checks on the
 * attribute, though any collision is enough to trigger a warning.
 *
 * We flag these collisions as warnings and not errors because XFS
 * treats names as a sequence of arbitrary nonzero bytes.  While a
 * Unicode collision is not technically a filesystem corruption, we
 * ought to say something if there's a possibility for misleading a
 * user.
 *
 * To normalize, we use Unicode NFKC.  We use the composing
 * normalization mode (e.g. "E WITH ACUTE" instead of "E" then "ACUTE")
 * because that's what W3C (and in general Linux) uses.  This enables us
 * to detect multiple object names that normalize to the same name and
 * could be confusing to users.  Furthermore, we use the compatibility
 * mode to detect names with compatible but different code points to
 * strengthen those checks.
 */

struct name_entry {
	struct name_entry	*next;
	xfs_ino_t		ino;
	size_t			uninamelen;
	uint8_t			uniname[0];
};
#define NAME_ENTRY_SZ(nl)	(sizeof(struct name_entry) + 1 + \
				 (nl * sizeof(uint8_t)))

struct unicrash {
	struct scrub_ctx	*ctx;
	bool			compare_ino;
	size_t			nr_buckets;
	struct name_entry	*buckets[0];
};
#define UNICRASH_SZ(nr)		(sizeof(struct unicrash) + \
				 (nr * sizeof(struct name_entry *)))

/*
 * We only care about validating utf8 collisions if the underlying
 * system configuration says we're using utf8.  If the language
 * specifier string used to output messages has ".UTF-8" somewhere in
 * its name, then we conclude utf8 is in use.  Otherwise, no checking is
 * performed.
 *
 * Most modern Linux systems default to utf8, so the only time this
 * check will return false is if the administrator configured things
 * this way or if things are so messed up there is no locale data at
 * all.
 */
#define UTF8_STR		".UTF-8"
#define UTF8_STRLEN		(sizeof(UTF8_STR) - 1)
static bool
is_utf8_locale(void)
{
	const char		*msg_locale;
	static int		answer = -1;

	if (answer != -1)
		return answer;

	msg_locale = setlocale(LC_MESSAGES, NULL);
	if (msg_locale == NULL)
		return false;

	if (strstr(msg_locale, UTF8_STR) != NULL)
		answer = 1;
	else
		answer = 0;
	return answer;
}

/* Initialize the collision detector. */
static bool
unicrash_init(
	struct unicrash		**ucp,
	struct scrub_ctx	*ctx,
	bool			compare_ino,
	size_t			nr_buckets)
{
	struct unicrash		*p;

	if (!is_utf8_locale()) {
		*ucp = NULL;
		return true;
	}

	if (nr_buckets > 65536)
		nr_buckets = 65536;
	else if (nr_buckets < 16)
		nr_buckets = 16;

	p = calloc(1, UNICRASH_SZ(nr_buckets));
	if (!p)
		return false;
	p->ctx = ctx;
	p->nr_buckets = nr_buckets;
	p->compare_ino = compare_ino;
	*ucp = p;

	return true;
}

/* Initialize the collision detector for a directory. */
bool
unicrash_dir_init(
	struct unicrash		**ucp,
	struct scrub_ctx	*ctx,
	struct xfs_bstat	*bstat)
{
	/*
	 * Assume 64 bytes per dentry, clamp buckets between 16 and 64k.
	 * Same general idea as dir_hash_init in xfs_repair.
	 */
	return unicrash_init(ucp, ctx, true, bstat->bs_size / 64);
}

/* Initialize the collision detector for an extended attribute. */
bool
unicrash_xattr_init(
	struct unicrash		**ucp,
	struct scrub_ctx	*ctx,
	struct xfs_bstat	*bstat)
{
	/* Assume 16 attributes per extent for lack of a better idea. */
	return unicrash_init(ucp, ctx, false, 16 * (1 + bstat->bs_aextents));
}

/* Free the crash detector. */
void
unicrash_free(
	struct unicrash		*uc)
{
	struct name_entry	*ne;
	struct name_entry	*x;
	size_t			i;

	if (!uc)
		return;

	for (i = 0; i < uc->nr_buckets; i++) {
		for (ne = uc->buckets[i]; ne != NULL; ne = x) {
			x = ne->next;
			free(ne);
		}
	}
	free(uc);
}

/* Steal the dirhash function from libxfs, avoid linking with libxfs. */

#define rol32(x, y)		(((x) << (y)) | ((x) >> (32 - (y))))

/*
 * Implement a simple hash on a character string.
 * Rotate the hash value by 7 bits, then XOR each character in.
 * This is implemented with some source-level loop unrolling.
 */
static xfs_dahash_t
unicrash_hashname(
	const uint8_t		*name,
	size_t			namelen)
{
	xfs_dahash_t		hash;

	/*
	 * Do four characters at a time as long as we can.
	 */
	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
		hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
		       (name[3] << 0) ^ rol32(hash, 7 * 4);

	/*
	 * Now do the rest of the characters.
	 */
	switch (namelen) {
	case 3:
		return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
		       rol32(hash, 7 * 3);
	case 2:
		return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
	case 1:
		return (name[0] << 0) ^ rol32(hash, 7 * 1);
	default: /* case 0: */
		return hash;
	}
}

/*
 * Normalize a name according to Unicode NFKC normalization rules.
 * Returns true if the name was already normalized.
 */
static bool
unicrash_normalize(
	const char		*in,
	uint8_t			*out,
	size_t			outlen)
{
	size_t			inlen = strlen(in);

	assert(inlen <= outlen);
	if (!u8_normalize(UNINORM_NFKC, (const uint8_t *)in, inlen,
			out, &outlen)) {
		/* Didn't normalize, just return the same buffer. */
		memcpy(out, in, inlen + 1);
		return true;
	}
	out[outlen] = 0;
	return outlen == inlen ? memcmp(in, out, inlen) == 0 : false;
}

/* Complain about Unicode problems. */
static void
unicrash_complain(
	struct unicrash		*uc,
	const char		*descr,
	const char		*what,
	bool			normal,
	bool			unique,
	const char		*name,
	uint8_t			*uniname)
{
	char			*bad1 = NULL;
	char			*bad2 = NULL;

	bad1 = string_escape(name);
	bad2 = string_escape((char *)uniname);

	if (!normal && should_warn_about_name(uc->ctx))
		str_info(uc->ctx, descr,
_("Unicode name \"%s\" in %s should be normalized as \"%s\"."),
				bad1, what, bad2);
	if (!unique)
		str_warn(uc->ctx, descr,
_("Duplicate normalized Unicode name \"%s\" found in %s."),
				bad1, what);

	free(bad1);
	free(bad2);
}

/*
 * Try to add a name -> ino entry to the collision detector.  The name
 * must be normalized according to Unicode NFKC normalization rules to
 * detect byte-unique names that map to the same sequence of Unicode
 * code points.
 *
 * This function returns true either if there was no previous mapping or
 * there was a mapping that matched exactly.  It returns false if
 * there is already a record with that name pointing to a different
 * inode.
 */
static bool
unicrash_add(
	struct unicrash		*uc,
	uint8_t			*uniname,
	xfs_ino_t		ino,
	bool			*unique)
{
	struct name_entry	*ne;
	struct name_entry	*x;
	struct name_entry	**nep;
	size_t			uninamelen = u8_strlen(uniname);
	size_t			bucket;
	xfs_dahash_t		hash;

	/* Do we already know about that name? */
	hash = unicrash_hashname(uniname, uninamelen);
	bucket = hash % uc->nr_buckets;
	for (nep = &uc->buckets[bucket], ne = *nep; ne != NULL; ne = x) {
		if (u8_strcmp(uniname, ne->uniname) == 0) {
			*unique = uc->compare_ino ? ne->ino == ino : false;
			return true;
		}
		nep = &ne->next;
		x = ne->next;
	}

	/* Remember that name. */
	x = malloc(NAME_ENTRY_SZ(uninamelen));
	if (!x)
		return false;
	x->next = NULL;
	x->ino = ino;
	x->uninamelen = uninamelen;
	memcpy(x->uniname, uniname, uninamelen + 1);
	*nep = x;
	*unique = true;

	return true;
}

/* Check a name for unicode normalization problems or collisions. */
static bool
__unicrash_check_name(
	struct unicrash		*uc,
	const char		*descr,
	const char		*namedescr,
	const char		*name,
	xfs_ino_t		ino)
{
	uint8_t			uniname[(NAME_MAX * 2) + 1];
	bool			moveon;
	bool			normal;
	bool			unique;

	memset(uniname, 0, (NAME_MAX * 2) + 1);
	normal = unicrash_normalize(name, uniname, NAME_MAX * 2);
	moveon = unicrash_add(uc, uniname, ino, &unique);
	if (!moveon)
		return false;

	if (normal && unique)
		return true;

	unicrash_complain(uc, descr, namedescr, normal, unique, name,
			uniname);
	return true;
}

/* Check a directory entry for unicode normalization problems or collisions. */
bool
unicrash_check_dir_name(
	struct unicrash		*uc,
	const char		*descr,
	struct dirent		*dentry)
{
	if (!uc)
		return true;
	return __unicrash_check_name(uc, descr, _("directory"),
			dentry->d_name, dentry->d_ino);
}

/*
 * Check an extended attribute name for unicode normalization problems
 * or collisions.
 */
bool
unicrash_check_xattr_name(
	struct unicrash		*uc,
	const char		*descr,
	const char		*attrname)
{
	if (!uc)
		return true;
	return __unicrash_check_name(uc, descr, _("extended attribute"),
			attrname, 0);
}
Commit	Line	Data
4bbed4ec DW	1	/*
	2	* Copyright (C) 2018 Oracle. All Rights Reserved.
	3	*
	4	* Author: Darrick J. Wong <darrick.wong@oracle.com>
	5	*
	6	* This program is free software; you can redistribute it and/or
	7	* modify it under the terms of the GNU General Public License
	8	* as published by the Free Software Foundation; either version 2
	9	* of the License, or (at your option) any later version.
	10	*
	11	* This program is distributed in the hope that it would be useful,
	12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	14	* GNU General Public License for more details.
	15	*
	16	* You should have received a copy of the GNU General Public License
	17	* along with this program; if not, write the Free Software Foundation,
	18	* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA.
	19	*/
a440f877	20	#include "xfs.h"
4bbed4ec	21	#include <stdint.h>
4bbed4ec DW	22	#include <stdlib.h>
	23	#include <dirent.h>
	24	#include <sys/types.h>
4bbed4ec DW	25	#include <sys/statvfs.h>
	26	#include <unistr.h>
	27	#include <uninorm.h>
4bbed4ec DW	28	#include "path.h"
	29	#include "xfs_scrub.h"
	30	#include "common.h"
	31
	32	/*
	33	* Detect collisions of Unicode-normalized names.
	34	*
	35	* Record all the name->ino mappings in a directory/xattr, with a twist!
	36	* The twist is that we perform unicode normalization on every name we
	37	* see, so that we can warn about a directory containing more than one
	38	* directory entries that normalize to the same Unicode string. These
	39	* entries are at best a sign of Unicode mishandling, or some sort of
	40	* weird name substitution attack if the entries do not point to the
	41	* same inode. Warn if we see multiple dirents that do not all point to
	42	* the same inode.
	43	*
	44	* For extended attributes we perform the same collision checks on the
	45	* attribute, though any collision is enough to trigger a warning.
	46	*
	47	* We flag these collisions as warnings and not errors because XFS
	48	* treats names as a sequence of arbitrary nonzero bytes. While a
	49	* Unicode collision is not technically a filesystem corruption, we
	50	* ought to say something if there's a possibility for misleading a
	51	* user.
	52	*
	53	* To normalize, we use Unicode NFKC. We use the composing
	54	* normalization mode (e.g. "E WITH ACUTE" instead of "E" then "ACUTE")
	55	* because that's what W3C (and in general Linux) uses. This enables us
	56	* to detect multiple object names that normalize to the same name and
	57	* could be confusing to users. Furthermore, we use the compatibility
	58	* mode to detect names with compatible but different code points to
	59	* strengthen those checks.
	60	*/
	61
	62	struct name_entry {
	63	struct name_entry *next;
	64	xfs_ino_t ino;
	65	size_t uninamelen;
	66	uint8_t uniname[0];
	67	};
	68	#define NAME_ENTRY_SZ(nl) (sizeof(struct name_entry) + 1 + \
	69	(nl * sizeof(uint8_t)))
	70
	71	struct unicrash {
	72	struct scrub_ctx *ctx;
	73	bool compare_ino;
	74	size_t nr_buckets;
	75	struct name_entry *buckets[0];
	76	};
	77	#define UNICRASH_SZ(nr) (sizeof(struct unicrash) + \
	78	(nr * sizeof(struct name_entry *)))
	79
	80	/*
	81	* We only care about validating utf8 collisions if the underlying
	82	* system configuration says we're using utf8. If the language
	83	* specifier string used to output messages has ".UTF-8" somewhere in
	84	* its name, then we conclude utf8 is in use. Otherwise, no checking is
	85	* performed.
	86	*
	87	* Most modern Linux systems default to utf8, so the only time this
	88	* check will return false is if the administrator configured things
	89	* this way or if things are so messed up there is no locale data at
	90	* all.
	91	*/
92	#define UTF8_STR ".UTF-8"
93	#define UTF8_STRLEN (sizeof(UTF8_STR) - 1)
94	static bool
95	is_utf8_locale(void)
96	{
97	const char *msg_locale;
98	static int answer = -1;
99
100	if (answer != -1)
101	return answer;
102
103	msg_locale = setlocale(LC_MESSAGES, NULL);
104	if (msg_locale == NULL)
105	return false;
106
107	if (strstr(msg_locale, UTF8_STR) != NULL)
108	answer = 1;
109	else
110	answer = 0;
111	return answer;
112	}
113
114	/* Initialize the collision detector. */
115	static bool
116	unicrash_init(
117	struct unicrash **ucp,
118	struct scrub_ctx *ctx,
119	bool compare_ino,
120	size_t nr_buckets)
121	{
122	struct unicrash *p;
123
124	if (!is_utf8_locale()) {
125	*ucp = NULL;
126	return true;
127	}
128
129	if (nr_buckets > 65536)
130	nr_buckets = 65536;
131	else if (nr_buckets < 16)
132	nr_buckets = 16;
133
134	p = calloc(1, UNICRASH_SZ(nr_buckets));
135	if (!p)
136	return false;
137	p->ctx = ctx;
138	p->nr_buckets = nr_buckets;
139	p->compare_ino = compare_ino;
140	*ucp = p;
141
142	return true;
143	}
144
145	/* Initialize the collision detector for a directory. */
146	bool
147	unicrash_dir_init(
148	struct unicrash **ucp,
149	struct scrub_ctx *ctx,
150	struct xfs_bstat *bstat)
151	{
152	/*
153	* Assume 64 bytes per dentry, clamp buckets between 16 and 64k.
154	* Same general idea as dir_hash_init in xfs_repair.
155	*/
156	return unicrash_init(ucp, ctx, true, bstat->bs_size / 64);
157	}
158
159	/* Initialize the collision detector for an extended attribute. */
160	bool
161	unicrash_xattr_init(
162	struct unicrash **ucp,
163	struct scrub_ctx *ctx,
164	struct xfs_bstat *bstat)
165	{
166	/* Assume 16 attributes per extent for lack of a better idea. */
167	return unicrash_init(ucp, ctx, false, 16 * (1 + bstat->bs_aextents));
168	}
169
170	/* Free the crash detector. */
171	void
172	unicrash_free(
173	struct unicrash *uc)
174	{
175	struct name_entry *ne;
176	struct name_entry *x;
177	size_t i;
178
179	if (!uc)
180	return;
181
182	for (i = 0; i < uc->nr_buckets; i++) {
183	for (ne = uc->buckets[i]; ne != NULL; ne = x) {
184	x = ne->next;
185	free(ne);
186	}
187	}
188	free(uc);
189	}
190
191	/* Steal the dirhash function from libxfs, avoid linking with libxfs. */
192
193	#define rol32(x, y) (((x) << (y)) \| ((x) >> (32 - (y))))
194
195	/*
196	* Implement a simple hash on a character string.
197	* Rotate the hash value by 7 bits, then XOR each character in.
198	* This is implemented with some source-level loop unrolling.
199	*/
200	static xfs_dahash_t
201	unicrash_hashname(
202	const uint8_t *name,
203	size_t namelen)
204	{
205	xfs_dahash_t hash;
206
207	/*
208	* Do four characters at a time as long as we can.
209	*/
210	for (hash = 0; namelen >= 4; namelen -= 4, name += 4)
211	hash = (name[0] << 21) ^ (name[1] << 14) ^ (name[2] << 7) ^
212	(name[3] << 0) ^ rol32(hash, 7 * 4);
213
214	/*
215	* Now do the rest of the characters.
216	*/
217	switch (namelen) {
218	case 3:
219	return (name[0] << 14) ^ (name[1] << 7) ^ (name[2] << 0) ^
220	rol32(hash, 7 * 3);
221	case 2:
222	return (name[0] << 7) ^ (name[1] << 0) ^ rol32(hash, 7 * 2);
223	case 1:
224	return (name[0] << 0) ^ rol32(hash, 7 * 1);
225	default: /* case 0: */
226	return hash;
227	}
228	}
229
230	/*
231	* Normalize a name according to Unicode NFKC normalization rules.
232	* Returns true if the name was already normalized.
233	*/
234	static bool
235	unicrash_normalize(
236	const char *in,
237	uint8_t *out,
238	size_t outlen)
239	{
240	size_t inlen = strlen(in);
241
242	assert(inlen <= outlen);
243	if (!u8_normalize(UNINORM_NFKC, (const uint8_t *)in, inlen,
244	out, &outlen)) {
245	/* Didn't normalize, just return the same buffer. */
246	memcpy(out, in, inlen + 1);
247	return true;
248	}
249	out[outlen] = 0;
250	return outlen == inlen ? memcmp(in, out, inlen) == 0 : false;
251	}
252
253	/* Complain about Unicode problems. */
254	static void
255	unicrash_complain(
256	struct unicrash *uc,
257	const char *descr,
258	const char *what,
259	bool normal,
260	bool unique,
261	const char *name,
262	uint8_t *uniname)
263	{
264	char *bad1 = NULL;
265	char *bad2 = NULL;
266
267	bad1 = string_escape(name);
268	bad2 = string_escape((char *)uniname);
269
270	if (!normal && should_warn_about_name(uc->ctx))
271	str_info(uc->ctx, descr,
272	_("Unicode name \"%s\" in %s should be normalized as \"%s\"."),
273	bad1, what, bad2);
274	if (!unique)
275	str_warn(uc->ctx, descr,
276	_("Duplicate normalized Unicode name \"%s\" found in %s."),
277	bad1, what);
278
279	free(bad1);
280	free(bad2);
281	}
282
283	/*
284	* Try to add a name -> ino entry to the collision detector. The name
285	* must be normalized according to Unicode NFKC normalization rules to
286	* detect byte-unique names that map to the same sequence of Unicode
287	* code points.
288	*
289	* This function returns true either if there was no previous mapping or
290	* there was a mapping that matched exactly. It returns false if
291	* there is already a record with that name pointing to a different
292	* inode.
293	*/
294	static bool
295	unicrash_add(
296	struct unicrash *uc,
297	uint8_t *uniname,
298	xfs_ino_t ino,
299	bool *unique)
300	{
301	struct name_entry *ne;
302	struct name_entry *x;
303	struct name_entry **nep;
304	size_t uninamelen = u8_strlen(uniname);
305	size_t bucket;
306	xfs_dahash_t hash;
307
308	/* Do we already know about that name? */
309	hash = unicrash_hashname(uniname, uninamelen);
310	bucket = hash % uc->nr_buckets;
311	for (nep = &uc->buckets[bucket], ne = *nep; ne != NULL; ne = x) {
312	if (u8_strcmp(uniname, ne->uniname) == 0) {
313	*unique = uc->compare_ino ? ne->ino == ino : false;
314	return true;
315	}
316	nep = &ne->next;
317	x = ne->next;
318	}
319
320	/* Remember that name. */
321	x = malloc(NAME_ENTRY_SZ(uninamelen));
322	if (!x)
323	return false;
324	x->next = NULL;
325	x->ino = ino;
326	x->uninamelen = uninamelen;
327	memcpy(x->uniname, uniname, uninamelen + 1);
328	*nep = x;
329	*unique = true;
330
331	return true;
332	}
333
334	/* Check a name for unicode normalization problems or collisions. */
335	static bool
336	__unicrash_check_name(
337	struct unicrash *uc,
338	const char *descr,
339	const char *namedescr,
340	const char *name,
341	xfs_ino_t ino)
342	{
343	uint8_t uniname[(NAME_MAX * 2) + 1];
344	bool moveon;
345	bool normal;
346	bool unique;
347
348	memset(uniname, 0, (NAME_MAX * 2) + 1);
349	normal = unicrash_normalize(name, uniname, NAME_MAX * 2);
350	moveon = unicrash_add(uc, uniname, ino, &unique);
351	if (!moveon)
352	return false;
353
354	if (normal && unique)
355	return true;
356
357	unicrash_complain(uc, descr, namedescr, normal, unique, name,
358	uniname);
359	return true;
360	}
361
362	/* Check a directory entry for unicode normalization problems or collisions. */
363	bool
364	unicrash_check_dir_name(
365	struct unicrash *uc,
366	const char *descr,
367	struct dirent *dentry)
368	{
369	if (!uc)
370	return true;
371	return __unicrash_check_name(uc, descr, _("directory"),
372	dentry->d_name, dentry->d_ino);
373	}
374
375	/*
376	* Check an extended attribute name for unicode normalization problems
377	* or collisions.
378	*/
379	bool
380	unicrash_check_xattr_name(
381	struct unicrash *uc,
382	const char *descr,
383	const char *attrname)
384	{
385	if (!uc)
386	return true;
387	return __unicrash_check_name(uc, descr, _("extended attribute"),
388	attrname, 0);
389	}