]> git.ipfire.org Git - thirdparty/xfsprogs-dev.git/blob - db/obfuscate.c
xfsprogs: Release v6.10.1
[thirdparty/xfsprogs-dev.git] / db / obfuscate.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2007, 2011 SGI
4 * All Rights Reserved.
5 */
6 #include "libxfs.h"
7 #include "init.h"
8 #include "obfuscate.h"
9
10 static inline unsigned char
11 random_filename_char(void)
12 {
13 static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
14 "abcdefghijklmnopqrstuvwxyz"
15 "0123456789-_";
16
17 return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
18 }
19
20 #define rol32(x,y) (((x) << (y)) | ((x) >> (32 - (y))))
21
22 /*
23 * Given a name and its hash value, massage the name in such a way
24 * that the result is another name of equal length which shares the
25 * same hash value.
26 */
27 void
28 obfuscate_name(
29 xfs_dahash_t hash,
30 size_t name_len,
31 unsigned char *name,
32 bool is_dirent)
33 {
34 unsigned char *oldname = NULL;
35 unsigned char *newp;
36 int i;
37 xfs_dahash_t new_hash;
38 unsigned char *first;
39 unsigned char high_bit;
40 int tries = 0;
41 bool is_ci_name = is_dirent && xfs_has_asciici(mp);
42 int shift;
43
44 /*
45 * Our obfuscation algorithm requires at least 5-character
46 * names, so don't bother if the name is too short. We
47 * work backward from a hash value to determine the last
48 * five bytes in a name required to produce a new name
49 * with the same hash.
50 */
51 if (name_len < 5)
52 return;
53
54 if (is_ci_name) {
55 oldname = malloc(name_len);
56 if (!oldname)
57 return;
58 memcpy(oldname, name, name_len);
59 }
60
61 again:
62 newp = name;
63 new_hash = 0;
64
65 /*
66 * If we cannot generate a ci-compatible obfuscated name after 1000
67 * tries, don't bother obfuscating the name.
68 */
69 if (tries++ > 1000) {
70 memcpy(name, oldname, name_len);
71 goto out_free;
72 }
73
74 /*
75 * The beginning of the obfuscated name can be pretty much
76 * anything, so fill it in with random characters.
77 * Accumulate its new hash value as we go.
78 */
79 for (i = 0; i < name_len - 5; i++) {
80 *newp = random_filename_char();
81 if (is_ci_name)
82 new_hash = xfs_ascii_ci_xfrm(*newp) ^
83 rol32(new_hash, 7);
84 else
85 new_hash = *newp ^ rol32(new_hash, 7);
86 newp++;
87 }
88
89 /*
90 * Compute which five bytes need to be used at the end of
91 * the name so the hash of the obfuscated name is the same
92 * as the hash of the original. If any result in an invalid
93 * character, flip a bit and arrange for a corresponding bit
94 * in a neighboring byte to be flipped as well. For the
95 * last byte, the "neighbor" to change is the first byte
96 * we're computing here.
97 */
98 new_hash = rol32(new_hash, 3) ^ hash;
99
100 first = newp;
101 high_bit = 0;
102 for (shift = 28; shift >= 0; shift -= 7) {
103 *newp = (new_hash >> shift & 0x7f) ^ high_bit;
104 if (is_invalid_char(*newp)) {
105 *newp ^= 1;
106 high_bit = 0x80;
107 } else
108 high_bit = 0;
109
110 /*
111 * If ascii-ci is enabled, uppercase characters are converted
112 * to lowercase characters while computing the name hash. If
113 * any of the necessary correction bytes are uppercase, the
114 * hash of the new name will not match. Try again with a
115 * different prefix.
116 */
117 if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
118 goto again;
119
120 ASSERT(!is_invalid_char(*newp));
121 newp++;
122 }
123
124 /*
125 * If we flipped a bit on the last byte, we need to fix up
126 * the matching bit in the first byte. The result will
127 * be a valid character, because we know that first byte
128 * has 0's in its upper four bits (it was produced by a
129 * 28-bit right-shift of a 32-bit unsigned value).
130 */
131 if (high_bit) {
132 *first ^= 0x10;
133
134 if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
135 goto again;
136
137 ASSERT(!is_invalid_char(*first));
138 }
139 out_free:
140 free(oldname);
141 }
142
143 /*
144 * Flip a bit in each of two bytes at the end of the given name.
145 * This is used in generating a series of alternate names to be used
146 * in the event a duplicate is found.
147 *
148 * The bits flipped are selected such that they both affect the same
149 * bit in the name's computed hash value, so flipping them both will
150 * preserve the hash.
151 *
152 * The following diagram aims to show the portion of a computed
153 * hash that a given byte of a name affects.
154 *
155 * 31 28 24 21 14 8 7 3 0
156 * +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
157 * hash: | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
158 * +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
159 * last-4 ->| |<-- last-2 --->| |<--- last ---->|
160 * |<-- last-3 --->| |<-- last-1 --->| |<- last-4
161 * |<-- last-7 --->| |<-- last-5 --->|
162 * |<-- last-8 --->| |<-- last-6 --->|
163 * . . . and so on
164 *
165 * The last byte of the name directly affects the low-order byte of
166 * the hash. The next-to-last affects bits 7-14, the next one back
167 * affects bits 14-21, and so on. The effect wraps around when it
168 * goes beyond the top of the hash (as happens for byte last-4).
169 *
170 * Bits that are flipped together "overlap" on the hash value. As
171 * an example of overlap, the last two bytes both affect bit 7 in
172 * the hash. That pair of bytes (and their overlapping bits) can be
173 * used for this "flip bit" operation (it's the first pair tried,
174 * actually).
175 *
176 * A table defines overlapping pairs--the bytes involved and bits
177 * within them--that can be used this way. The byte offset is
178 * relative to a starting point within the name, which will be set
179 * to affect the bytes at the end of the name. The function is
180 * called with a "bitseq" value which indicates which bit flip is
181 * desired, and this translates directly into selecting which entry
182 * in the bit_to_flip[] table to apply.
183 *
184 * The function returns 1 if the operation was successful. It
185 * returns 0 if the result produced a character that's not valid in
186 * a name (either '/' or a '\0'). Finally, it returns -1 if the bit
187 * sequence number is beyond what is supported for a name of this
188 * length.
189 *
190 * Discussion
191 * ----------
192 * (Also see the discussion above find_alternate(), below.)
193 *
194 * In order to make this function work for any length name, the
195 * table is ordered by increasing byte offset, so that the earliest
196 * entries can apply to the shortest strings. This way all names
197 * are done consistently.
198 *
199 * When bit flips occur, they can convert printable characters
200 * into non-printable ones. In an effort to reduce the impact of
201 * this, the first bit flips are chosen to affect bytes the end of
202 * the name (and furthermore, toward the low bits of a byte). Those
203 * bytes are often non-printable anyway because of the way they are
204 * initially selected by obfuscate_name()). This is accomplished,
205 * using later table entries first.
206 *
207 * Each row in the table doubles the number of alternates that
208 * can be generated. A two-byte name is limited to using only
209 * the first row, so it's possible to generate two alternates
210 * (the original name, plus the alternate produced by flipping
211 * the one pair of bits). In a 5-byte name, the effect of the
212 * first byte overlaps the last by 4 its, and there are 8 bits
213 * to flip, allowing for 256 possible alternates.
214 *
215 * Short names (less than 5 bytes) are never even obfuscated, so for
216 * such names the relatively small number of alternates should never
217 * really be a problem.
218 *
219 * Long names (more than 6 bytes, say) are not likely to exhaust
220 * the number of available alternates. In fact, the table could
221 * probably have stopped at 8 entries, on the assumption that 256
222 * alternates should be enough for most any situation. The entries
223 * beyond those are present mostly for demonstration of how it could
224 * be populated with more entries, should it ever be necessary to do
225 * so.
226 */
227 static int
228 flip_bit(
229 size_t name_len,
230 unsigned char *name,
231 uint32_t bitseq)
232 {
233 int index;
234 size_t offset;
235 unsigned char *p0, *p1;
236 unsigned char m0, m1;
237 struct {
238 int byte; /* Offset from start within name */
239 unsigned char bit; /* Bit within that byte */
240 } bit_to_flip[][2] = { /* Sorted by second entry's byte */
241 { { 0, 0 }, { 1, 7 } }, /* Each row defines a pair */
242 { { 1, 0 }, { 2, 7 } }, /* of bytes and a bit within */
243 { { 2, 0 }, { 3, 7 } }, /* each byte. Each bit in */
244 { { 0, 4 }, { 4, 0 } }, /* a pair affects the same */
245 { { 0, 5 }, { 4, 1 } }, /* bit in the hash, so flipping */
246 { { 0, 6 }, { 4, 2 } }, /* both will change the name */
247 { { 0, 7 }, { 4, 3 } }, /* while preserving the hash. */
248 { { 3, 0 }, { 4, 7 } },
249 { { 0, 0 }, { 5, 3 } }, /* The first entry's byte offset */
250 { { 0, 1 }, { 5, 4 } }, /* must be less than the second. */
251 { { 0, 2 }, { 5, 5 } },
252 { { 0, 3 }, { 5, 6 } }, /* The table can be extended to */
253 { { 0, 4 }, { 5, 7 } }, /* an arbitrary number of entries */
254 { { 4, 0 }, { 5, 7 } }, /* but there's not much point. */
255 /* . . . */
256 };
257
258 /* Find the first entry *not* usable for name of this length */
259
260 for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
261 if (bit_to_flip[index][1].byte >= name_len)
262 break;
263
264 /*
265 * Back up to the last usable entry. If that number is
266 * smaller than the bit sequence number, inform the caller
267 * that nothing this large (or larger) will work.
268 */
269 if (bitseq > --index)
270 return -1;
271
272 /*
273 * We will be switching bits at the end of name, with a
274 * preference for affecting the last bytes first. Compute
275 * where in the name we'll start applying the changes.
276 */
277 offset = name_len - (bit_to_flip[index][1].byte + 1);
278 index -= bitseq; /* Use later table entries first */
279
280 p0 = name + offset + bit_to_flip[index][0].byte;
281 p1 = name + offset + bit_to_flip[index][1].byte;
282 m0 = 1 << bit_to_flip[index][0].bit;
283 m1 = 1 << bit_to_flip[index][1].bit;
284
285 /* Only change the bytes if it produces valid characters */
286
287 if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
288 return 0;
289
290 *p0 ^= m0;
291 *p1 ^= m1;
292
293 return 1;
294 }
295
296 /*
297 * This function generates a well-defined sequence of "alternate"
298 * names for a given name. An alternate is a name having the same
299 * length and same hash value as the original name. This is needed
300 * because the algorithm produces only one obfuscated name to use
301 * for a given original name, and it's possible that result matches
302 * a name already seen. This function checks for this, and if it
303 * occurs, finds another suitable obfuscated name to use.
304 *
305 * Each bit in the binary representation of the sequence number is
306 * used to select one possible "bit flip" operation to perform on
307 * the name. So for example:
308 * seq = 0: selects no bits to flip
309 * seq = 1: selects the 0th bit to flip
310 * seq = 2: selects the 1st bit to flip
311 * seq = 3: selects the 0th and 1st bit to flip
312 * ... and so on.
313 *
314 * The flip_bit() function takes care of the details of the bit
315 * flipping within the name. Note that the "1st bit" in this
316 * context is a bit sequence number; i.e. it doesn't necessarily
317 * mean bit 0x02 will be changed.
318 *
319 * If a valid name (one that contains no '/' or '\0' characters) is
320 * produced by this process for the given sequence number, this
321 * function returns 1. If the result is not valid, it returns 0.
322 * Returns -1 if the sequence number is beyond the the maximum for
323 * names of the given length.
324 *
325 *
326 * Discussion
327 * ----------
328 * The number of alternates available for a given name is dependent
329 * on its length. A "bit flip" involves inverting two bits in
330 * a name--the two bits being selected such that their values
331 * affect the name's hash value in the same way. Alternates are
332 * thus generated by inverting the value of pairs of such
333 * "overlapping" bits in the original name. Each byte after the
334 * first in a name adds at least one bit of overlap to work with.
335 * (See comments above flip_bit() for more discussion on this.)
336 *
337 * So the number of alternates is dependent on the number of such
338 * overlapping bits in a name. If there are N bit overlaps, there
339 * 2^N alternates for that hash value.
340 *
341 * Here are the number of overlapping bits available for generating
342 * alternates for names of specific lengths:
343 * 1 0 (must have 2 bytes to have any overlap)
344 * 2 1 One bit overlaps--so 2 possible alternates
345 * 3 2 Two bits overlap--so 4 possible alternates
346 * 4 4 Three bits overlap, so 2^3 alternates
347 * 5 8 8 bits overlap (due to wrapping), 256 alternates
348 * 6 18 2^18 alternates
349 * 7 28 2^28 alternates
350 * ...
351 * It's clear that the number of alternates grows very quickly with
352 * the length of the name. But note that the set of alternates
353 * includes invalid names. And for certain (contrived) names, the
354 * number of valid names is a fairly small fraction of the total
355 * number of alternates.
356 *
357 * The main driver for this infrastructure for coming up with
358 * alternate names is really related to names 5 (or possibly 6)
359 * bytes in length. 5-byte obfuscated names contain no randomly-
360 * generated bytes in them, and the chance of an obfuscated name
361 * matching an already-seen name is too high to just ignore. This
362 * methodical selection of alternates ensures we don't produce
363 * duplicate names unless we have exhausted our options.
364 */
365 int
366 find_alternate(
367 size_t name_len,
368 unsigned char *name,
369 uint32_t seq)
370 {
371 uint32_t bitseq = 0;
372 uint32_t bits = seq;
373
374 if (!seq)
375 return 1; /* alternate 0 is the original name */
376 if (name_len < 2) /* Must have 2 bytes to flip */
377 return -1;
378
379 for (bitseq = 0; bits; bitseq++) {
380 uint32_t mask = 1 << bitseq;
381 int fb;
382
383 if (!(bits & mask))
384 continue;
385
386 fb = flip_bit(name_len, name, bitseq);
387 if (fb < 1)
388 return fb ? -1 : 0;
389 bits ^= mask;
390 }
391
392 return 1;
393 }