xfs_db: hoist name obfuscation code out of metadump.c

author Darrick J. Wong <djwong@kernel.org>

Thu, 15 Jun 2023 16:12:06 +0000 (09:12 -0700)

committer Carlos Maiolino <cem@kernel.org>

Thu, 22 Jun 2023 12:04:11 +0000 (14:04 +0200)
author Darrick J. Wong <djwong@kernel.org>
Thu, 15 Jun 2023 16:12:06 +0000 (09:12 -0700)
committer Carlos Maiolino <cem@kernel.org>
Thu, 22 Jun 2023 12:04:11 +0000 (14:04 +0200)
diff --git a/db/Makefile b/db/Makefile

index b2e01174571be44ac978794ccb784ff8e1d224c8..2f95f67075df0d084843e2846216e65a2b971df7 100644 (file)
--- a/db/Makefile
+++ b/db/Makefile
@@ -13,7 +13,7 @@ HFILES = addr.h agf.h agfl.h agi.h attr.h attrshort.h bit.h block.h bmap.h \
         flist.h fprint.h frag.h freesp.h hash.h help.h init.h inode.h input.h \
         io.h logformat.h malloc.h metadump.h output.h print.h quit.h sb.h \
         sig.h strvec.h text.h type.h write.h attrset.h symlink.h fsmap.h \
-       fuzz.h
+       fuzz.h obfuscate.h
  CFILES = $(HFILES:.h=.c) btdump.c btheight.c convert.c info.c namei.c \
         timelimit.c
  LSRCFILES = xfs_admin.sh xfs_ncheck.sh xfs_metadump.sh
diff --git a/db/metadump.c b/db/metadump.c

index 9ccee0b7ace98fea93886823d668af7c4118619d..d9a616a92961baa9ce5be3b759ead27351483fef 100644 (file)
--- a/db/metadump.c
+++ b/db/metadump.c
@@ -19,6 +19,7 @@
  #include "faddr.h"
  #include "field.h"
  #include "dir2.h"
+#include "obfuscate.h"
  
  #define DEFAULT_MAX_EXT_SIZE   XFS_MAX_BMBT_EXTLEN
  
@@ -736,19 +737,6 @@ nametable_add(xfs_dahash_t hash, int namelen, unsigned char *name)
         return ent;
  }
  
-#define is_invalid_char(c)     ((c) == '/' || (c) == '\0')
-#define rol32(x,y)             (((x) << (y)) | ((x) >> (32 - (y))))
-
-static inline unsigned char
-random_filename_char(void)
-{
-       static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                                               "abcdefghijklmnopqrstuvwxyz"
-                                               "0123456789-_";
-
-       return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
-}
-
  #define        ORPHANAGE       "lost+found"
  #define        ORPHANAGE_LEN   (sizeof (ORPHANAGE) - 1)
  
@@ -808,380 +796,6 @@ in_lost_found(
         return slen == namelen && !memcmp(name, s, namelen);
  }
  
-/*
- * Given a name and its hash value, massage the name in such a way
- * that the result is another name of equal length which shares the
- * same hash value.
- */
-static void
-obfuscate_name(
-       xfs_dahash_t    hash,
-       size_t          name_len,
-       unsigned char   *name,
-       bool            is_dirent)
-{
-       unsigned char   *oldname = NULL;
-       unsigned char   *newp;
-       int             i;
-       xfs_dahash_t    new_hash;
-       unsigned char   *first;
-       unsigned char   high_bit;
-       int             tries = 0;
-       bool            is_ci_name = is_dirent && xfs_has_asciici(mp);
-       int             shift;
-
-       /*
-        * Our obfuscation algorithm requires at least 5-character
-        * names, so don't bother if the name is too short.  We
-        * work backward from a hash value to determine the last
-        * five bytes in a name required to produce a new name
-        * with the same hash.
-        */
-       if (name_len < 5)
-               return;
-
-       if (is_ci_name) {
-               oldname = malloc(name_len);
-               if (!oldname)
-                       return;
-               memcpy(oldname, name, name_len);
-       }
-
-again:
-       newp = name;
-       new_hash = 0;
-
-       /*
-        * If we cannot generate a ci-compatible obfuscated name after 1000
-        * tries, don't bother obfuscating the name.
-        */
-       if (tries++ > 1000) {
-               memcpy(name, oldname, name_len);
-               goto out_free;
-       }
-
-       /*
-        * The beginning of the obfuscated name can be pretty much
-        * anything, so fill it in with random characters.
-        * Accumulate its new hash value as we go.
-        */
-       for (i = 0; i < name_len - 5; i++) {
-               *newp = random_filename_char();
-               if (is_ci_name)
-                       new_hash = xfs_ascii_ci_xfrm(*newp) ^
-                                                       rol32(new_hash, 7);
-               else
-                       new_hash = *newp ^ rol32(new_hash, 7);
-               newp++;
-       }
-
-       /*
-        * Compute which five bytes need to be used at the end of
-        * the name so the hash of the obfuscated name is the same
-        * as the hash of the original.  If any result in an invalid
-        * character, flip a bit and arrange for a corresponding bit
-        * in a neighboring byte to be flipped as well.  For the
-        * last byte, the "neighbor" to change is the first byte
-        * we're computing here.
-        */
-       new_hash = rol32(new_hash, 3) ^ hash;
-
-       first = newp;
-       high_bit = 0;
-       for (shift = 28; shift >= 0; shift -= 7) {
-               *newp = (new_hash >> shift & 0x7f) ^ high_bit;
-               if (is_invalid_char(*newp)) {
-                       *newp ^= 1;
-                       high_bit = 0x80;
-               } else
-                       high_bit = 0;
-
-               /*
-                * If ascii-ci is enabled, uppercase characters are converted
-                * to lowercase characters while computing the name hash.  If
-                * any of the necessary correction bytes are uppercase, the
-                * hash of the new name will not match.  Try again with a
-                * different prefix.
-                */
-               if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
-                       goto again;
-
-               ASSERT(!is_invalid_char(*newp));
-               newp++;
-       }
-
-       /*
-        * If we flipped a bit on the last byte, we need to fix up
-        * the matching bit in the first byte.  The result will
-        * be a valid character, because we know that first byte
-        * has 0's in its upper four bits (it was produced by a
-        * 28-bit right-shift of a 32-bit unsigned value).
-        */
-       if (high_bit) {
-               *first ^= 0x10;
-
-               if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
-                       goto again;
-
-               ASSERT(!is_invalid_char(*first));
-       }
-
-out_free:
-       free(oldname);
-}
-
-/*
- * Flip a bit in each of two bytes at the end of the given name.
- * This is used in generating a series of alternate names to be used
- * in the event a duplicate is found.
- *
- * The bits flipped are selected such that they both affect the same
- * bit in the name's computed hash value, so flipping them both will
- * preserve the hash.
- *
- * The following diagram aims to show the portion of a computed
- * hash that a given byte of a name affects.
- *
- *        31    28      24    21            14           8 7       3     0
- *        +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
- * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
- *        +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
- *       last-4 ->|           |<-- last-2 --->|           |<--- last ---->|
- *              |<-- last-3 --->|           |<-- last-1 --->|     |<- last-4
- *                      |<-- last-7 --->|           |<-- last-5 --->|
- *        |<-- last-8 --->|           |<-- last-6 --->|
- *                     . . . and so on
- *
- * The last byte of the name directly affects the low-order byte of
- * the hash.  The next-to-last affects bits 7-14, the next one back
- * affects bits 14-21, and so on.  The effect wraps around when it
- * goes beyond the top of the hash (as happens for byte last-4).
- *
- * Bits that are flipped together "overlap" on the hash value.  As
- * an example of overlap, the last two bytes both affect bit 7 in
- * the hash.  That pair of bytes (and their overlapping bits) can be
- * used for this "flip bit" operation (it's the first pair tried,
- * actually).
- *
- * A table defines overlapping pairs--the bytes involved and bits
- * within them--that can be used this way.  The byte offset is
- * relative to a starting point within the name, which will be set
- * to affect the bytes at the end of the name.  The function is
- * called with a "bitseq" value which indicates which bit flip is
- * desired, and this translates directly into selecting which entry
- * in the bit_to_flip[] table to apply.
- *
- * The function returns 1 if the operation was successful.  It
- * returns 0 if the result produced a character that's not valid in
- * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
- * sequence number is beyond what is supported for a name of this
- * length.
- *
- * Discussion
- * ----------
- * (Also see the discussion above find_alternate(), below.)
- *
- * In order to make this function work for any length name, the
- * table is ordered by increasing byte offset, so that the earliest
- * entries can apply to the shortest strings.  This way all names
- * are done consistently.
- *
- * When bit flips occur, they can convert printable characters
- * into non-printable ones.  In an effort to reduce the impact of
- * this, the first bit flips are chosen to affect bytes the end of
- * the name (and furthermore, toward the low bits of a byte).  Those
- * bytes are often non-printable anyway because of the way they are
- * initially selected by obfuscate_name()).  This is accomplished,
- * using later table entries first.
- *
- * Each row in the table doubles the number of alternates that
- * can be generated.  A two-byte name is limited to using only
- * the first row, so it's possible to generate two alternates
- * (the original name, plus the alternate produced by flipping
- * the one pair of bits).  In a 5-byte name, the effect of the
- * first byte overlaps the last by 4 its, and there are 8 bits
- * to flip, allowing for 256 possible alternates.
- *
- * Short names (less than 5 bytes) are never even obfuscated, so for
- * such names the relatively small number of alternates should never
- * really be a problem.
- *
- * Long names (more than 6 bytes, say) are not likely to exhaust
- * the number of available alternates.  In fact, the table could
- * probably have stopped at 8 entries, on the assumption that 256
- * alternates should be enough for most any situation.  The entries
- * beyond those are present mostly for demonstration of how it could
- * be populated with more entries, should it ever be necessary to do
- * so.
- */
-static int
-flip_bit(
-       size_t          name_len,
-       unsigned char   *name,
-       uint32_t        bitseq)
-{
-       int     index;
-       size_t  offset;
-       unsigned char *p0, *p1;
-       unsigned char m0, m1;
-       struct {
-           int         byte;   /* Offset from start within name */
-           unsigned char bit;  /* Bit within that byte */
-       } bit_to_flip[][2] = {  /* Sorted by second entry's byte */
-           { { 0, 0 }, { 1, 7 } },     /* Each row defines a pair */
-           { { 1, 0 }, { 2, 7 } },     /* of bytes and a bit within */
-           { { 2, 0 }, { 3, 7 } },     /* each byte.  Each bit in */
-           { { 0, 4 }, { 4, 0 } },     /* a pair affects the same */
-           { { 0, 5 }, { 4, 1 } },     /* bit in the hash, so flipping */
-           { { 0, 6 }, { 4, 2 } },     /* both will change the name */
-           { { 0, 7 }, { 4, 3 } },     /* while preserving the hash. */
-           { { 3, 0 }, { 4, 7 } },
-           { { 0, 0 }, { 5, 3 } },     /* The first entry's byte offset */
-           { { 0, 1 }, { 5, 4 } },     /* must be less than the second. */
-           { { 0, 2 }, { 5, 5 } },
-           { { 0, 3 }, { 5, 6 } },     /* The table can be extended to */
-           { { 0, 4 }, { 5, 7 } },     /* an arbitrary number of entries */
-           { { 4, 0 }, { 5, 7 } },     /* but there's not much point. */
-               /* . . . */
-       };
-
-       /* Find the first entry *not* usable for name of this length */
-
-       for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
-               if (bit_to_flip[index][1].byte >= name_len)
-                       break;
-
-       /*
-        * Back up to the last usable entry.  If that number is
-        * smaller than the bit sequence number, inform the caller
-        * that nothing this large (or larger) will work.
-        */
-       if (bitseq > --index)
-               return -1;
-
-       /*
-        * We will be switching bits at the end of name, with a
-        * preference for affecting the last bytes first.  Compute
-        * where in the name we'll start applying the changes.
-        */
-       offset = name_len - (bit_to_flip[index][1].byte + 1);
-       index -= bitseq;        /* Use later table entries first */
-
-       p0 = name + offset + bit_to_flip[index][0].byte;
-       p1 = name + offset + bit_to_flip[index][1].byte;
-       m0 = 1 << bit_to_flip[index][0].bit;
-       m1 = 1 << bit_to_flip[index][1].bit;
-
-       /* Only change the bytes if it produces valid characters */
-
-       if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
-               return 0;
-
-       *p0 ^= m0;
-       *p1 ^= m1;
-
-       return 1;
-}
-
-/*
- * This function generates a well-defined sequence of "alternate"
- * names for a given name.  An alternate is a name having the same
- * length and same hash value as the original name.  This is needed
- * because the algorithm produces only one obfuscated name to use
- * for a given original name, and it's possible that result matches
- * a name already seen.  This function checks for this, and if it
- * occurs, finds another suitable obfuscated name to use.
- *
- * Each bit in the binary representation of the sequence number is
- * used to select one possible "bit flip" operation to perform on
- * the name.  So for example:
- *    seq = 0: selects no bits to flip
- *    seq = 1: selects the 0th bit to flip
- *    seq = 2: selects the 1st bit to flip
- *    seq = 3: selects the 0th and 1st bit to flip
- *    ... and so on.
- *
- * The flip_bit() function takes care of the details of the bit
- * flipping within the name.  Note that the "1st bit" in this
- * context is a bit sequence number; i.e. it doesn't necessarily
- * mean bit 0x02 will be changed.
- *
- * If a valid name (one that contains no '/' or '\0' characters) is
- * produced by this process for the given sequence number, this
- * function returns 1.  If the result is not valid, it returns 0.
- * Returns -1 if the sequence number is beyond the the maximum for
- * names of the given length.
- *
- *
- * Discussion
- * ----------
- * The number of alternates available for a given name is dependent
- * on its length.  A "bit flip" involves inverting two bits in
- * a name--the two bits being selected such that their values
- * affect the name's hash value in the same way.  Alternates are
- * thus generated by inverting the value of pairs of such
- * "overlapping" bits in the original name.  Each byte after the
- * first in a name adds at least one bit of overlap to work with.
- * (See comments above flip_bit() for more discussion on this.)
- *
- * So the number of alternates is dependent on the number of such
- * overlapping bits in a name.  If there are N bit overlaps, there
- * 2^N alternates for that hash value.
- *
- * Here are the number of overlapping bits available for generating
- * alternates for names of specific lengths:
- *     1       0       (must have 2 bytes to have any overlap)
- *     2       1       One bit overlaps--so 2 possible alternates
- *     3       2       Two bits overlap--so 4 possible alternates
- *     4       4       Three bits overlap, so 2^3 alternates
- *     5       8       8 bits overlap (due to wrapping), 256 alternates
- *     6       18      2^18 alternates
- *     7       28      2^28 alternates
- *        ...
- * It's clear that the number of alternates grows very quickly with
- * the length of the name.  But note that the set of alternates
- * includes invalid names.  And for certain (contrived) names, the
- * number of valid names is a fairly small fraction of the total
- * number of alternates.
- *
- * The main driver for this infrastructure for coming up with
- * alternate names is really related to names 5 (or possibly 6)
- * bytes in length.  5-byte obfuscated names contain no randomly-
- * generated bytes in them, and the chance of an obfuscated name
- * matching an already-seen name is too high to just ignore.  This
- * methodical selection of alternates ensures we don't produce
- * duplicate names unless we have exhausted our options.
- */
-static int
-find_alternate(
-       size_t          name_len,
-       unsigned char   *name,
-       uint32_t        seq)
-{
-       uint32_t        bitseq = 0;
-       uint32_t        bits = seq;
-
-       if (!seq)
-               return 1;       /* alternate 0 is the original name */
-       if (name_len < 2)       /* Must have 2 bytes to flip */
-               return -1;
-
-       for (bitseq = 0; bits; bitseq++) {
-               uint32_t        mask = 1 << bitseq;
-               int             fb;
-
-               if (!(bits & mask))
-                       continue;
-
-               fb = flip_bit(name_len, name, bitseq);
-               if (fb < 1)
-                       return fb ? -1 : 0;
-               bits ^= mask;
-       }
-
-       return 1;
-}
-
  /*
   * Look up the given name in the name table.  If it is already
   * present, iterate through a well-defined sequence of alternate
diff --git a/db/obfuscate.c b/db/obfuscate.c

new file mode 100644 (file)

index 0000000..cd950b4
--- /dev/null
+++ b/db/obfuscate.c
@@ -0,0 +1,393 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2007, 2011 SGI
+ * All Rights Reserved.
+ */
+#include "libxfs.h"
+#include "init.h"
+#include "obfuscate.h"
+
+static inline unsigned char
+random_filename_char(void)
+{
+       static unsigned char filename_alphabet[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+                                               "abcdefghijklmnopqrstuvwxyz"
+                                               "0123456789-_";
+
+       return filename_alphabet[random() % (sizeof filename_alphabet - 1)];
+}
+
+#define rol32(x,y)             (((x) << (y)) | ((x) >> (32 - (y))))
+
+/*
+ * Given a name and its hash value, massage the name in such a way
+ * that the result is another name of equal length which shares the
+ * same hash value.
+ */
+void
+obfuscate_name(
+       xfs_dahash_t    hash,
+       size_t          name_len,
+       unsigned char   *name,
+       bool            is_dirent)
+{
+       unsigned char   *oldname = NULL;
+       unsigned char   *newp;
+       int             i;
+       xfs_dahash_t    new_hash;
+       unsigned char   *first;
+       unsigned char   high_bit;
+       int             tries = 0;
+       bool            is_ci_name = is_dirent && xfs_has_asciici(mp);
+       int             shift;
+
+       /*
+        * Our obfuscation algorithm requires at least 5-character
+        * names, so don't bother if the name is too short.  We
+        * work backward from a hash value to determine the last
+        * five bytes in a name required to produce a new name
+        * with the same hash.
+        */
+       if (name_len < 5)
+               return;
+
+       if (is_ci_name) {
+               oldname = malloc(name_len);
+               if (!oldname)
+                       return;
+               memcpy(oldname, name, name_len);
+       }
+
+again:
+       newp = name;
+       new_hash = 0;
+
+       /*
+        * If we cannot generate a ci-compatible obfuscated name after 1000
+        * tries, don't bother obfuscating the name.
+        */
+       if (tries++ > 1000) {
+               memcpy(name, oldname, name_len);
+               goto out_free;
+       }
+
+       /*
+        * The beginning of the obfuscated name can be pretty much
+        * anything, so fill it in with random characters.
+        * Accumulate its new hash value as we go.
+        */
+       for (i = 0; i < name_len - 5; i++) {
+               *newp = random_filename_char();
+               if (is_ci_name)
+                       new_hash = xfs_ascii_ci_xfrm(*newp) ^
+                                                       rol32(new_hash, 7);
+               else
+                       new_hash = *newp ^ rol32(new_hash, 7);
+               newp++;
+       }
+
+       /*
+        * Compute which five bytes need to be used at the end of
+        * the name so the hash of the obfuscated name is the same
+        * as the hash of the original.  If any result in an invalid
+        * character, flip a bit and arrange for a corresponding bit
+        * in a neighboring byte to be flipped as well.  For the
+        * last byte, the "neighbor" to change is the first byte
+        * we're computing here.
+        */
+       new_hash = rol32(new_hash, 3) ^ hash;
+
+       first = newp;
+       high_bit = 0;
+       for (shift = 28; shift >= 0; shift -= 7) {
+               *newp = (new_hash >> shift & 0x7f) ^ high_bit;
+               if (is_invalid_char(*newp)) {
+                       *newp ^= 1;
+                       high_bit = 0x80;
+               } else
+                       high_bit = 0;
+
+               /*
+                * If ascii-ci is enabled, uppercase characters are converted
+                * to lowercase characters while computing the name hash.  If
+                * any of the necessary correction bytes are uppercase, the
+                * hash of the new name will not match.  Try again with a
+                * different prefix.
+                */
+               if (is_ci_name && xfs_ascii_ci_need_xfrm(*newp))
+                       goto again;
+
+               ASSERT(!is_invalid_char(*newp));
+               newp++;
+       }
+
+       /*
+        * If we flipped a bit on the last byte, we need to fix up
+        * the matching bit in the first byte.  The result will
+        * be a valid character, because we know that first byte
+        * has 0's in its upper four bits (it was produced by a
+        * 28-bit right-shift of a 32-bit unsigned value).
+        */
+       if (high_bit) {
+               *first ^= 0x10;
+
+               if (is_ci_name && xfs_ascii_ci_need_xfrm(*first))
+                       goto again;
+
+               ASSERT(!is_invalid_char(*first));
+       }
+out_free:
+       free(oldname);
+}
+
+/*
+ * Flip a bit in each of two bytes at the end of the given name.
+ * This is used in generating a series of alternate names to be used
+ * in the event a duplicate is found.
+ *
+ * The bits flipped are selected such that they both affect the same
+ * bit in the name's computed hash value, so flipping them both will
+ * preserve the hash.
+ *
+ * The following diagram aims to show the portion of a computed
+ * hash that a given byte of a name affects.
+ *
+ *        31    28      24    21            14           8 7       3     0
+ *        +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
+ * hash:   | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
+ *        +-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-|-+-+-+-+-+-+-+-+
+ *       last-4 ->|           |<-- last-2 --->|           |<--- last ---->|
+ *              |<-- last-3 --->|           |<-- last-1 --->|     |<- last-4
+ *                      |<-- last-7 --->|           |<-- last-5 --->|
+ *        |<-- last-8 --->|           |<-- last-6 --->|
+ *                     . . . and so on
+ *
+ * The last byte of the name directly affects the low-order byte of
+ * the hash.  The next-to-last affects bits 7-14, the next one back
+ * affects bits 14-21, and so on.  The effect wraps around when it
+ * goes beyond the top of the hash (as happens for byte last-4).
+ *
+ * Bits that are flipped together "overlap" on the hash value.  As
+ * an example of overlap, the last two bytes both affect bit 7 in
+ * the hash.  That pair of bytes (and their overlapping bits) can be
+ * used for this "flip bit" operation (it's the first pair tried,
+ * actually).
+ *
+ * A table defines overlapping pairs--the bytes involved and bits
+ * within them--that can be used this way.  The byte offset is
+ * relative to a starting point within the name, which will be set
+ * to affect the bytes at the end of the name.  The function is
+ * called with a "bitseq" value which indicates which bit flip is
+ * desired, and this translates directly into selecting which entry
+ * in the bit_to_flip[] table to apply.
+ *
+ * The function returns 1 if the operation was successful.  It
+ * returns 0 if the result produced a character that's not valid in
+ * a name (either '/' or a '\0').  Finally, it returns -1 if the bit
+ * sequence number is beyond what is supported for a name of this
+ * length.
+ *
+ * Discussion
+ * ----------
+ * (Also see the discussion above find_alternate(), below.)
+ *
+ * In order to make this function work for any length name, the
+ * table is ordered by increasing byte offset, so that the earliest
+ * entries can apply to the shortest strings.  This way all names
+ * are done consistently.
+ *
+ * When bit flips occur, they can convert printable characters
+ * into non-printable ones.  In an effort to reduce the impact of
+ * this, the first bit flips are chosen to affect bytes the end of
+ * the name (and furthermore, toward the low bits of a byte).  Those
+ * bytes are often non-printable anyway because of the way they are
+ * initially selected by obfuscate_name()).  This is accomplished,
+ * using later table entries first.
+ *
+ * Each row in the table doubles the number of alternates that
+ * can be generated.  A two-byte name is limited to using only
+ * the first row, so it's possible to generate two alternates
+ * (the original name, plus the alternate produced by flipping
+ * the one pair of bits).  In a 5-byte name, the effect of the
+ * first byte overlaps the last by 4 its, and there are 8 bits
+ * to flip, allowing for 256 possible alternates.
+ *
+ * Short names (less than 5 bytes) are never even obfuscated, so for
+ * such names the relatively small number of alternates should never
+ * really be a problem.
+ *
+ * Long names (more than 6 bytes, say) are not likely to exhaust
+ * the number of available alternates.  In fact, the table could
+ * probably have stopped at 8 entries, on the assumption that 256
+ * alternates should be enough for most any situation.  The entries
+ * beyond those are present mostly for demonstration of how it could
+ * be populated with more entries, should it ever be necessary to do
+ * so.
+ */
+static int
+flip_bit(
+       size_t          name_len,
+       unsigned char   *name,
+       uint32_t        bitseq)
+{
+       int     index;
+       size_t  offset;
+       unsigned char *p0, *p1;
+       unsigned char m0, m1;
+       struct {
+           int         byte;   /* Offset from start within name */
+           unsigned char bit;  /* Bit within that byte */
+       } bit_to_flip[][2] = {  /* Sorted by second entry's byte */
+           { { 0, 0 }, { 1, 7 } },     /* Each row defines a pair */
+           { { 1, 0 }, { 2, 7 } },     /* of bytes and a bit within */
+           { { 2, 0 }, { 3, 7 } },     /* each byte.  Each bit in */
+           { { 0, 4 }, { 4, 0 } },     /* a pair affects the same */
+           { { 0, 5 }, { 4, 1 } },     /* bit in the hash, so flipping */
+           { { 0, 6 }, { 4, 2 } },     /* both will change the name */
+           { { 0, 7 }, { 4, 3 } },     /* while preserving the hash. */
+           { { 3, 0 }, { 4, 7 } },
+           { { 0, 0 }, { 5, 3 } },     /* The first entry's byte offset */
+           { { 0, 1 }, { 5, 4 } },     /* must be less than the second. */
+           { { 0, 2 }, { 5, 5 } },
+           { { 0, 3 }, { 5, 6 } },     /* The table can be extended to */
+           { { 0, 4 }, { 5, 7 } },     /* an arbitrary number of entries */
+           { { 4, 0 }, { 5, 7 } },     /* but there's not much point. */
+               /* . . . */
+       };
+
+       /* Find the first entry *not* usable for name of this length */
+
+       for (index = 0; index < ARRAY_SIZE(bit_to_flip); index++)
+               if (bit_to_flip[index][1].byte >= name_len)
+                       break;
+
+       /*
+        * Back up to the last usable entry.  If that number is
+        * smaller than the bit sequence number, inform the caller
+        * that nothing this large (or larger) will work.
+        */
+       if (bitseq > --index)
+               return -1;
+
+       /*
+        * We will be switching bits at the end of name, with a
+        * preference for affecting the last bytes first.  Compute
+        * where in the name we'll start applying the changes.
+        */
+       offset = name_len - (bit_to_flip[index][1].byte + 1);
+       index -= bitseq;        /* Use later table entries first */
+
+       p0 = name + offset + bit_to_flip[index][0].byte;
+       p1 = name + offset + bit_to_flip[index][1].byte;
+       m0 = 1 << bit_to_flip[index][0].bit;
+       m1 = 1 << bit_to_flip[index][1].bit;
+
+       /* Only change the bytes if it produces valid characters */
+
+       if (is_invalid_char(*p0 ^ m0) || is_invalid_char(*p1 ^ m1))
+               return 0;
+
+       *p0 ^= m0;
+       *p1 ^= m1;
+
+       return 1;
+}
+
+/*
+ * This function generates a well-defined sequence of "alternate"
+ * names for a given name.  An alternate is a name having the same
+ * length and same hash value as the original name.  This is needed
+ * because the algorithm produces only one obfuscated name to use
+ * for a given original name, and it's possible that result matches
+ * a name already seen.  This function checks for this, and if it
+ * occurs, finds another suitable obfuscated name to use.
+ *
+ * Each bit in the binary representation of the sequence number is
+ * used to select one possible "bit flip" operation to perform on
+ * the name.  So for example:
+ *    seq = 0: selects no bits to flip
+ *    seq = 1: selects the 0th bit to flip
+ *    seq = 2: selects the 1st bit to flip
+ *    seq = 3: selects the 0th and 1st bit to flip
+ *    ... and so on.
+ *
+ * The flip_bit() function takes care of the details of the bit
+ * flipping within the name.  Note that the "1st bit" in this
+ * context is a bit sequence number; i.e. it doesn't necessarily
+ * mean bit 0x02 will be changed.
+ *
+ * If a valid name (one that contains no '/' or '\0' characters) is
+ * produced by this process for the given sequence number, this
+ * function returns 1.  If the result is not valid, it returns 0.
+ * Returns -1 if the sequence number is beyond the the maximum for
+ * names of the given length.
+ *
+ *
+ * Discussion
+ * ----------
+ * The number of alternates available for a given name is dependent
+ * on its length.  A "bit flip" involves inverting two bits in
+ * a name--the two bits being selected such that their values
+ * affect the name's hash value in the same way.  Alternates are
+ * thus generated by inverting the value of pairs of such
+ * "overlapping" bits in the original name.  Each byte after the
+ * first in a name adds at least one bit of overlap to work with.
+ * (See comments above flip_bit() for more discussion on this.)
+ *
+ * So the number of alternates is dependent on the number of such
+ * overlapping bits in a name.  If there are N bit overlaps, there
+ * 2^N alternates for that hash value.
+ *
+ * Here are the number of overlapping bits available for generating
+ * alternates for names of specific lengths:
+ *     1       0       (must have 2 bytes to have any overlap)
+ *     2       1       One bit overlaps--so 2 possible alternates
+ *     3       2       Two bits overlap--so 4 possible alternates
+ *     4       4       Three bits overlap, so 2^3 alternates
+ *     5       8       8 bits overlap (due to wrapping), 256 alternates
+ *     6       18      2^18 alternates
+ *     7       28      2^28 alternates
+ *        ...
+ * It's clear that the number of alternates grows very quickly with
+ * the length of the name.  But note that the set of alternates
+ * includes invalid names.  And for certain (contrived) names, the
+ * number of valid names is a fairly small fraction of the total
+ * number of alternates.
+ *
+ * The main driver for this infrastructure for coming up with
+ * alternate names is really related to names 5 (or possibly 6)
+ * bytes in length.  5-byte obfuscated names contain no randomly-
+ * generated bytes in them, and the chance of an obfuscated name
+ * matching an already-seen name is too high to just ignore.  This
+ * methodical selection of alternates ensures we don't produce
+ * duplicate names unless we have exhausted our options.
+ */
+int
+find_alternate(
+       size_t          name_len,
+       unsigned char   *name,
+       uint32_t        seq)
+{
+       uint32_t        bitseq = 0;
+       uint32_t        bits = seq;
+
+       if (!seq)
+               return 1;       /* alternate 0 is the original name */
+       if (name_len < 2)       /* Must have 2 bytes to flip */
+               return -1;
+
+       for (bitseq = 0; bits; bitseq++) {
+               uint32_t        mask = 1 << bitseq;
+               int             fb;
+
+               if (!(bits & mask))
+                       continue;
+
+               fb = flip_bit(name_len, name, bitseq);
+               if (fb < 1)
+                       return fb ? -1 : 0;
+               bits ^= mask;
+       }
+
+       return 1;
+}
diff --git a/db/obfuscate.h b/db/obfuscate.h

new file mode 100644 (file)

index 0000000..afaaca3
--- /dev/null
+++ b/db/obfuscate.h
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2007, 2011 SGI
+ * All Rights Reserved.
+ */
+#ifndef __DB_OBFUSCATE_H__
+#define __DB_OBFUSCATE_H__
+
+/* Routines to obfuscate directory filenames and xattr names. */
+
+#define is_invalid_char(c)     ((c) == '/' || (c) == '\0')
+
+void obfuscate_name(xfs_dahash_t hash, size_t name_len, unsigned char *name,
+               bool is_dirent);
+int find_alternate(size_t name_len, unsigned char *name, uint32_t seq);
+
+#endif /* __DB_OBFUSCATE_H__ */
author	Darrick J. Wong <djwong@kernel.org>
	Thu, 15 Jun 2023 16:12:06 +0000 (09:12 -0700)
committer	Carlos Maiolino <cem@kernel.org>
	Thu, 22 Jun 2023 12:04:11 +0000 (14:04 +0200)
db/Makefile		patch \| blob \| blame \| history
db/metadump.c		patch \| blob \| blame \| history
db/obfuscate.c	[new file with mode: 0644]	patch \| blob
db/obfuscate.h	[new file with mode: 0644]	patch \| blob