hostname: improve the algorithm in hostname_pick_word()

author Michael Vogt <michael@amutable.com>

Wed, 17 Jun 2026 07:51:53 +0000 (09:51 +0200)

committer Michael Vogt <michael@amutable.com>

Sat, 20 Jun 2026 12:26:41 +0000 (14:26 +0200)
author Michael Vogt <michael@amutable.com>
Wed, 17 Jun 2026 07:51:53 +0000 (09:51 +0200)
committer Michael Vogt <michael@amutable.com>
Sat, 20 Jun 2026 12:26:41 +0000 (14:26 +0200)
diff --git a/hostname-wordlist/README b/hostname-wordlist/README

index 1cda8ab7cc2aa9165a45b5686f0a340f20ded3fd..0ef3a3fc5fd377286226cd0e585b69bcf9918994 100644 (file)
--- a/hostname-wordlist/README
+++ b/hostname-wordlist/README
@@ -24,13 +24,11 @@ the symlinks) while the actual word lists keep meaningful names.
  Files
  -----
  
-Each file is a plain list of words, one per line, with no comment or blank
-lines: a word is picked by hashing the machine ID to a byte offset into the
-file, so comment/blank lines (although skipped) would bias the selection and
-should be avoided. Each word must be a valid single hostname label (lowercase
-letters, digits, hyphens); invalid entries are skipped. The file is used as-is
-from the highest-priority directory that provides it (/etc wins over /run wins
-over /usr/lib); files are not merged across directories.
+Each file is a plain list of words, one per line. Blank lines and lines starting
+with "#" are treated as comments and skipped. Each word must be a valid single
+hostname label (lowercase letters, digits, hyphens); invalid entries are skipped.
+The file is used as-is from the highest-priority directory that provides it (/etc
+-> /run -> /usr/local/lib -> /usr/lib); files are not merged across directories.
  
  Search path (highest priority first):
  
@@ -49,15 +47,10 @@ list. Changing a word list may change the name a machine gets. If a referenced
  list is missing the name is treated as invalid and the built-in fallback
  hostname is used.
  
-Because a word is chosen by byte offset into the file (rather than loading and
-indexing the whole list), the words are not all equally likely: a word's chance
-tracks the length of the word that precedes it in the list (not its own length),
-so a word listed right after a long word is slightly more likely to be picked.
-The effect is small: about a 12% non-uniformity, i.e. the effective name space
-is ~88% of the nominal product for $-$-$. This is an accepted trade for not
-reading the whole list into memory. If exact uniformity is ever needed, pad
-every word to a fixed width (e.g. with trailing '#') and have the loader strip
-the padding.
+Words are picked uniformly without reading the whole list into memory: an offset
+is chosen by hashing and accepted only when it lands on the start of a line
+(otherwise another offset is tried), so a word's chance does not depend on its
+own length or that of its neighbours.
  
  Origin
  ------
diff --git a/src/shared/hostname-setup.c b/src/shared/hostname-setup.c

index aa04a1ceab34f7593d588f404f2714a82f781980..22225e203091cf5170182b22785ff2b229540cfa 100644 (file)
--- a/src/shared/hostname-setup.c
+++ b/src/shared/hostname-setup.c
@@ -321,12 +321,58 @@ static int hostname_open_wordlist(const char *file, FILE **ret) {
          return 0;
  }
  
+static bool normalize_and_validate_word(char *word) {
+        assert(word);
+
+        if (IN_SET(word[0], '\0', '#')) /* empty line or comment */
+                return false;
+
+        ascii_strlower(word);
+        return hostname_is_valid(word, /* flags= */ 0);
+}
+
+static int pick_word_linear_scan(FILE *f, off_t offset, char **ret) {
+        int r;
+
+        assert(f);
+        assert(ret);
+
+        if (fseeko(f, offset, SEEK_SET) < 0)
+                return -errno;
+
+        bool wrapped = false;
+        r = read_line(f, LONG_LINE_MAX, NULL); /* discard the partial line we landed in */
+        if (r < 0)
+                return r;
+        if (r == 0) {
+                wrapped = true;
+                rewind(f);
+        }
+
+        for (;;) {
+                _cleanup_free_ char *line = NULL;
+
+                r = read_stripped_line(f, LONG_LINE_MAX, &line);
+                if (r < 0)
+                        return r;
+                if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
+                        if (wrapped) /* already wrapped once, the file contains no usable word at all */
+                                return -ENOENT;
+                        wrapped = true;
+                        rewind(f);
+                        continue;
+                }
+                if (normalize_and_validate_word(line)) {
+                        *ret = TAKE_PTR(line);
+                        return 0;
+                }
+        }
+}
+
  static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) {
          static const sd_id128_t word_key = SD_ID128_MAKE(2d,9f,1c,7a,4b,8e,43,11,9a,6d,5f,02,c8,77,e3,14);
          _cleanup_fclose_ FILE *f = NULL;
          struct stat st;
-        bool wrapped = false;
-        uint64_t h;
          int r;
  
          assert(pos >= 1);
@@ -348,55 +394,52 @@ static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) {
          if (st.st_size == 0)
                  return -ENOENT;
  
-        /* Pick a word without reading the whole list into memory: hash the machine ID and word position to a
-         * byte offset. This stream is independent of the '?' nibble stream, so pure-'?' templates keep
-         * producing byte-identical output. Stable as long as the wordlist is stable. */
-        struct siphash state;
-        siphash24_init(&state, word_key.bytes);
-        siphash24_compress_typesafe(mid, &state);
-        siphash24_compress_typesafe(pos, &state);
-        h = siphash24_finalize(&state);
-
-        if (fseeko(f, (off_t) (h % (uint64_t) st.st_size), SEEK_SET) < 0)
-                return -errno;
-
-        /* We mostly landed mid-line, so read/discard the current line here. If the file was shrunk by a
-         * concurrent modification we might have seeked at/past EOF, so wrap around to the beginning. */
-        r = read_line(f, LONG_LINE_MAX, NULL);
-        if (r < 0)
-                return r;
-        if (r == 0) {
-                wrapped = true;
-                rewind(f);
-        }
-
-        for (;;) {
+        /* Pick a word without reading the whole list into memory:
+         * 1. pick a random offset in the file [0 … st.st_size-1]
+         * 2. if offset is zero, read a full line from the beginning of the file, use that.
+         * 3. otherwise, seek to offset minus 1 and read one character.
+         * 4. if that character is newline, then read a full line after it, and use that as result
+         * 5. otherwise, goto 1
+         *
+         * As a safety net terminate after a fixed number iterations (for pathological wordlists)
+         * This stream is independent of the '?' nibble stream so pure-'?'  * templates keep producing
+         * byte-identical output. Stable as long as the wordlist is stable. */
+        off_t offset = 0;
+        const unsigned int MAX_ITERATIONS = 64;
+        for (unsigned i = 0; i < MAX_ITERATIONS; i++) {
                  _cleanup_free_ char *line = NULL;
  
+                struct siphash state;
+                siphash24_init(&state, word_key.bytes);
+                siphash24_compress_typesafe(mid, &state);
+                siphash24_compress_typesafe(pos, &state);
+                siphash24_compress_typesafe(i, &state); /* counter mode */
+                offset = (off_t) (siphash24_finalize(&state) % (uint64_t) st.st_size);
+
+                if (offset > 0) {
+                        if (fseeko(f, offset - 1, SEEK_SET) < 0)
+                                return -errno;
+                        if (fgetc(f) != '\n')
+                                continue; /* not a line start */
+                } else if (fseeko(f, 0, SEEK_SET) < 0) /* offset 0 always begins the first line */
+                        return -errno;
+
                  r = read_stripped_line(f, LONG_LINE_MAX, &line);
                  if (r < 0)
                          return r;
-                if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
-                        if (wrapped) /* already wrapped once, the file contains no usable word at all */
-                                return -ENOENT;
-                        wrapped = true;
-                        rewind(f);
+                if (r == 0) /* raced with truncation */
                          continue;
+                if (normalize_and_validate_word(line)) {
+                        *ret = TAKE_PTR(line);
+                        return 0;
                  }
-
-                /* Skip empty lines and comments */
-                if (IN_SET(line[0], '\0', '#'))
-                        continue;
-
-                /* Each word must be a valid single hostname label on its own; lowercase it and silently skip
-                 * bogus entries. */
-                ascii_strlower(line);
-                if (!hostname_is_valid(line, /* flags= */ 0))
-                        continue;
-
-                *ret = TAKE_PTR(line);
-                return 0;
+                /* Comment/empty/invalid line: resample rather than advancing, to keep the pick uniform. */
          }
+
+        /* We exhausted the uniform attempts, this should never happen but if it does fallback to picking the
+        * next word after our last attempt. */
+        log_warning("hostname_pick_word did not find a usable word after %u in wordlist %zu", MAX_ITERATIONS, pos);
+        return pick_word_linear_scan(f, offset, ret);
  }
  
  int hostname_substitute_wildcards(const char *name, char **ret) {
author	Michael Vogt <michael@amutable.com>
	Wed, 17 Jun 2026 07:51:53 +0000 (09:51 +0200)
committer	Michael Vogt <michael@amutable.com>
	Sat, 20 Jun 2026 12:26:41 +0000 (14:26 +0200)
hostname-wordlist/README		patch \| blob \| blame \| history
src/shared/hostname-setup.c		patch \| blob \| blame \| history