Files
-----
-Each file is a plain list of words, one per line, with no comment or blank
-lines: a word is picked by hashing the machine ID to a byte offset into the
-file, so comment/blank lines (although skipped) would bias the selection and
-should be avoided. Each word must be a valid single hostname label (lowercase
-letters, digits, hyphens); invalid entries are skipped. The file is used as-is
-from the highest-priority directory that provides it (/etc wins over /run wins
-over /usr/lib); files are not merged across directories.
+Each file is a plain list of words, one per line. Blank lines and lines starting
+with "#" are treated as comments and skipped. Each word must be a valid single
+hostname label (lowercase letters, digits, hyphens); invalid entries are skipped.
+The file is used as-is from the highest-priority directory that provides it (/etc
+-> /run -> /usr/local/lib -> /usr/lib); files are not merged across directories.
Search path (highest priority first):
list is missing the name is treated as invalid and the built-in fallback
hostname is used.
-Because a word is chosen by byte offset into the file (rather than loading and
-indexing the whole list), the words are not all equally likely: a word's chance
-tracks the length of the word that precedes it in the list (not its own length),
-so a word listed right after a long word is slightly more likely to be picked.
-The effect is small: about a 12% non-uniformity, i.e. the effective name space
-is ~88% of the nominal product for $-$-$. This is an accepted trade for not
-reading the whole list into memory. If exact uniformity is ever needed, pad
-every word to a fixed width (e.g. with trailing '#') and have the loader strip
-the padding.
+Words are picked uniformly without reading the whole list into memory: an offset
+is chosen by hashing and accepted only when it lands on the start of a line
+(otherwise another offset is tried), so a word's chance does not depend on its
+own length or that of its neighbours.
Origin
------
return 0;
}
+static bool normalize_and_validate_word(char *word) {
+ assert(word);
+
+ if (IN_SET(word[0], '\0', '#')) /* empty line or comment */
+ return false;
+
+ ascii_strlower(word);
+ return hostname_is_valid(word, /* flags= */ 0);
+}
+
+static int pick_word_linear_scan(FILE *f, off_t offset, char **ret) {
+ int r;
+
+ assert(f);
+ assert(ret);
+
+ if (fseeko(f, offset, SEEK_SET) < 0)
+ return -errno;
+
+ bool wrapped = false;
+ r = read_line(f, LONG_LINE_MAX, NULL); /* discard the partial line we landed in */
+ if (r < 0)
+ return r;
+ if (r == 0) {
+ wrapped = true;
+ rewind(f);
+ }
+
+ for (;;) {
+ _cleanup_free_ char *line = NULL;
+
+ r = read_stripped_line(f, LONG_LINE_MAX, &line);
+ if (r < 0)
+ return r;
+ if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
+ if (wrapped) /* already wrapped once, the file contains no usable word at all */
+ return -ENOENT;
+ wrapped = true;
+ rewind(f);
+ continue;
+ }
+ if (normalize_and_validate_word(line)) {
+ *ret = TAKE_PTR(line);
+ return 0;
+ }
+ }
+}
+
static int hostname_pick_word(sd_id128_t mid, size_t pos, char **ret) {
static const sd_id128_t word_key = SD_ID128_MAKE(2d,9f,1c,7a,4b,8e,43,11,9a,6d,5f,02,c8,77,e3,14);
_cleanup_fclose_ FILE *f = NULL;
struct stat st;
- bool wrapped = false;
- uint64_t h;
int r;
assert(pos >= 1);
if (st.st_size == 0)
return -ENOENT;
- /* Pick a word without reading the whole list into memory: hash the machine ID and word position to a
- * byte offset. This stream is independent of the '?' nibble stream, so pure-'?' templates keep
- * producing byte-identical output. Stable as long as the wordlist is stable. */
- struct siphash state;
- siphash24_init(&state, word_key.bytes);
- siphash24_compress_typesafe(mid, &state);
- siphash24_compress_typesafe(pos, &state);
- h = siphash24_finalize(&state);
-
- if (fseeko(f, (off_t) (h % (uint64_t) st.st_size), SEEK_SET) < 0)
- return -errno;
-
- /* We mostly landed mid-line, so read/discard the current line here. If the file was shrunk by a
- * concurrent modification we might have seeked at/past EOF, so wrap around to the beginning. */
- r = read_line(f, LONG_LINE_MAX, NULL);
- if (r < 0)
- return r;
- if (r == 0) {
- wrapped = true;
- rewind(f);
- }
-
- for (;;) {
+ /* Pick a word without reading the whole list into memory:
+ * 1. pick a random offset in the file [0 … st.st_size-1]
+ * 2. if offset is zero, read a full line from the beginning of the file, use that.
+ * 3. otherwise, seek to offset minus 1 and read one character.
+ * 4. if that character is newline, then read a full line after it, and use that as result
+ * 5. otherwise, goto 1
+ *
+ * As a safety net terminate after a fixed number iterations (for pathological wordlists)
+ * This stream is independent of the '?' nibble stream so pure-'?' * templates keep producing
+ * byte-identical output. Stable as long as the wordlist is stable. */
+ off_t offset = 0;
+ const unsigned int MAX_ITERATIONS = 64;
+ for (unsigned i = 0; i < MAX_ITERATIONS; i++) {
_cleanup_free_ char *line = NULL;
+ struct siphash state;
+ siphash24_init(&state, word_key.bytes);
+ siphash24_compress_typesafe(mid, &state);
+ siphash24_compress_typesafe(pos, &state);
+ siphash24_compress_typesafe(i, &state); /* counter mode */
+ offset = (off_t) (siphash24_finalize(&state) % (uint64_t) st.st_size);
+
+ if (offset > 0) {
+ if (fseeko(f, offset - 1, SEEK_SET) < 0)
+ return -errno;
+ if (fgetc(f) != '\n')
+ continue; /* not a line start */
+ } else if (fseeko(f, 0, SEEK_SET) < 0) /* offset 0 always begins the first line */
+ return -errno;
+
r = read_stripped_line(f, LONG_LINE_MAX, &line);
if (r < 0)
return r;
- if (r == 0) { /* hit EOF: we started at a random offset, wrap around to the beginning */
- if (wrapped) /* already wrapped once, the file contains no usable word at all */
- return -ENOENT;
- wrapped = true;
- rewind(f);
+ if (r == 0) /* raced with truncation */
continue;
+ if (normalize_and_validate_word(line)) {
+ *ret = TAKE_PTR(line);
+ return 0;
}
-
- /* Skip empty lines and comments */
- if (IN_SET(line[0], '\0', '#'))
- continue;
-
- /* Each word must be a valid single hostname label on its own; lowercase it and silently skip
- * bogus entries. */
- ascii_strlower(line);
- if (!hostname_is_valid(line, /* flags= */ 0))
- continue;
-
- *ret = TAKE_PTR(line);
- return 0;
+ /* Comment/empty/invalid line: resample rather than advancing, to keep the pick uniform. */
}
+
+ /* We exhausted the uniform attempts, this should never happen but if it does fallback to picking the
+ * next word after our last attempt. */
+ log_warning("hostname_pick_word did not find a usable word after %u in wordlist %zu", MAX_ITERATIONS, pos);
+ return pick_word_linear_scan(f, offset, ret);
}
int hostname_substitute_wildcards(const char *name, char **ret) {