From 9294e8822f6f359ea9dba6f19b97fb75a72c433e Mon Sep 17 00:00:00 2001
From: Willy Tarreau <w@1wt.eu>
Date: Mon, 15 Mar 2021 09:34:27 +0100
Subject: [PATCH] MINOR: tools: improve word fingerprinting by counting
 presence

The distance between two words can be high due to a sub-word being missing
and in this case it happens that other totally unrealted words are proposed
because their average score looks lower thanks to being shorter. Here we're
introducing the notion of presence of each character so that word sequences
that contain existing sub-words are favored against the shorter ones having
nothing in common. In addition we do not distinguish being/end from a
regular delimitor anymore. That made it harder to spot inverted words.
---
 include/haproxy/tools.h | 24 +++++++++++++-----------
 src/tools.c             | 12 +++++++-----
 2 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/include/haproxy/tools.h b/include/haproxy/tools.h
index 901dca0bb9..3121fea51a 100644
--- a/include/haproxy/tools.h
+++ b/include/haproxy/tools.h
@@ -1077,28 +1077,30 @@ static inline unsigned int statistical_prng()
  * is zero, it's assumed that <curr> is the first character. If <curr> is zero
  * its assumed to mark the end. Both may be zero. <fp> is a 1024-entries array
  * indexed as 32*from+to. Positions for 'from' and 'to' are:
- *   0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
+ *   1..26=letter, 27=digit, 28=other/begin/end.
+ * Row "from=0" is used to mark the character's presence. Others unused.
  */
 static inline void update_char_fingerprint(uint8_t *fp, char prev, char curr)
 {
 	int from, to;
 
 	switch (prev) {
-	case 0:         from = 26; break; // begin
-	case 'a'...'z': from = prev - 'a'; break;
-	case 'A'...'Z': from = tolower(prev) - 'a'; break;
-	case '0'...'9': from = 26; break;
-	default:        from = 27; break;
+	case 0:         from = 28; break; // begin
+	case 'a'...'z': from = prev - 'a' + 1; break;
+	case 'A'...'Z': from = tolower(prev) - 'a' + 1; break;
+	case '0'...'9': from = 27; break;
+	default:        from = 28; break;
 	}
 
 	switch (curr) {
 	case 0:         to = 28; break; // end
-	case 'a'...'z': to = curr - 'a'; break;
-	case 'A'...'Z': to = tolower(curr) - 'a'; break;
-	case '0'...'9': to = 26; break;
-	default:        to = 27; break;
+	case 'a'...'z': to = curr - 'a' + 1; break;
+	case 'A'...'Z': to = tolower(curr) - 'a' + 1; break;
+	case '0'...'9': to = 27; break;
+	default:        to = 28; break;
 	}
-
+	if (curr)
+		fp[to] = 1;
 	fp[32 * from + to]++;
 }
 
diff --git a/src/tools.c b/src/tools.c
index 1255e748b8..ffd167a24f 100644
--- a/src/tools.c
+++ b/src/tools.c
@@ -5372,7 +5372,8 @@ size_t sanitize_for_printing(char *line, size_t pos, size_t width)
 /* Update array <fp> with the fingerprint of word <word> by counting the
  * transitions between characters. <fp> is a 1024-entries array indexed as
  * 32*from+to. Positions for 'from' and 'to' are:
- *   0..25=letter, 26=digit, 27=other, 28=begin, 29=end, others unused.
+ *   1..26=letter, 27=digit, 28=other/begin/end.
+ * Row "from=0" is used to mark the character's presence. Others unused.
  */
 void update_word_fingerprint(uint8_t *fp, const char *word)
 {
@@ -5384,11 +5385,12 @@ void update_word_fingerprint(uint8_t *fp, const char *word)
 	for (p = word; *p; p++) {
 		c = tolower(*p);
 		switch(c) {
-		case 'a'...'z': to = c - 'a'; break;
-		case 'A'...'Z': to = tolower(c) - 'a'; break;
-		case '0'...'9': to = 26; break;
-		default: to = 27; break;
+		case 'a'...'z': to = c - 'a' + 1; break;
+		case 'A'...'Z': to = tolower(c) - 'a' + 1; break;
+		case '0'...'9': to = 27; break;
+		default:        to = 28; break;
 		}
+		fp[to] = 1;
 		fp[32 * from + to]++;
 		from = to;
 	}
-- 
2.39.5