From: Lennart Poettering Date: Mon, 21 Aug 2023 11:38:03 +0000 (+0200) Subject: string-util: add a function to determine levenshtein distance of two strings X-Git-Tag: v255-rc1~677^2~1 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=7ef5b0a4d83c4075abfffd80eed9c47bc9e073ff;p=thirdparty%2Fsystemd.git string-util: add a function to determine levenshtein distance of two strings --- diff --git a/src/basic/string-util.c b/src/basic/string-util.c index 7cddec712b0..854cf963acb 100644 --- a/src/basic/string-util.c +++ b/src/basic/string-util.c @@ -1446,3 +1446,67 @@ bool version_is_valid_versionspec(const char *s) { return true; } + +ssize_t strlevenshtein(const char *x, const char *y) { + _cleanup_free_ size_t *t0 = NULL, *t1 = NULL, *t2 = NULL; + size_t xl, yl; + + /* This is inspired from the Linux kernel's Levenshtein implementation */ + + if (streq_ptr(x, y)) + return 0; + + xl = strlen_ptr(x); + if (xl > SSIZE_MAX) + return -E2BIG; + + yl = strlen_ptr(y); + if (yl > SSIZE_MAX) + return -E2BIG; + + if (isempty(x)) + return yl; + if (isempty(y)) + return xl; + + t0 = new0(size_t, yl + 1); + if (!t0) + return -ENOMEM; + t1 = new0(size_t, yl + 1); + if (!t1) + return -ENOMEM; + t2 = new0(size_t, yl + 1); + if (!t2) + return -ENOMEM; + + for (size_t i = 0; i <= yl; i++) + t1[i] = i; + + for (size_t i = 0; i < xl; i++) { + t2[0] = i + 1; + + for (size_t j = 0; j < yl; j++) { + /* Substitution */ + t2[j+1] = t1[j] + (x[i] != y[j]); + + /* Swap */ + if (i > 0 && j > 0 && x[i-1] == y[j] && x[i] == y[j-1] && t2[j+1] > t0[j-1] + 1) + t2[j+1] = t0[j-1] + 1; + + /* Deletion */ + if (t2[j+1] > t1[j+1] + 1) + t2[j+1] = t1[j+1] + 1; + + /* Insertion */ + if (t2[j+1] > t2[j] + 1) + t2[j+1] = t2[j] + 1; + } + + size_t *dummy = t0; + t0 = t1; + t1 = t2; + t2 = dummy; + } + + return t1[yl]; +} diff --git a/src/basic/string-util.h b/src/basic/string-util.h index f473946864e..73d586d4c54 100644 --- a/src/basic/string-util.h +++ b/src/basic/string-util.h @@ -284,3 +284,5 @@ char *startswith_strv(const char *string, char **strv); bool version_is_valid(const char *s); bool version_is_valid_versionspec(const char *s); + +ssize_t strlevenshtein(const char *x, const char *y); diff --git a/src/test/test-string-util.c b/src/test/test-string-util.c index 6ec70054e78..b5f0008d76b 100644 --- a/src/test/test-string-util.c +++ b/src/test/test-string-util.c @@ -1292,4 +1292,28 @@ TEST(strextendn) { x = mfree(x); } +TEST(strlevenshtein) { + assert_se(strlevenshtein(NULL, NULL) == 0); + assert_se(strlevenshtein("", "") == 0); + assert_se(strlevenshtein("", NULL) == 0); + assert_se(strlevenshtein(NULL, "") == 0); + + assert_se(strlevenshtein("a", "a") == 0); + assert_se(strlevenshtein("a", "b") == 1); + assert_se(strlevenshtein("b", "a") == 1); + assert_se(strlevenshtein("a", "") == 1); + assert_se(strlevenshtein("", "a") == 1); + + assert_se(strlevenshtein("xxx", "xxx") == 0); + assert_se(strlevenshtein("xxx", "yyy") == 3); + assert_se(strlevenshtein("yyy", "xxx") == 3); + assert_se(strlevenshtein("xx", "xxx") == 1); + assert_se(strlevenshtein("xxx", "xx") == 1); + assert_se(strlevenshtein("x", "xxx") == 2); + assert_se(strlevenshtein("xxx", "x") == 2); + + assert_se(strlevenshtein("sitting", "kitten") == 3); + assert_se(strlevenshtein("sunday", "saturday") == 3); +} + DEFINE_TEST_MAIN(LOG_DEBUG);