From: Michael Tremer Date: Sat, 4 Jan 2025 15:21:03 +0000 (+0000) Subject: strings: Implement a fast search that even works over large buffers X-Git-Tag: 0.9.30~539 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=33d627d1d7244f4bef9f8b1210c11120538fe24d;p=pakfire.git strings: Implement a fast search that even works over large buffers Signed-off-by: Michael Tremer --- diff --git a/src/pakfire/string.c b/src/pakfire/string.c index e409bba3c..7e59d09af 100644 --- a/src/pakfire/string.c +++ b/src/pakfire/string.c @@ -309,6 +309,80 @@ char* pakfire_string_join(const char** list, const char* delim) { return string; } +int pakfire_string_search(const char* haystack, ssize_t lhaystack, + const char* needle, ssize_t lneedle) { + // Check inputs + if (!haystack || !needle || !lhaystack || !lneedle) + return -EINVAL; + + // Fill lengths + if (lhaystack < 0) + lhaystack = strlen(haystack); + + if (lneedle < 0) + lneedle = strlen(needle); + + // Alphabet size + const int d = 256; + + // A prime number + const int q = 101; + + // Hash constant + int c = 1; + + // Hash value for the haystack + int h = 0; + + // Hash value for the needle + int n = 0; + + // Compute the hash constant + for (int i = 0; i < lneedle - 1; i++) + c = (c * d) % q; + + // Calculate the hash value of pattern and first window + for (int i = 0; i < lneedle; i++) { + h = (d * h + haystack[i]) % q; + n = (d * n + needle[i]) % q; + } + + // Slide the text over the pattern... + for (int i = 0; i <= lhaystack - lneedle; i++) { + // Check the hash values of the current window. + // If they match, then check then check if we actually found a match + if (h == n) { + // Assume there was a match + int match = 1; + + for (int j = 0; j < lneedle; j++) { + // Skip the parts that match + if (haystack[i + j] == needle[j]) + continue; + + // If a character does not match, we reset and continue + match = 0; + break; + } + + // We found a match and are done + if (match) + return i; + } + + // Calculate the hash value for the next window + if (i < lhaystack - lneedle) { + h = (d * (h - haystack[i] * c) + haystack[i + lneedle]) % q; + + // If we have underflowed, we make the hash positive again + if (h < 0) + h += q; + } + } + + return -1; +} + size_t pakfire_strings_length(char** array) { size_t length = 0; diff --git a/src/pakfire/string.h b/src/pakfire/string.h index f9f0c2d81..42d72f00a 100644 --- a/src/pakfire/string.h +++ b/src/pakfire/string.h @@ -82,6 +82,9 @@ static inline int pakfire_string_contains_whitespace(const char* s) { return 0; } +int pakfire_string_search(const char* haystack, ssize_t lhaystack, + const char* needle, ssize_t lneedle); + /* String Arrays */ diff --git a/tests/libpakfire/string.c b/tests/libpakfire/string.c index 2668336a0..75de6882d 100644 --- a/tests/libpakfire/string.c +++ b/tests/libpakfire/string.c @@ -347,6 +347,27 @@ FAIL: return r; } +static int test_string_search(const struct test* t) { + int r = EXIT_FAILURE; + + // Check if we find the pattern in the right place + ASSERT_EQUALS(pakfire_string_search("ABCDEFGHIJKLMNOPQ", -1, "ABC", -1), 0); + ASSERT_EQUALS(pakfire_string_search("ABCDEFGHIJKLMNOPQ", -1, "DEF", -1), 3); + ASSERT_EQUALS(pakfire_string_search("ABCDEFGHIJKLMNOPQ", -1, "OPQ", -1), 14); + + // Check if don't find some false positive + ASSERT_EQUALS(pakfire_string_search("ABCDEFGHIJKLMNOPQ", -1, "XYZ", -1), -1); + + // Check invalid inputs + ASSERT_ERROR(pakfire_string_search(NULL, -1, NULL, -1), EINVAL); + + // Everything passed + r = EXIT_SUCCESS; + +FAIL: + return r; +} + int main(int argc, const char* argv[]) { testsuite_add_test(test_string_set, 0); testsuite_add_test(test_string_startswith, 0); @@ -359,6 +380,7 @@ int main(int argc, const char* argv[]) { testsuite_add_test(test_parse_bytes, 0); testsuite_add_test(test_intervals, 0); testsuite_add_test(test_string_contains_whitespace, 0); + testsuite_add_test(test_string_search, 0); return testsuite_run(argc, argv); }