]>
Commit | Line | Data |
---|---|---|
ca9431f7 HPN |
1 | // Copyright (C) 2010-2013 Free Software Foundation, Inc. |
2 | // | |
3 | // This file is part of the GNU ISO C++ Library. This library is free | |
4 | // software; you can redistribute it and/or modify it under the | |
5 | // terms of the GNU General Public License as published by the | |
6 | // Free Software Foundation; either version 3, or (at your option) | |
7 | // any later version. | |
8 | // | |
9 | // This library is distributed in the hope that it will be useful, | |
10 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
11 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
12 | // GNU General Public License for more details. | |
13 | // | |
14 | // You should have received a copy of the GNU General Public License | |
15 | // along with this library; see the file COPYING3. If not see | |
16 | // <http://www.gnu.org/licenses/>. | |
17 | ||
18 | // This file uses the chi^2 test to measure the quality of a hash | |
19 | // function, by computing the uniformity with which it distributes a set | |
20 | // of N strings into k buckets (where k is significantly greater than N). | |
21 | // | |
22 | // Each bucket has B[i] strings in it. The expected value of each bucket | |
23 | // for a uniform distribution is z = N/k, so | |
24 | // chi^2 = Sum_i (B[i] - z)^2 / z. | |
25 | // | |
26 | // We check whether chi^2 is small enough to be consistent with the | |
27 | // hypothesis of a uniform distribution. If F(chi^2, k-1) is close to | |
28 | // 0 (where F is the cumulative probability distribution), we can | |
29 | // reject that hypothesis. So we don't want F to be too small, which | |
30 | // for large k, means we want chi^2 to be not too much larger than k. | |
31 | // | |
32 | // We use the chi^2 test for several sets of strings. Any non-horrible | |
33 | // hash function should do well with purely random strings. A really | |
34 | // good hash function will also do well with more structured sets, | |
35 | // including ones where the strings differ by only a few bits. | |
36 | ||
37 | #include <algorithm> | |
38 | #include <cstdlib> | |
39 | #include <cstdio> | |
40 | #include <fstream> | |
41 | #include <functional> | |
42 | #include <iostream> | |
43 | #include <iterator> | |
44 | #include <string> | |
45 | #include <unordered_set> | |
46 | #include <vector> | |
47 | #include <testsuite_hooks.h> | |
48 | ||
49 | #ifndef SAMPLES | |
50 | #define SAMPLES 300000 | |
51 | #endif | |
52 | ||
53 | template <typename Container> | |
54 | double | |
55 | chi2_hash(const Container& c, long buckets) | |
56 | { | |
57 | std::vector<int> counts(buckets); | |
58 | std::hash<std::string> hasher; | |
59 | double elements = 0; | |
60 | for (auto i = c.begin(); i != c.end(); ++i) | |
61 | { | |
62 | ++counts[hasher(*i) % buckets]; | |
63 | ++elements; | |
64 | } | |
65 | ||
66 | const double z = elements / buckets; | |
67 | double sum = 0; | |
68 | for (long i = 0; i < buckets; ++i) | |
69 | { | |
70 | double delta = counts[i] - z; | |
71 | sum += delta*delta; | |
72 | } | |
73 | return sum/z; | |
74 | } |