]>
Commit | Line | Data |
---|---|---|
48c7b524 UD |
1 | // Optimizations for random number functions, x86 version -*- C++ -*- |
2 | ||
a5544970 | 3 | // Copyright (C) 2012-2019 Free Software Foundation, Inc. |
48c7b524 UD |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the | |
7 | // terms of the GNU General Public License as published by the | |
8 | // Free Software Foundation; either version 3, or (at your option) | |
9 | // any later version. | |
10 | ||
11 | // This library is distributed in the hope that it will be useful, | |
12 | // but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
14 | // GNU General Public License for more details. | |
15 | ||
16 | // Under Section 7 of GPL version 3, you are granted additional | |
17 | // permissions described in the GCC Runtime Library Exception, version | |
18 | // 3.1, as published by the Free Software Foundation. | |
19 | ||
20 | // You should have received a copy of the GNU General Public License and | |
21 | // a copy of the GCC Runtime Library Exception along with this program; | |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | // <http://www.gnu.org/licenses/>. | |
24 | ||
25 | /** @file bits/opt_random.h | |
26 | * This is an internal header file, included by other library headers. | |
27 | * Do not attempt to use it directly. @headername{random} | |
28 | */ | |
29 | ||
30 | #ifndef _BITS_OPT_RANDOM_H | |
31 | #define _BITS_OPT_RANDOM_H 1 | |
32 | ||
141aa58b NF |
33 | #ifdef __SSE3__ |
34 | #include <pmmintrin.h> | |
35 | #endif | |
48c7b524 UD |
36 | |
37 | ||
38 | #pragma GCC system_header | |
39 | ||
40 | ||
41 | namespace std _GLIBCXX_VISIBILITY(default) | |
42 | { | |
43 | _GLIBCXX_BEGIN_NAMESPACE_VERSION | |
44 | ||
45 | #ifdef __SSE3__ | |
46 | template<> | |
47 | template<typename _UniformRandomNumberGenerator> | |
48 | void | |
49 | normal_distribution<double>:: | |
50 | __generate(typename normal_distribution<double>::result_type* __f, | |
51 | typename normal_distribution<double>::result_type* __t, | |
52 | _UniformRandomNumberGenerator& __urng, | |
53 | const param_type& __param) | |
54 | { | |
55 | typedef uint64_t __uctype; | |
56 | ||
57 | if (__f == __t) | |
58 | return; | |
59 | ||
60 | if (_M_saved_available) | |
61 | { | |
62 | _M_saved_available = false; | |
63 | *__f++ = _M_saved * __param.stddev() + __param.mean(); | |
64 | ||
65 | if (__f == __t) | |
66 | return; | |
67 | } | |
68 | ||
fb95b3df | 69 | constexpr uint64_t __maskval = 0xfffffffffffffull; |
48c7b524 UD |
70 | static const __m128i __mask = _mm_set1_epi64x(__maskval); |
71 | static const __m128i __two = _mm_set1_epi64x(0x4000000000000000ull); | |
72 | static const __m128d __three = _mm_set1_pd(3.0); | |
73 | const __m128d __av = _mm_set1_pd(__param.mean()); | |
74 | ||
75 | const __uctype __urngmin = __urng.min(); | |
76 | const __uctype __urngmax = __urng.max(); | |
77 | const __uctype __urngrange = __urngmax - __urngmin; | |
78 | const __uctype __uerngrange = __urngrange + 1; | |
79 | ||
80 | while (__f + 1 < __t) | |
81 | { | |
82 | double __le; | |
83 | __m128d __x; | |
84 | do | |
85 | { | |
86 | union | |
87 | { | |
88 | __m128i __i; | |
89 | __m128d __d; | |
90 | } __v; | |
91 | ||
92 | if (__urngrange > __maskval) | |
93 | { | |
94 | if (__detail::_Power_of_2(__uerngrange)) | |
95 | __v.__i = _mm_and_si128(_mm_set_epi64x(__urng(), | |
96 | __urng()), | |
97 | __mask); | |
98 | else | |
99 | { | |
100 | const __uctype __uerange = __maskval + 1; | |
101 | const __uctype __scaling = __urngrange / __uerange; | |
102 | const __uctype __past = __uerange * __scaling; | |
103 | uint64_t __v1; | |
104 | do | |
105 | __v1 = __uctype(__urng()) - __urngmin; | |
106 | while (__v1 >= __past); | |
107 | __v1 /= __scaling; | |
108 | uint64_t __v2; | |
109 | do | |
110 | __v2 = __uctype(__urng()) - __urngmin; | |
111 | while (__v2 >= __past); | |
112 | __v2 /= __scaling; | |
113 | ||
114 | __v.__i = _mm_set_epi64x(__v1, __v2); | |
115 | } | |
116 | } | |
117 | else if (__urngrange == __maskval) | |
118 | __v.__i = _mm_set_epi64x(__urng(), __urng()); | |
119 | else if ((__urngrange + 2) * __urngrange >= __maskval | |
120 | && __detail::_Power_of_2(__uerngrange)) | |
121 | { | |
122 | uint64_t __v1 = __urng() * __uerngrange + __urng(); | |
123 | uint64_t __v2 = __urng() * __uerngrange + __urng(); | |
124 | ||
125 | __v.__i = _mm_and_si128(_mm_set_epi64x(__v1, __v2), | |
126 | __mask); | |
127 | } | |
128 | else | |
129 | { | |
130 | size_t __nrng = 2; | |
131 | __uctype __high = __maskval / __uerngrange / __uerngrange; | |
132 | while (__high > __uerngrange) | |
133 | { | |
134 | ++__nrng; | |
135 | __high /= __uerngrange; | |
136 | } | |
137 | const __uctype __highrange = __high + 1; | |
138 | const __uctype __scaling = __urngrange / __highrange; | |
139 | const __uctype __past = __highrange * __scaling; | |
140 | __uctype __tmp; | |
141 | ||
142 | uint64_t __v1; | |
143 | do | |
144 | { | |
145 | do | |
146 | __tmp = __uctype(__urng()) - __urngmin; | |
147 | while (__tmp >= __past); | |
148 | __v1 = __tmp / __scaling; | |
149 | for (size_t __cnt = 0; __cnt < __nrng; ++__cnt) | |
150 | { | |
151 | __tmp = __v1; | |
152 | __v1 *= __uerngrange; | |
153 | __v1 += __uctype(__urng()) - __urngmin; | |
154 | } | |
155 | } | |
156 | while (__v1 > __maskval || __v1 < __tmp); | |
157 | ||
158 | uint64_t __v2; | |
159 | do | |
160 | { | |
161 | do | |
162 | __tmp = __uctype(__urng()) - __urngmin; | |
163 | while (__tmp >= __past); | |
164 | __v2 = __tmp / __scaling; | |
165 | for (size_t __cnt = 0; __cnt < __nrng; ++__cnt) | |
166 | { | |
167 | __tmp = __v2; | |
168 | __v2 *= __uerngrange; | |
169 | __v2 += __uctype(__urng()) - __urngmin; | |
170 | } | |
171 | } | |
172 | while (__v2 > __maskval || __v2 < __tmp); | |
f92ab29f | 173 | |
48c7b524 UD |
174 | __v.__i = _mm_set_epi64x(__v1, __v2); |
175 | } | |
176 | ||
177 | __v.__i = _mm_or_si128(__v.__i, __two); | |
178 | __x = _mm_sub_pd(__v.__d, __three); | |
179 | __m128d __m = _mm_mul_pd(__x, __x); | |
180 | __le = _mm_cvtsd_f64(_mm_hadd_pd (__m, __m)); | |
181 | } | |
182 | while (__le == 0.0 || __le >= 1.0); | |
183 | ||
184 | double __mult = (std::sqrt(-2.0 * std::log(__le) / __le) | |
185 | * __param.stddev()); | |
186 | ||
187 | __x = _mm_add_pd(_mm_mul_pd(__x, _mm_set1_pd(__mult)), __av); | |
188 | ||
189 | _mm_storeu_pd(__f, __x); | |
190 | __f += 2; | |
191 | } | |
192 | ||
193 | if (__f != __t) | |
194 | { | |
195 | result_type __x, __y, __r2; | |
196 | ||
197 | __detail::_Adaptor<_UniformRandomNumberGenerator, result_type> | |
198 | __aurng(__urng); | |
199 | ||
200 | do | |
201 | { | |
202 | __x = result_type(2.0) * __aurng() - 1.0; | |
203 | __y = result_type(2.0) * __aurng() - 1.0; | |
204 | __r2 = __x * __x + __y * __y; | |
205 | } | |
206 | while (__r2 > 1.0 || __r2 == 0.0); | |
207 | ||
208 | const result_type __mult = std::sqrt(-2 * std::log(__r2) / __r2); | |
209 | _M_saved = __x * __mult; | |
210 | _M_saved_available = true; | |
211 | *__f = __y * __mult * __param.stddev() + __param.mean(); | |
212 | } | |
213 | } | |
214 | #endif | |
215 | ||
216 | ||
217 | _GLIBCXX_END_NAMESPACE_VERSION | |
218 | } // namespace | |
219 | ||
220 | ||
221 | #endif // _BITS_OPT_RANDOM_H |