]> git.ipfire.org Git - thirdparty/gcc.git/blob - libstdc++-v3/include/parallel/random_shuffle.h
Licensing changes to GPLv3 resp. GPLv3 with GCC Runtime Exception.
[thirdparty/gcc.git] / libstdc++-v3 / include / parallel / random_shuffle.h
1 // -*- C++ -*-
2
3 // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 3, or (at your option) any later
9 // version.
10
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
15
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24
25 /** @file parallel/random_shuffle.h
26 * @brief Parallel implementation of std::random_shuffle().
27 * This file is a GNU parallel extension to the Standard C++ Library.
28 */
29
30 // Written by Johannes Singler.
31
32 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
33 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
34
35 #include <limits>
36 #include <bits/stl_numeric.h>
37 #include <parallel/parallel.h>
38 #include <parallel/random_number.h>
39
40 namespace __gnu_parallel
41 {
42 /** @brief Type to hold the index of a bin.
43 *
44 * Since many variables of this type are allocated, it should be
45 * chosen as small as possible.
46 */
47 typedef unsigned short bin_index;
48
49 /** @brief Data known to every thread participating in
50 __gnu_parallel::parallel_random_shuffle(). */
51 template<typename RandomAccessIterator>
52 struct DRandomShufflingGlobalData
53 {
54 typedef std::iterator_traits<RandomAccessIterator> traits_type;
55 typedef typename traits_type::value_type value_type;
56 typedef typename traits_type::difference_type difference_type;
57
58 /** @brief Begin iterator of the source. */
59 RandomAccessIterator& source;
60
61 /** @brief Temporary arrays for each thread. */
62 value_type** temporaries;
63
64 /** @brief Two-dimensional array to hold the thread-bin distribution.
65 *
66 * Dimensions (num_threads + 1) x (num_bins + 1). */
67 difference_type** dist;
68
69 /** @brief Start indexes of the threads' chunks. */
70 difference_type* starts;
71
72 /** @brief Number of the thread that will further process the
73 corresponding bin. */
74 thread_index_t* bin_proc;
75
76 /** @brief Number of bins to distribute to. */
77 int num_bins;
78
79 /** @brief Number of bits needed to address the bins. */
80 int num_bits;
81
82 /** @brief Constructor. */
83 DRandomShufflingGlobalData(RandomAccessIterator& _source)
84 : source(_source) { }
85 };
86
87 /** @brief Local data for a thread participating in
88 __gnu_parallel::parallel_random_shuffle().
89 */
90 template<typename RandomAccessIterator, typename RandomNumberGenerator>
91 struct DRSSorterPU
92 {
93 /** @brief Number of threads participating in total. */
94 int num_threads;
95
96 /** @brief Begin index for bins taken care of by this thread. */
97 bin_index bins_begin;
98
99 /** @brief End index for bins taken care of by this thread. */
100 bin_index bins_end;
101
102 /** @brief Random seed for this thread. */
103 uint32 seed;
104
105 /** @brief Pointer to global data. */
106 DRandomShufflingGlobalData<RandomAccessIterator>* sd;
107 };
108
109 /** @brief Generate a random number in @c [0,2^logp).
110 * @param logp Logarithm (basis 2) of the upper range bound.
111 * @param rng Random number generator to use.
112 */
113 template<typename RandomNumberGenerator>
114 inline int
115 random_number_pow2(int logp, RandomNumberGenerator& rng)
116 { return rng.genrand_bits(logp); }
117
118 /** @brief Random shuffle code executed by each thread.
119 * @param pus Array of thread-local data records. */
120 template<typename RandomAccessIterator, typename RandomNumberGenerator>
121 void
122 parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
123 RandomNumberGenerator>* pus)
124 {
125 typedef std::iterator_traits<RandomAccessIterator> traits_type;
126 typedef typename traits_type::value_type value_type;
127 typedef typename traits_type::difference_type difference_type;
128
129 thread_index_t iam = omp_get_thread_num();
130 DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
131 DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
132
133 // Indexing: dist[bin][processor]
134 difference_type length = sd->starts[iam + 1] - sd->starts[iam];
135 bin_index* oracles = new bin_index[length];
136 difference_type* dist = new difference_type[sd->num_bins + 1];
137 bin_index* bin_proc = new bin_index[sd->num_bins];
138 value_type** temporaries = new value_type*[d->num_threads];
139
140 // Compute oracles and count appearances.
141 for (bin_index b = 0; b < sd->num_bins + 1; ++b)
142 dist[b] = 0;
143 int num_bits = sd->num_bits;
144
145 random_number rng(d->seed);
146
147 // First main loop.
148 for (difference_type i = 0; i < length; ++i)
149 {
150 bin_index oracle = random_number_pow2(num_bits, rng);
151 oracles[i] = oracle;
152
153 // To allow prefix (partial) sum.
154 ++(dist[oracle + 1]);
155 }
156
157 for (bin_index b = 0; b < sd->num_bins + 1; ++b)
158 sd->dist[b][iam + 1] = dist[b];
159
160 # pragma omp barrier
161
162 # pragma omp single
163 {
164 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
165 // total number of items in bin s
166 for (bin_index s = 0; s < sd->num_bins; ++s)
167 __gnu_sequential::partial_sum(sd->dist[s + 1],
168 sd->dist[s + 1] + d->num_threads + 1,
169 sd->dist[s + 1]);
170 }
171
172 # pragma omp barrier
173
174 sequence_index_t offset = 0, global_offset = 0;
175 for (bin_index s = 0; s < d->bins_begin; ++s)
176 global_offset += sd->dist[s + 1][d->num_threads];
177
178 # pragma omp barrier
179
180 for (bin_index s = d->bins_begin; s < d->bins_end; ++s)
181 {
182 for (int t = 0; t < d->num_threads + 1; ++t)
183 sd->dist[s + 1][t] += offset;
184 offset = sd->dist[s + 1][d->num_threads];
185 }
186
187 sd->temporaries[iam] = static_cast<value_type*>(
188 ::operator new(sizeof(value_type) * offset));
189
190 # pragma omp barrier
191
192 // Draw local copies to avoid false sharing.
193 for (bin_index b = 0; b < sd->num_bins + 1; ++b)
194 dist[b] = sd->dist[b][iam];
195 for (bin_index b = 0; b < sd->num_bins; ++b)
196 bin_proc[b] = sd->bin_proc[b];
197 for (thread_index_t t = 0; t < d->num_threads; ++t)
198 temporaries[t] = sd->temporaries[t];
199
200 RandomAccessIterator source = sd->source;
201 difference_type start = sd->starts[iam];
202
203 // Distribute according to oracles, second main loop.
204 for (difference_type i = 0; i < length; ++i)
205 {
206 bin_index target_bin = oracles[i];
207 thread_index_t target_p = bin_proc[target_bin];
208
209 // Last column [d->num_threads] stays unchanged.
210 ::new(&(temporaries[target_p][dist[target_bin + 1]++]))
211 value_type(*(source + i + start));
212 }
213
214 delete[] oracles;
215 delete[] dist;
216 delete[] bin_proc;
217 delete[] temporaries;
218
219 # pragma omp barrier
220
221 // Shuffle bins internally.
222 for (bin_index b = d->bins_begin; b < d->bins_end; ++b)
223 {
224 value_type* begin =
225 sd->temporaries[iam] +
226 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
227 * end =
228 sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
229 sequential_random_shuffle(begin, end, rng);
230 std::copy(begin, end, sd->source + global_offset +
231 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
232 }
233
234 ::operator delete(sd->temporaries[iam]);
235 }
236
237 /** @brief Round up to the next greater power of 2.
238 * @param x Integer to round up */
239 template<typename T>
240 T
241 round_up_to_pow2(T x)
242 {
243 if (x <= 1)
244 return 1;
245 else
246 return (T)1 << (__log2(x - 1) + 1);
247 }
248
249 /** @brief Main parallel random shuffle step.
250 * @param begin Begin iterator of sequence.
251 * @param end End iterator of sequence.
252 * @param n Length of sequence.
253 * @param num_threads Number of threads to use.
254 * @param rng Random number generator to use.
255 */
256 template<typename RandomAccessIterator, typename RandomNumberGenerator>
257 void
258 parallel_random_shuffle_drs(RandomAccessIterator begin,
259 RandomAccessIterator end,
260 typename std::iterator_traits
261 <RandomAccessIterator>::difference_type n,
262 thread_index_t num_threads,
263 RandomNumberGenerator& rng)
264 {
265 typedef std::iterator_traits<RandomAccessIterator> traits_type;
266 typedef typename traits_type::value_type value_type;
267 typedef typename traits_type::difference_type difference_type;
268
269 _GLIBCXX_CALL(n)
270
271 const _Settings& __s = _Settings::get();
272
273 if (num_threads > n)
274 num_threads = static_cast<thread_index_t>(n);
275
276 bin_index num_bins, num_bins_cache;
277
278 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
279 // Try the L1 cache first.
280
281 // Must fit into L1.
282 num_bins_cache = std::max<difference_type>(
283 1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
284 num_bins_cache = round_up_to_pow2(num_bins_cache);
285
286 // No more buckets than TLB entries, power of 2
287 // Power of 2 and at least one element per bin, at most the TLB size.
288 num_bins = std::min<difference_type>(n, num_bins_cache);
289
290 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
291 // 2 TLB entries needed per bin.
292 num_bins = std::min<difference_type>(__s.TLB_size / 2, num_bins);
293 #endif
294 num_bins = round_up_to_pow2(num_bins);
295
296 if (num_bins < num_bins_cache)
297 {
298 #endif
299 // Now try the L2 cache
300 // Must fit into L2
301 num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
302 1, n / (__s.L2_cache_size / sizeof(value_type))));
303 num_bins_cache = round_up_to_pow2(num_bins_cache);
304
305 // No more buckets than TLB entries, power of 2.
306 num_bins = static_cast<bin_index>(
307 std::min(n, static_cast<difference_type>(num_bins_cache)));
308 // Power of 2 and at least one element per bin, at most the TLB size.
309 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
310 // 2 TLB entries needed per bin.
311 num_bins = std::min(
312 static_cast<difference_type>(__s.TLB_size / 2), num_bins);
313 #endif
314 num_bins = round_up_to_pow2(num_bins);
315 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
316 }
317 #endif
318
319 num_threads = std::min<bin_index>(num_threads, num_bins);
320
321 if (num_threads <= 1)
322 return sequential_random_shuffle(begin, end, rng);
323
324 DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
325 DRSSorterPU<RandomAccessIterator, random_number >* pus;
326 difference_type* starts;
327
328 # pragma omp parallel num_threads(num_threads)
329 {
330 thread_index_t num_threads = omp_get_num_threads();
331 # pragma omp single
332 {
333 pus = new DRSSorterPU<RandomAccessIterator, random_number>
334 [num_threads];
335
336 sd.temporaries = new value_type*[num_threads];
337 sd.dist = new difference_type*[num_bins + 1];
338 sd.bin_proc = new thread_index_t[num_bins];
339 for (bin_index b = 0; b < num_bins + 1; ++b)
340 sd.dist[b] = new difference_type[num_threads + 1];
341 for (bin_index b = 0; b < (num_bins + 1); ++b)
342 {
343 sd.dist[0][0] = 0;
344 sd.dist[b][0] = 0;
345 }
346 starts = sd.starts = new difference_type[num_threads + 1];
347 int bin_cursor = 0;
348 sd.num_bins = num_bins;
349 sd.num_bits = __log2(num_bins);
350
351 difference_type chunk_length = n / num_threads,
352 split = n % num_threads, start = 0;
353 difference_type bin_chunk_length = num_bins / num_threads,
354 bin_split = num_bins % num_threads;
355 for (thread_index_t i = 0; i < num_threads; ++i)
356 {
357 starts[i] = start;
358 start += (i < split) ? (chunk_length + 1) : chunk_length;
359 int j = pus[i].bins_begin = bin_cursor;
360
361 // Range of bins for this processor.
362 bin_cursor += (i < bin_split) ?
363 (bin_chunk_length + 1) : bin_chunk_length;
364 pus[i].bins_end = bin_cursor;
365 for (; j < bin_cursor; ++j)
366 sd.bin_proc[j] = i;
367 pus[i].num_threads = num_threads;
368 pus[i].seed = rng(std::numeric_limits<uint32>::max());
369 pus[i].sd = &sd;
370 }
371 starts[num_threads] = start;
372 } //single
373 // Now shuffle in parallel.
374 parallel_random_shuffle_drs_pu(pus);
375 } // parallel
376
377 delete[] starts;
378 delete[] sd.bin_proc;
379 for (int s = 0; s < (num_bins + 1); ++s)
380 delete[] sd.dist[s];
381 delete[] sd.dist;
382 delete[] sd.temporaries;
383
384 delete[] pus;
385 }
386
387 /** @brief Sequential cache-efficient random shuffle.
388 * @param begin Begin iterator of sequence.
389 * @param end End iterator of sequence.
390 * @param rng Random number generator to use.
391 */
392 template<typename RandomAccessIterator, typename RandomNumberGenerator>
393 void
394 sequential_random_shuffle(RandomAccessIterator begin,
395 RandomAccessIterator end,
396 RandomNumberGenerator& rng)
397 {
398 typedef std::iterator_traits<RandomAccessIterator> traits_type;
399 typedef typename traits_type::value_type value_type;
400 typedef typename traits_type::difference_type difference_type;
401
402 difference_type n = end - begin;
403 const _Settings& __s = _Settings::get();
404
405 bin_index num_bins, num_bins_cache;
406
407 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
408 // Try the L1 cache first, must fit into L1.
409 num_bins_cache =
410 std::max<difference_type>
411 (1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
412 num_bins_cache = round_up_to_pow2(num_bins_cache);
413
414 // No more buckets than TLB entries, power of 2
415 // Power of 2 and at least one element per bin, at most the TLB size
416 num_bins = std::min(n, (difference_type)num_bins_cache);
417 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
418 // 2 TLB entries needed per bin
419 num_bins = std::min((difference_type)__s.TLB_size / 2, num_bins);
420 #endif
421 num_bins = round_up_to_pow2(num_bins);
422
423 if (num_bins < num_bins_cache)
424 {
425 #endif
426 // Now try the L2 cache, must fit into L2.
427 num_bins_cache =
428 static_cast<bin_index>(std::max<difference_type>(
429 1, n / (__s.L2_cache_size / sizeof(value_type))));
430 num_bins_cache = round_up_to_pow2(num_bins_cache);
431
432 // No more buckets than TLB entries, power of 2
433 // Power of 2 and at least one element per bin, at most the TLB size.
434 num_bins = static_cast<bin_index>
435 (std::min(n, static_cast<difference_type>(num_bins_cache)));
436
437 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
438 // 2 TLB entries needed per bin
439 num_bins =
440 std::min<difference_type>(__s.TLB_size / 2, num_bins);
441 #endif
442 num_bins = round_up_to_pow2(num_bins);
443 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
444 }
445 #endif
446
447 int num_bits = __log2(num_bins);
448
449 if (num_bins > 1)
450 {
451 value_type* target = static_cast<value_type*>(
452 ::operator new(sizeof(value_type) * n));
453 bin_index* oracles = new bin_index[n];
454 difference_type* dist0 = new difference_type[num_bins + 1],
455 * dist1 = new difference_type[num_bins + 1];
456
457 for (int b = 0; b < num_bins + 1; ++b)
458 dist0[b] = 0;
459
460 random_number bitrng(rng(0xFFFFFFFF));
461
462 for (difference_type i = 0; i < n; ++i)
463 {
464 bin_index oracle = random_number_pow2(num_bits, bitrng);
465 oracles[i] = oracle;
466
467 // To allow prefix (partial) sum.
468 ++(dist0[oracle + 1]);
469 }
470
471 // Sum up bins.
472 __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
473
474 for (int b = 0; b < num_bins + 1; ++b)
475 dist1[b] = dist0[b];
476
477 // Distribute according to oracles.
478 for (difference_type i = 0; i < n; ++i)
479 ::new(&(target[(dist0[oracles[i]])++])) value_type(*(begin + i));
480
481 for (int b = 0; b < num_bins; ++b)
482 {
483 sequential_random_shuffle(target + dist1[b],
484 target + dist1[b + 1],
485 rng);
486 }
487
488 // Copy elements back.
489 std::copy(target, target + n, begin);
490
491 delete[] dist0;
492 delete[] dist1;
493 delete[] oracles;
494 ::operator delete(target);
495 }
496 else
497 __gnu_sequential::random_shuffle(begin, end, rng);
498 }
499
500 /** @brief Parallel random public call.
501 * @param begin Begin iterator of sequence.
502 * @param end End iterator of sequence.
503 * @param rng Random number generator to use.
504 */
505 template<typename RandomAccessIterator, typename RandomNumberGenerator>
506 inline void
507 parallel_random_shuffle(RandomAccessIterator begin,
508 RandomAccessIterator end,
509 RandomNumberGenerator rng = random_number())
510 {
511 typedef std::iterator_traits<RandomAccessIterator> traits_type;
512 typedef typename traits_type::difference_type difference_type;
513 difference_type n = end - begin;
514 parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ;
515 }
516
517 }
518
519 #endif /* _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H */