]> git.ipfire.org Git - thirdparty/gcc.git/blame - libstdc++-v3/include/parallel/random_shuffle.h
Licensing changes to GPLv3 resp. GPLv3 with GCC Runtime Exception.
[thirdparty/gcc.git] / libstdc++-v3 / include / parallel / random_shuffle.h
CommitLineData
c2ba9709
JS
1// -*- C++ -*-
2
748086b7 3// Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
c2ba9709
JS
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the terms
7// of the GNU General Public License as published by the Free Software
748086b7 8// Foundation; either version 3, or (at your option) any later
c2ba9709
JS
9// version.
10
11// This library is distributed in the hope that it will be useful, but
12// WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14// General Public License for more details.
15
748086b7
JJ
16// Under Section 7 of GPL version 3, you are granted additional
17// permissions described in the GCC Runtime Library Exception, version
18// 3.1, as published by the Free Software Foundation.
c2ba9709 19
748086b7
JJ
20// You should have received a copy of the GNU General Public License and
21// a copy of the GCC Runtime Library Exception along with this program;
22// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23// <http://www.gnu.org/licenses/>.
c2ba9709
JS
24
25/** @file parallel/random_shuffle.h
26 * @brief Parallel implementation of std::random_shuffle().
27 * This file is a GNU parallel extension to the Standard C++ Library.
28 */
29
30// Written by Johannes Singler.
31
32#ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
33#define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
34
35#include <limits>
6f95a65a 36#include <bits/stl_numeric.h>
c2ba9709 37#include <parallel/parallel.h>
c2ba9709 38#include <parallel/random_number.h>
c2ba9709
JS
39
40namespace __gnu_parallel
41{
e683ee2a
JS
42/** @brief Type to hold the index of a bin.
43 *
44 * Since many variables of this type are allocated, it should be
45 * chosen as small as possible.
46 */
47typedef unsigned short bin_index;
48
49/** @brief Data known to every thread participating in
50 __gnu_parallel::parallel_random_shuffle(). */
51template<typename RandomAccessIterator>
c2ba9709
JS
52 struct DRandomShufflingGlobalData
53 {
54 typedef std::iterator_traits<RandomAccessIterator> traits_type;
55 typedef typename traits_type::value_type value_type;
56 typedef typename traits_type::difference_type difference_type;
57
58 /** @brief Begin iterator of the source. */
59 RandomAccessIterator& source;
60
61 /** @brief Temporary arrays for each thread. */
62 value_type** temporaries;
63
64 /** @brief Two-dimensional array to hold the thread-bin distribution.
65 *
66 * Dimensions (num_threads + 1) x (num_bins + 1). */
67 difference_type** dist;
68
69 /** @brief Start indexes of the threads' chunks. */
70 difference_type* starts;
71
72 /** @brief Number of the thread that will further process the
73 corresponding bin. */
74 thread_index_t* bin_proc;
75
76 /** @brief Number of bins to distribute to. */
77 int num_bins;
78
79 /** @brief Number of bits needed to address the bins. */
80 int num_bits;
81
82 /** @brief Constructor. */
83 DRandomShufflingGlobalData(RandomAccessIterator& _source)
84 : source(_source) { }
85 };
86
e683ee2a
JS
87/** @brief Local data for a thread participating in
88 __gnu_parallel::parallel_random_shuffle().
89 */
90template<typename RandomAccessIterator, typename RandomNumberGenerator>
c2ba9709
JS
91 struct DRSSorterPU
92 {
93 /** @brief Number of threads participating in total. */
94 int num_threads;
95
c2ba9709
JS
96 /** @brief Begin index for bins taken care of by this thread. */
97 bin_index bins_begin;
98
99 /** @brief End index for bins taken care of by this thread. */
100 bin_index bins_end;
101
102 /** @brief Random seed for this thread. */
6df548d2 103 uint32 seed;
c2ba9709
JS
104
105 /** @brief Pointer to global data. */
106 DRandomShufflingGlobalData<RandomAccessIterator>* sd;
107 };
108
e683ee2a
JS
109/** @brief Generate a random number in @c [0,2^logp).
110 * @param logp Logarithm (basis 2) of the upper range bound.
111 * @param rng Random number generator to use.
112 */
113template<typename RandomNumberGenerator>
6f95a65a
BK
114 inline int
115 random_number_pow2(int logp, RandomNumberGenerator& rng)
116 { return rng.genrand_bits(logp); }
c2ba9709 117
e683ee2a
JS
118/** @brief Random shuffle code executed by each thread.
119 * @param pus Array of thread-local data records. */
120template<typename RandomAccessIterator, typename RandomNumberGenerator>
5817ff8e 121 void
e683ee2a
JS
122 parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
123 RandomNumberGenerator>* pus)
c2ba9709
JS
124 {
125 typedef std::iterator_traits<RandomAccessIterator> traits_type;
126 typedef typename traits_type::value_type value_type;
127 typedef typename traits_type::difference_type difference_type;
128
e683ee2a
JS
129 thread_index_t iam = omp_get_thread_num();
130 DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
c2ba9709 131 DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
c2ba9709
JS
132
133 // Indexing: dist[bin][processor]
134 difference_type length = sd->starts[iam + 1] - sd->starts[iam];
135 bin_index* oracles = new bin_index[length];
136 difference_type* dist = new difference_type[sd->num_bins + 1];
137 bin_index* bin_proc = new bin_index[sd->num_bins];
138 value_type** temporaries = new value_type*[d->num_threads];
139
140 // Compute oracles and count appearances.
1661473b 141 for (bin_index b = 0; b < sd->num_bins + 1; ++b)
c2ba9709
JS
142 dist[b] = 0;
143 int num_bits = sd->num_bits;
144
145 random_number rng(d->seed);
146
147 // First main loop.
1661473b 148 for (difference_type i = 0; i < length; ++i)
c2ba9709 149 {
e683ee2a
JS
150 bin_index oracle = random_number_pow2(num_bits, rng);
151 oracles[i] = oracle;
c2ba9709 152
e683ee2a 153 // To allow prefix (partial) sum.
1661473b 154 ++(dist[oracle + 1]);
c2ba9709
JS
155 }
156
1661473b 157 for (bin_index b = 0; b < sd->num_bins + 1; ++b)
c2ba9709
JS
158 sd->dist[b][iam + 1] = dist[b];
159
e683ee2a 160# pragma omp barrier
c2ba9709 161
e683ee2a 162# pragma omp single
c2ba9709
JS
163 {
164 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
165 // total number of items in bin s
1661473b 166 for (bin_index s = 0; s < sd->num_bins; ++s)
e683ee2a
JS
167 __gnu_sequential::partial_sum(sd->dist[s + 1],
168 sd->dist[s + 1] + d->num_threads + 1,
169 sd->dist[s + 1]);
c2ba9709
JS
170 }
171
e683ee2a 172# pragma omp barrier
c2ba9709 173
c2ba9709 174 sequence_index_t offset = 0, global_offset = 0;
1661473b 175 for (bin_index s = 0; s < d->bins_begin; ++s)
c2ba9709
JS
176 global_offset += sd->dist[s + 1][d->num_threads];
177
e683ee2a 178# pragma omp barrier
c2ba9709 179
1661473b 180 for (bin_index s = d->bins_begin; s < d->bins_end; ++s)
c2ba9709 181 {
1661473b 182 for (int t = 0; t < d->num_threads + 1; ++t)
c2ba9709
JS
183 sd->dist[s + 1][t] += offset;
184 offset = sd->dist[s + 1][d->num_threads];
185 }
186
e683ee2a
JS
187 sd->temporaries[iam] = static_cast<value_type*>(
188 ::operator new(sizeof(value_type) * offset));
c2ba9709 189
e683ee2a 190# pragma omp barrier
c2ba9709 191
c2ba9709 192 // Draw local copies to avoid false sharing.
1661473b 193 for (bin_index b = 0; b < sd->num_bins + 1; ++b)
c2ba9709 194 dist[b] = sd->dist[b][iam];
1661473b 195 for (bin_index b = 0; b < sd->num_bins; ++b)
c2ba9709 196 bin_proc[b] = sd->bin_proc[b];
1661473b 197 for (thread_index_t t = 0; t < d->num_threads; ++t)
c2ba9709
JS
198 temporaries[t] = sd->temporaries[t];
199
200 RandomAccessIterator source = sd->source;
201 difference_type start = sd->starts[iam];
202
203 // Distribute according to oracles, second main loop.
1661473b 204 for (difference_type i = 0; i < length; ++i)
c2ba9709 205 {
e683ee2a
JS
206 bin_index target_bin = oracles[i];
207 thread_index_t target_p = bin_proc[target_bin];
c2ba9709 208
e683ee2a 209 // Last column [d->num_threads] stays unchanged.
5817ff8e
PC
210 ::new(&(temporaries[target_p][dist[target_bin + 1]++]))
211 value_type(*(source + i + start));
c2ba9709
JS
212 }
213
214 delete[] oracles;
215 delete[] dist;
216 delete[] bin_proc;
217 delete[] temporaries;
218
e683ee2a 219# pragma omp barrier
c2ba9709 220
c2ba9709 221 // Shuffle bins internally.
1661473b 222 for (bin_index b = d->bins_begin; b < d->bins_end; ++b)
c2ba9709 223 {
e683ee2a
JS
224 value_type* begin =
225 sd->temporaries[iam] +
226 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
227 * end =
228 sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
229 sequential_random_shuffle(begin, end, rng);
230 std::copy(begin, end, sd->source + global_offset +
231 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
c2ba9709
JS
232 }
233
fac9044d 234 ::operator delete(sd->temporaries[iam]);
c2ba9709
JS
235 }
236
e683ee2a
JS
237/** @brief Round up to the next greater power of 2.
238 * @param x Integer to round up */
239template<typename T>
6f95a65a
BK
240 T
241 round_up_to_pow2(T x)
c2ba9709
JS
242 {
243 if (x <= 1)
244 return 1;
245 else
c38b84d8 246 return (T)1 << (__log2(x - 1) + 1);
c2ba9709
JS
247 }
248
e683ee2a
JS
249/** @brief Main parallel random shuffle step.
250 * @param begin Begin iterator of sequence.
251 * @param end End iterator of sequence.
252 * @param n Length of sequence.
253 * @param num_threads Number of threads to use.
254 * @param rng Random number generator to use.
255 */
256template<typename RandomAccessIterator, typename RandomNumberGenerator>
5817ff8e
PC
257 void
258 parallel_random_shuffle_drs(RandomAccessIterator begin,
259 RandomAccessIterator end,
260 typename std::iterator_traits
261 <RandomAccessIterator>::difference_type n,
262 thread_index_t num_threads,
263 RandomNumberGenerator& rng)
c2ba9709
JS
264 {
265 typedef std::iterator_traits<RandomAccessIterator> traits_type;
266 typedef typename traits_type::value_type value_type;
267 typedef typename traits_type::difference_type difference_type;
268
269 _GLIBCXX_CALL(n)
270
ee1b5fc5
BK
271 const _Settings& __s = _Settings::get();
272
c2ba9709
JS
273 if (num_threads > n)
274 num_threads = static_cast<thread_index_t>(n);
275
276 bin_index num_bins, num_bins_cache;
277
278#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
279 // Try the L1 cache first.
280
281 // Must fit into L1.
e683ee2a 282 num_bins_cache = std::max<difference_type>(
ee1b5fc5 283 1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
c2ba9709
JS
284 num_bins_cache = round_up_to_pow2(num_bins_cache);
285
286 // No more buckets than TLB entries, power of 2
287 // Power of 2 and at least one element per bin, at most the TLB size.
e683ee2a 288 num_bins = std::min<difference_type>(n, num_bins_cache);
c2ba9709
JS
289
290#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
291 // 2 TLB entries needed per bin.
ee1b5fc5 292 num_bins = std::min<difference_type>(__s.TLB_size / 2, num_bins);
c2ba9709
JS
293#endif
294 num_bins = round_up_to_pow2(num_bins);
295
296 if (num_bins < num_bins_cache)
297 {
298#endif
e683ee2a
JS
299 // Now try the L2 cache
300 // Must fit into L2
301 num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
ee1b5fc5 302 1, n / (__s.L2_cache_size / sizeof(value_type))));
e683ee2a
JS
303 num_bins_cache = round_up_to_pow2(num_bins_cache);
304
305 // No more buckets than TLB entries, power of 2.
306 num_bins = static_cast<bin_index>(
307 std::min(n, static_cast<difference_type>(num_bins_cache)));
308 // Power of 2 and at least one element per bin, at most the TLB size.
c2ba9709 309#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
e683ee2a
JS
310 // 2 TLB entries needed per bin.
311 num_bins = std::min(
ee1b5fc5 312 static_cast<difference_type>(__s.TLB_size / 2), num_bins);
c2ba9709 313#endif
e683ee2a 314 num_bins = round_up_to_pow2(num_bins);
c2ba9709
JS
315#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
316 }
317#endif
318
e683ee2a 319 num_threads = std::min<bin_index>(num_threads, num_bins);
c2ba9709
JS
320
321 if (num_threads <= 1)
322 return sequential_random_shuffle(begin, end, rng);
323
324 DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
e683ee2a
JS
325 DRSSorterPU<RandomAccessIterator, random_number >* pus;
326 difference_type* starts;
c2ba9709 327
e683ee2a 328# pragma omp parallel num_threads(num_threads)
c2ba9709 329 {
3611e176 330 thread_index_t num_threads = omp_get_num_threads();
e683ee2a
JS
331# pragma omp single
332 {
333 pus = new DRSSorterPU<RandomAccessIterator, random_number>
334 [num_threads];
335
336 sd.temporaries = new value_type*[num_threads];
337 sd.dist = new difference_type*[num_bins + 1];
338 sd.bin_proc = new thread_index_t[num_bins];
1661473b 339 for (bin_index b = 0; b < num_bins + 1; ++b)
e683ee2a 340 sd.dist[b] = new difference_type[num_threads + 1];
1661473b 341 for (bin_index b = 0; b < (num_bins + 1); ++b)
e683ee2a
JS
342 {
343 sd.dist[0][0] = 0;
344 sd.dist[b][0] = 0;
345 }
346 starts = sd.starts = new difference_type[num_threads + 1];
347 int bin_cursor = 0;
348 sd.num_bins = num_bins;
c38b84d8 349 sd.num_bits = __log2(num_bins);
e683ee2a
JS
350
351 difference_type chunk_length = n / num_threads,
352 split = n % num_threads, start = 0;
353 difference_type bin_chunk_length = num_bins / num_threads,
354 bin_split = num_bins % num_threads;
1661473b 355 for (thread_index_t i = 0; i < num_threads; ++i)
e683ee2a
JS
356 {
357 starts[i] = start;
358 start += (i < split) ? (chunk_length + 1) : chunk_length;
359 int j = pus[i].bins_begin = bin_cursor;
360
361 // Range of bins for this processor.
362 bin_cursor += (i < bin_split) ?
363 (bin_chunk_length + 1) : bin_chunk_length;
364 pus[i].bins_end = bin_cursor;
1661473b 365 for (; j < bin_cursor; ++j)
e683ee2a
JS
366 sd.bin_proc[j] = i;
367 pus[i].num_threads = num_threads;
6df548d2 368 pus[i].seed = rng(std::numeric_limits<uint32>::max());
e683ee2a
JS
369 pus[i].sd = &sd;
370 }
371 starts[num_threads] = start;
372 } //single
3611e176
JS
373 // Now shuffle in parallel.
374 parallel_random_shuffle_drs_pu(pus);
375 } // parallel
c2ba9709
JS
376
377 delete[] starts;
378 delete[] sd.bin_proc;
1661473b 379 for (int s = 0; s < (num_bins + 1); ++s)
c2ba9709
JS
380 delete[] sd.dist[s];
381 delete[] sd.dist;
382 delete[] sd.temporaries;
383
384 delete[] pus;
385 }
386
e683ee2a
JS
387/** @brief Sequential cache-efficient random shuffle.
388 * @param begin Begin iterator of sequence.
389 * @param end End iterator of sequence.
390 * @param rng Random number generator to use.
391 */
392template<typename RandomAccessIterator, typename RandomNumberGenerator>
5817ff8e 393 void
6f95a65a 394 sequential_random_shuffle(RandomAccessIterator begin,
e683ee2a
JS
395 RandomAccessIterator end,
396 RandomNumberGenerator& rng)
c2ba9709
JS
397 {
398 typedef std::iterator_traits<RandomAccessIterator> traits_type;
399 typedef typename traits_type::value_type value_type;
400 typedef typename traits_type::difference_type difference_type;
401
402 difference_type n = end - begin;
ee1b5fc5 403 const _Settings& __s = _Settings::get();
c2ba9709
JS
404
405 bin_index num_bins, num_bins_cache;
406
407#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
408 // Try the L1 cache first, must fit into L1.
e683ee2a
JS
409 num_bins_cache =
410 std::max<difference_type>
ee1b5fc5 411 (1, n / (__s.L1_cache_size_lb / sizeof(value_type)));
c2ba9709
JS
412 num_bins_cache = round_up_to_pow2(num_bins_cache);
413
414 // No more buckets than TLB entries, power of 2
415 // Power of 2 and at least one element per bin, at most the TLB size
416 num_bins = std::min(n, (difference_type)num_bins_cache);
417#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
418 // 2 TLB entries needed per bin
ee1b5fc5 419 num_bins = std::min((difference_type)__s.TLB_size / 2, num_bins);
c2ba9709
JS
420#endif
421 num_bins = round_up_to_pow2(num_bins);
422
423 if (num_bins < num_bins_cache)
424 {
425#endif
e683ee2a
JS
426 // Now try the L2 cache, must fit into L2.
427 num_bins_cache =
428 static_cast<bin_index>(std::max<difference_type>(
ee1b5fc5 429 1, n / (__s.L2_cache_size / sizeof(value_type))));
e683ee2a 430 num_bins_cache = round_up_to_pow2(num_bins_cache);
c2ba9709 431
e683ee2a
JS
432 // No more buckets than TLB entries, power of 2
433 // Power of 2 and at least one element per bin, at most the TLB size.
434 num_bins = static_cast<bin_index>
435 (std::min(n, static_cast<difference_type>(num_bins_cache)));
c2ba9709
JS
436
437#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
e683ee2a
JS
438 // 2 TLB entries needed per bin
439 num_bins =
ee1b5fc5 440 std::min<difference_type>(__s.TLB_size / 2, num_bins);
c2ba9709 441#endif
e683ee2a 442 num_bins = round_up_to_pow2(num_bins);
c2ba9709
JS
443#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
444 }
445#endif
446
c38b84d8 447 int num_bits = __log2(num_bins);
c2ba9709
JS
448
449 if (num_bins > 1)
450 {
e683ee2a
JS
451 value_type* target = static_cast<value_type*>(
452 ::operator new(sizeof(value_type) * n));
453 bin_index* oracles = new bin_index[n];
454 difference_type* dist0 = new difference_type[num_bins + 1],
455 * dist1 = new difference_type[num_bins + 1];
456
1661473b 457 for (int b = 0; b < num_bins + 1; ++b)
e683ee2a
JS
458 dist0[b] = 0;
459
460 random_number bitrng(rng(0xFFFFFFFF));
461
1661473b 462 for (difference_type i = 0; i < n; ++i)
e683ee2a
JS
463 {
464 bin_index oracle = random_number_pow2(num_bits, bitrng);
465 oracles[i] = oracle;
466
467 // To allow prefix (partial) sum.
1661473b 468 ++(dist0[oracle + 1]);
e683ee2a
JS
469 }
470
471 // Sum up bins.
472 __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
473
1661473b 474 for (int b = 0; b < num_bins + 1; ++b)
e683ee2a
JS
475 dist1[b] = dist0[b];
476
477 // Distribute according to oracles.
1661473b 478 for (difference_type i = 0; i < n; ++i)
58a6ef4b 479 ::new(&(target[(dist0[oracles[i]])++])) value_type(*(begin + i));
e683ee2a 480
1661473b 481 for (int b = 0; b < num_bins; ++b)
e683ee2a
JS
482 {
483 sequential_random_shuffle(target + dist1[b],
484 target + dist1[b + 1],
485 rng);
486 }
487
361eefe7
JS
488 // Copy elements back.
489 std::copy(target, target + n, begin);
490
e683ee2a
JS
491 delete[] dist0;
492 delete[] dist1;
493 delete[] oracles;
fac9044d 494 ::operator delete(target);
c2ba9709
JS
495 }
496 else
497 __gnu_sequential::random_shuffle(begin, end, rng);
498 }
499
e683ee2a
JS
500/** @brief Parallel random public call.
501 * @param begin Begin iterator of sequence.
502 * @param end End iterator of sequence.
503 * @param rng Random number generator to use.
504 */
505template<typename RandomAccessIterator, typename RandomNumberGenerator>
c2ba9709 506 inline void
e683ee2a
JS
507 parallel_random_shuffle(RandomAccessIterator begin,
508 RandomAccessIterator end,
509 RandomNumberGenerator rng = random_number())
c2ba9709
JS
510 {
511 typedef std::iterator_traits<RandomAccessIterator> traits_type;
512 typedef typename traits_type::difference_type difference_type;
513 difference_type n = end - begin;
514 parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ;
515 }
516
517}
518
cbcd1e45 519#endif /* _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H */