]> git.ipfire.org Git - thirdparty/gcc.git/blob - libstdc++-v3/include/parallel/random_shuffle.h
re PR libstdc++/33893 ([parallel mode] Algorithms rely on omp_set_dynamic(false))
[thirdparty/gcc.git] / libstdc++-v3 / include / parallel / random_shuffle.h
1 // -*- C++ -*-
2
3 // Copyright (C) 2007 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 2, or (at your option) any later
9 // version.
10
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
15
16 // You should have received a copy of the GNU General Public License
17 // along with this library; see the file COPYING. If not, write to
18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19 // MA 02111-1307, USA.
20
21 // As a special exception, you may use this file as part of a free
22 // software library without restriction. Specifically, if other files
23 // instantiate templates or use macros or inline functions from this
24 // file, or you compile this file and link it with other files to
25 // produce an executable, this file does not by itself cause the
26 // resulting executable to be covered by the GNU General Public
27 // License. This exception does not however invalidate any other
28 // reasons why the executable file might be covered by the GNU General
29 // Public License.
30
31 /** @file parallel/random_shuffle.h
32 * @brief Parallel implementation of std::random_shuffle().
33 * This file is a GNU parallel extension to the Standard C++ Library.
34 */
35
36 // Written by Johannes Singler.
37
38 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
39 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
40
41 #include <limits>
42 #include <bits/stl_numeric.h>
43 #include <parallel/parallel.h>
44 #include <parallel/random_number.h>
45
46 namespace __gnu_parallel
47 {
48 /** @brief Type to hold the index of a bin.
49 *
50 * Since many variables of this type are allocated, it should be
51 * chosen as small as possible.
52 */
53 typedef unsigned short bin_index;
54
55 /** @brief Data known to every thread participating in
56 __gnu_parallel::parallel_random_shuffle(). */
57 template<typename RandomAccessIterator>
58 struct DRandomShufflingGlobalData
59 {
60 typedef std::iterator_traits<RandomAccessIterator> traits_type;
61 typedef typename traits_type::value_type value_type;
62 typedef typename traits_type::difference_type difference_type;
63
64 /** @brief Begin iterator of the source. */
65 RandomAccessIterator& source;
66
67 /** @brief Temporary arrays for each thread. */
68 value_type** temporaries;
69
70 /** @brief Two-dimensional array to hold the thread-bin distribution.
71 *
72 * Dimensions (num_threads + 1) x (num_bins + 1). */
73 difference_type** dist;
74
75 /** @brief Start indexes of the threads' chunks. */
76 difference_type* starts;
77
78 /** @brief Number of the thread that will further process the
79 corresponding bin. */
80 thread_index_t* bin_proc;
81
82 /** @brief Number of bins to distribute to. */
83 int num_bins;
84
85 /** @brief Number of bits needed to address the bins. */
86 int num_bits;
87
88 /** @brief Constructor. */
89 DRandomShufflingGlobalData(RandomAccessIterator& _source)
90 : source(_source) { }
91 };
92
93 /** @brief Local data for a thread participating in
94 __gnu_parallel::parallel_random_shuffle().
95 */
96 template<typename RandomAccessIterator, typename RandomNumberGenerator>
97 struct DRSSorterPU
98 {
99 /** @brief Number of threads participating in total. */
100 int num_threads;
101
102 /** @brief Begin index for bins taken care of by this thread. */
103 bin_index bins_begin;
104
105 /** @brief End index for bins taken care of by this thread. */
106 bin_index bins_end;
107
108 /** @brief Random seed for this thread. */
109 uint32 seed;
110
111 /** @brief Pointer to global data. */
112 DRandomShufflingGlobalData<RandomAccessIterator>* sd;
113 };
114
115 /** @brief Generate a random number in @c [0,2^logp).
116 * @param logp Logarithm (basis 2) of the upper range bound.
117 * @param rng Random number generator to use.
118 */
119 template<typename RandomNumberGenerator>
120 inline int
121 random_number_pow2(int logp, RandomNumberGenerator& rng)
122 { return rng.genrand_bits(logp); }
123
124 /** @brief Random shuffle code executed by each thread.
125 * @param pus Array of thread-local data records. */
126 template<typename RandomAccessIterator, typename RandomNumberGenerator>
127 inline void
128 parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator,
129 RandomNumberGenerator>* pus)
130 {
131 typedef std::iterator_traits<RandomAccessIterator> traits_type;
132 typedef typename traits_type::value_type value_type;
133 typedef typename traits_type::difference_type difference_type;
134
135 thread_index_t iam = omp_get_thread_num();
136 DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[iam];
137 DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
138
139 // Indexing: dist[bin][processor]
140 difference_type length = sd->starts[iam + 1] - sd->starts[iam];
141 bin_index* oracles = new bin_index[length];
142 difference_type* dist = new difference_type[sd->num_bins + 1];
143 bin_index* bin_proc = new bin_index[sd->num_bins];
144 value_type** temporaries = new value_type*[d->num_threads];
145
146 // Compute oracles and count appearances.
147 for (bin_index b = 0; b < sd->num_bins + 1; b++)
148 dist[b] = 0;
149 int num_bits = sd->num_bits;
150
151 random_number rng(d->seed);
152
153 // First main loop.
154 for (difference_type i = 0; i < length; i++)
155 {
156 bin_index oracle = random_number_pow2(num_bits, rng);
157 oracles[i] = oracle;
158
159 // To allow prefix (partial) sum.
160 dist[oracle + 1]++;
161 }
162
163 for (bin_index b = 0; b < sd->num_bins + 1; b++)
164 sd->dist[b][iam + 1] = dist[b];
165
166 # pragma omp barrier
167
168 # pragma omp single
169 {
170 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
171 // total number of items in bin s
172 for (bin_index s = 0; s < sd->num_bins; s++)
173 __gnu_sequential::partial_sum(sd->dist[s + 1],
174 sd->dist[s + 1] + d->num_threads + 1,
175 sd->dist[s + 1]);
176 }
177
178 # pragma omp barrier
179
180 sequence_index_t offset = 0, global_offset = 0;
181 for (bin_index s = 0; s < d->bins_begin; s++)
182 global_offset += sd->dist[s + 1][d->num_threads];
183
184 # pragma omp barrier
185
186 for (bin_index s = d->bins_begin; s < d->bins_end; s++)
187 {
188 for (int t = 0; t < d->num_threads + 1; t++)
189 sd->dist[s + 1][t] += offset;
190 offset = sd->dist[s + 1][d->num_threads];
191 }
192
193 sd->temporaries[iam] = static_cast<value_type*>(
194 ::operator new(sizeof(value_type) * offset));
195
196 # pragma omp barrier
197
198 // Draw local copies to avoid false sharing.
199 for (bin_index b = 0; b < sd->num_bins + 1; b++)
200 dist[b] = sd->dist[b][iam];
201 for (bin_index b = 0; b < sd->num_bins; b++)
202 bin_proc[b] = sd->bin_proc[b];
203 for (thread_index_t t = 0; t < d->num_threads; t++)
204 temporaries[t] = sd->temporaries[t];
205
206 RandomAccessIterator source = sd->source;
207 difference_type start = sd->starts[iam];
208
209 // Distribute according to oracles, second main loop.
210 for (difference_type i = 0; i < length; i++)
211 {
212 bin_index target_bin = oracles[i];
213 thread_index_t target_p = bin_proc[target_bin];
214
215 // Last column [d->num_threads] stays unchanged.
216 temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
217 }
218
219 delete[] oracles;
220 delete[] dist;
221 delete[] bin_proc;
222 delete[] temporaries;
223
224 # pragma omp barrier
225
226 // Shuffle bins internally.
227 for (bin_index b = d->bins_begin; b < d->bins_end; b++)
228 {
229 value_type* begin =
230 sd->temporaries[iam] +
231 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
232 * end =
233 sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
234 sequential_random_shuffle(begin, end, rng);
235 std::copy(begin, end, sd->source + global_offset +
236 ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
237 }
238
239 delete[] sd->temporaries[iam];
240 }
241
242 /** @brief Round up to the next greater power of 2.
243 * @param x Integer to round up */
244 template<typename T>
245 T
246 round_up_to_pow2(T x)
247 {
248 if (x <= 1)
249 return 1;
250 else
251 return (T)1 << (log2(x - 1) + 1);
252 }
253
254 /** @brief Main parallel random shuffle step.
255 * @param begin Begin iterator of sequence.
256 * @param end End iterator of sequence.
257 * @param n Length of sequence.
258 * @param num_threads Number of threads to use.
259 * @param rng Random number generator to use.
260 */
261 template<typename RandomAccessIterator, typename RandomNumberGenerator>
262 inline void
263 parallel_random_shuffle_drs(
264 RandomAccessIterator begin,
265 RandomAccessIterator end,
266 typename std::iterator_traits<RandomAccessIterator>::difference_type n,
267 thread_index_t num_threads,
268 RandomNumberGenerator& rng)
269 {
270 typedef std::iterator_traits<RandomAccessIterator> traits_type;
271 typedef typename traits_type::value_type value_type;
272 typedef typename traits_type::difference_type difference_type;
273
274 _GLIBCXX_CALL(n)
275
276 if (num_threads > n)
277 num_threads = static_cast<thread_index_t>(n);
278
279 bin_index num_bins, num_bins_cache;
280
281 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
282 // Try the L1 cache first.
283
284 // Must fit into L1.
285 num_bins_cache = std::max<difference_type>(
286 1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
287 num_bins_cache = round_up_to_pow2(num_bins_cache);
288
289 // No more buckets than TLB entries, power of 2
290 // Power of 2 and at least one element per bin, at most the TLB size.
291 num_bins = std::min<difference_type>(n, num_bins_cache);
292
293 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
294 // 2 TLB entries needed per bin.
295 num_bins = std::min<difference_type>(Settings::TLB_size / 2, num_bins);
296 #endif
297 num_bins = round_up_to_pow2(num_bins);
298
299 if (num_bins < num_bins_cache)
300 {
301 #endif
302 // Now try the L2 cache
303 // Must fit into L2
304 num_bins_cache = static_cast<bin_index>(std::max<difference_type>(
305 1, n / (Settings::L2_cache_size / sizeof(value_type))));
306 num_bins_cache = round_up_to_pow2(num_bins_cache);
307
308 // No more buckets than TLB entries, power of 2.
309 num_bins = static_cast<bin_index>(
310 std::min(n, static_cast<difference_type>(num_bins_cache)));
311 // Power of 2 and at least one element per bin, at most the TLB size.
312 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
313 // 2 TLB entries needed per bin.
314 num_bins = std::min(
315 static_cast<difference_type>(Settings::TLB_size / 2), num_bins);
316 #endif
317 num_bins = round_up_to_pow2(num_bins);
318 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
319 }
320 #endif
321
322 num_threads = std::min<bin_index>(num_threads, num_bins);
323
324 if (num_threads <= 1)
325 return sequential_random_shuffle(begin, end, rng);
326
327 DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
328 DRSSorterPU<RandomAccessIterator, random_number >* pus;
329 difference_type* starts;
330
331 # pragma omp parallel num_threads(num_threads)
332 {
333 # pragma omp single
334 {
335 pus = new DRSSorterPU<RandomAccessIterator, random_number>
336 [num_threads];
337
338 sd.temporaries = new value_type*[num_threads];
339 sd.dist = new difference_type*[num_bins + 1];
340 sd.bin_proc = new thread_index_t[num_bins];
341 for (bin_index b = 0; b < num_bins + 1; b++)
342 sd.dist[b] = new difference_type[num_threads + 1];
343 for (bin_index b = 0; b < (num_bins + 1); b++)
344 {
345 sd.dist[0][0] = 0;
346 sd.dist[b][0] = 0;
347 }
348 starts = sd.starts = new difference_type[num_threads + 1];
349 int bin_cursor = 0;
350 sd.num_bins = num_bins;
351 sd.num_bits = log2(num_bins);
352
353 difference_type chunk_length = n / num_threads,
354 split = n % num_threads, start = 0;
355 difference_type bin_chunk_length = num_bins / num_threads,
356 bin_split = num_bins % num_threads;
357 for (thread_index_t i = 0; i < num_threads; i++)
358 {
359 starts[i] = start;
360 start += (i < split) ? (chunk_length + 1) : chunk_length;
361 int j = pus[i].bins_begin = bin_cursor;
362
363 // Range of bins for this processor.
364 bin_cursor += (i < bin_split) ?
365 (bin_chunk_length + 1) : bin_chunk_length;
366 pus[i].bins_end = bin_cursor;
367 for (; j < bin_cursor; j++)
368 sd.bin_proc[j] = i;
369 pus[i].num_threads = num_threads;
370 pus[i].seed = rng(std::numeric_limits<uint32>::max());
371 pus[i].sd = &sd;
372 }
373 starts[num_threads] = start;
374 } //single
375 // Now shuffle in parallel.
376 parallel_random_shuffle_drs_pu(pus);
377 }
378
379 delete[] starts;
380 delete[] sd.bin_proc;
381 for (int s = 0; s < (num_bins + 1); s++)
382 delete[] sd.dist[s];
383 delete[] sd.dist;
384 delete[] sd.temporaries;
385
386 delete[] pus;
387 }
388
389 /** @brief Sequential cache-efficient random shuffle.
390 * @param begin Begin iterator of sequence.
391 * @param end End iterator of sequence.
392 * @param rng Random number generator to use.
393 */
394 template<typename RandomAccessIterator, typename RandomNumberGenerator>
395 inline void
396 sequential_random_shuffle(RandomAccessIterator begin,
397 RandomAccessIterator end,
398 RandomNumberGenerator& rng)
399 {
400 typedef std::iterator_traits<RandomAccessIterator> traits_type;
401 typedef typename traits_type::value_type value_type;
402 typedef typename traits_type::difference_type difference_type;
403
404 difference_type n = end - begin;
405
406 bin_index num_bins, num_bins_cache;
407
408 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
409 // Try the L1 cache first, must fit into L1.
410 num_bins_cache =
411 std::max<difference_type>
412 (1, n / (Settings::L1_cache_size_lb / sizeof(value_type)));
413 num_bins_cache = round_up_to_pow2(num_bins_cache);
414
415 // No more buckets than TLB entries, power of 2
416 // Power of 2 and at least one element per bin, at most the TLB size
417 num_bins = std::min(n, (difference_type)num_bins_cache);
418 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
419 // 2 TLB entries needed per bin
420 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
421 #endif
422 num_bins = round_up_to_pow2(num_bins);
423
424 if (num_bins < num_bins_cache)
425 {
426 #endif
427 // Now try the L2 cache, must fit into L2.
428 num_bins_cache =
429 static_cast<bin_index>(std::max<difference_type>(
430 1, n / (Settings::L2_cache_size / sizeof(value_type))));
431 num_bins_cache = round_up_to_pow2(num_bins_cache);
432
433 // No more buckets than TLB entries, power of 2
434 // Power of 2 and at least one element per bin, at most the TLB size.
435 num_bins = static_cast<bin_index>
436 (std::min(n, static_cast<difference_type>(num_bins_cache)));
437
438 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
439 // 2 TLB entries needed per bin
440 num_bins =
441 std::min<difference_type>(Settings::TLB_size / 2, num_bins);
442 #endif
443 num_bins = round_up_to_pow2(num_bins);
444 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
445 }
446 #endif
447
448 int num_bits = log2(num_bins);
449
450 if (num_bins > 1)
451 {
452 value_type* target = static_cast<value_type*>(
453 ::operator new(sizeof(value_type) * n));
454 bin_index* oracles = new bin_index[n];
455 difference_type* dist0 = new difference_type[num_bins + 1],
456 * dist1 = new difference_type[num_bins + 1];
457
458 for (int b = 0; b < num_bins + 1; b++)
459 dist0[b] = 0;
460
461 random_number bitrng(rng(0xFFFFFFFF));
462
463 for (difference_type i = 0; i < n; i++)
464 {
465 bin_index oracle = random_number_pow2(num_bits, bitrng);
466 oracles[i] = oracle;
467
468 // To allow prefix (partial) sum.
469 dist0[oracle + 1]++;
470 }
471
472 // Sum up bins.
473 __gnu_sequential::partial_sum(dist0, dist0 + num_bins + 1, dist0);
474
475 for (int b = 0; b < num_bins + 1; b++)
476 dist1[b] = dist0[b];
477
478 // Distribute according to oracles.
479 for (difference_type i = 0; i < n; i++)
480 target[(dist0[oracles[i]])++] = *(begin + i);
481
482 for (int b = 0; b < num_bins; b++)
483 {
484 sequential_random_shuffle(target + dist1[b],
485 target + dist1[b + 1],
486 rng);
487 }
488
489 delete[] dist0;
490 delete[] dist1;
491 delete[] oracles;
492 delete[] target;
493 }
494 else
495 __gnu_sequential::random_shuffle(begin, end, rng);
496 }
497
498 /** @brief Parallel random public call.
499 * @param begin Begin iterator of sequence.
500 * @param end End iterator of sequence.
501 * @param rng Random number generator to use.
502 */
503 template<typename RandomAccessIterator, typename RandomNumberGenerator>
504 inline void
505 parallel_random_shuffle(RandomAccessIterator begin,
506 RandomAccessIterator end,
507 RandomNumberGenerator rng = random_number())
508 {
509 typedef std::iterator_traits<RandomAccessIterator> traits_type;
510 typedef typename traits_type::difference_type difference_type;
511 difference_type n = end - begin;
512 parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ;
513 }
514
515 }
516
517 #endif