]> git.ipfire.org Git - thirdparty/gcc.git/blame - libstdc++-v3/include/parallel/random_shuffle.h
Add parallel mode.
[thirdparty/gcc.git] / libstdc++-v3 / include / parallel / random_shuffle.h
CommitLineData
c2ba9709
JS
1// -*- C++ -*-
2
3// Copyright (C) 2007 Free Software Foundation, Inc.
4//
5// This file is part of the GNU ISO C++ Library. This library is free
6// software; you can redistribute it and/or modify it under the terms
7// of the GNU General Public License as published by the Free Software
8// Foundation; either version 2, or (at your option) any later
9// version.
10
11// This library is distributed in the hope that it will be useful, but
12// WITHOUT ANY WARRANTY; without even the implied warranty of
13// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14// General Public License for more details.
15
16// You should have received a copy of the GNU General Public License
17// along with this library; see the file COPYING. If not, write to
18// the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19// MA 02111-1307, USA.
20
21// As a special exception, you may use this file as part of a free
22// software library without restriction. Specifically, if other files
23// instantiate templates or use macros or inline functions from this
24// file, or you compile this file and link it with other files to
25// produce an executable, this file does not by itself cause the
26// resulting executable to be covered by the GNU General Public
27// License. This exception does not however invalidate any other
28// reasons why the executable file might be covered by the GNU General
29// Public License.
30
31/** @file parallel/random_shuffle.h
32 * @brief Parallel implementation of std::random_shuffle().
33 * This file is a GNU parallel extension to the Standard C++ Library.
34 */
35
36// Written by Johannes Singler.
37
38#ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
39#define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
40
41#include <limits>
42
43#include <parallel/basic_iterator.h>
44#include <bits/stl_algo.h>
45
46#include <parallel/parallel.h>
47#include <parallel/base.h>
48#include <parallel/random_number.h>
49#include <parallel/timing.h>
50
51namespace __gnu_parallel
52{
53 /** @brief Type to hold the index of a bin.
54 *
55 * Since many variables of this type are allocated, it should be
56 * chosen as small as possible.
57 */
58 typedef unsigned short bin_index;
59
60 /** @brief Data known to every thread participating in
61 __gnu_parallel::parallel_random_shuffle(). */
62 template<typename RandomAccessIterator>
63 struct DRandomShufflingGlobalData
64 {
65 typedef std::iterator_traits<RandomAccessIterator> traits_type;
66 typedef typename traits_type::value_type value_type;
67 typedef typename traits_type::difference_type difference_type;
68
69 /** @brief Begin iterator of the source. */
70 RandomAccessIterator& source;
71
72 /** @brief Temporary arrays for each thread. */
73 value_type** temporaries;
74
75 /** @brief Two-dimensional array to hold the thread-bin distribution.
76 *
77 * Dimensions (num_threads + 1) x (num_bins + 1). */
78 difference_type** dist;
79
80 /** @brief Start indexes of the threads' chunks. */
81 difference_type* starts;
82
83 /** @brief Number of the thread that will further process the
84 corresponding bin. */
85 thread_index_t* bin_proc;
86
87 /** @brief Number of bins to distribute to. */
88 int num_bins;
89
90 /** @brief Number of bits needed to address the bins. */
91 int num_bits;
92
93 /** @brief Constructor. */
94 DRandomShufflingGlobalData(RandomAccessIterator& _source)
95 : source(_source) { }
96 };
97
98 /** @brief Local data for a thread participating in
99 __gnu_parallel::parallel_random_shuffle().
100 */
101 template<typename RandomAccessIterator, typename RandomNumberGenerator>
102 struct DRSSorterPU
103 {
104 /** @brief Number of threads participating in total. */
105 int num_threads;
106
107 /** @brief Number of owning thread. */
108 int iam;
109
110 /** @brief Begin index for bins taken care of by this thread. */
111 bin_index bins_begin;
112
113 /** @brief End index for bins taken care of by this thread. */
114 bin_index bins_end;
115
116 /** @brief Random seed for this thread. */
117 uint32 seed;
118
119 /** @brief Pointer to global data. */
120 DRandomShufflingGlobalData<RandomAccessIterator>* sd;
121 };
122
123 /** @brief Generate a random number in @c [0,2^logp).
124 * @param logp Logarithm (basis 2) of the upper range bound.
125 * @param rng Random number generator to use.
126 */
127 template<typename RandomNumberGenerator>
128 inline int random_number_pow2(int logp, RandomNumberGenerator& rng)
129 {
130 return rng.genrand_bits(logp);
131 }
132
133 /** @brief Random shuffle code executed by each thread.
134 * @param pus Arary of thread-local data records. */
135 template<typename RandomAccessIterator, typename RandomNumberGenerator>
136 inline void parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* pus)
137 {
138 typedef std::iterator_traits<RandomAccessIterator> traits_type;
139 typedef typename traits_type::value_type value_type;
140 typedef typename traits_type::difference_type difference_type;
141
142 Timing<sequential_tag> t;
143 t.tic();
144
145 DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()];
146 DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd;
147 thread_index_t iam = d->iam;
148
149 // Indexing: dist[bin][processor]
150 difference_type length = sd->starts[iam + 1] - sd->starts[iam];
151 bin_index* oracles = new bin_index[length];
152 difference_type* dist = new difference_type[sd->num_bins + 1];
153 bin_index* bin_proc = new bin_index[sd->num_bins];
154 value_type** temporaries = new value_type*[d->num_threads];
155
156 // Compute oracles and count appearances.
157 for (bin_index b = 0; b < sd->num_bins + 1; b++)
158 dist[b] = 0;
159 int num_bits = sd->num_bits;
160
161 random_number rng(d->seed);
162
163 // First main loop.
164 for (difference_type i = 0; i < length; i++)
165 {
166 bin_index oracle = random_number_pow2(num_bits, rng);
167 oracles[i] = oracle;
168
169 // To allow prefix (partial) sum.
170 dist[oracle + 1]++;
171 }
172
173 for (bin_index b = 0; b < sd->num_bins + 1; b++)
174 sd->dist[b][iam + 1] = dist[b];
175
176 t.tic();
177
178#pragma omp barrier
179
180 t.tic();
181
182#pragma omp single
183 {
184 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
185 // total number of items in bin s
186 for (bin_index s = 0; s < sd->num_bins; s++)
187 partial_sum(sd->dist[s + 1], sd->dist[s + 1] + d->num_threads + 1, sd->dist[s + 1]);
188 }
189
190#pragma omp barrier
191
192 t.tic();
193
194 sequence_index_t offset = 0, global_offset = 0;
195 for (bin_index s = 0; s < d->bins_begin; s++)
196 global_offset += sd->dist[s + 1][d->num_threads];
197
198#pragma omp barrier
199
200 for (bin_index s = d->bins_begin; s < d->bins_end; s++)
201 {
202 for (int t = 0; t < d->num_threads + 1; t++)
203 sd->dist[s + 1][t] += offset;
204 offset = sd->dist[s + 1][d->num_threads];
205 }
206
207 sd->temporaries[iam] = new value_type[offset];
208
209 t.tic();
210
211#pragma omp barrier
212
213 t.tic();
214
215 // Draw local copies to avoid false sharing.
216 for (bin_index b = 0; b < sd->num_bins + 1; b++)
217 dist[b] = sd->dist[b][iam];
218 for (bin_index b = 0; b < sd->num_bins; b++)
219 bin_proc[b] = sd->bin_proc[b];
220 for (thread_index_t t = 0; t < d->num_threads; t++)
221 temporaries[t] = sd->temporaries[t];
222
223 RandomAccessIterator source = sd->source;
224 difference_type start = sd->starts[iam];
225
226 // Distribute according to oracles, second main loop.
227 for (difference_type i = 0; i < length; i++)
228 {
229 bin_index target_bin = oracles[i];
230 thread_index_t target_p = bin_proc[target_bin];
231
232 // Last column [d->num_threads] stays unchanged.
233 temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start);
234 }
235
236 delete[] oracles;
237 delete[] dist;
238 delete[] bin_proc;
239 delete[] temporaries;
240
241 t.tic();
242
243#pragma omp barrier
244
245 t.tic();
246
247 // Shuffle bins internally.
248 for (bin_index b = d->bins_begin; b < d->bins_end; b++)
249 {
250 value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]),
251 * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads];
252 sequential_random_shuffle(begin, end, rng);
253 std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]));
254 }
255
256 delete[] sd->temporaries[iam];
257
258 t.tic();
259
260 t.print();
261 }
262
263 /** @brief Round up to the next greater power of 2.
264 * @param x Integer to round up */
265 template<typename T>
266 T round_up_to_pow2(T x)
267 {
268 if (x <= 1)
269 return 1;
270 else
271 return (T)1 << (log2(x - 1) + 1);
272 }
273
274 /** @brief Main parallel random shuffle step.
275 * @param begin Begin iterator of sequence.
276 * @param end End iterator of sequence.
277 * @param n Length of sequence.
278 * @param num_threads Number of threads to use.
279 * @param rng Random number generator to use.
280 */
281 template<typename RandomAccessIterator, typename RandomNumberGenerator>
282 inline void
283 parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng)
284 {
285 typedef std::iterator_traits<RandomAccessIterator> traits_type;
286 typedef typename traits_type::value_type value_type;
287 typedef typename traits_type::difference_type difference_type;
288
289 _GLIBCXX_CALL(n)
290
291 if (num_threads > n)
292 num_threads = static_cast<thread_index_t>(n);
293
294 bin_index num_bins, num_bins_cache;
295
296#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
297 // Try the L1 cache first.
298
299 // Must fit into L1.
300 num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
301 num_bins_cache = round_up_to_pow2(num_bins_cache);
302
303 // No more buckets than TLB entries, power of 2
304 // Power of 2 and at least one element per bin, at most the TLB size.
305 num_bins = std::min(n, (difference_type)num_bins_cache);
306
307#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
308 // 2 TLB entries needed per bin.
309 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
310#endif
311 num_bins = round_up_to_pow2(num_bins);
312
313 if (num_bins < num_bins_cache)
314 {
315#endif
316 // Now try the L2 cache
317 // Must fit into L2
318 num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
319 num_bins_cache = round_up_to_pow2(num_bins_cache);
320
321 // No more buckets than TLB entries, power of 2.
322 num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
323 // Power of 2 and at least one element per bin, at most the TLB size.
324#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
325 // 2 TLB entries needed per bin.
326 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
327#endif
328 num_bins = round_up_to_pow2(num_bins);
329#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
330 }
331#endif
332
333 num_threads = std::min((bin_index)num_threads, (bin_index)num_bins);
334
335 if (num_threads <= 1)
336 return sequential_random_shuffle(begin, end, rng);
337
338 DRandomShufflingGlobalData<RandomAccessIterator> sd(begin);
339
340 DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads];
341
342 sd.temporaries = new value_type*[num_threads];
343 //sd.oracles = new bin_index[n];
344 sd.dist = new difference_type*[num_bins + 1];
345 sd.bin_proc = new thread_index_t[num_bins];
346 for (bin_index b = 0; b < num_bins + 1; b++)
347 sd.dist[b] = new difference_type[num_threads + 1];
348 for (bin_index b = 0; b < (num_bins + 1); b++)
349 {
350 sd.dist[0][0] = 0;
351 sd.dist[b][0] = 0;
352 }
353 difference_type* starts = sd.starts = new difference_type[num_threads + 1];
354 int bin_cursor = 0;
355 sd.num_bins = num_bins;
356 sd.num_bits = log2(num_bins);
357
358 difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0;
359 int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads;
360 for (int i = 0; i < num_threads; i++)
361 {
362 starts[i] = start;
363 start += (i < split) ? (chunk_length + 1) : chunk_length;
364 int j = pus[i].bins_begin = bin_cursor;
365
366 // Range of bins for this processor.
367 bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length;
368 pus[i].bins_end = bin_cursor;
369 for (; j < bin_cursor; j++)
370 sd.bin_proc[j] = i;
371 pus[i].num_threads = num_threads;
372 pus[i].iam = i;
373 pus[i].seed = rng(std::numeric_limits<uint32>::max());
374 pus[i].sd = &sd;
375 }
376 starts[num_threads] = start;
377
378 // Now shuffle in parallel.
379#pragma omp parallel num_threads(num_threads)
380 parallel_random_shuffle_drs_pu(pus);
381
382 delete[] starts;
383 delete[] sd.bin_proc;
384 for (int s = 0; s < (num_bins + 1); s++)
385 delete[] sd.dist[s];
386 delete[] sd.dist;
387 delete[] sd.temporaries;
388
389 delete[] pus;
390 }
391
392 /** @brief Sequential cache-efficient random shuffle.
393 * @param begin Begin iterator of sequence.
394 * @param end End iterator of sequence.
395 * @param rng Random number generator to use.
396 */
397 template<typename RandomAccessIterator, typename RandomNumberGenerator>
398 inline void
399 sequential_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, RandomNumberGenerator& rng)
400 {
401 typedef std::iterator_traits<RandomAccessIterator> traits_type;
402 typedef typename traits_type::value_type value_type;
403 typedef typename traits_type::difference_type difference_type;
404
405 difference_type n = end - begin;
406
407 bin_index num_bins, num_bins_cache;
408
409#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
410 // Try the L1 cache first, must fit into L1.
411 num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type))));
412 num_bins_cache = round_up_to_pow2(num_bins_cache);
413
414 // No more buckets than TLB entries, power of 2
415 // Power of 2 and at least one element per bin, at most the TLB size
416 num_bins = std::min(n, (difference_type)num_bins_cache);
417#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
418 // 2 TLB entries needed per bin
419 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
420#endif
421 num_bins = round_up_to_pow2(num_bins);
422
423 if (num_bins < num_bins_cache)
424 {
425#endif
426 // Now try the L2 cache, must fit into L2.
427 num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type)))));
428 num_bins_cache = round_up_to_pow2(num_bins_cache);
429
430 // No more buckets than TLB entries, power of 2
431 // Power of 2 and at least one element per bin, at most the TLB size.
432 num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache));
433
434#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
435 // 2 TLB entries needed per bin
436 num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins);
437#endif
438 num_bins = round_up_to_pow2(num_bins);
439#if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
440 }
441#endif
442
443 int num_bits = log2(num_bins);
444
445 if (num_bins > 1)
446 {
447 value_type* target = new value_type[n];
448 bin_index* oracles = new bin_index[n];
449 difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1];
450
451 for (int b = 0; b < num_bins + 1; b++)
452 dist0[b] = 0;
453
454 Timing<sequential_tag> t;
455 t.tic();
456
457 random_number bitrng(rng(0xFFFFFFFF));
458
459 for (difference_type i = 0; i < n; i++)
460 {
461 bin_index oracle = random_number_pow2(num_bits, bitrng);
462 oracles[i] = oracle;
463
464 // To allow prefix (partial) sum.
465 dist0[oracle + 1]++;
466 }
467
468 t.tic();
469
470 // Sum up bins.
471 partial_sum(dist0, dist0 + num_bins + 1, dist0);
472
473 for (int b = 0; b < num_bins + 1; b++)
474 dist1[b] = dist0[b];
475
476 t.tic();
477
478 // Distribute according to oracles.
479 for (difference_type i = 0; i < n; i++)
480 target[(dist0[oracles[i]])++] = *(begin + i);
481
482 for (int b = 0; b < num_bins; b++)
483 {
484 sequential_random_shuffle(target + dist1[b], target + dist1[b + 1],
485 rng);
486 t.tic();
487 }
488 t.print();
489
490 delete[] dist0;
491 delete[] dist1;
492 delete[] oracles;
493 delete[] target;
494 }
495 else
496 __gnu_sequential::random_shuffle(begin, end, rng);
497 }
498
499 /** @brief Parallel random public call.
500 * @param begin Begin iterator of sequence.
501 * @param end End iterator of sequence.
502 * @param rng Random number generator to use.
503 */
504 template<typename RandomAccessIterator, typename RandomNumberGenerator>
505 inline void
506 parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, RandomNumberGenerator rng = random_number())
507 {
508 typedef std::iterator_traits<RandomAccessIterator> traits_type;
509 typedef typename traits_type::difference_type difference_type;
510 difference_type n = end - begin;
511 parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ;
512 }
513
514}
515
516#endif