]>
Commit | Line | Data |
---|---|---|
c2ba9709 JS |
1 | // -*- C++ -*- |
2 | ||
3 | // Copyright (C) 2007 Free Software Foundation, Inc. | |
4 | // | |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the terms | |
7 | // of the GNU General Public License as published by the Free Software | |
8 | // Foundation; either version 2, or (at your option) any later | |
9 | // version. | |
10 | ||
11 | // This library is distributed in the hope that it will be useful, but | |
12 | // WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | // General Public License for more details. | |
15 | ||
16 | // You should have received a copy of the GNU General Public License | |
17 | // along with this library; see the file COPYING. If not, write to | |
18 | // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, | |
19 | // MA 02111-1307, USA. | |
20 | ||
21 | // As a special exception, you may use this file as part of a free | |
22 | // software library without restriction. Specifically, if other files | |
23 | // instantiate templates or use macros or inline functions from this | |
24 | // file, or you compile this file and link it with other files to | |
25 | // produce an executable, this file does not by itself cause the | |
26 | // resulting executable to be covered by the GNU General Public | |
27 | // License. This exception does not however invalidate any other | |
28 | // reasons why the executable file might be covered by the GNU General | |
29 | // Public License. | |
30 | ||
31 | /** @file parallel/random_shuffle.h | |
32 | * @brief Parallel implementation of std::random_shuffle(). | |
33 | * This file is a GNU parallel extension to the Standard C++ Library. | |
34 | */ | |
35 | ||
36 | // Written by Johannes Singler. | |
37 | ||
38 | #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H | |
39 | #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1 | |
40 | ||
41 | #include <limits> | |
42 | ||
43 | #include <parallel/basic_iterator.h> | |
44 | #include <bits/stl_algo.h> | |
45 | ||
46 | #include <parallel/parallel.h> | |
47 | #include <parallel/base.h> | |
48 | #include <parallel/random_number.h> | |
49 | #include <parallel/timing.h> | |
50 | ||
51 | namespace __gnu_parallel | |
52 | { | |
53 | /** @brief Type to hold the index of a bin. | |
54 | * | |
55 | * Since many variables of this type are allocated, it should be | |
56 | * chosen as small as possible. | |
57 | */ | |
58 | typedef unsigned short bin_index; | |
59 | ||
60 | /** @brief Data known to every thread participating in | |
61 | __gnu_parallel::parallel_random_shuffle(). */ | |
62 | template<typename RandomAccessIterator> | |
63 | struct DRandomShufflingGlobalData | |
64 | { | |
65 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
66 | typedef typename traits_type::value_type value_type; | |
67 | typedef typename traits_type::difference_type difference_type; | |
68 | ||
69 | /** @brief Begin iterator of the source. */ | |
70 | RandomAccessIterator& source; | |
71 | ||
72 | /** @brief Temporary arrays for each thread. */ | |
73 | value_type** temporaries; | |
74 | ||
75 | /** @brief Two-dimensional array to hold the thread-bin distribution. | |
76 | * | |
77 | * Dimensions (num_threads + 1) x (num_bins + 1). */ | |
78 | difference_type** dist; | |
79 | ||
80 | /** @brief Start indexes of the threads' chunks. */ | |
81 | difference_type* starts; | |
82 | ||
83 | /** @brief Number of the thread that will further process the | |
84 | corresponding bin. */ | |
85 | thread_index_t* bin_proc; | |
86 | ||
87 | /** @brief Number of bins to distribute to. */ | |
88 | int num_bins; | |
89 | ||
90 | /** @brief Number of bits needed to address the bins. */ | |
91 | int num_bits; | |
92 | ||
93 | /** @brief Constructor. */ | |
94 | DRandomShufflingGlobalData(RandomAccessIterator& _source) | |
95 | : source(_source) { } | |
96 | }; | |
97 | ||
98 | /** @brief Local data for a thread participating in | |
99 | __gnu_parallel::parallel_random_shuffle(). | |
100 | */ | |
101 | template<typename RandomAccessIterator, typename RandomNumberGenerator> | |
102 | struct DRSSorterPU | |
103 | { | |
104 | /** @brief Number of threads participating in total. */ | |
105 | int num_threads; | |
106 | ||
107 | /** @brief Number of owning thread. */ | |
108 | int iam; | |
109 | ||
110 | /** @brief Begin index for bins taken care of by this thread. */ | |
111 | bin_index bins_begin; | |
112 | ||
113 | /** @brief End index for bins taken care of by this thread. */ | |
114 | bin_index bins_end; | |
115 | ||
116 | /** @brief Random seed for this thread. */ | |
117 | uint32 seed; | |
118 | ||
119 | /** @brief Pointer to global data. */ | |
120 | DRandomShufflingGlobalData<RandomAccessIterator>* sd; | |
121 | }; | |
122 | ||
123 | /** @brief Generate a random number in @c [0,2^logp). | |
124 | * @param logp Logarithm (basis 2) of the upper range bound. | |
125 | * @param rng Random number generator to use. | |
126 | */ | |
127 | template<typename RandomNumberGenerator> | |
128 | inline int random_number_pow2(int logp, RandomNumberGenerator& rng) | |
129 | { | |
130 | return rng.genrand_bits(logp); | |
131 | } | |
132 | ||
133 | /** @brief Random shuffle code executed by each thread. | |
134 | * @param pus Arary of thread-local data records. */ | |
135 | template<typename RandomAccessIterator, typename RandomNumberGenerator> | |
136 | inline void parallel_random_shuffle_drs_pu(DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* pus) | |
137 | { | |
138 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
139 | typedef typename traits_type::value_type value_type; | |
140 | typedef typename traits_type::difference_type difference_type; | |
141 | ||
142 | Timing<sequential_tag> t; | |
143 | t.tic(); | |
144 | ||
145 | DRSSorterPU<RandomAccessIterator, RandomNumberGenerator>* d = &pus[omp_get_thread_num()]; | |
146 | DRandomShufflingGlobalData<RandomAccessIterator>* sd = d->sd; | |
147 | thread_index_t iam = d->iam; | |
148 | ||
149 | // Indexing: dist[bin][processor] | |
150 | difference_type length = sd->starts[iam + 1] - sd->starts[iam]; | |
151 | bin_index* oracles = new bin_index[length]; | |
152 | difference_type* dist = new difference_type[sd->num_bins + 1]; | |
153 | bin_index* bin_proc = new bin_index[sd->num_bins]; | |
154 | value_type** temporaries = new value_type*[d->num_threads]; | |
155 | ||
156 | // Compute oracles and count appearances. | |
157 | for (bin_index b = 0; b < sd->num_bins + 1; b++) | |
158 | dist[b] = 0; | |
159 | int num_bits = sd->num_bits; | |
160 | ||
161 | random_number rng(d->seed); | |
162 | ||
163 | // First main loop. | |
164 | for (difference_type i = 0; i < length; i++) | |
165 | { | |
166 | bin_index oracle = random_number_pow2(num_bits, rng); | |
167 | oracles[i] = oracle; | |
168 | ||
169 | // To allow prefix (partial) sum. | |
170 | dist[oracle + 1]++; | |
171 | } | |
172 | ||
173 | for (bin_index b = 0; b < sd->num_bins + 1; b++) | |
174 | sd->dist[b][iam + 1] = dist[b]; | |
175 | ||
176 | t.tic(); | |
177 | ||
178 | #pragma omp barrier | |
179 | ||
180 | t.tic(); | |
181 | ||
182 | #pragma omp single | |
183 | { | |
184 | // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the | |
185 | // total number of items in bin s | |
186 | for (bin_index s = 0; s < sd->num_bins; s++) | |
187 | partial_sum(sd->dist[s + 1], sd->dist[s + 1] + d->num_threads + 1, sd->dist[s + 1]); | |
188 | } | |
189 | ||
190 | #pragma omp barrier | |
191 | ||
192 | t.tic(); | |
193 | ||
194 | sequence_index_t offset = 0, global_offset = 0; | |
195 | for (bin_index s = 0; s < d->bins_begin; s++) | |
196 | global_offset += sd->dist[s + 1][d->num_threads]; | |
197 | ||
198 | #pragma omp barrier | |
199 | ||
200 | for (bin_index s = d->bins_begin; s < d->bins_end; s++) | |
201 | { | |
202 | for (int t = 0; t < d->num_threads + 1; t++) | |
203 | sd->dist[s + 1][t] += offset; | |
204 | offset = sd->dist[s + 1][d->num_threads]; | |
205 | } | |
206 | ||
207 | sd->temporaries[iam] = new value_type[offset]; | |
208 | ||
209 | t.tic(); | |
210 | ||
211 | #pragma omp barrier | |
212 | ||
213 | t.tic(); | |
214 | ||
215 | // Draw local copies to avoid false sharing. | |
216 | for (bin_index b = 0; b < sd->num_bins + 1; b++) | |
217 | dist[b] = sd->dist[b][iam]; | |
218 | for (bin_index b = 0; b < sd->num_bins; b++) | |
219 | bin_proc[b] = sd->bin_proc[b]; | |
220 | for (thread_index_t t = 0; t < d->num_threads; t++) | |
221 | temporaries[t] = sd->temporaries[t]; | |
222 | ||
223 | RandomAccessIterator source = sd->source; | |
224 | difference_type start = sd->starts[iam]; | |
225 | ||
226 | // Distribute according to oracles, second main loop. | |
227 | for (difference_type i = 0; i < length; i++) | |
228 | { | |
229 | bin_index target_bin = oracles[i]; | |
230 | thread_index_t target_p = bin_proc[target_bin]; | |
231 | ||
232 | // Last column [d->num_threads] stays unchanged. | |
233 | temporaries[target_p][dist[target_bin + 1]++] = *(source + i + start); | |
234 | } | |
235 | ||
236 | delete[] oracles; | |
237 | delete[] dist; | |
238 | delete[] bin_proc; | |
239 | delete[] temporaries; | |
240 | ||
241 | t.tic(); | |
242 | ||
243 | #pragma omp barrier | |
244 | ||
245 | t.tic(); | |
246 | ||
247 | // Shuffle bins internally. | |
248 | for (bin_index b = d->bins_begin; b < d->bins_end; b++) | |
249 | { | |
250 | value_type* begin = sd->temporaries[iam] + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads]), | |
251 | * end = sd->temporaries[iam] + sd->dist[b + 1][d->num_threads]; | |
252 | sequential_random_shuffle(begin, end, rng); | |
253 | std::copy(begin, end, sd->source + global_offset + ((b == d->bins_begin) ? 0 : sd->dist[b][d->num_threads])); | |
254 | } | |
255 | ||
256 | delete[] sd->temporaries[iam]; | |
257 | ||
258 | t.tic(); | |
259 | ||
260 | t.print(); | |
261 | } | |
262 | ||
263 | /** @brief Round up to the next greater power of 2. | |
264 | * @param x Integer to round up */ | |
265 | template<typename T> | |
266 | T round_up_to_pow2(T x) | |
267 | { | |
268 | if (x <= 1) | |
269 | return 1; | |
270 | else | |
271 | return (T)1 << (log2(x - 1) + 1); | |
272 | } | |
273 | ||
274 | /** @brief Main parallel random shuffle step. | |
275 | * @param begin Begin iterator of sequence. | |
276 | * @param end End iterator of sequence. | |
277 | * @param n Length of sequence. | |
278 | * @param num_threads Number of threads to use. | |
279 | * @param rng Random number generator to use. | |
280 | */ | |
281 | template<typename RandomAccessIterator, typename RandomNumberGenerator> | |
282 | inline void | |
283 | parallel_random_shuffle_drs(RandomAccessIterator begin, RandomAccessIterator end, typename std::iterator_traits<RandomAccessIterator>::difference_type n, int num_threads, RandomNumberGenerator& rng) | |
284 | { | |
285 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
286 | typedef typename traits_type::value_type value_type; | |
287 | typedef typename traits_type::difference_type difference_type; | |
288 | ||
289 | _GLIBCXX_CALL(n) | |
290 | ||
291 | if (num_threads > n) | |
292 | num_threads = static_cast<thread_index_t>(n); | |
293 | ||
294 | bin_index num_bins, num_bins_cache; | |
295 | ||
296 | #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 | |
297 | // Try the L1 cache first. | |
298 | ||
299 | // Must fit into L1. | |
300 | num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type)))); | |
301 | num_bins_cache = round_up_to_pow2(num_bins_cache); | |
302 | ||
303 | // No more buckets than TLB entries, power of 2 | |
304 | // Power of 2 and at least one element per bin, at most the TLB size. | |
305 | num_bins = std::min(n, (difference_type)num_bins_cache); | |
306 | ||
307 | #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB | |
308 | // 2 TLB entries needed per bin. | |
309 | num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); | |
310 | #endif | |
311 | num_bins = round_up_to_pow2(num_bins); | |
312 | ||
313 | if (num_bins < num_bins_cache) | |
314 | { | |
315 | #endif | |
316 | // Now try the L2 cache | |
317 | // Must fit into L2 | |
318 | num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type))))); | |
319 | num_bins_cache = round_up_to_pow2(num_bins_cache); | |
320 | ||
321 | // No more buckets than TLB entries, power of 2. | |
322 | num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache)); | |
323 | // Power of 2 and at least one element per bin, at most the TLB size. | |
324 | #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB | |
325 | // 2 TLB entries needed per bin. | |
326 | num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); | |
327 | #endif | |
328 | num_bins = round_up_to_pow2(num_bins); | |
329 | #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 | |
330 | } | |
331 | #endif | |
332 | ||
333 | num_threads = std::min((bin_index)num_threads, (bin_index)num_bins); | |
334 | ||
335 | if (num_threads <= 1) | |
336 | return sequential_random_shuffle(begin, end, rng); | |
337 | ||
338 | DRandomShufflingGlobalData<RandomAccessIterator> sd(begin); | |
339 | ||
340 | DRSSorterPU<RandomAccessIterator, random_number >* pus = new DRSSorterPU<RandomAccessIterator, random_number >[num_threads]; | |
341 | ||
342 | sd.temporaries = new value_type*[num_threads]; | |
343 | //sd.oracles = new bin_index[n]; | |
344 | sd.dist = new difference_type*[num_bins + 1]; | |
345 | sd.bin_proc = new thread_index_t[num_bins]; | |
346 | for (bin_index b = 0; b < num_bins + 1; b++) | |
347 | sd.dist[b] = new difference_type[num_threads + 1]; | |
348 | for (bin_index b = 0; b < (num_bins + 1); b++) | |
349 | { | |
350 | sd.dist[0][0] = 0; | |
351 | sd.dist[b][0] = 0; | |
352 | } | |
353 | difference_type* starts = sd.starts = new difference_type[num_threads + 1]; | |
354 | int bin_cursor = 0; | |
355 | sd.num_bins = num_bins; | |
356 | sd.num_bits = log2(num_bins); | |
357 | ||
358 | difference_type chunk_length = n / num_threads, split = n % num_threads, start = 0; | |
359 | int bin_chunk_length = num_bins / num_threads, bin_split = num_bins % num_threads; | |
360 | for (int i = 0; i < num_threads; i++) | |
361 | { | |
362 | starts[i] = start; | |
363 | start += (i < split) ? (chunk_length + 1) : chunk_length; | |
364 | int j = pus[i].bins_begin = bin_cursor; | |
365 | ||
366 | // Range of bins for this processor. | |
367 | bin_cursor += (i < bin_split) ? (bin_chunk_length + 1) : bin_chunk_length; | |
368 | pus[i].bins_end = bin_cursor; | |
369 | for (; j < bin_cursor; j++) | |
370 | sd.bin_proc[j] = i; | |
371 | pus[i].num_threads = num_threads; | |
372 | pus[i].iam = i; | |
373 | pus[i].seed = rng(std::numeric_limits<uint32>::max()); | |
374 | pus[i].sd = &sd; | |
375 | } | |
376 | starts[num_threads] = start; | |
377 | ||
378 | // Now shuffle in parallel. | |
379 | #pragma omp parallel num_threads(num_threads) | |
380 | parallel_random_shuffle_drs_pu(pus); | |
381 | ||
382 | delete[] starts; | |
383 | delete[] sd.bin_proc; | |
384 | for (int s = 0; s < (num_bins + 1); s++) | |
385 | delete[] sd.dist[s]; | |
386 | delete[] sd.dist; | |
387 | delete[] sd.temporaries; | |
388 | ||
389 | delete[] pus; | |
390 | } | |
391 | ||
392 | /** @brief Sequential cache-efficient random shuffle. | |
393 | * @param begin Begin iterator of sequence. | |
394 | * @param end End iterator of sequence. | |
395 | * @param rng Random number generator to use. | |
396 | */ | |
397 | template<typename RandomAccessIterator, typename RandomNumberGenerator> | |
398 | inline void | |
399 | sequential_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, RandomNumberGenerator& rng) | |
400 | { | |
401 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
402 | typedef typename traits_type::value_type value_type; | |
403 | typedef typename traits_type::difference_type difference_type; | |
404 | ||
405 | difference_type n = end - begin; | |
406 | ||
407 | bin_index num_bins, num_bins_cache; | |
408 | ||
409 | #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 | |
410 | // Try the L1 cache first, must fit into L1. | |
411 | num_bins_cache = std::max((difference_type)1, (difference_type)(n / (Settings::L1_cache_size_lb / sizeof(value_type)))); | |
412 | num_bins_cache = round_up_to_pow2(num_bins_cache); | |
413 | ||
414 | // No more buckets than TLB entries, power of 2 | |
415 | // Power of 2 and at least one element per bin, at most the TLB size | |
416 | num_bins = std::min(n, (difference_type)num_bins_cache); | |
417 | #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB | |
418 | // 2 TLB entries needed per bin | |
419 | num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); | |
420 | #endif | |
421 | num_bins = round_up_to_pow2(num_bins); | |
422 | ||
423 | if (num_bins < num_bins_cache) | |
424 | { | |
425 | #endif | |
426 | // Now try the L2 cache, must fit into L2. | |
427 | num_bins_cache = static_cast<bin_index>(std::max((difference_type)1, (difference_type)(n / (Settings::L2_cache_size / sizeof(value_type))))); | |
428 | num_bins_cache = round_up_to_pow2(num_bins_cache); | |
429 | ||
430 | // No more buckets than TLB entries, power of 2 | |
431 | // Power of 2 and at least one element per bin, at most the TLB size. | |
432 | num_bins = static_cast<bin_index>(std::min(n, (difference_type)num_bins_cache)); | |
433 | ||
434 | #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB | |
435 | // 2 TLB entries needed per bin | |
436 | num_bins = std::min((difference_type)Settings::TLB_size / 2, num_bins); | |
437 | #endif | |
438 | num_bins = round_up_to_pow2(num_bins); | |
439 | #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1 | |
440 | } | |
441 | #endif | |
442 | ||
443 | int num_bits = log2(num_bins); | |
444 | ||
445 | if (num_bins > 1) | |
446 | { | |
447 | value_type* target = new value_type[n]; | |
448 | bin_index* oracles = new bin_index[n]; | |
449 | difference_type* dist0 = new difference_type[num_bins + 1], * dist1 = new difference_type[num_bins + 1]; | |
450 | ||
451 | for (int b = 0; b < num_bins + 1; b++) | |
452 | dist0[b] = 0; | |
453 | ||
454 | Timing<sequential_tag> t; | |
455 | t.tic(); | |
456 | ||
457 | random_number bitrng(rng(0xFFFFFFFF)); | |
458 | ||
459 | for (difference_type i = 0; i < n; i++) | |
460 | { | |
461 | bin_index oracle = random_number_pow2(num_bits, bitrng); | |
462 | oracles[i] = oracle; | |
463 | ||
464 | // To allow prefix (partial) sum. | |
465 | dist0[oracle + 1]++; | |
466 | } | |
467 | ||
468 | t.tic(); | |
469 | ||
470 | // Sum up bins. | |
471 | partial_sum(dist0, dist0 + num_bins + 1, dist0); | |
472 | ||
473 | for (int b = 0; b < num_bins + 1; b++) | |
474 | dist1[b] = dist0[b]; | |
475 | ||
476 | t.tic(); | |
477 | ||
478 | // Distribute according to oracles. | |
479 | for (difference_type i = 0; i < n; i++) | |
480 | target[(dist0[oracles[i]])++] = *(begin + i); | |
481 | ||
482 | for (int b = 0; b < num_bins; b++) | |
483 | { | |
484 | sequential_random_shuffle(target + dist1[b], target + dist1[b + 1], | |
485 | rng); | |
486 | t.tic(); | |
487 | } | |
488 | t.print(); | |
489 | ||
490 | delete[] dist0; | |
491 | delete[] dist1; | |
492 | delete[] oracles; | |
493 | delete[] target; | |
494 | } | |
495 | else | |
496 | __gnu_sequential::random_shuffle(begin, end, rng); | |
497 | } | |
498 | ||
499 | /** @brief Parallel random public call. | |
500 | * @param begin Begin iterator of sequence. | |
501 | * @param end End iterator of sequence. | |
502 | * @param rng Random number generator to use. | |
503 | */ | |
504 | template<typename RandomAccessIterator, typename RandomNumberGenerator> | |
505 | inline void | |
506 | parallel_random_shuffle(RandomAccessIterator begin, RandomAccessIterator end, RandomNumberGenerator rng = random_number()) | |
507 | { | |
508 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
509 | typedef typename traits_type::difference_type difference_type; | |
510 | difference_type n = end - begin; | |
511 | parallel_random_shuffle_drs(begin, end, n, get_max_threads(), rng) ; | |
512 | } | |
513 | ||
514 | } | |
515 | ||
516 | #endif |