3 // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 3, or (at your option) any later
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
25 /** @file parallel/random_shuffle.h
26 * @brief Parallel implementation of std::random_shuffle().
27 * This file is a GNU parallel extension to the Standard C++ Library.
30 // Written by Johannes Singler.
32 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
33 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
36 #include <bits/stl_numeric.h>
37 #include <parallel/parallel.h>
38 #include <parallel/random_number.h>
40 namespace __gnu_parallel
42 /** @brief Type to hold the index of a bin.
44 * Since many variables of this type are allocated, it should be
45 * chosen as small as possible.
47 typedef unsigned short bin_index
;
49 /** @brief Data known to every thread participating in
50 __gnu_parallel::parallel_random_shuffle(). */
51 template<typename RandomAccessIterator
>
52 struct DRandomShufflingGlobalData
54 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
55 typedef typename
traits_type::value_type value_type
;
56 typedef typename
traits_type::difference_type difference_type
;
58 /** @brief Begin iterator of the source. */
59 RandomAccessIterator
& source
;
61 /** @brief Temporary arrays for each thread. */
62 value_type
** temporaries
;
64 /** @brief Two-dimensional array to hold the thread-bin distribution.
66 * Dimensions (num_threads + 1) x (num_bins + 1). */
67 difference_type
** dist
;
69 /** @brief Start indexes of the threads' chunks. */
70 difference_type
* starts
;
72 /** @brief Number of the thread that will further process the
74 thread_index_t
* bin_proc
;
76 /** @brief Number of bins to distribute to. */
79 /** @brief Number of bits needed to address the bins. */
82 /** @brief Constructor. */
83 DRandomShufflingGlobalData(RandomAccessIterator
& _source
)
87 /** @brief Local data for a thread participating in
88 __gnu_parallel::parallel_random_shuffle().
90 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
93 /** @brief Number of threads participating in total. */
96 /** @brief Begin index for bins taken care of by this thread. */
99 /** @brief End index for bins taken care of by this thread. */
102 /** @brief Random seed for this thread. */
105 /** @brief Pointer to global data. */
106 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
;
109 /** @brief Generate a random number in @c [0,2^logp).
110 * @param logp Logarithm (basis 2) of the upper range bound.
111 * @param rng Random number generator to use.
113 template<typename RandomNumberGenerator
>
115 random_number_pow2(int logp
, RandomNumberGenerator
& rng
)
116 { return rng
.genrand_bits(logp
); }
118 /** @brief Random shuffle code executed by each thread.
119 * @param pus Array of thread-local data records. */
120 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
122 parallel_random_shuffle_drs_pu(DRSSorterPU
<RandomAccessIterator
,
123 RandomNumberGenerator
>* pus
)
125 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
126 typedef typename
traits_type::value_type value_type
;
127 typedef typename
traits_type::difference_type difference_type
;
129 thread_index_t iam
= omp_get_thread_num();
130 DRSSorterPU
<RandomAccessIterator
, RandomNumberGenerator
>* d
= &pus
[iam
];
131 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
= d
->sd
;
133 // Indexing: dist[bin][processor]
134 difference_type length
= sd
->starts
[iam
+ 1] - sd
->starts
[iam
];
135 bin_index
* oracles
= new bin_index
[length
];
136 difference_type
* dist
= new difference_type
[sd
->num_bins
+ 1];
137 bin_index
* bin_proc
= new bin_index
[sd
->num_bins
];
138 value_type
** temporaries
= new value_type
*[d
->num_threads
];
140 // Compute oracles and count appearances.
141 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; ++b
)
143 int num_bits
= sd
->num_bits
;
145 random_number
rng(d
->seed
);
148 for (difference_type i
= 0; i
< length
; ++i
)
150 bin_index oracle
= random_number_pow2(num_bits
, rng
);
153 // To allow prefix (partial) sum.
154 ++(dist
[oracle
+ 1]);
157 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; ++b
)
158 sd
->dist
[b
][iam
+ 1] = dist
[b
];
164 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
165 // total number of items in bin s
166 for (bin_index s
= 0; s
< sd
->num_bins
; ++s
)
167 __gnu_sequential::partial_sum(sd
->dist
[s
+ 1],
168 sd
->dist
[s
+ 1] + d
->num_threads
+ 1,
174 sequence_index_t offset
= 0, global_offset
= 0;
175 for (bin_index s
= 0; s
< d
->bins_begin
; ++s
)
176 global_offset
+= sd
->dist
[s
+ 1][d
->num_threads
];
180 for (bin_index s
= d
->bins_begin
; s
< d
->bins_end
; ++s
)
182 for (int t
= 0; t
< d
->num_threads
+ 1; ++t
)
183 sd
->dist
[s
+ 1][t
] += offset
;
184 offset
= sd
->dist
[s
+ 1][d
->num_threads
];
187 sd
->temporaries
[iam
] = static_cast<value_type
*>(
188 ::operator new(sizeof(value_type
) * offset
));
192 // Draw local copies to avoid false sharing.
193 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; ++b
)
194 dist
[b
] = sd
->dist
[b
][iam
];
195 for (bin_index b
= 0; b
< sd
->num_bins
; ++b
)
196 bin_proc
[b
] = sd
->bin_proc
[b
];
197 for (thread_index_t t
= 0; t
< d
->num_threads
; ++t
)
198 temporaries
[t
] = sd
->temporaries
[t
];
200 RandomAccessIterator source
= sd
->source
;
201 difference_type start
= sd
->starts
[iam
];
203 // Distribute according to oracles, second main loop.
204 for (difference_type i
= 0; i
< length
; ++i
)
206 bin_index target_bin
= oracles
[i
];
207 thread_index_t target_p
= bin_proc
[target_bin
];
209 // Last column [d->num_threads] stays unchanged.
210 ::new(&(temporaries
[target_p
][dist
[target_bin
+ 1]++]))
211 value_type(*(source
+ i
+ start
));
217 delete[] temporaries
;
221 // Shuffle bins internally.
222 for (bin_index b
= d
->bins_begin
; b
< d
->bins_end
; ++b
)
225 sd
->temporaries
[iam
] +
226 ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]),
228 sd
->temporaries
[iam
] + sd
->dist
[b
+ 1][d
->num_threads
];
229 sequential_random_shuffle(begin
, end
, rng
);
230 std::copy(begin
, end
, sd
->source
+ global_offset
+
231 ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]));
234 ::operator delete(sd
->temporaries
[iam
]);
237 /** @brief Round up to the next greater power of 2.
238 * @param x Integer to round up */
241 round_up_to_pow2(T x
)
246 return (T
)1 << (__log2(x
- 1) + 1);
249 /** @brief Main parallel random shuffle step.
250 * @param begin Begin iterator of sequence.
251 * @param end End iterator of sequence.
252 * @param n Length of sequence.
253 * @param num_threads Number of threads to use.
254 * @param rng Random number generator to use.
256 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
258 parallel_random_shuffle_drs(RandomAccessIterator begin
,
259 RandomAccessIterator end
,
260 typename
std::iterator_traits
261 <RandomAccessIterator
>::difference_type n
,
262 thread_index_t num_threads
,
263 RandomNumberGenerator
& rng
)
265 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
266 typedef typename
traits_type::value_type value_type
;
267 typedef typename
traits_type::difference_type difference_type
;
271 const _Settings
& __s
= _Settings::get();
274 num_threads
= static_cast<thread_index_t
>(n
);
276 bin_index num_bins
, num_bins_cache
;
278 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
279 // Try the L1 cache first.
282 num_bins_cache
= std::max
<difference_type
>(
283 1, n
/ (__s
.L1_cache_size_lb
/ sizeof(value_type
)));
284 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
286 // No more buckets than TLB entries, power of 2
287 // Power of 2 and at least one element per bin, at most the TLB size.
288 num_bins
= std::min
<difference_type
>(n
, num_bins_cache
);
290 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
291 // 2 TLB entries needed per bin.
292 num_bins
= std::min
<difference_type
>(__s
.TLB_size
/ 2, num_bins
);
294 num_bins
= round_up_to_pow2(num_bins
);
296 if (num_bins
< num_bins_cache
)
299 // Now try the L2 cache
301 num_bins_cache
= static_cast<bin_index
>(std::max
<difference_type
>(
302 1, n
/ (__s
.L2_cache_size
/ sizeof(value_type
))));
303 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
305 // No more buckets than TLB entries, power of 2.
306 num_bins
= static_cast<bin_index
>(
307 std::min(n
, static_cast<difference_type
>(num_bins_cache
)));
308 // Power of 2 and at least one element per bin, at most the TLB size.
309 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
310 // 2 TLB entries needed per bin.
312 static_cast<difference_type
>(__s
.TLB_size
/ 2), num_bins
);
314 num_bins
= round_up_to_pow2(num_bins
);
315 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
319 num_threads
= std::min
<bin_index
>(num_threads
, num_bins
);
321 if (num_threads
<= 1)
322 return sequential_random_shuffle(begin
, end
, rng
);
324 DRandomShufflingGlobalData
<RandomAccessIterator
> sd(begin
);
325 DRSSorterPU
<RandomAccessIterator
, random_number
>* pus
;
326 difference_type
* starts
;
328 # pragma omp parallel num_threads(num_threads)
330 thread_index_t num_threads
= omp_get_num_threads();
333 pus
= new DRSSorterPU
<RandomAccessIterator
, random_number
>
336 sd
.temporaries
= new value_type
*[num_threads
];
337 sd
.dist
= new difference_type
*[num_bins
+ 1];
338 sd
.bin_proc
= new thread_index_t
[num_bins
];
339 for (bin_index b
= 0; b
< num_bins
+ 1; ++b
)
340 sd
.dist
[b
] = new difference_type
[num_threads
+ 1];
341 for (bin_index b
= 0; b
< (num_bins
+ 1); ++b
)
346 starts
= sd
.starts
= new difference_type
[num_threads
+ 1];
348 sd
.num_bins
= num_bins
;
349 sd
.num_bits
= __log2(num_bins
);
351 difference_type chunk_length
= n
/ num_threads
,
352 split
= n
% num_threads
, start
= 0;
353 difference_type bin_chunk_length
= num_bins
/ num_threads
,
354 bin_split
= num_bins
% num_threads
;
355 for (thread_index_t i
= 0; i
< num_threads
; ++i
)
358 start
+= (i
< split
) ? (chunk_length
+ 1) : chunk_length
;
359 int j
= pus
[i
].bins_begin
= bin_cursor
;
361 // Range of bins for this processor.
362 bin_cursor
+= (i
< bin_split
) ?
363 (bin_chunk_length
+ 1) : bin_chunk_length
;
364 pus
[i
].bins_end
= bin_cursor
;
365 for (; j
< bin_cursor
; ++j
)
367 pus
[i
].num_threads
= num_threads
;
368 pus
[i
].seed
= rng(std::numeric_limits
<uint32
>::max());
371 starts
[num_threads
] = start
;
373 // Now shuffle in parallel.
374 parallel_random_shuffle_drs_pu(pus
);
378 delete[] sd
.bin_proc
;
379 for (int s
= 0; s
< (num_bins
+ 1); ++s
)
382 delete[] sd
.temporaries
;
387 /** @brief Sequential cache-efficient random shuffle.
388 * @param begin Begin iterator of sequence.
389 * @param end End iterator of sequence.
390 * @param rng Random number generator to use.
392 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
394 sequential_random_shuffle(RandomAccessIterator begin
,
395 RandomAccessIterator end
,
396 RandomNumberGenerator
& rng
)
398 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
399 typedef typename
traits_type::value_type value_type
;
400 typedef typename
traits_type::difference_type difference_type
;
402 difference_type n
= end
- begin
;
403 const _Settings
& __s
= _Settings::get();
405 bin_index num_bins
, num_bins_cache
;
407 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
408 // Try the L1 cache first, must fit into L1.
410 std::max
<difference_type
>
411 (1, n
/ (__s
.L1_cache_size_lb
/ sizeof(value_type
)));
412 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
414 // No more buckets than TLB entries, power of 2
415 // Power of 2 and at least one element per bin, at most the TLB size
416 num_bins
= std::min(n
, (difference_type
)num_bins_cache
);
417 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
418 // 2 TLB entries needed per bin
419 num_bins
= std::min((difference_type
)__s
.TLB_size
/ 2, num_bins
);
421 num_bins
= round_up_to_pow2(num_bins
);
423 if (num_bins
< num_bins_cache
)
426 // Now try the L2 cache, must fit into L2.
428 static_cast<bin_index
>(std::max
<difference_type
>(
429 1, n
/ (__s
.L2_cache_size
/ sizeof(value_type
))));
430 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
432 // No more buckets than TLB entries, power of 2
433 // Power of 2 and at least one element per bin, at most the TLB size.
434 num_bins
= static_cast<bin_index
>
435 (std::min(n
, static_cast<difference_type
>(num_bins_cache
)));
437 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
438 // 2 TLB entries needed per bin
440 std::min
<difference_type
>(__s
.TLB_size
/ 2, num_bins
);
442 num_bins
= round_up_to_pow2(num_bins
);
443 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
447 int num_bits
= __log2(num_bins
);
451 value_type
* target
= static_cast<value_type
*>(
452 ::operator new(sizeof(value_type
) * n
));
453 bin_index
* oracles
= new bin_index
[n
];
454 difference_type
* dist0
= new difference_type
[num_bins
+ 1],
455 * dist1
= new difference_type
[num_bins
+ 1];
457 for (int b
= 0; b
< num_bins
+ 1; ++b
)
460 random_number
bitrng(rng(0xFFFFFFFF));
462 for (difference_type i
= 0; i
< n
; ++i
)
464 bin_index oracle
= random_number_pow2(num_bits
, bitrng
);
467 // To allow prefix (partial) sum.
468 ++(dist0
[oracle
+ 1]);
472 __gnu_sequential::partial_sum(dist0
, dist0
+ num_bins
+ 1, dist0
);
474 for (int b
= 0; b
< num_bins
+ 1; ++b
)
477 // Distribute according to oracles.
478 for (difference_type i
= 0; i
< n
; ++i
)
479 ::new(&(target
[(dist0
[oracles
[i
]])++])) value_type(*(begin
+ i
));
481 for (int b
= 0; b
< num_bins
; ++b
)
483 sequential_random_shuffle(target
+ dist1
[b
],
484 target
+ dist1
[b
+ 1],
488 // Copy elements back.
489 std::copy(target
, target
+ n
, begin
);
494 ::operator delete(target
);
497 __gnu_sequential::random_shuffle(begin
, end
, rng
);
500 /** @brief Parallel random public call.
501 * @param begin Begin iterator of sequence.
502 * @param end End iterator of sequence.
503 * @param rng Random number generator to use.
505 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
507 parallel_random_shuffle(RandomAccessIterator begin
,
508 RandomAccessIterator end
,
509 RandomNumberGenerator rng
= random_number())
511 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
512 typedef typename
traits_type::difference_type difference_type
;
513 difference_type n
= end
- begin
;
514 parallel_random_shuffle_drs(begin
, end
, n
, get_max_threads(), rng
) ;
519 #endif /* _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H */