3 // Copyright (C) 2007 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 2, or (at your option) any later
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this library; see the file COPYING. If not, write to
18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19 // MA 02111-1307, USA.
21 // As a special exception, you may use this file as part of a free
22 // software library without restriction. Specifically, if other files
23 // instantiate templates or use macros or inline functions from this
24 // file, or you compile this file and link it with other files to
25 // produce an executable, this file does not by itself cause the
26 // resulting executable to be covered by the GNU General Public
27 // License. This exception does not however invalidate any other
28 // reasons why the executable file might be covered by the GNU General
31 /** @file parallel/random_shuffle.h
32 * @brief Parallel implementation of std::random_shuffle().
33 * This file is a GNU parallel extension to the Standard C++ Library.
36 // Written by Johannes Singler.
38 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
39 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
42 #include <bits/stl_numeric.h>
43 #include <parallel/parallel.h>
44 #include <parallel/random_number.h>
46 namespace __gnu_parallel
48 /** @brief Type to hold the index of a bin.
50 * Since many variables of this type are allocated, it should be
51 * chosen as small as possible.
53 typedef unsigned short bin_index
;
55 /** @brief Data known to every thread participating in
56 __gnu_parallel::parallel_random_shuffle(). */
57 template<typename RandomAccessIterator
>
58 struct DRandomShufflingGlobalData
60 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
61 typedef typename
traits_type::value_type value_type
;
62 typedef typename
traits_type::difference_type difference_type
;
64 /** @brief Begin iterator of the source. */
65 RandomAccessIterator
& source
;
67 /** @brief Temporary arrays for each thread. */
68 value_type
** temporaries
;
70 /** @brief Two-dimensional array to hold the thread-bin distribution.
72 * Dimensions (num_threads + 1) x (num_bins + 1). */
73 difference_type
** dist
;
75 /** @brief Start indexes of the threads' chunks. */
76 difference_type
* starts
;
78 /** @brief Number of the thread that will further process the
80 thread_index_t
* bin_proc
;
82 /** @brief Number of bins to distribute to. */
85 /** @brief Number of bits needed to address the bins. */
88 /** @brief Constructor. */
89 DRandomShufflingGlobalData(RandomAccessIterator
& _source
)
93 /** @brief Local data for a thread participating in
94 __gnu_parallel::parallel_random_shuffle().
96 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
99 /** @brief Number of threads participating in total. */
102 /** @brief Begin index for bins taken care of by this thread. */
103 bin_index bins_begin
;
105 /** @brief End index for bins taken care of by this thread. */
108 /** @brief Random seed for this thread. */
111 /** @brief Pointer to global data. */
112 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
;
115 /** @brief Generate a random number in @c [0,2^logp).
116 * @param logp Logarithm (basis 2) of the upper range bound.
117 * @param rng Random number generator to use.
119 template<typename RandomNumberGenerator
>
121 random_number_pow2(int logp
, RandomNumberGenerator
& rng
)
122 { return rng
.genrand_bits(logp
); }
124 /** @brief Random shuffle code executed by each thread.
125 * @param pus Array of thread-local data records. */
126 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
128 parallel_random_shuffle_drs_pu(DRSSorterPU
<RandomAccessIterator
,
129 RandomNumberGenerator
>* pus
)
131 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
132 typedef typename
traits_type::value_type value_type
;
133 typedef typename
traits_type::difference_type difference_type
;
135 thread_index_t iam
= omp_get_thread_num();
136 DRSSorterPU
<RandomAccessIterator
, RandomNumberGenerator
>* d
= &pus
[iam
];
137 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
= d
->sd
;
139 // Indexing: dist[bin][processor]
140 difference_type length
= sd
->starts
[iam
+ 1] - sd
->starts
[iam
];
141 bin_index
* oracles
= new bin_index
[length
];
142 difference_type
* dist
= new difference_type
[sd
->num_bins
+ 1];
143 bin_index
* bin_proc
= new bin_index
[sd
->num_bins
];
144 value_type
** temporaries
= new value_type
*[d
->num_threads
];
146 // Compute oracles and count appearances.
147 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; b
++)
149 int num_bits
= sd
->num_bits
;
151 random_number
rng(d
->seed
);
154 for (difference_type i
= 0; i
< length
; i
++)
156 bin_index oracle
= random_number_pow2(num_bits
, rng
);
159 // To allow prefix (partial) sum.
163 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; b
++)
164 sd
->dist
[b
][iam
+ 1] = dist
[b
];
170 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
171 // total number of items in bin s
172 for (bin_index s
= 0; s
< sd
->num_bins
; s
++)
173 __gnu_sequential::partial_sum(sd
->dist
[s
+ 1],
174 sd
->dist
[s
+ 1] + d
->num_threads
+ 1,
180 sequence_index_t offset
= 0, global_offset
= 0;
181 for (bin_index s
= 0; s
< d
->bins_begin
; s
++)
182 global_offset
+= sd
->dist
[s
+ 1][d
->num_threads
];
186 for (bin_index s
= d
->bins_begin
; s
< d
->bins_end
; s
++)
188 for (int t
= 0; t
< d
->num_threads
+ 1; t
++)
189 sd
->dist
[s
+ 1][t
] += offset
;
190 offset
= sd
->dist
[s
+ 1][d
->num_threads
];
193 sd
->temporaries
[iam
] = static_cast<value_type
*>(
194 ::operator new(sizeof(value_type
) * offset
));
198 // Draw local copies to avoid false sharing.
199 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; b
++)
200 dist
[b
] = sd
->dist
[b
][iam
];
201 for (bin_index b
= 0; b
< sd
->num_bins
; b
++)
202 bin_proc
[b
] = sd
->bin_proc
[b
];
203 for (thread_index_t t
= 0; t
< d
->num_threads
; t
++)
204 temporaries
[t
] = sd
->temporaries
[t
];
206 RandomAccessIterator source
= sd
->source
;
207 difference_type start
= sd
->starts
[iam
];
209 // Distribute according to oracles, second main loop.
210 for (difference_type i
= 0; i
< length
; i
++)
212 bin_index target_bin
= oracles
[i
];
213 thread_index_t target_p
= bin_proc
[target_bin
];
215 // Last column [d->num_threads] stays unchanged.
216 temporaries
[target_p
][dist
[target_bin
+ 1]++] = *(source
+ i
+ start
);
222 delete[] temporaries
;
226 // Shuffle bins internally.
227 for (bin_index b
= d
->bins_begin
; b
< d
->bins_end
; b
++)
230 sd
->temporaries
[iam
] +
231 ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]),
233 sd
->temporaries
[iam
] + sd
->dist
[b
+ 1][d
->num_threads
];
234 sequential_random_shuffle(begin
, end
, rng
);
235 std::copy(begin
, end
, sd
->source
+ global_offset
+
236 ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]));
239 delete[] sd
->temporaries
[iam
];
242 /** @brief Round up to the next greater power of 2.
243 * @param x Integer to round up */
246 round_up_to_pow2(T x
)
251 return (T
)1 << (log2(x
- 1) + 1);
254 /** @brief Main parallel random shuffle step.
255 * @param begin Begin iterator of sequence.
256 * @param end End iterator of sequence.
257 * @param n Length of sequence.
258 * @param num_threads Number of threads to use.
259 * @param rng Random number generator to use.
261 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
263 parallel_random_shuffle_drs(
264 RandomAccessIterator begin
,
265 RandomAccessIterator end
,
266 typename
std::iterator_traits
<RandomAccessIterator
>::difference_type n
,
267 thread_index_t num_threads
,
268 RandomNumberGenerator
& rng
)
270 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
271 typedef typename
traits_type::value_type value_type
;
272 typedef typename
traits_type::difference_type difference_type
;
277 num_threads
= static_cast<thread_index_t
>(n
);
279 bin_index num_bins
, num_bins_cache
;
281 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
282 // Try the L1 cache first.
285 num_bins_cache
= std::max
<difference_type
>(
286 1, n
/ (Settings::L1_cache_size_lb
/ sizeof(value_type
)));
287 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
289 // No more buckets than TLB entries, power of 2
290 // Power of 2 and at least one element per bin, at most the TLB size.
291 num_bins
= std::min
<difference_type
>(n
, num_bins_cache
);
293 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
294 // 2 TLB entries needed per bin.
295 num_bins
= std::min
<difference_type
>(Settings::TLB_size
/ 2, num_bins
);
297 num_bins
= round_up_to_pow2(num_bins
);
299 if (num_bins
< num_bins_cache
)
302 // Now try the L2 cache
304 num_bins_cache
= static_cast<bin_index
>(std::max
<difference_type
>(
305 1, n
/ (Settings::L2_cache_size
/ sizeof(value_type
))));
306 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
308 // No more buckets than TLB entries, power of 2.
309 num_bins
= static_cast<bin_index
>(
310 std::min(n
, static_cast<difference_type
>(num_bins_cache
)));
311 // Power of 2 and at least one element per bin, at most the TLB size.
312 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
313 // 2 TLB entries needed per bin.
315 static_cast<difference_type
>(Settings::TLB_size
/ 2), num_bins
);
317 num_bins
= round_up_to_pow2(num_bins
);
318 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
322 num_threads
= std::min
<bin_index
>(num_threads
, num_bins
);
324 if (num_threads
<= 1)
325 return sequential_random_shuffle(begin
, end
, rng
);
327 DRandomShufflingGlobalData
<RandomAccessIterator
> sd(begin
);
328 DRSSorterPU
<RandomAccessIterator
, random_number
>* pus
;
329 difference_type
* starts
;
331 # pragma omp parallel num_threads(num_threads)
335 pus
= new DRSSorterPU
<RandomAccessIterator
, random_number
>
338 sd
.temporaries
= new value_type
*[num_threads
];
339 sd
.dist
= new difference_type
*[num_bins
+ 1];
340 sd
.bin_proc
= new thread_index_t
[num_bins
];
341 for (bin_index b
= 0; b
< num_bins
+ 1; b
++)
342 sd
.dist
[b
] = new difference_type
[num_threads
+ 1];
343 for (bin_index b
= 0; b
< (num_bins
+ 1); b
++)
348 starts
= sd
.starts
= new difference_type
[num_threads
+ 1];
350 sd
.num_bins
= num_bins
;
351 sd
.num_bits
= log2(num_bins
);
353 difference_type chunk_length
= n
/ num_threads
,
354 split
= n
% num_threads
, start
= 0;
355 difference_type bin_chunk_length
= num_bins
/ num_threads
,
356 bin_split
= num_bins
% num_threads
;
357 for (thread_index_t i
= 0; i
< num_threads
; i
++)
360 start
+= (i
< split
) ? (chunk_length
+ 1) : chunk_length
;
361 int j
= pus
[i
].bins_begin
= bin_cursor
;
363 // Range of bins for this processor.
364 bin_cursor
+= (i
< bin_split
) ?
365 (bin_chunk_length
+ 1) : bin_chunk_length
;
366 pus
[i
].bins_end
= bin_cursor
;
367 for (; j
< bin_cursor
; j
++)
369 pus
[i
].num_threads
= num_threads
;
370 pus
[i
].seed
= rng(std::numeric_limits
<uint32
>::max());
373 starts
[num_threads
] = start
;
375 // Now shuffle in parallel.
376 parallel_random_shuffle_drs_pu(pus
);
380 delete[] sd
.bin_proc
;
381 for (int s
= 0; s
< (num_bins
+ 1); s
++)
384 delete[] sd
.temporaries
;
389 /** @brief Sequential cache-efficient random shuffle.
390 * @param begin Begin iterator of sequence.
391 * @param end End iterator of sequence.
392 * @param rng Random number generator to use.
394 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
396 sequential_random_shuffle(RandomAccessIterator begin
,
397 RandomAccessIterator end
,
398 RandomNumberGenerator
& rng
)
400 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
401 typedef typename
traits_type::value_type value_type
;
402 typedef typename
traits_type::difference_type difference_type
;
404 difference_type n
= end
- begin
;
406 bin_index num_bins
, num_bins_cache
;
408 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
409 // Try the L1 cache first, must fit into L1.
411 std::max
<difference_type
>
412 (1, n
/ (Settings::L1_cache_size_lb
/ sizeof(value_type
)));
413 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
415 // No more buckets than TLB entries, power of 2
416 // Power of 2 and at least one element per bin, at most the TLB size
417 num_bins
= std::min(n
, (difference_type
)num_bins_cache
);
418 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
419 // 2 TLB entries needed per bin
420 num_bins
= std::min((difference_type
)Settings::TLB_size
/ 2, num_bins
);
422 num_bins
= round_up_to_pow2(num_bins
);
424 if (num_bins
< num_bins_cache
)
427 // Now try the L2 cache, must fit into L2.
429 static_cast<bin_index
>(std::max
<difference_type
>(
430 1, n
/ (Settings::L2_cache_size
/ sizeof(value_type
))));
431 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
433 // No more buckets than TLB entries, power of 2
434 // Power of 2 and at least one element per bin, at most the TLB size.
435 num_bins
= static_cast<bin_index
>
436 (std::min(n
, static_cast<difference_type
>(num_bins_cache
)));
438 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
439 // 2 TLB entries needed per bin
441 std::min
<difference_type
>(Settings::TLB_size
/ 2, num_bins
);
443 num_bins
= round_up_to_pow2(num_bins
);
444 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
448 int num_bits
= log2(num_bins
);
452 value_type
* target
= static_cast<value_type
*>(
453 ::operator new(sizeof(value_type
) * n
));
454 bin_index
* oracles
= new bin_index
[n
];
455 difference_type
* dist0
= new difference_type
[num_bins
+ 1],
456 * dist1
= new difference_type
[num_bins
+ 1];
458 for (int b
= 0; b
< num_bins
+ 1; b
++)
461 random_number
bitrng(rng(0xFFFFFFFF));
463 for (difference_type i
= 0; i
< n
; i
++)
465 bin_index oracle
= random_number_pow2(num_bits
, bitrng
);
468 // To allow prefix (partial) sum.
473 __gnu_sequential::partial_sum(dist0
, dist0
+ num_bins
+ 1, dist0
);
475 for (int b
= 0; b
< num_bins
+ 1; b
++)
478 // Distribute according to oracles.
479 for (difference_type i
= 0; i
< n
; i
++)
480 target
[(dist0
[oracles
[i
]])++] = *(begin
+ i
);
482 for (int b
= 0; b
< num_bins
; b
++)
484 sequential_random_shuffle(target
+ dist1
[b
],
485 target
+ dist1
[b
+ 1],
495 __gnu_sequential::random_shuffle(begin
, end
, rng
);
498 /** @brief Parallel random public call.
499 * @param begin Begin iterator of sequence.
500 * @param end End iterator of sequence.
501 * @param rng Random number generator to use.
503 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
505 parallel_random_shuffle(RandomAccessIterator begin
,
506 RandomAccessIterator end
,
507 RandomNumberGenerator rng
= random_number())
509 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
510 typedef typename
traits_type::difference_type difference_type
;
511 difference_type n
= end
- begin
;
512 parallel_random_shuffle_drs(begin
, end
, n
, get_max_threads(), rng
) ;