3 // Copyright (C) 2007, 2008 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 2, or (at your option) any later
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this library; see the file COPYING. If not, write to
18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19 // MA 02111-1307, USA.
21 // As a special exception, you may use this file as part of a free
22 // software library without restriction. Specifically, if other files
23 // instantiate templates or use macros or inline functions from this
24 // file, or you compile this file and link it with other files to
25 // produce an executable, this file does not by itself cause the
26 // resulting executable to be covered by the GNU General Public
27 // License. This exception does not however invalidate any other
28 // reasons why the executable file might be covered by the GNU General
31 /** @file parallel/random_shuffle.h
32 * @brief Parallel implementation of std::random_shuffle().
33 * This file is a GNU parallel extension to the Standard C++ Library.
36 // Written by Johannes Singler.
38 #ifndef _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H
39 #define _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H 1
42 #include <bits/stl_numeric.h>
43 #include <parallel/parallel.h>
44 #include <parallel/random_number.h>
46 namespace __gnu_parallel
48 /** @brief Type to hold the index of a bin.
50 * Since many variables of this type are allocated, it should be
51 * chosen as small as possible.
53 typedef unsigned short bin_index
;
55 /** @brief Data known to every thread participating in
56 __gnu_parallel::parallel_random_shuffle(). */
57 template<typename RandomAccessIterator
>
58 struct DRandomShufflingGlobalData
60 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
61 typedef typename
traits_type::value_type value_type
;
62 typedef typename
traits_type::difference_type difference_type
;
64 /** @brief Begin iterator of the source. */
65 RandomAccessIterator
& source
;
67 /** @brief Temporary arrays for each thread. */
68 value_type
** temporaries
;
70 /** @brief Two-dimensional array to hold the thread-bin distribution.
72 * Dimensions (num_threads + 1) x (num_bins + 1). */
73 difference_type
** dist
;
75 /** @brief Start indexes of the threads' chunks. */
76 difference_type
* starts
;
78 /** @brief Number of the thread that will further process the
80 thread_index_t
* bin_proc
;
82 /** @brief Number of bins to distribute to. */
85 /** @brief Number of bits needed to address the bins. */
88 /** @brief Constructor. */
89 DRandomShufflingGlobalData(RandomAccessIterator
& _source
)
93 /** @brief Local data for a thread participating in
94 __gnu_parallel::parallel_random_shuffle().
96 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
99 /** @brief Number of threads participating in total. */
102 /** @brief Begin index for bins taken care of by this thread. */
103 bin_index bins_begin
;
105 /** @brief End index for bins taken care of by this thread. */
108 /** @brief Random seed for this thread. */
111 /** @brief Pointer to global data. */
112 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
;
115 /** @brief Generate a random number in @c [0,2^logp).
116 * @param logp Logarithm (basis 2) of the upper range bound.
117 * @param rng Random number generator to use.
119 template<typename RandomNumberGenerator
>
121 random_number_pow2(int logp
, RandomNumberGenerator
& rng
)
122 { return rng
.genrand_bits(logp
); }
124 /** @brief Random shuffle code executed by each thread.
125 * @param pus Array of thread-local data records. */
126 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
128 parallel_random_shuffle_drs_pu(DRSSorterPU
<RandomAccessIterator
,
129 RandomNumberGenerator
>* pus
)
131 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
132 typedef typename
traits_type::value_type value_type
;
133 typedef typename
traits_type::difference_type difference_type
;
135 thread_index_t iam
= omp_get_thread_num();
136 DRSSorterPU
<RandomAccessIterator
, RandomNumberGenerator
>* d
= &pus
[iam
];
137 DRandomShufflingGlobalData
<RandomAccessIterator
>* sd
= d
->sd
;
139 // Indexing: dist[bin][processor]
140 difference_type length
= sd
->starts
[iam
+ 1] - sd
->starts
[iam
];
141 bin_index
* oracles
= new bin_index
[length
];
142 difference_type
* dist
= new difference_type
[sd
->num_bins
+ 1];
143 bin_index
* bin_proc
= new bin_index
[sd
->num_bins
];
144 value_type
** temporaries
= new value_type
*[d
->num_threads
];
146 // Compute oracles and count appearances.
147 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; ++b
)
149 int num_bits
= sd
->num_bits
;
151 random_number
rng(d
->seed
);
154 for (difference_type i
= 0; i
< length
; ++i
)
156 bin_index oracle
= random_number_pow2(num_bits
, rng
);
159 // To allow prefix (partial) sum.
160 ++(dist
[oracle
+ 1]);
163 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; ++b
)
164 sd
->dist
[b
][iam
+ 1] = dist
[b
];
170 // Sum up bins, sd->dist[s + 1][d->num_threads] now contains the
171 // total number of items in bin s
172 for (bin_index s
= 0; s
< sd
->num_bins
; ++s
)
173 __gnu_sequential::partial_sum(sd
->dist
[s
+ 1],
174 sd
->dist
[s
+ 1] + d
->num_threads
+ 1,
180 sequence_index_t offset
= 0, global_offset
= 0;
181 for (bin_index s
= 0; s
< d
->bins_begin
; ++s
)
182 global_offset
+= sd
->dist
[s
+ 1][d
->num_threads
];
186 for (bin_index s
= d
->bins_begin
; s
< d
->bins_end
; ++s
)
188 for (int t
= 0; t
< d
->num_threads
+ 1; ++t
)
189 sd
->dist
[s
+ 1][t
] += offset
;
190 offset
= sd
->dist
[s
+ 1][d
->num_threads
];
193 sd
->temporaries
[iam
] = static_cast<value_type
*>(
194 ::operator new(sizeof(value_type
) * offset
));
198 // Draw local copies to avoid false sharing.
199 for (bin_index b
= 0; b
< sd
->num_bins
+ 1; ++b
)
200 dist
[b
] = sd
->dist
[b
][iam
];
201 for (bin_index b
= 0; b
< sd
->num_bins
; ++b
)
202 bin_proc
[b
] = sd
->bin_proc
[b
];
203 for (thread_index_t t
= 0; t
< d
->num_threads
; ++t
)
204 temporaries
[t
] = sd
->temporaries
[t
];
206 RandomAccessIterator source
= sd
->source
;
207 difference_type start
= sd
->starts
[iam
];
209 // Distribute according to oracles, second main loop.
210 for (difference_type i
= 0; i
< length
; ++i
)
212 bin_index target_bin
= oracles
[i
];
213 thread_index_t target_p
= bin_proc
[target_bin
];
215 // Last column [d->num_threads] stays unchanged.
216 ::new(&(temporaries
[target_p
][dist
[target_bin
+ 1]++]))
217 value_type(*(source
+ i
+ start
));
223 delete[] temporaries
;
227 // Shuffle bins internally.
228 for (bin_index b
= d
->bins_begin
; b
< d
->bins_end
; ++b
)
231 sd
->temporaries
[iam
] +
232 ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]),
234 sd
->temporaries
[iam
] + sd
->dist
[b
+ 1][d
->num_threads
];
235 sequential_random_shuffle(begin
, end
, rng
);
236 std::copy(begin
, end
, sd
->source
+ global_offset
+
237 ((b
== d
->bins_begin
) ? 0 : sd
->dist
[b
][d
->num_threads
]));
240 ::operator delete(sd
->temporaries
[iam
]);
243 /** @brief Round up to the next greater power of 2.
244 * @param x Integer to round up */
247 round_up_to_pow2(T x
)
252 return (T
)1 << (log2(x
- 1) + 1);
255 /** @brief Main parallel random shuffle step.
256 * @param begin Begin iterator of sequence.
257 * @param end End iterator of sequence.
258 * @param n Length of sequence.
259 * @param num_threads Number of threads to use.
260 * @param rng Random number generator to use.
262 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
264 parallel_random_shuffle_drs(RandomAccessIterator begin
,
265 RandomAccessIterator end
,
266 typename
std::iterator_traits
267 <RandomAccessIterator
>::difference_type n
,
268 thread_index_t num_threads
,
269 RandomNumberGenerator
& rng
)
271 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
272 typedef typename
traits_type::value_type value_type
;
273 typedef typename
traits_type::difference_type difference_type
;
277 const _Settings
& __s
= _Settings::get();
280 num_threads
= static_cast<thread_index_t
>(n
);
282 bin_index num_bins
, num_bins_cache
;
284 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
285 // Try the L1 cache first.
288 num_bins_cache
= std::max
<difference_type
>(
289 1, n
/ (__s
.L1_cache_size_lb
/ sizeof(value_type
)));
290 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
292 // No more buckets than TLB entries, power of 2
293 // Power of 2 and at least one element per bin, at most the TLB size.
294 num_bins
= std::min
<difference_type
>(n
, num_bins_cache
);
296 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
297 // 2 TLB entries needed per bin.
298 num_bins
= std::min
<difference_type
>(__s
.TLB_size
/ 2, num_bins
);
300 num_bins
= round_up_to_pow2(num_bins
);
302 if (num_bins
< num_bins_cache
)
305 // Now try the L2 cache
307 num_bins_cache
= static_cast<bin_index
>(std::max
<difference_type
>(
308 1, n
/ (__s
.L2_cache_size
/ sizeof(value_type
))));
309 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
311 // No more buckets than TLB entries, power of 2.
312 num_bins
= static_cast<bin_index
>(
313 std::min(n
, static_cast<difference_type
>(num_bins_cache
)));
314 // Power of 2 and at least one element per bin, at most the TLB size.
315 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
316 // 2 TLB entries needed per bin.
318 static_cast<difference_type
>(__s
.TLB_size
/ 2), num_bins
);
320 num_bins
= round_up_to_pow2(num_bins
);
321 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
325 num_threads
= std::min
<bin_index
>(num_threads
, num_bins
);
327 if (num_threads
<= 1)
328 return sequential_random_shuffle(begin
, end
, rng
);
330 DRandomShufflingGlobalData
<RandomAccessIterator
> sd(begin
);
331 DRSSorterPU
<RandomAccessIterator
, random_number
>* pus
;
332 difference_type
* starts
;
334 # pragma omp parallel num_threads(num_threads)
336 thread_index_t num_threads
= omp_get_num_threads();
339 pus
= new DRSSorterPU
<RandomAccessIterator
, random_number
>
342 sd
.temporaries
= new value_type
*[num_threads
];
343 sd
.dist
= new difference_type
*[num_bins
+ 1];
344 sd
.bin_proc
= new thread_index_t
[num_bins
];
345 for (bin_index b
= 0; b
< num_bins
+ 1; ++b
)
346 sd
.dist
[b
] = new difference_type
[num_threads
+ 1];
347 for (bin_index b
= 0; b
< (num_bins
+ 1); ++b
)
352 starts
= sd
.starts
= new difference_type
[num_threads
+ 1];
354 sd
.num_bins
= num_bins
;
355 sd
.num_bits
= log2(num_bins
);
357 difference_type chunk_length
= n
/ num_threads
,
358 split
= n
% num_threads
, start
= 0;
359 difference_type bin_chunk_length
= num_bins
/ num_threads
,
360 bin_split
= num_bins
% num_threads
;
361 for (thread_index_t i
= 0; i
< num_threads
; ++i
)
364 start
+= (i
< split
) ? (chunk_length
+ 1) : chunk_length
;
365 int j
= pus
[i
].bins_begin
= bin_cursor
;
367 // Range of bins for this processor.
368 bin_cursor
+= (i
< bin_split
) ?
369 (bin_chunk_length
+ 1) : bin_chunk_length
;
370 pus
[i
].bins_end
= bin_cursor
;
371 for (; j
< bin_cursor
; ++j
)
373 pus
[i
].num_threads
= num_threads
;
374 pus
[i
].seed
= rng(std::numeric_limits
<uint32
>::max());
377 starts
[num_threads
] = start
;
379 // Now shuffle in parallel.
380 parallel_random_shuffle_drs_pu(pus
);
384 delete[] sd
.bin_proc
;
385 for (int s
= 0; s
< (num_bins
+ 1); ++s
)
388 delete[] sd
.temporaries
;
393 /** @brief Sequential cache-efficient random shuffle.
394 * @param begin Begin iterator of sequence.
395 * @param end End iterator of sequence.
396 * @param rng Random number generator to use.
398 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
400 sequential_random_shuffle(RandomAccessIterator begin
,
401 RandomAccessIterator end
,
402 RandomNumberGenerator
& rng
)
404 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
405 typedef typename
traits_type::value_type value_type
;
406 typedef typename
traits_type::difference_type difference_type
;
408 difference_type n
= end
- begin
;
409 const _Settings
& __s
= _Settings::get();
411 bin_index num_bins
, num_bins_cache
;
413 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
414 // Try the L1 cache first, must fit into L1.
416 std::max
<difference_type
>
417 (1, n
/ (__s
.L1_cache_size_lb
/ sizeof(value_type
)));
418 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
420 // No more buckets than TLB entries, power of 2
421 // Power of 2 and at least one element per bin, at most the TLB size
422 num_bins
= std::min(n
, (difference_type
)num_bins_cache
);
423 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
424 // 2 TLB entries needed per bin
425 num_bins
= std::min((difference_type
)__s
.TLB_size
/ 2, num_bins
);
427 num_bins
= round_up_to_pow2(num_bins
);
429 if (num_bins
< num_bins_cache
)
432 // Now try the L2 cache, must fit into L2.
434 static_cast<bin_index
>(std::max
<difference_type
>(
435 1, n
/ (__s
.L2_cache_size
/ sizeof(value_type
))));
436 num_bins_cache
= round_up_to_pow2(num_bins_cache
);
438 // No more buckets than TLB entries, power of 2
439 // Power of 2 and at least one element per bin, at most the TLB size.
440 num_bins
= static_cast<bin_index
>
441 (std::min(n
, static_cast<difference_type
>(num_bins_cache
)));
443 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_TLB
444 // 2 TLB entries needed per bin
446 std::min
<difference_type
>(__s
.TLB_size
/ 2, num_bins
);
448 num_bins
= round_up_to_pow2(num_bins
);
449 #if _GLIBCXX_RANDOM_SHUFFLE_CONSIDER_L1
453 int num_bits
= log2(num_bins
);
457 value_type
* target
= static_cast<value_type
*>(
458 ::operator new(sizeof(value_type
) * n
));
459 bin_index
* oracles
= new bin_index
[n
];
460 difference_type
* dist0
= new difference_type
[num_bins
+ 1],
461 * dist1
= new difference_type
[num_bins
+ 1];
463 for (int b
= 0; b
< num_bins
+ 1; ++b
)
466 random_number
bitrng(rng(0xFFFFFFFF));
468 for (difference_type i
= 0; i
< n
; ++i
)
470 bin_index oracle
= random_number_pow2(num_bits
, bitrng
);
473 // To allow prefix (partial) sum.
474 ++(dist0
[oracle
+ 1]);
478 __gnu_sequential::partial_sum(dist0
, dist0
+ num_bins
+ 1, dist0
);
480 for (int b
= 0; b
< num_bins
+ 1; ++b
)
483 // Distribute according to oracles.
484 for (difference_type i
= 0; i
< n
; ++i
)
485 ::new(&(target
[(dist0
[oracles
[i
]])++])) value_type(*(begin
+ i
));
487 for (int b
= 0; b
< num_bins
; ++b
)
489 sequential_random_shuffle(target
+ dist1
[b
],
490 target
+ dist1
[b
+ 1],
494 // Copy elements back.
495 std::copy(target
, target
+ n
, begin
);
500 ::operator delete(target
);
503 __gnu_sequential::random_shuffle(begin
, end
, rng
);
506 /** @brief Parallel random public call.
507 * @param begin Begin iterator of sequence.
508 * @param end End iterator of sequence.
509 * @param rng Random number generator to use.
511 template<typename RandomAccessIterator
, typename RandomNumberGenerator
>
513 parallel_random_shuffle(RandomAccessIterator begin
,
514 RandomAccessIterator end
,
515 RandomNumberGenerator rng
= random_number())
517 typedef std::iterator_traits
<RandomAccessIterator
> traits_type
;
518 typedef typename
traits_type::difference_type difference_type
;
519 difference_type n
= end
- begin
;
520 parallel_random_shuffle_drs(begin
, end
, n
, get_max_threads(), rng
) ;
525 #endif /* _GLIBCXX_PARALLEL_RANDOM_SHUFFLE_H */