]>
Commit | Line | Data |
---|---|---|
c2ba9709 JS |
1 | // -*- C++ -*- |
2 | ||
531898c3 | 3 | // Copyright (C) 2007, 2008 Free Software Foundation, Inc. |
c2ba9709 JS |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the terms | |
7 | // of the GNU General Public License as published by the Free Software | |
8 | // Foundation; either version 2, or (at your option) any later | |
9 | // version. | |
10 | ||
11 | // This library is distributed in the hope that it will be useful, but | |
12 | // WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | // General Public License for more details. | |
15 | ||
16 | // You should have received a copy of the GNU General Public License | |
17 | // along with this library; see the file COPYING. If not, write to | |
18 | // the Free Software Foundation, 59 Temple Place - Suite 330, Boston, | |
19 | // MA 02111-1307, USA. | |
20 | ||
21 | // As a special exception, you may use this file as part of a free | |
22 | // software library without restriction. Specifically, if other files | |
23 | // instantiate templates or use macros or inline functions from this | |
24 | // file, or you compile this file and link it with other files to | |
25 | // produce an executable, this file does not by itself cause the | |
26 | // resulting executable to be covered by the GNU General Public | |
27 | // License. This exception does not however invalidate any other | |
28 | // reasons why the executable file might be covered by the GNU General | |
29 | // Public License. | |
30 | ||
31 | /** @file parallel/balanced_quicksort.h | |
32 | * @brief Implementation of a dynamically load-balanced parallel quicksort. | |
33 | * | |
34 | * It works in-place and needs only logarithmic extra memory. | |
1904bef1 JS |
35 | * The algorithm is similar to the one proposed in |
36 | * | |
37 | * P. Tsigas and Y. Zhang. | |
38 | * A simple, fast parallel implementation of quicksort and | |
39 | * its performance evaluation on SUN enterprise 10000. | |
40 | * In 11th Euromicro Conference on Parallel, Distributed and | |
41 | * Network-Based Processing, page 372, 2003. | |
42 | * | |
c2ba9709 JS |
43 | * This file is a GNU parallel extension to the Standard C++ Library. |
44 | */ | |
45 | ||
46 | // Written by Johannes Singler. | |
47 | ||
48 | #ifndef _GLIBCXX_PARALLEL_BAL_QUICKSORT_H | |
49 | #define _GLIBCXX_PARALLEL_BAL_QUICKSORT_H 1 | |
50 | ||
51 | #include <parallel/basic_iterator.h> | |
52 | #include <bits/stl_algo.h> | |
53 | ||
54 | #include <parallel/settings.h> | |
55 | #include <parallel/partition.h> | |
56 | #include <parallel/random_number.h> | |
57 | #include <parallel/queue.h> | |
58 | #include <functional> | |
59 | ||
60 | #if _GLIBCXX_ASSERTIONS | |
61 | #include <parallel/checkers.h> | |
62 | #endif | |
63 | ||
64 | namespace __gnu_parallel | |
65 | { | |
e683ee2a JS |
66 | /** @brief Information local to one thread in the parallel quicksort run. */ |
67 | template<typename RandomAccessIterator> | |
c2ba9709 JS |
68 | struct QSBThreadLocal |
69 | { | |
70 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
71 | typedef typename traits_type::difference_type difference_type; | |
72 | ||
73 | /** @brief Continuous part of the sequence, described by an | |
e683ee2a | 74 | iterator pair. */ |
c2ba9709 JS |
75 | typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece; |
76 | ||
77 | /** @brief Initial piece to work on. */ | |
78 | Piece initial; | |
79 | ||
80 | /** @brief Work-stealing queue. */ | |
81 | RestrictedBoundedConcurrentQueue<Piece> leftover_parts; | |
82 | ||
83 | /** @brief Number of threads involved in this algorithm. */ | |
84 | thread_index_t num_threads; | |
85 | ||
86 | /** @brief Pointer to a counter of elements left over to sort. */ | |
87 | volatile difference_type* elements_leftover; | |
88 | ||
89 | /** @brief The complete sequence to sort. */ | |
90 | Piece global; | |
91 | ||
92 | /** @brief Constructor. | |
93 | * @param queue_size Size of the work-stealing queue. */ | |
94 | QSBThreadLocal(int queue_size) : leftover_parts(queue_size) { } | |
95 | }; | |
96 | ||
e683ee2a JS |
97 | /** @brief Balanced quicksort divide step. |
98 | * @param begin Begin iterator of subsequence. | |
99 | * @param end End iterator of subsequence. | |
100 | * @param comp Comparator. | |
101 | * @param num_threads Number of threads that are allowed to work on | |
102 | * this part. | |
103 | * @pre @c (end-begin)>=1 */ | |
104 | template<typename RandomAccessIterator, typename Comparator> | |
531898c3 | 105 | typename std::iterator_traits<RandomAccessIterator>::difference_type |
c2ba9709 | 106 | qsb_divide(RandomAccessIterator begin, RandomAccessIterator end, |
e683ee2a | 107 | Comparator comp, thread_index_t num_threads) |
c2ba9709 JS |
108 | { |
109 | _GLIBCXX_PARALLEL_ASSERT(num_threads > 0); | |
110 | ||
111 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
112 | typedef typename traits_type::value_type value_type; | |
113 | typedef typename traits_type::difference_type difference_type; | |
114 | ||
5817ff8e PC |
115 | RandomAccessIterator pivot_pos = |
116 | median_of_three_iterators(begin, begin + (end - begin) / 2, | |
117 | end - 1, comp); | |
c2ba9709 JS |
118 | |
119 | #if defined(_GLIBCXX_ASSERTIONS) | |
120 | // Must be in between somewhere. | |
121 | difference_type n = end - begin; | |
122 | ||
e683ee2a JS |
123 | _GLIBCXX_PARALLEL_ASSERT( |
124 | (!comp(*pivot_pos, *begin) && !comp(*(begin + n / 2), *pivot_pos)) | |
38a28aab | 125 | || (!comp(*pivot_pos, *begin) && !comp(*(end - 1), *pivot_pos)) |
e683ee2a | 126 | || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*begin, *pivot_pos)) |
38a28aab JS |
127 | || (!comp(*pivot_pos, *(begin + n / 2)) && !comp(*(end - 1), *pivot_pos)) |
128 | || (!comp(*pivot_pos, *(end - 1)) && !comp(*begin, *pivot_pos)) | |
129 | || (!comp(*pivot_pos, *(end - 1)) && !comp(*(begin + n / 2), *pivot_pos))); | |
c2ba9709 JS |
130 | #endif |
131 | ||
132 | // Swap pivot value to end. | |
133 | if (pivot_pos != (end - 1)) | |
134 | std::swap(*pivot_pos, *(end - 1)); | |
135 | pivot_pos = end - 1; | |
136 | ||
e683ee2a JS |
137 | __gnu_parallel::binder2nd<Comparator, value_type, value_type, bool> |
138 | pred(comp, *pivot_pos); | |
c2ba9709 JS |
139 | |
140 | // Divide, returning end - begin - 1 in the worst case. | |
e683ee2a JS |
141 | difference_type split_pos = parallel_partition( |
142 | begin, end - 1, pred, num_threads); | |
c2ba9709 JS |
143 | |
144 | // Swap back pivot to middle. | |
145 | std::swap(*(begin + split_pos), *pivot_pos); | |
146 | pivot_pos = begin + split_pos; | |
147 | ||
148 | #if _GLIBCXX_ASSERTIONS | |
149 | RandomAccessIterator r; | |
5817ff8e | 150 | for (r = begin; r != pivot_pos; ++r) |
c2ba9709 | 151 | _GLIBCXX_PARALLEL_ASSERT(comp(*r, *pivot_pos)); |
5817ff8e | 152 | for (; r != end; ++r) |
c2ba9709 JS |
153 | _GLIBCXX_PARALLEL_ASSERT(!comp(*r, *pivot_pos)); |
154 | #endif | |
155 | ||
156 | return split_pos; | |
157 | } | |
158 | ||
e683ee2a JS |
159 | /** @brief Quicksort conquer step. |
160 | * @param tls Array of thread-local storages. | |
161 | * @param begin Begin iterator of subsequence. | |
162 | * @param end End iterator of subsequence. | |
163 | * @param comp Comparator. | |
164 | * @param iam Number of the thread processing this function. | |
165 | * @param num_threads | |
166 | * Number of threads that are allowed to work on this part. */ | |
167 | template<typename RandomAccessIterator, typename Comparator> | |
531898c3 | 168 | void |
c2ba9709 | 169 | qsb_conquer(QSBThreadLocal<RandomAccessIterator>** tls, |
e683ee2a JS |
170 | RandomAccessIterator begin, RandomAccessIterator end, |
171 | Comparator comp, | |
172 | thread_index_t iam, thread_index_t num_threads, | |
173 | bool parent_wait) | |
c2ba9709 JS |
174 | { |
175 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
176 | typedef typename traits_type::value_type value_type; | |
177 | typedef typename traits_type::difference_type difference_type; | |
178 | ||
179 | difference_type n = end - begin; | |
180 | ||
e683ee2a | 181 | if (num_threads <= 1 || n <= 1) |
c2ba9709 | 182 | { |
e683ee2a JS |
183 | tls[iam]->initial.first = begin; |
184 | tls[iam]->initial.second = end; | |
c2ba9709 | 185 | |
e683ee2a | 186 | qsb_local_sort_with_helping(tls, comp, iam, parent_wait); |
c2ba9709 | 187 | |
e683ee2a | 188 | return; |
c2ba9709 JS |
189 | } |
190 | ||
191 | // Divide step. | |
192 | difference_type split_pos = qsb_divide(begin, end, comp, num_threads); | |
193 | ||
194 | #if _GLIBCXX_ASSERTIONS | |
195 | _GLIBCXX_PARALLEL_ASSERT(0 <= split_pos && split_pos < (end - begin)); | |
196 | #endif | |
197 | ||
e683ee2a JS |
198 | thread_index_t num_threads_leftside = |
199 | std::max<thread_index_t>(1, std::min<thread_index_t>( | |
200 | num_threads - 1, split_pos * num_threads / n)); | |
c2ba9709 | 201 | |
e683ee2a | 202 | # pragma omp atomic |
c2ba9709 JS |
203 | *tls[iam]->elements_leftover -= (difference_type)1; |
204 | ||
205 | // Conquer step. | |
e683ee2a | 206 | # pragma omp parallel num_threads(2) |
c2ba9709 | 207 | { |
e683ee2a JS |
208 | bool wait; |
209 | if(omp_get_num_threads() < 2) | |
210 | wait = false; | |
211 | else | |
212 | wait = parent_wait; | |
213 | ||
214 | # pragma omp sections | |
215 | { | |
216 | # pragma omp section | |
217 | { | |
218 | qsb_conquer(tls, begin, begin + split_pos, comp, | |
219 | iam, | |
220 | num_threads_leftside, | |
221 | wait); | |
222 | wait = parent_wait; | |
223 | } | |
224 | // The pivot_pos is left in place, to ensure termination. | |
225 | # pragma omp section | |
226 | { | |
227 | qsb_conquer(tls, begin + split_pos + 1, end, comp, | |
228 | iam + num_threads_leftside, | |
229 | num_threads - num_threads_leftside, | |
230 | wait); | |
231 | wait = parent_wait; | |
232 | } | |
233 | } | |
c2ba9709 JS |
234 | } |
235 | } | |
236 | ||
e683ee2a JS |
237 | /** |
238 | * @brief Quicksort step doing load-balanced local sort. | |
239 | * @param tls Array of thread-local storages. | |
240 | * @param comp Comparator. | |
241 | * @param iam Number of the thread processing this function. | |
242 | */ | |
243 | template<typename RandomAccessIterator, typename Comparator> | |
531898c3 | 244 | void |
c2ba9709 | 245 | qsb_local_sort_with_helping(QSBThreadLocal<RandomAccessIterator>** tls, |
e683ee2a | 246 | Comparator& comp, int iam, bool wait) |
c2ba9709 JS |
247 | { |
248 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
249 | typedef typename traits_type::value_type value_type; | |
250 | typedef typename traits_type::difference_type difference_type; | |
251 | typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece; | |
252 | ||
253 | QSBThreadLocal<RandomAccessIterator>& tl = *tls[iam]; | |
254 | ||
d7066497 JS |
255 | difference_type base_case_n = |
256 | _Settings::get().sort_qsb_base_case_maximal_n; | |
c2ba9709 JS |
257 | if (base_case_n < 2) |
258 | base_case_n = 2; | |
259 | thread_index_t num_threads = tl.num_threads; | |
260 | ||
261 | // Every thread has its own random number generator. | |
262 | random_number rng(iam + 1); | |
263 | ||
264 | Piece current = tl.initial; | |
265 | ||
266 | difference_type elements_done = 0; | |
267 | #if _GLIBCXX_ASSERTIONS | |
268 | difference_type total_elements_done = 0; | |
269 | #endif | |
270 | ||
271 | for (;;) | |
272 | { | |
e683ee2a JS |
273 | // Invariant: current must be a valid (maybe empty) range. |
274 | RandomAccessIterator begin = current.first, end = current.second; | |
275 | difference_type n = end - begin; | |
c2ba9709 | 276 | |
e683ee2a JS |
277 | if (n > base_case_n) |
278 | { | |
279 | // Divide. | |
280 | RandomAccessIterator pivot_pos = begin + rng(n); | |
c2ba9709 | 281 | |
e683ee2a JS |
282 | // Swap pivot_pos value to end. |
283 | if (pivot_pos != (end - 1)) | |
284 | std::swap(*pivot_pos, *(end - 1)); | |
285 | pivot_pos = end - 1; | |
c2ba9709 | 286 | |
e683ee2a JS |
287 | __gnu_parallel::binder2nd |
288 | <Comparator, value_type, value_type, bool> | |
289 | pred(comp, *pivot_pos); | |
c2ba9709 | 290 | |
e683ee2a JS |
291 | // Divide, leave pivot unchanged in last place. |
292 | RandomAccessIterator split_pos1, split_pos2; | |
293 | split_pos1 = __gnu_sequential::partition(begin, end - 1, pred); | |
c2ba9709 | 294 | |
e683ee2a | 295 | // Left side: < pivot_pos; right side: >= pivot_pos. |
c2ba9709 | 296 | #if _GLIBCXX_ASSERTIONS |
e683ee2a | 297 | _GLIBCXX_PARALLEL_ASSERT(begin <= split_pos1 && split_pos1 < end); |
c2ba9709 | 298 | #endif |
e683ee2a JS |
299 | // Swap pivot back to middle. |
300 | if (split_pos1 != pivot_pos) | |
301 | std::swap(*split_pos1, *pivot_pos); | |
302 | pivot_pos = split_pos1; | |
303 | ||
304 | // In case all elements are equal, split_pos1 == 0. | |
305 | if ((split_pos1 + 1 - begin) < (n >> 7) | |
306 | || (end - split_pos1) < (n >> 7)) | |
307 | { | |
308 | // Very unequal split, one part smaller than one 128th | |
309 | // elements not strictly larger than the pivot. | |
310 | __gnu_parallel::unary_negate<__gnu_parallel::binder1st | |
a4797b34 PC |
311 | <Comparator, value_type, value_type, bool>, value_type> |
312 | pred(__gnu_parallel::binder1st | |
313 | <Comparator, value_type, value_type, bool>(comp, | |
314 | *pivot_pos)); | |
e683ee2a JS |
315 | |
316 | // Find other end of pivot-equal range. | |
5817ff8e PC |
317 | split_pos2 = __gnu_sequential::partition(split_pos1 + 1, |
318 | end, pred); | |
e683ee2a JS |
319 | } |
320 | else | |
321 | // Only skip the pivot. | |
322 | split_pos2 = split_pos1 + 1; | |
323 | ||
324 | // Elements equal to pivot are done. | |
325 | elements_done += (split_pos2 - split_pos1); | |
c2ba9709 | 326 | #if _GLIBCXX_ASSERTIONS |
e683ee2a | 327 | total_elements_done += (split_pos2 - split_pos1); |
c2ba9709 | 328 | #endif |
e683ee2a JS |
329 | // Always push larger part onto stack. |
330 | if (((split_pos1 + 1) - begin) < (end - (split_pos2))) | |
331 | { | |
332 | // Right side larger. | |
333 | if ((split_pos2) != end) | |
a4797b34 PC |
334 | tl.leftover_parts.push_front(std::make_pair(split_pos2, |
335 | end)); | |
e683ee2a JS |
336 | |
337 | //current.first = begin; //already set anyway | |
338 | current.second = split_pos1; | |
339 | continue; | |
340 | } | |
341 | else | |
342 | { | |
343 | // Left side larger. | |
344 | if (begin != split_pos1) | |
5817ff8e PC |
345 | tl.leftover_parts.push_front(std::make_pair(begin, |
346 | split_pos1)); | |
e683ee2a JS |
347 | |
348 | current.first = split_pos2; | |
349 | //current.second = end; //already set anyway | |
350 | continue; | |
351 | } | |
352 | } | |
353 | else | |
354 | { | |
355 | __gnu_sequential::sort(begin, end, comp); | |
356 | elements_done += n; | |
c2ba9709 | 357 | #if _GLIBCXX_ASSERTIONS |
e683ee2a | 358 | total_elements_done += n; |
c2ba9709 JS |
359 | #endif |
360 | ||
e683ee2a JS |
361 | // Prefer own stack, small pieces. |
362 | if (tl.leftover_parts.pop_front(current)) | |
363 | continue; | |
c2ba9709 | 364 | |
e683ee2a JS |
365 | # pragma omp atomic |
366 | *tl.elements_leftover -= elements_done; | |
367 | ||
368 | elements_done = 0; | |
c2ba9709 JS |
369 | |
370 | #if _GLIBCXX_ASSERTIONS | |
e683ee2a | 371 | double search_start = omp_get_wtime(); |
c2ba9709 JS |
372 | #endif |
373 | ||
e683ee2a JS |
374 | // Look for new work. |
375 | bool successfully_stolen = false; | |
376 | while (wait && *tl.elements_leftover > 0 && !successfully_stolen | |
c2ba9709 | 377 | #if _GLIBCXX_ASSERTIONS |
e683ee2a JS |
378 | // Possible dead-lock. |
379 | && (omp_get_wtime() < (search_start + 1.0)) | |
c2ba9709 | 380 | #endif |
e683ee2a JS |
381 | ) |
382 | { | |
383 | thread_index_t victim; | |
384 | victim = rng(num_threads); | |
385 | ||
386 | // Large pieces. | |
387 | successfully_stolen = (victim != iam) | |
388 | && tls[victim]->leftover_parts.pop_back(current); | |
389 | if (!successfully_stolen) | |
390 | yield(); | |
c2ba9709 | 391 | #if !defined(__ICC) && !defined(__ECC) |
e683ee2a | 392 | # pragma omp flush |
c2ba9709 | 393 | #endif |
e683ee2a | 394 | } |
c2ba9709 JS |
395 | |
396 | #if _GLIBCXX_ASSERTIONS | |
e683ee2a JS |
397 | if (omp_get_wtime() >= (search_start + 1.0)) |
398 | { | |
399 | sleep(1); | |
5817ff8e PC |
400 | _GLIBCXX_PARALLEL_ASSERT(omp_get_wtime() |
401 | < (search_start + 1.0)); | |
e683ee2a | 402 | } |
c2ba9709 | 403 | #endif |
e683ee2a JS |
404 | if (!successfully_stolen) |
405 | { | |
c2ba9709 | 406 | #if _GLIBCXX_ASSERTIONS |
e683ee2a | 407 | _GLIBCXX_PARALLEL_ASSERT(*tl.elements_leftover == 0); |
c2ba9709 | 408 | #endif |
e683ee2a JS |
409 | return; |
410 | } | |
411 | } | |
c2ba9709 JS |
412 | } |
413 | } | |
414 | ||
e683ee2a JS |
415 | /** @brief Top-level quicksort routine. |
416 | * @param begin Begin iterator of sequence. | |
417 | * @param end End iterator of sequence. | |
418 | * @param comp Comparator. | |
e683ee2a JS |
419 | * @param num_threads Number of threads that are allowed to work on |
420 | * this part. | |
421 | */ | |
422 | template<typename RandomAccessIterator, typename Comparator> | |
531898c3 | 423 | void |
c2ba9709 | 424 | parallel_sort_qsb(RandomAccessIterator begin, RandomAccessIterator end, |
e683ee2a | 425 | Comparator comp, |
e683ee2a | 426 | thread_index_t num_threads) |
c2ba9709 JS |
427 | { |
428 | _GLIBCXX_CALL(end - begin) | |
429 | ||
430 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
431 | typedef typename traits_type::value_type value_type; | |
432 | typedef typename traits_type::difference_type difference_type; | |
433 | typedef std::pair<RandomAccessIterator, RandomAccessIterator> Piece; | |
434 | ||
435 | typedef QSBThreadLocal<RandomAccessIterator> tls_type; | |
436 | ||
d7066497 JS |
437 | difference_type n = end - begin; |
438 | ||
c2ba9709 JS |
439 | if (n <= 1) |
440 | return; | |
441 | ||
442 | // At least one element per processor. | |
443 | if (num_threads > n) | |
444 | num_threads = static_cast<thread_index_t>(n); | |
445 | ||
e683ee2a | 446 | // Initialize thread local storage |
c2ba9709 | 447 | tls_type** tls = new tls_type*[num_threads]; |
e683ee2a JS |
448 | difference_type queue_size = num_threads * (thread_index_t)(log2(n) + 1); |
449 | for (thread_index_t t = 0; t < num_threads; ++t) | |
450 | tls[t] = new QSBThreadLocal<RandomAccessIterator>(queue_size); | |
c2ba9709 JS |
451 | |
452 | // There can never be more than ceil(log2(n)) ranges on the stack, because | |
453 | // 1. Only one processor pushes onto the stack | |
454 | // 2. The largest range has at most length n | |
455 | // 3. Each range is larger than half of the range remaining | |
456 | volatile difference_type elements_leftover = n; | |
5817ff8e | 457 | for (int i = 0; i < num_threads; ++i) |
c2ba9709 | 458 | { |
e683ee2a JS |
459 | tls[i]->elements_leftover = &elements_leftover; |
460 | tls[i]->num_threads = num_threads; | |
461 | tls[i]->global = std::make_pair(begin, end); | |
c2ba9709 | 462 | |
e683ee2a JS |
463 | // Just in case nothing is left to assign. |
464 | tls[i]->initial = std::make_pair(end, end); | |
c2ba9709 JS |
465 | } |
466 | ||
c2ba9709 | 467 | // Main recursion call. |
e683ee2a | 468 | qsb_conquer(tls, begin, begin + n, comp, 0, num_threads, true); |
c2ba9709 JS |
469 | |
470 | #if _GLIBCXX_ASSERTIONS | |
471 | // All stack must be empty. | |
472 | Piece dummy; | |
5817ff8e | 473 | for (int i = 1; i < num_threads; ++i) |
c2ba9709 JS |
474 | _GLIBCXX_PARALLEL_ASSERT(!tls[i]->leftover_parts.pop_back(dummy)); |
475 | #endif | |
476 | ||
5817ff8e | 477 | for (int i = 0; i < num_threads; ++i) |
c2ba9709 JS |
478 | delete tls[i]; |
479 | delete[] tls; | |
480 | } | |
481 | } // namespace __gnu_parallel | |
482 | ||
483 | #endif |