]>
Commit | Line | Data |
---|---|---|
c2ba9709 JS |
1 | // -*- C++ -*- |
2 | ||
7adcbafe | 3 | // Copyright (C) 2007-2022 Free Software Foundation, Inc. |
c2ba9709 JS |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the terms | |
7 | // of the GNU General Public License as published by the Free Software | |
748086b7 | 8 | // Foundation; either version 3, or (at your option) any later |
c2ba9709 JS |
9 | // version. |
10 | ||
11 | // This library is distributed in the hope that it will be useful, but | |
12 | // WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | // General Public License for more details. | |
15 | ||
748086b7 JJ |
16 | // Under Section 7 of GPL version 3, you are granted additional |
17 | // permissions described in the GCC Runtime Library Exception, version | |
18 | // 3.1, as published by the Free Software Foundation. | |
c2ba9709 | 19 | |
748086b7 JJ |
20 | // You should have received a copy of the GNU General Public License and |
21 | // a copy of the GCC Runtime Library Exception along with this program; | |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | // <http://www.gnu.org/licenses/>. | |
c2ba9709 JS |
24 | |
25 | /** @file parallel/workstealing.h | |
26 | * @brief Parallelization of embarrassingly parallel execution by | |
27 | * means of work-stealing. | |
1904bef1 JS |
28 | * |
29 | * Work stealing is described in | |
30 | * | |
31 | * R. D. Blumofe and C. E. Leiserson. | |
32 | * Scheduling multithreaded computations by work stealing. | |
3c732e6f | 33 | * Journal of the ACM, 46(5):720-748, 1999. |
1904bef1 | 34 | * |
c2ba9709 JS |
35 | * This file is a GNU parallel extension to the Standard C++ Library. |
36 | */ | |
37 | ||
38 | // Written by Felix Putze. | |
39 | ||
40 | #ifndef _GLIBCXX_PARALLEL_WORKSTEALING_H | |
41 | #define _GLIBCXX_PARALLEL_WORKSTEALING_H 1 | |
42 | ||
43 | #include <parallel/parallel.h> | |
44 | #include <parallel/random_number.h> | |
45 | #include <parallel/compatibility.h> | |
46 | ||
47 | namespace __gnu_parallel | |
48 | { | |
49 | ||
50 | #define _GLIBCXX_JOB_VOLATILE volatile | |
51 | ||
77d16198 PC |
52 | /** @brief One __job for a certain thread. */ |
53 | template<typename _DifferenceTp> | |
54 | struct _Job | |
55 | { | |
56 | typedef _DifferenceTp _DifferenceType; | |
57 | ||
58 | /** @brief First element. | |
59 | * | |
60 | * Changed by owning and stealing thread. By stealing thread, | |
61 | * always incremented. */ | |
62 | _GLIBCXX_JOB_VOLATILE _DifferenceType _M_first; | |
63 | ||
64 | /** @brief Last element. | |
65 | * | |
66 | * Changed by owning thread only. */ | |
67 | _GLIBCXX_JOB_VOLATILE _DifferenceType _M_last; | |
68 | ||
8e32aa11 | 69 | /** @brief Number of elements, i.e. @c _M_last-_M_first+1. |
77d16198 PC |
70 | * |
71 | * Changed by owning thread only. */ | |
72 | _GLIBCXX_JOB_VOLATILE _DifferenceType _M_load; | |
73 | }; | |
74 | ||
75 | /** @brief Work stealing algorithm for random access iterators. | |
76 | * | |
77 | * Uses O(1) additional memory. Synchronization at job lists is | |
78 | * done with atomic operations. | |
79 | * @param __begin Begin iterator of element sequence. | |
80 | * @param __end End iterator of element sequence. | |
81 | * @param __op User-supplied functor (comparator, predicate, adding | |
82 | * functor, ...). | |
2a60a9f6 | 83 | * @param __f Functor to @a process an element with __op (depends on |
77d16198 | 84 | * desired functionality, e. g. for std::for_each(), ...). |
2a60a9f6 | 85 | * @param __r Functor to @a add a single __result to the already |
77d16198 PC |
86 | * processed elements (depends on functionality). |
87 | * @param __base Base value for reduction. | |
88 | * @param __output Pointer to position where final result is written to | |
89 | * @param __bound Maximum number of elements processed (e. g. for | |
90 | * std::count_n()). | |
91 | * @return User-supplied functor (that may contain a part of the result). | |
92 | */ | |
93 | template<typename _RAIter, | |
94 | typename _Op, | |
95 | typename _Fu, | |
96 | typename _Red, | |
97 | typename _Result> | |
98 | _Op | |
99 | __for_each_template_random_access_workstealing(_RAIter __begin, | |
100 | _RAIter __end, _Op __op, | |
101 | _Fu& __f, _Red __r, | |
102 | _Result __base, | |
103 | _Result& __output, | |
104 | typename std::iterator_traits<_RAIter>::difference_type __bound) | |
105 | { | |
106 | _GLIBCXX_CALL(__end - __begin) | |
107 | ||
108 | typedef std::iterator_traits<_RAIter> _TraitsType; | |
109 | typedef typename _TraitsType::difference_type _DifferenceType; | |
110 | ||
111 | const _Settings& __s = _Settings::get(); | |
112 | ||
113 | _DifferenceType __chunk_size = | |
114 | static_cast<_DifferenceType>(__s.workstealing_chunk_size); | |
115 | ||
116 | // How many jobs? | |
117 | _DifferenceType __length = (__bound < 0) ? (__end - __begin) : __bound; | |
118 | ||
119 | // To avoid false sharing in a cache line. | |
120 | const int __stride = (__s.cache_line_size * 10 | |
121 | / sizeof(_Job<_DifferenceType>) + 1); | |
122 | ||
123 | // Total number of threads currently working. | |
124 | _ThreadIndex __busy = 0; | |
125 | ||
126 | _Job<_DifferenceType> *__job; | |
127 | ||
128 | omp_lock_t __output_lock; | |
129 | omp_init_lock(&__output_lock); | |
130 | ||
131 | // Write base value to output. | |
132 | __output = __base; | |
133 | ||
134 | // No more threads than jobs, at least one thread. | |
135 | _ThreadIndex __num_threads = __gnu_parallel::max<_ThreadIndex> | |
136 | (1, __gnu_parallel::min<_DifferenceType>(__length, | |
137 | __get_max_threads())); | |
138 | ||
139 | # pragma omp parallel shared(__busy) num_threads(__num_threads) | |
e683ee2a | 140 | { |
e683ee2a | 141 | # pragma omp single |
77d16198 PC |
142 | { |
143 | __num_threads = omp_get_num_threads(); | |
e683ee2a | 144 | |
77d16198 PC |
145 | // Create job description array. |
146 | __job = new _Job<_DifferenceType>[__num_threads * __stride]; | |
147 | } | |
e683ee2a | 148 | |
77d16198 | 149 | // Initialization phase. |
e683ee2a | 150 | |
77d16198 PC |
151 | // Flags for every thread if it is doing productive work. |
152 | bool __iam_working = false; | |
e683ee2a | 153 | |
77d16198 PC |
154 | // Thread id. |
155 | _ThreadIndex __iam = omp_get_thread_num(); | |
e683ee2a | 156 | |
77d16198 PC |
157 | // This job. |
158 | _Job<_DifferenceType>& __my_job = __job[__iam * __stride]; | |
e683ee2a | 159 | |
77d16198 PC |
160 | // Random number (for work stealing). |
161 | _ThreadIndex __victim; | |
e683ee2a | 162 | |
77d16198 PC |
163 | // Local value for reduction. |
164 | _Result __result = _Result(); | |
e683ee2a | 165 | |
77d16198 PC |
166 | // Number of elements to steal in one attempt. |
167 | _DifferenceType __steal; | |
e683ee2a | 168 | |
77d16198 PC |
169 | // Every thread has its own random number generator |
170 | // (modulo __num_threads). | |
171 | _RandomNumber __rand_gen(__iam, __num_threads); | |
e683ee2a | 172 | |
77d16198 | 173 | // This thread is currently working. |
e683ee2a | 174 | # pragma omp atomic |
77d16198 | 175 | ++__busy; |
e683ee2a | 176 | |
77d16198 | 177 | __iam_working = true; |
e683ee2a | 178 | |
77d16198 PC |
179 | // How many jobs per thread? last thread gets the rest. |
180 | __my_job._M_first = static_cast<_DifferenceType> | |
181 | (__iam * (__length / __num_threads)); | |
e683ee2a | 182 | |
77d16198 PC |
183 | __my_job._M_last = (__iam == (__num_threads - 1) |
184 | ? (__length - 1) | |
185 | : ((__iam + 1) * (__length / __num_threads) - 1)); | |
186 | __my_job._M_load = __my_job._M_last - __my_job._M_first + 1; | |
e683ee2a | 187 | |
77d16198 PC |
188 | // Init result with _M_first value (to have a base value for reduction) |
189 | if (__my_job._M_first <= __my_job._M_last) | |
190 | { | |
191 | // Cannot use volatile variable directly. | |
192 | _DifferenceType __my_first = __my_job._M_first; | |
193 | __result = __f(__op, __begin + __my_first); | |
194 | ++__my_job._M_first; | |
195 | --__my_job._M_load; | |
196 | } | |
e683ee2a | 197 | |
77d16198 | 198 | _RAIter __current; |
e683ee2a JS |
199 | |
200 | # pragma omp barrier | |
201 | ||
77d16198 PC |
202 | // Actual work phase |
203 | // Work on own or stolen current start | |
204 | while (__busy > 0) | |
205 | { | |
206 | // Work until no productive thread left. | |
1acba85b | 207 | # pragma omp flush(__busy) |
e683ee2a | 208 | |
77d16198 PC |
209 | // Thread has own work to do |
210 | while (__my_job._M_first <= __my_job._M_last) | |
211 | { | |
212 | // fetch-and-add call | |
213 | // Reserve current job block (size __chunk_size) in my queue. | |
214 | _DifferenceType __current_job = | |
215 | __fetch_and_add<_DifferenceType>(&(__my_job._M_first), | |
216 | __chunk_size); | |
217 | ||
218 | // Update _M_load, to make the three values consistent, | |
219 | // _M_first might have been changed in the meantime | |
220 | __my_job._M_load = __my_job._M_last - __my_job._M_first + 1; | |
221 | for (_DifferenceType __job_counter = 0; | |
222 | __job_counter < __chunk_size | |
223 | && __current_job <= __my_job._M_last; | |
224 | ++__job_counter) | |
225 | { | |
226 | // Yes: process it! | |
227 | __current = __begin + __current_job; | |
228 | ++__current_job; | |
229 | ||
230 | // Do actual work. | |
231 | __result = __r(__result, __f(__op, __current)); | |
232 | } | |
e683ee2a | 233 | |
1acba85b | 234 | # pragma omp flush(__busy) |
77d16198 | 235 | } |
e683ee2a | 236 | |
77d16198 PC |
237 | // After reaching this point, a thread's __job list is empty. |
238 | if (__iam_working) | |
239 | { | |
240 | // This thread no longer has work. | |
e683ee2a | 241 | # pragma omp atomic |
77d16198 | 242 | --__busy; |
e683ee2a | 243 | |
77d16198 PC |
244 | __iam_working = false; |
245 | } | |
e683ee2a | 246 | |
77d16198 PC |
247 | _DifferenceType __supposed_first, __supposed_last, |
248 | __supposed_load; | |
249 | do | |
250 | { | |
251 | // Find random nonempty deque (not own), do consistency check. | |
252 | __yield(); | |
1acba85b | 253 | # pragma omp flush(__busy) |
77d16198 PC |
254 | __victim = __rand_gen(); |
255 | __supposed_first = __job[__victim * __stride]._M_first; | |
256 | __supposed_last = __job[__victim * __stride]._M_last; | |
257 | __supposed_load = __job[__victim * __stride]._M_load; | |
258 | } | |
259 | while (__busy > 0 | |
260 | && ((__supposed_load <= 0) | |
261 | || ((__supposed_first + __supposed_load - 1) | |
262 | != __supposed_last))); | |
263 | ||
264 | if (__busy == 0) | |
265 | break; | |
266 | ||
267 | if (__supposed_load > 0) | |
268 | { | |
269 | // Has work and work to do. | |
270 | // Number of elements to steal (at least one). | |
271 | __steal = (__supposed_load < 2) ? 1 : __supposed_load / 2; | |
272 | ||
273 | // Push __victim's current start forward. | |
274 | _DifferenceType __stolen_first = | |
275 | __fetch_and_add<_DifferenceType> | |
276 | (&(__job[__victim * __stride]._M_first), __steal); | |
277 | _DifferenceType __stolen_try = (__stolen_first + __steal | |
278 | - _DifferenceType(1)); | |
279 | ||
280 | __my_job._M_first = __stolen_first; | |
281 | __my_job._M_last = __gnu_parallel::min(__stolen_try, | |
282 | __supposed_last); | |
283 | __my_job._M_load = __my_job._M_last - __my_job._M_first + 1; | |
284 | ||
285 | // Has potential work again. | |
e683ee2a | 286 | # pragma omp atomic |
77d16198 PC |
287 | ++__busy; |
288 | __iam_working = true; | |
e683ee2a | 289 | |
1acba85b | 290 | # pragma omp flush(__busy) |
77d16198 | 291 | } |
1acba85b | 292 | # pragma omp flush(__busy) |
77d16198 PC |
293 | } // end while __busy > 0 |
294 | // Add accumulated result to output. | |
295 | omp_set_lock(&__output_lock); | |
296 | __output = __r(__output, __result); | |
297 | omp_unset_lock(&__output_lock); | |
e683ee2a | 298 | } |
c2ba9709 | 299 | |
77d16198 | 300 | delete[] __job; |
c2ba9709 | 301 | |
77d16198 PC |
302 | // Points to last element processed (needed as return value for |
303 | // some algorithms like transform) | |
304 | __f._M_finish_iterator = __begin + __length; | |
c2ba9709 | 305 | |
77d16198 | 306 | omp_destroy_lock(&__output_lock); |
59b567fb | 307 | |
77d16198 PC |
308 | return __op; |
309 | } | |
c2ba9709 JS |
310 | } // end namespace |
311 | ||
cbcd1e45 | 312 | #endif /* _GLIBCXX_PARALLEL_WORKSTEALING_H */ |