]>
Commit | Line | Data |
---|---|---|
c2ba9709 JS |
1 | // -*- C++ -*- |
2 | ||
748086b7 | 3 | // Copyright (C) 2007, 2008, 2009 Free Software Foundation, Inc. |
c2ba9709 JS |
4 | // |
5 | // This file is part of the GNU ISO C++ Library. This library is free | |
6 | // software; you can redistribute it and/or modify it under the terms | |
7 | // of the GNU General Public License as published by the Free Software | |
748086b7 | 8 | // Foundation; either version 3, or (at your option) any later |
c2ba9709 JS |
9 | // version. |
10 | ||
11 | // This library is distributed in the hope that it will be useful, but | |
12 | // WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | // General Public License for more details. | |
15 | ||
748086b7 JJ |
16 | // Under Section 7 of GPL version 3, you are granted additional |
17 | // permissions described in the GCC Runtime Library Exception, version | |
18 | // 3.1, as published by the Free Software Foundation. | |
c2ba9709 | 19 | |
748086b7 JJ |
20 | // You should have received a copy of the GNU General Public License and |
21 | // a copy of the GCC Runtime Library Exception along with this program; | |
22 | // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see | |
23 | // <http://www.gnu.org/licenses/>. | |
c2ba9709 JS |
24 | |
25 | /** @file parallel/workstealing.h | |
26 | * @brief Parallelization of embarrassingly parallel execution by | |
27 | * means of work-stealing. | |
1904bef1 JS |
28 | * |
29 | * Work stealing is described in | |
30 | * | |
31 | * R. D. Blumofe and C. E. Leiserson. | |
32 | * Scheduling multithreaded computations by work stealing. | |
33 | * Journal of the ACM, 46(5):720–748, 1999. | |
34 | * | |
c2ba9709 JS |
35 | * This file is a GNU parallel extension to the Standard C++ Library. |
36 | */ | |
37 | ||
38 | // Written by Felix Putze. | |
39 | ||
40 | #ifndef _GLIBCXX_PARALLEL_WORKSTEALING_H | |
41 | #define _GLIBCXX_PARALLEL_WORKSTEALING_H 1 | |
42 | ||
43 | #include <parallel/parallel.h> | |
44 | #include <parallel/random_number.h> | |
45 | #include <parallel/compatibility.h> | |
46 | ||
47 | namespace __gnu_parallel | |
48 | { | |
49 | ||
50 | #define _GLIBCXX_JOB_VOLATILE volatile | |
51 | ||
e683ee2a JS |
52 | /** @brief One job for a certain thread. */ |
53 | template<typename _DifferenceTp> | |
c2ba9709 JS |
54 | struct Job |
55 | { | |
56 | typedef _DifferenceTp difference_type; | |
57 | ||
58 | /** @brief First element. | |
59 | * | |
60 | * Changed by owning and stealing thread. By stealing thread, | |
61 | * always incremented. */ | |
62 | _GLIBCXX_JOB_VOLATILE difference_type first; | |
63 | ||
64 | /** @brief Last element. | |
65 | * | |
66 | * Changed by owning thread only. */ | |
67 | _GLIBCXX_JOB_VOLATILE difference_type last; | |
68 | ||
69 | /** @brief Number of elements, i. e. @c last-first+1. | |
70 | * | |
71 | * Changed by owning thread only. */ | |
72 | _GLIBCXX_JOB_VOLATILE difference_type load; | |
73 | }; | |
74 | ||
e683ee2a JS |
75 | /** @brief Work stealing algorithm for random access iterators. |
76 | * | |
77 | * Uses O(1) additional memory. Synchronization at job lists is | |
78 | * done with atomic operations. | |
79 | * @param begin Begin iterator of element sequence. | |
80 | * @param end End iterator of element sequence. | |
81 | * @param op User-supplied functor (comparator, predicate, adding | |
82 | * functor, ...). | |
83 | * @param f Functor to "process" an element with op (depends on | |
84 | * desired functionality, e. g. for std::for_each(), ...). | |
85 | * @param r Functor to "add" a single result to the already | |
86 | * processed elements (depends on functionality). | |
87 | * @param base Base value for reduction. | |
88 | * @param output Pointer to position where final result is written to | |
89 | * @param bound Maximum number of elements processed (e. g. for | |
90 | * std::count_n()). | |
91 | * @return User-supplied functor (that may contain a part of the result). | |
92 | */ | |
5817ff8e PC |
93 | template<typename RandomAccessIterator, |
94 | typename Op, | |
95 | typename Fu, | |
96 | typename Red, | |
97 | typename Result> | |
c2ba9709 | 98 | Op |
5817ff8e PC |
99 | for_each_template_random_access_workstealing(RandomAccessIterator begin, |
100 | RandomAccessIterator end, | |
101 | Op op, Fu& f, Red r, | |
102 | Result base, Result& output, | |
103 | typename std::iterator_traits | |
104 | <RandomAccessIterator>:: | |
105 | difference_type bound) | |
c2ba9709 JS |
106 | { |
107 | _GLIBCXX_CALL(end - begin) | |
108 | ||
109 | typedef std::iterator_traits<RandomAccessIterator> traits_type; | |
110 | typedef typename traits_type::difference_type difference_type; | |
ee1b5fc5 BK |
111 | |
112 | const _Settings& __s = _Settings::get(); | |
c2ba9709 | 113 | |
ee1b5fc5 | 114 | difference_type chunk_size = static_cast<difference_type>(__s.workstealing_chunk_size); |
c2ba9709 JS |
115 | |
116 | // How many jobs? | |
117 | difference_type length = (bound < 0) ? (end - begin) : bound; | |
118 | ||
119 | // To avoid false sharing in a cache line. | |
ee1b5fc5 | 120 | const int stride = __s.cache_line_size * 10 / sizeof(Job<difference_type>) + 1; |
c2ba9709 JS |
121 | |
122 | // Total number of threads currently working. | |
123 | thread_index_t busy = 0; | |
e683ee2a JS |
124 | |
125 | Job<difference_type> *job; | |
c2ba9709 | 126 | |
59b567fb JS |
127 | omp_lock_t output_lock; |
128 | omp_init_lock(&output_lock); | |
129 | ||
c2ba9709 JS |
130 | // Write base value to output. |
131 | output = base; | |
132 | ||
e683ee2a JS |
133 | // No more threads than jobs, at least one thread. |
134 | thread_index_t num_threads = | |
135 | __gnu_parallel::max<thread_index_t>(1, | |
136 | __gnu_parallel::min<difference_type>(length, get_max_threads())); | |
137 | ||
138 | # pragma omp parallel shared(busy) num_threads(num_threads) | |
139 | { | |
140 | ||
141 | # pragma omp single | |
142 | { | |
143 | num_threads = omp_get_num_threads(); | |
144 | ||
145 | // Create job description array. | |
146 | job = new Job<difference_type>[num_threads * stride]; | |
147 | } | |
148 | ||
149 | // Initialization phase. | |
150 | ||
151 | // Flags for every thread if it is doing productive work. | |
152 | bool iam_working = false; | |
153 | ||
154 | // Thread id. | |
155 | thread_index_t iam = omp_get_thread_num(); | |
156 | ||
157 | // This job. | |
158 | Job<difference_type>& my_job = job[iam * stride]; | |
159 | ||
160 | // Random number (for work stealing). | |
161 | thread_index_t victim; | |
162 | ||
163 | // Local value for reduction. | |
164 | Result result = Result(); | |
165 | ||
166 | // Number of elements to steal in one attempt. | |
167 | difference_type steal; | |
168 | ||
169 | // Every thread has its own random number generator | |
170 | // (modulo num_threads). | |
171 | random_number rand_gen(iam, num_threads); | |
172 | ||
173 | // This thread is currently working. | |
174 | # pragma omp atomic | |
5817ff8e | 175 | ++busy; |
e683ee2a JS |
176 | |
177 | iam_working = true; | |
178 | ||
179 | // How many jobs per thread? last thread gets the rest. | |
180 | my_job.first = | |
181 | static_cast<difference_type>(iam * (length / num_threads)); | |
182 | ||
183 | my_job.last = (iam == (num_threads - 1)) ? | |
184 | (length - 1) : ((iam + 1) * (length / num_threads) - 1); | |
185 | my_job.load = my_job.last - my_job.first + 1; | |
186 | ||
187 | // Init result with first value (to have a base value for reduction). | |
188 | if (my_job.first <= my_job.last) | |
189 | { | |
190 | // Cannot use volatile variable directly. | |
191 | difference_type my_first = my_job.first; | |
192 | result = f(op, begin + my_first); | |
5817ff8e PC |
193 | ++my_job.first; |
194 | --my_job.load; | |
e683ee2a JS |
195 | } |
196 | ||
197 | RandomAccessIterator current; | |
198 | ||
199 | # pragma omp barrier | |
200 | ||
201 | // Actual work phase | |
202 | // Work on own or stolen start | |
203 | while (busy > 0) | |
204 | { | |
205 | // Work until no productive thread left. | |
206 | # pragma omp flush(busy) | |
207 | ||
208 | // Thread has own work to do | |
209 | while (my_job.first <= my_job.last) | |
210 | { | |
211 | // fetch-and-add call | |
212 | // Reserve current job block (size chunk_size) in my queue. | |
213 | difference_type current_job = | |
214 | fetch_and_add<difference_type>(&(my_job.first), chunk_size); | |
215 | ||
216 | // Update load, to make the three values consistent, | |
217 | // first might have been changed in the meantime | |
218 | my_job.load = my_job.last - my_job.first + 1; | |
219 | for (difference_type job_counter = 0; | |
220 | job_counter < chunk_size && current_job <= my_job.last; | |
5817ff8e | 221 | ++job_counter) |
e683ee2a JS |
222 | { |
223 | // Yes: process it! | |
224 | current = begin + current_job; | |
5817ff8e | 225 | ++current_job; |
e683ee2a JS |
226 | |
227 | // Do actual work. | |
228 | result = r(result, f(op, current)); | |
229 | } | |
230 | ||
231 | # pragma omp flush(busy) | |
232 | } | |
233 | ||
234 | // After reaching this point, a thread's job list is empty. | |
235 | if (iam_working) | |
236 | { | |
237 | // This thread no longer has work. | |
238 | # pragma omp atomic | |
5817ff8e | 239 | --busy; |
e683ee2a JS |
240 | |
241 | iam_working = false; | |
242 | } | |
243 | ||
244 | difference_type supposed_first, supposed_last, supposed_load; | |
245 | do | |
246 | { | |
247 | // Find random nonempty deque (not own), do consistency check. | |
248 | yield(); | |
249 | # pragma omp flush(busy) | |
250 | victim = rand_gen(); | |
251 | supposed_first = job[victim * stride].first; | |
252 | supposed_last = job[victim * stride].last; | |
253 | supposed_load = job[victim * stride].load; | |
254 | } | |
255 | while (busy > 0 | |
256 | && ((supposed_load <= 0) | |
257 | || ((supposed_first + supposed_load - 1) != supposed_last))); | |
258 | ||
259 | if (busy == 0) | |
260 | break; | |
261 | ||
262 | if (supposed_load > 0) | |
263 | { | |
264 | // Has work and work to do. | |
265 | // Number of elements to steal (at least one). | |
266 | steal = (supposed_load < 2) ? 1 : supposed_load / 2; | |
267 | ||
268 | // Push victim's start forward. | |
269 | difference_type stolen_first = | |
270 | fetch_and_add<difference_type>( | |
271 | &(job[victim * stride].first), steal); | |
272 | difference_type stolen_try = | |
273 | stolen_first + steal - difference_type(1); | |
274 | ||
275 | my_job.first = stolen_first; | |
276 | my_job.last = __gnu_parallel::min(stolen_try, supposed_last); | |
277 | my_job.load = my_job.last - my_job.first + 1; | |
278 | ||
279 | // Has potential work again. | |
280 | # pragma omp atomic | |
5817ff8e | 281 | ++busy; |
e683ee2a JS |
282 | iam_working = true; |
283 | ||
284 | # pragma omp flush(busy) | |
285 | } | |
286 | # pragma omp flush(busy) | |
287 | } // end while busy > 0 | |
288 | // Add accumulated result to output. | |
289 | omp_set_lock(&output_lock); | |
290 | output = r(output, result); | |
291 | omp_unset_lock(&output_lock); | |
292 | } | |
c2ba9709 JS |
293 | |
294 | delete[] job; | |
295 | ||
296 | // Points to last element processed (needed as return value for | |
297 | // some algorithms like transform) | |
298 | f.finish_iterator = begin + length; | |
299 | ||
59b567fb JS |
300 | omp_destroy_lock(&output_lock); |
301 | ||
c2ba9709 JS |
302 | return op; |
303 | } | |
304 | } // end namespace | |
305 | ||
cbcd1e45 | 306 | #endif /* _GLIBCXX_PARALLEL_WORKSTEALING_H */ |