]> git.ipfire.org Git - thirdparty/gcc.git/blob - libcilkrts/runtime/cilk-abi-cilk-for.cpp
6e7c08a08af172771843c13eaab61017cd51bcb0
[thirdparty/gcc.git] / libcilkrts / runtime / cilk-abi-cilk-for.cpp
1 /* cilk-abi-cilk-for.cpp -*-C++-*-
2 *
3 *************************************************************************
4 *
5 * Copyright (C) 2011-2016, Intel Corporation
6 * All rights reserved.
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * * Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * * Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 * * Neither the name of Intel Corporation nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
29 * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
32 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 * POSSIBILITY OF SUCH DAMAGE.
34 *
35 * *********************************************************************
36 *
37 * PLEASE NOTE: This file is a downstream copy of a file mainitained in
38 * a repository at cilkplus.org. Changes made to this file that are not
39 * submitted through the contribution process detailed at
40 * http://www.cilkplus.org/submit-cilk-contribution will be lost the next
41 * time that a new version is released. Changes only submitted to the
42 * GNU compiler collection or posted to the git repository at
43 * https://bitbucket.org/intelcilkruntime/intel-cilk-runtime.git are
44 * not tracked.
45 *
46 * We welcome your contributions to this open source project. Thank you
47 * for your assistance in helping us improve Cilk Plus.
48 *
49 **************************************************************************/
50
51 /* Implementation of cilk_for ABI.
52 *
53 * This file must be C++, not C, in order to handle C++ exceptions correctly
54 * from within the body of the cilk_for loop
55 */
56
57 #include "internal/abi.h"
58 #include "metacall_impl.h"
59 #include "global_state.h"
60
61 // Icky macros to determine if we're compiled with optimization. Based on
62 // the declaration of __CILKRTS_ASSERT in common.h
63 #if defined(_WIN32)
64 # if defined (_DEBUG)
65 # define CILKRTS_OPTIMIZED 0 // Assumes /MDd is always used with /Od
66 # else
67 # define CILKRTS_OPTIMIZED 1
68 # endif // defined(_DEBUG)
69 #else
70 # if defined(__OPTIMIZE__)
71 # define CILKRTS_OPTIMIZED 1
72 # else
73 # define CILKRTS_OPTIMIZED 0
74 # endif
75 #endif
76
77 template <typename count_t>
78 static inline int grainsize(int req, count_t count)
79 {
80 // A positive requested grain size comes from the user. A very high grain
81 // size risks losing parallelism, but the user told us what they want for
82 // grainsize. Who are we to argue?
83 if (req > 0)
84 return req;
85
86 // At present, a negative requested grain size is treated the same way as
87 // a zero grain size, i.e., the runtime computes the actual grainsize
88 // using a hueristic. In the future, the compiler may give us additional
89 // information about the size of the cilk_for body by passing a negative
90 // grain size.
91
92 // Avoid generating a zero grainsize, even for empty loops.
93 if (count < 1)
94 return 1;
95
96 global_state_t* g = cilkg_get_global_state();
97 if (g->under_ptool)
98 {
99 // Grainsize = 1, when running under PIN, and when the grainsize has
100 // not explicitly been set by the user.
101 return 1;
102 }
103 else
104 {
105 // Divide loop count by 8 times the worker count and round up.
106 const int Px8 = g->P * 8;
107 count_t n = (count + Px8 - 1) / Px8;
108
109 // 2K should be enough to amortize the cost of the cilk_for. Any
110 // larger grainsize risks losing parallelism.
111 if (n > 2048)
112 return 2048;
113 return (int) n; // n <= 2048, so no loss of precision on cast to int
114 }
115 }
116
117 /*
118 * call_cilk_for_loop_body
119 *
120 * Centralizes the code to call the loop body. The compiler should be
121 * inlining this code
122 *
123 * low - Low loop index we're considering in this portion of the algorithm
124 * high - High loop index we're considering in this portion of the algorithm
125 * body - lambda function for the cilk_for loop body
126 * data - data used by the lambda function
127 * w - __cilkrts_worker we're currently executing on
128 * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
129 * the cilk_for loop to flatten out the internal nodes
130 */
131 template <typename count_t, typename F>
132 inline static
133 void call_cilk_for_loop_body(count_t low, count_t high,
134 F body, void *data,
135 __cilkrts_worker *w,
136 __cilkrts_pedigree *loop_root_pedigree)
137 {
138 // Cilkscreen should not report this call in a stack trace
139 NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
140
141 // The worker is only valid until the first spawn. Fetch the
142 // __cilkrts_stack_frame out of the worker, since it will be stable across
143 // steals. The sf pointer actually points to the *parent's*
144 // __cilkrts_stack_frame, since this function is a non-spawning function
145 // and therefore has no cilk stack frame of its own.
146 __cilkrts_stack_frame *sf = w->current_stack_frame;
147
148 // Save the pedigree node pointed to by the worker. We'll need to restore
149 // that when we exit since the spawn helpers in the cilk_for call tree
150 // will assume that it's valid
151 const __cilkrts_pedigree *saved_next_pedigree_node = w->pedigree.parent;
152
153 // Add the leaf pedigree node to the chain. The parent is the root node
154 // to flatten the tree regardless of the DAG branches in the cilk_for
155 // divide-and-conquer recursion.
156 //
157 // The rank is initialized to the low index. The user is
158 // expected to call __cilkrts_bump_loop_rank at the end of the cilk_for
159 // loop body.
160 __cilkrts_pedigree loop_leaf_pedigree;
161
162 loop_leaf_pedigree.rank = (uint64_t)low;
163 loop_leaf_pedigree.parent = loop_root_pedigree;
164
165 // The worker's pedigree always starts with a rank of 0
166 w->pedigree.rank = 0;
167 w->pedigree.parent = &loop_leaf_pedigree;
168
169 // Call the compiler generated cilk_for loop body lambda function
170 body(data, low, high);
171
172 // The loop body may have included spawns, so we must refetch the worker
173 // from the __cilkrts_stack_frame, which is stable regardless of which
174 // worker we're executing on.
175 w = sf->worker;
176
177 // Restore the pedigree chain. It must be valid because the spawn helpers
178 // generated by the cilk_for implementation will access it.
179 w->pedigree.parent = saved_next_pedigree_node;
180 }
181
182 /* capture_spawn_arg_stack_frame
183 *
184 * Efficiently get the address of the caller's __cilkrts_stack_frame. The
185 * preconditons are that 'w' is the worker at the time of the call and
186 * 'w->current_stack_frame' points to the __cilkrts_stack_frame within the
187 * spawn helper. This function should be called only within the argument list
188 * of a function that is being spawned because that is the only situation in
189 * which these preconditions hold. This function returns the worker
190 * (unchanged) after storing the captured stack frame pointer is stored in the
191 * sf argument.
192 *
193 * The purpose of this function is to get the caller's stack frame in a
194 * context where the caller's worker is known but its stack frame is not
195 * necessarily initialized. The "shrink wrap" optimization delays
196 * initializing the contents of a spawning function's '__cilkrts_stack_frame'
197 * as well as the 'current_stack_frame' pointer within the worker. By calling
198 * this function within a spawning function's argument list, we can ensure
199 * that these initializations have occured but that a detach (which would
200 * invalidate the worker pointer in the caller) has not yet occured. Once the
201 * '__cilkrts_stack_frame' has been retrieved in this way, it is stable for the
202 * remainder of the caller's execution, and becomes an efficient way to get
203 * the worker (much more efficient than calling '__cilkrts_get_tls_worker()'),
204 * even after a spawn or sync.
205 */
206 inline __cilkrts_worker*
207 capture_spawn_arg_stack_frame(__cilkrts_stack_frame* &sf, __cilkrts_worker* w)
208 {
209 // Get current stack frame
210 sf = w->current_stack_frame;
211 #ifdef __INTEL_COMPILER
212 # if __INTEL_COMPILER <= 1300 && __INTEL_COMPILER_BUILD_DATE < 20130101
213 // In older compilers 'w->current_stack_frame' points to the
214 // spawn-helper's stack frame. In newer compiler's however, it points
215 // directly to the pointer's stack frame. (This change was made to avoid
216 // having the spawn helper in the frame list when evaluating function
217 // arguments, thus avoiding corruption when those arguments themselves
218 // contain cilk_spawns.)
219
220 // w->current_stack_frame is the spawn helper's stack frame.
221 // w->current_stack_frame->call_parent is the caller's stack frame.
222 sf = sf->call_parent;
223 # endif
224 #endif
225 return w;
226 }
227
228 /*
229 * cilk_for_recursive
230 *
231 * Templatized function to implement the recursive divide-and-conquer
232 * algorithm that's how we implement a cilk_for.
233 *
234 * low - Low loop index we're considering in this portion of the algorithm
235 * high - High loop index we're considering in this portion of the algorithm
236 * body - lambda function for the cilk_for loop body
237 * data - data used by the lambda function
238 * grain - grain size (0 if it should be computed)
239 * w - __cilkrts_worker we're currently executing on
240 * loop_root_pedigree - __cilkrts_pedigree node we generated for the root of
241 * the cilk_for loop to flatten out the internal nodes
242 */
243 template <typename count_t, typename F>
244 static
245 void cilk_for_recursive(count_t low, count_t high,
246 F body, void *data, int grain,
247 __cilkrts_worker *w,
248 __cilkrts_pedigree *loop_root_pedigree)
249 {
250 tail_recurse:
251 // Cilkscreen should not report this call in a stack trace
252 // This needs to be done everytime the worker resumes
253 NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
254
255 count_t count = high - low;
256 // Invariant: count > 0, grain >= 1
257 if (count > grain)
258 {
259 // Invariant: count >= 2
260 count_t mid = low + count / 2;
261 // The worker is valid only until the first spawn and is expensive to
262 // retrieve (using '__cilkrts_get_tls_worker') after the spawn. The
263 // '__cilkrts_stack_frame' is more stable, but isn't initialized until
264 // the first spawn. Thus, we want to grab the address of the
265 // '__cilkrts_stack_frame' after it is initialized but before the
266 // spawn detaches. The only place we can do that is within the
267 // argument list of the spawned function, hence the call to
268 // capture_spawn_arg_stack_frame().
269 __cilkrts_stack_frame *sf;
270 #if defined(__GNUC__) && ! defined(__INTEL_COMPILER) && ! defined(__clang__)
271 // The current version of gcc initializes the sf structure eagerly.
272 // We can take advantage of this fact to avoid calling
273 // `capture_spawn_arg_stack_frame` when compiling with gcc.
274 // Remove this if the "shrink-wrap" optimization is implemented.
275 sf = w->current_stack_frame;
276 _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain, w,
277 loop_root_pedigree);
278 #else
279 _Cilk_spawn cilk_for_recursive(low, mid, body, data, grain,
280 capture_spawn_arg_stack_frame(sf, w),
281 loop_root_pedigree);
282 #endif
283 w = sf->worker;
284 low = mid;
285
286 goto tail_recurse;
287 }
288
289 // Call the cilk_for loop body lambda function passed in by the compiler to
290 // execute one grain
291 call_cilk_for_loop_body(low, high, body, data, w, loop_root_pedigree);
292 }
293
294 static void noop() { }
295
296 /*
297 * cilk_for_root
298 *
299 * Templatized function to implement the top level of a cilk_for loop.
300 *
301 * body - lambda function for the cilk_for loop body
302 * data - data used by the lambda function
303 * count - trip count for loop
304 * grain - grain size (0 if it should be computed)
305 */
306 template <typename count_t, typename F>
307 static void cilk_for_root(F body, void *data, count_t count, int grain)
308 {
309 // Cilkscreen should not report this call in a stack trace
310 NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
311
312 // Pedigree computation:
313 //
314 // If the last pedigree node on entry to the _Cilk_for has value X,
315 // then at the start of each iteration of the loop body, the value of
316 // the last pedigree node should be 0, the value of the second-to-last
317 // node should equal the loop counter, and the value of the
318 // third-to-last node should be X. On return from the _Cilk_for, the
319 // value of the last pedigree should be incremented to X+2. The
320 // pedigree within the loop is thus flattened, such that the depth of
321 // recursion does not affect the results either inside or outside of
322 // the loop. Note that the pedigree after the loop exists is the same
323 // as if a single spawn and sync were executed within this function.
324
325 // TBD: Since the shrink-wrap optimization was turned on in the compiler,
326 // it is not possible to get the current stack frame without actually
327 // forcing a call to bind-thread. This spurious spawn is a temporary
328 // stopgap until the correct intrinsics are added to give us total control
329 // over frame initialization.
330 _Cilk_spawn noop();
331
332 // Fetch the current worker. From that we can get the current stack frame
333 // which will be constant even if we're stolen
334 __cilkrts_worker *w = __cilkrts_get_tls_worker();
335 __cilkrts_stack_frame *sf = w->current_stack_frame;
336
337 // Decrement the rank by one to undo the pedigree change from the
338 // _Cilk_spawn
339 --w->pedigree.rank;
340
341 // Save the current worker pedigree into loop_root_pedigree, which will be
342 // the root node for our flattened pedigree.
343 __cilkrts_pedigree loop_root_pedigree = w->pedigree;
344
345 // Don't splice the loop_root node in yet. It will be done when we
346 // call the loop body lambda function
347 // w->pedigree.rank = 0;
348 // w->pedigree.next = &loop_root_pedigree;
349
350 /* Spawn is necessary at top-level to force runtime to start up.
351 * Runtime must be started in order to call the grainsize() function.
352 */
353 int gs = grainsize(grain, count);
354 cilk_for_recursive((count_t) 0, count, body, data, gs, w,
355 &loop_root_pedigree);
356
357 // Need to refetch the worker after calling a spawning function.
358 w = sf->worker;
359
360 // Restore the pedigree in the worker.
361 w->pedigree = loop_root_pedigree;
362
363 // Bump the worker pedigree.
364 ++w->pedigree.rank;
365
366 // Implicit sync will increment the pedigree leaf rank again, for a total
367 // of two increments. If the noop spawn above is removed, then we'll need
368 // to re-enable the following code:
369 // // If this is an optimized build, then the compiler will have optimized
370 // // out the increment of the worker's pedigree in the implied sync. We
371 // // need to add one to make the pedigree_loop test work correctly.
372 // #if CILKRTS_OPTIMIZED
373 // ++sf->worker->pedigree.rank;
374 // #endif
375 }
376
377 // Use extern "C" to suppress name mangling of __cilkrts_cilk_for_32 and
378 // __cilkrts_cilk_for_64.
379 extern "C" {
380
381 /*
382 * __cilkrts_cilk_for_32
383 *
384 * Implementation of cilk_for for 32-bit trip counts (regardless of processor
385 * word size). Assumes that the range is 0 - count.
386 *
387 * body - lambda function for the cilk_for loop body
388 * data - data used by the lambda function
389 * count - trip count for loop
390 * grain - grain size (0 if it should be computed)
391 */
392
393 CILK_ABI_THROWS_VOID __cilkrts_cilk_for_32(__cilk_abi_f32_t body, void *data,
394 cilk32_t count, int grain)
395 {
396 // Cilkscreen should not report this call in a stack trace
397 NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
398
399 // Check for an empty range here as an optimization - don't need to do any
400 // __cilkrts_stack_frame initialization
401 if (count > 0)
402 cilk_for_root(body, data, count, grain);
403 }
404
405 /*
406 * __cilkrts_cilk_for_64
407 *
408 * Implementation of cilk_for for 64-bit trip counts (regardless of processor
409 * word size). Assumes that the range is 0 - count.
410 *
411 * body - lambda function for the cilk_for loop body
412 * data - data used by the lambda function
413 * count - trip count for loop
414 * grain - grain size (0 if it should be computed)
415 */
416 CILK_ABI_THROWS_VOID __cilkrts_cilk_for_64(__cilk_abi_f64_t body, void *data,
417 cilk64_t count, int grain)
418 {
419 // Cilkscreen should not report this call in a stack trace
420 NOTIFY_ZC_INTRINSIC((char *)"cilkscreen_hide_call", 0);
421
422 // Check for an empty range here as an optimization - don't need to do any
423 // __cilkrts_stack_frame initialization
424 if (count > 0)
425 cilk_for_root(body, data, count, grain);
426 }
427
428 } // end extern "C"
429
430 /* End cilk-abi-cilk-for.cpp */