3 // Copyright (C) 2007, 2008 Free Software Foundation, Inc.
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the terms
7 // of the GNU General Public License as published by the Free Software
8 // Foundation; either version 2, or (at your option) any later
11 // This library is distributed in the hope that it will be useful, but
12 // WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 // General Public License for more details.
16 // You should have received a copy of the GNU General Public License
17 // along with this library; see the file COPYING. If not, write to
18 // the Free Software Foundation, 59 Temple Place - Suite 330, Boston,
19 // MA 02111-1307, USA.
21 // As a special exception, you may use this file as part of a free
22 // software library without restriction. Specifically, if other files
23 // instantiate templates or use macros or inline functions from this
24 // file, or you compile this file and link it with other files to
25 // produce an executable, this file does not by itself cause the
26 // resulting executable to be covered by the GNU General Public
27 // License. This exception does not however invalidate any other
28 // reasons why the executable file might be covered by the GNU General
31 /** @file parallel/settings.h
32 * @brief Settings and tuning parameters, heuristics to decide
33 * whether to use parallelized algorithms.
34 * This file is a GNU parallel extension to the Standard C++ Library.
36 * @section parallelization_decision The decision whether to run
37 * an algorithm in parallel.
39 * There are several ways the user can switch on and off the
40 * parallel execution of an algorithm, both at compile- and
43 * Only sequential execution can be forced at compile-time.
44 * This reduces code size and protects code parts that have
45 * non-thread-safe side effects.
47 * Ultimately forcing parallel execution at compile-time does
49 * Often, the sequential algorithm implementation is used as
50 * a subroutine, so no reduction in code size can be achieved.
51 * Also, the machine the program is run on might have only one
52 * processor core, so to avoid overhead, the algorithm is
53 * executed sequentially.
55 * To force sequential execution of an algorithm ultimately
56 * at compile-time, the user must add the tag
57 * __gnu_parallel::sequential_tag() to the end of the
58 * parameter list, e. g.
61 * std::sort(v.begin(), v.end(), __gnu_parallel::sequential_tag());
64 * This is compatible with all overloaded algorithm variants.
65 * No additional code will be instantiated, at all.
66 * The same holds for most algorithm calls with iterators
67 * not providing random access.
69 * If the algorithm call is not forced to be executed sequentially
70 * at compile-time, the decision is made at run-time, for each call.
71 * First, the two (conceptually) global variables
72 * __gnu_parallel::Settings::force_sequential and
73 * __gnu_parallel::Settings::force_parallel are executed.
74 * If the former one is true, the sequential algorithm is executed.
75 * If the latter one is true and the former one is false,
76 * the algorithm is executed in parallel.
78 * If none of these conditions has fired so far, a heuristic is used.
79 * The parallel algorithm implementation is called only if the
80 * input size is sufficiently large.
81 * For most algorithms, the input size is the (combined) length of
82 * the input sequence(s).
83 * The threshold can be set by the user, individually for each
85 * The according variables are called
86 * __gnu_parallel::Settings::[algorithm]_minimal_n .
88 * For some of the algorithms, there are even more tuning options,
89 * e. g. the ability to choose from multiple algorithm variants.
90 * See the __gnu_parallel::Settings class for details.
93 // Written by Johannes Singler and Felix Putze.
95 #ifndef _GLIBCXX_PARALLEL_SETTINGS_H
96 #define _GLIBCXX_PARALLEL_SETTINGS_H 1
99 #include <parallel/types.h>
102 * @brief The extensible condition on whether the parallel variant of
103 * an algorithm should be called.
104 * @param c A condition that is overruled by
105 * __gnu_parallel::Settings::force_parallel, i. e. usually a decision based on
108 #define _GLIBCXX_PARALLEL_CONDITION(c) \
109 (!(__gnu_parallel::Settings::force_sequential) \
110 && ((__gnu_parallel::get_max_threads() > 1 \
111 && (c)) || __gnu_parallel::Settings::force_parallel))
113 namespace __gnu_parallel
115 // NB: Including this file cannot produce (unresolved) symbols from
116 // the OpenMP runtime unless the parallel mode is actually invoked
117 // and active, which implies that the OpenMP runtime is actually
118 // going to be linked in.
121 { return omp_get_max_threads() > 1 ? omp_get_max_threads() : 1; }
125 // XXX look at _Tune in mt_allocator.h
126 /** @brief Run-time settings for the parallel mode. */
129 /** @brief Different parallel sorting algorithms to choose
130 from: multi-way mergesort, quicksort, load-balanced
133 { MWMS
, QS
, QS_BALANCED
};
135 /** @brief Different merging algorithms: bubblesort-alike,
136 loser-tree variants, enum sentinel */
137 enum MultiwayMergeAlgorithm
138 { BUBBLE
, LOSER_TREE_EXPLICIT
, LOSER_TREE
, LOSER_TREE_COMBINED
,
139 LOSER_TREE_SENTINEL
, MWM_ALGORITHM_LAST
};
141 /** @brief Different splitting strategies for sorting/merging:
142 by sampling, exact */
146 /** @brief Different partial sum algorithms: recursive, linear */
147 enum PartialSumAlgorithm
148 { RECURSIVE
, LINEAR
};
150 /** @brief Different find distribution strategies: growing
151 blocks, equal-sized blocks, equal splitting. */
152 enum FindDistribution
153 { GROWING_BLOCKS
, CONSTANT_SIZE_BLOCKS
, EQUAL_SPLIT
};
155 /** @brief Force all algorithms to be executed sequentially.
156 * This setting cannot be overwritten. */
157 static volatile bool force_sequential
;
159 /** @brief Force all algorithms to be executed in parallel.
160 * This setting can be overridden by __gnu_parallel::sequential_tag
161 * (compile-time), and force_sequential (run-time). */
162 static volatile bool force_parallel
;
164 /** @brief Algorithm to use for sorting. */
165 static volatile SortAlgorithm sort_algorithm
;
167 /** @brief Strategy to use for splitting the input when
169 static volatile Splitting sort_splitting
;
171 /** @brief Minimal input size for parallel sorting. */
172 static volatile sequence_index_t sort_minimal_n
;
174 /** @brief Oversampling factor for parallel std::sort (MWMS). */
175 static volatile unsigned int sort_mwms_oversampling
;
177 /** @brief Such many samples to take to find a good pivot
179 static volatile unsigned int sort_qs_num_samples_preset
;
181 /** @brief Maximal subsequence length to switch to unbalanced
182 * base case. Applies to std::sort with dynamically
183 * load-balanced quicksort. */
184 static volatile sequence_index_t sort_qsb_base_case_maximal_n
;
186 /** @brief Minimal input size for parallel std::partition. */
187 static volatile sequence_index_t partition_minimal_n
;
189 /** @brief Chunk size for parallel std::partition. */
190 static volatile sequence_index_t partition_chunk_size
;
192 /** @brief Chunk size for parallel std::partition, relative to
193 * input size. If >0.0, this value overrides
194 * partition_chunk_size. */
195 static volatile double partition_chunk_share
;
197 /** @brief Minimal input size for parallel std::nth_element. */
198 static volatile sequence_index_t nth_element_minimal_n
;
200 /** @brief Minimal input size for parallel std::partial_sort. */
201 static volatile sequence_index_t partial_sort_minimal_n
;
203 /** @brief Minimal input size for parallel std::adjacent_difference. */
204 static volatile unsigned int adjacent_difference_minimal_n
;
206 /** @brief Minimal input size for parallel std::partial_sum. */
207 static volatile unsigned int partial_sum_minimal_n
;
209 /** @brief Algorithm to use for std::partial_sum. */
210 static volatile PartialSumAlgorithm partial_sum_algorithm
;
212 /** @brief Assume "sum and write result" to be that factor
213 * slower than just "sum". This value is used for
214 * std::partial_sum. */
215 static volatile float partial_sum_dilatation
;
217 /** @brief Minimal input size for parallel std::random_shuffle. */
218 static volatile unsigned int random_shuffle_minimal_n
;
220 /** @brief Minimal input size for parallel std::merge. */
221 static volatile sequence_index_t merge_minimal_n
;
223 /** @brief Splitting strategy for parallel std::merge. */
224 static volatile Splitting merge_splitting
;
226 /** @brief Oversampling factor for parallel std::merge.
227 * Such many samples per thread are collected. */
228 static volatile unsigned int merge_oversampling
;
230 /** @brief Algorithm to use for parallel
231 __gnu_parallel::multiway_merge. */
232 static volatile MultiwayMergeAlgorithm multiway_merge_algorithm
;
234 /** @brief Splitting strategy to use for parallel
235 __gnu_parallel::multiway_merge. */
236 static volatile Splitting multiway_merge_splitting
;
238 //// Oversampling factor for parallel __gnu_parallel::multiway_merge.
239 static volatile unsigned int multiway_merge_oversampling
;
241 /// Minimal input size for parallel __gnu_parallel::multiway_merge.
242 static volatile sequence_index_t multiway_merge_minimal_n
;
244 /// Oversampling factor for parallel __gnu_parallel::multiway_merge.
245 static volatile int multiway_merge_minimal_k
;
247 /** @brief Minimal input size for parallel std::unique_copy. */
248 static volatile sequence_index_t unique_copy_minimal_n
;
250 static volatile sequence_index_t workstealing_chunk_size
;
252 /** @brief Minimal input size for parallel std::for_each. */
253 static volatile sequence_index_t for_each_minimal_n
;
255 /** @brief Minimal input size for parallel std::count and
257 static volatile sequence_index_t count_minimal_n
;
259 /** @brief Minimal input size for parallel std::transform. */
260 static volatile sequence_index_t transform_minimal_n
;
262 /** @brief Minimal input size for parallel std::replace and
264 static volatile sequence_index_t replace_minimal_n
;
266 /** @brief Minimal input size for parallel std::generate. */
267 static volatile sequence_index_t generate_minimal_n
;
269 /** @brief Minimal input size for parallel std::fill. */
270 static volatile sequence_index_t fill_minimal_n
;
272 /** @brief Minimal input size for parallel std::min_element. */
273 static volatile sequence_index_t min_element_minimal_n
;
275 /** @brief Minimal input size for parallel std::max_element. */
276 static volatile sequence_index_t max_element_minimal_n
;
278 /** @brief Minimal input size for parallel std::accumulate. */
279 static volatile sequence_index_t accumulate_minimal_n
;
281 /** @brief Distribution strategy for parallel std::find. */
282 static volatile FindDistribution find_distribution
;
284 /** @brief Start with looking for that many elements
285 sequentially, for std::find. */
286 static volatile sequence_index_t find_sequential_search_size
;
288 /** @brief Initial block size for parallel std::find. */
289 static volatile sequence_index_t find_initial_block_size
;
291 /** @brief Maximal block size for parallel std::find. */
292 static volatile sequence_index_t find_maximum_block_size
;
294 /** @brief Block size increase factor for parallel std::find. */
295 static volatile double find_increasing_factor
;
298 /** @brief Minimal input size for parallel std::set_union. */
299 static volatile sequence_index_t set_union_minimal_n
;
301 /** @brief Minimal input size for parallel
302 std::set_symmetric_difference. */
303 static volatile sequence_index_t set_symmetric_difference_minimal_n
;
305 /** @brief Minimal input size for parallel std::set_difference. */
306 static volatile sequence_index_t set_difference_minimal_n
;
308 /** @brief Minimal input size for parallel std::set_intersection. */
309 static volatile sequence_index_t set_intersection_minimal_n
;
311 //hardware dependent tuning parameters
312 /** @brief Size of the L1 cache in bytes (underestimation). */
313 static volatile unsigned long long L1_cache_size
;
315 /** @brief Size of the L2 cache in bytes (underestimation). */
316 static volatile unsigned long long L2_cache_size
;
318 /** @brief Size of the Translation Lookaside Buffer
319 (underestimation). */
320 static volatile unsigned int TLB_size
;
322 /** @brief Overestimation of cache line size. Used to avoid
323 * false sharing, i. e. elements of different threads are at
324 * least this amount apart. */
325 static unsigned int cache_line_size
;
328 /** @brief Statistic on the number of stolen ranges in
329 load-balanced quicksort.*/
330 static volatile sequence_index_t qsb_steals
;
333 volatile bool Settings::force_parallel
= false;
334 volatile bool Settings::force_sequential
= false;
335 volatile Settings::SortAlgorithm
Settings::sort_algorithm
= Settings::MWMS
;
336 volatile Settings::Splitting
Settings::sort_splitting
= Settings::EXACT
;
337 volatile sequence_index_t
Settings::sort_minimal_n
= 1000;
339 volatile unsigned int Settings::sort_mwms_oversampling
= 10;
340 volatile unsigned int Settings::sort_qs_num_samples_preset
= 100;
341 volatile sequence_index_t
Settings::sort_qsb_base_case_maximal_n
= 100;
342 volatile sequence_index_t
Settings::partition_minimal_n
= 1000;
343 volatile sequence_index_t
Settings::nth_element_minimal_n
= 1000;
344 volatile sequence_index_t
Settings::partial_sort_minimal_n
= 1000;
345 volatile sequence_index_t
Settings::partition_chunk_size
= 1000;
346 volatile double Settings::partition_chunk_share
= 0.0;
347 volatile unsigned int Settings::adjacent_difference_minimal_n
= 1000;
348 volatile Settings::PartialSumAlgorithm
Settings::
349 partial_sum_algorithm
= Settings::LINEAR
;
350 volatile unsigned int Settings::partial_sum_minimal_n
= 1000;
351 volatile float Settings::partial_sum_dilatation
= 1.0f
;
352 volatile unsigned int Settings::random_shuffle_minimal_n
= 1000;
353 volatile Settings::Splitting
Settings::merge_splitting
= Settings::EXACT
;
354 volatile sequence_index_t
Settings::merge_minimal_n
= 1000;
355 volatile unsigned int Settings::merge_oversampling
= 10;
356 volatile sequence_index_t
Settings::multiway_merge_minimal_n
= 1000;
357 volatile int Settings::multiway_merge_minimal_k
= 2;
360 volatile sequence_index_t
Settings::unique_copy_minimal_n
= 10000;
361 volatile Settings::MultiwayMergeAlgorithm
Settings::
362 multiway_merge_algorithm
= Settings::LOSER_TREE
;
363 volatile Settings::Splitting
Settings::multiway_merge_splitting
=
365 volatile unsigned int Settings::multiway_merge_oversampling
= 10;
366 volatile Settings::FindDistribution
Settings::find_distribution
=
367 Settings::CONSTANT_SIZE_BLOCKS
;
368 volatile sequence_index_t
Settings::find_sequential_search_size
= 256;
369 volatile sequence_index_t
Settings::find_initial_block_size
= 256;
370 volatile sequence_index_t
Settings::find_maximum_block_size
= 8192;
371 volatile double Settings::find_increasing_factor
= 2.0;
372 volatile sequence_index_t
Settings::workstealing_chunk_size
= 100;
373 volatile sequence_index_t
Settings::for_each_minimal_n
= 1000;
374 volatile sequence_index_t
Settings::count_minimal_n
= 1000;
375 volatile sequence_index_t
Settings::transform_minimal_n
= 1000;
376 volatile sequence_index_t
Settings::replace_minimal_n
= 1000;
377 volatile sequence_index_t
Settings::generate_minimal_n
= 1000;
378 volatile sequence_index_t
Settings::fill_minimal_n
= 1000;
379 volatile sequence_index_t
Settings::min_element_minimal_n
= 1000;
380 volatile sequence_index_t
Settings::max_element_minimal_n
= 1000;
381 volatile sequence_index_t
Settings::accumulate_minimal_n
= 1000;
384 volatile sequence_index_t
Settings::set_union_minimal_n
= 1000;
385 volatile sequence_index_t
Settings::set_intersection_minimal_n
= 1000;
386 volatile sequence_index_t
Settings::set_difference_minimal_n
= 1000;
387 volatile sequence_index_t
Settings::set_symmetric_difference_minimal_n
=
389 volatile unsigned long long Settings::L1_cache_size
= 16 << 10;
390 volatile unsigned long long Settings::L2_cache_size
= 256 << 10;
391 volatile unsigned int Settings::TLB_size
= 128;
392 unsigned int Settings::cache_line_size
= 64;
395 volatile sequence_index_t
Settings::qsb_steals
= 0;
396 } // end anonymous namespace
400 #endif /* _GLIBCXX_SETTINGS_H */