1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
|
// -*- C++ -*-
// Copyright (C) 2007, 2008, 2009, 2010 Free Software Foundation, Inc.
//
// This file is part of the GNU ISO C++ Library. This library is free
// software; you can redistribute it and/or modify it under the terms
// of the GNU General Public License as published by the Free Software
// Foundation; either version 3, or (at your option) any later
// version.
// This library is distributed in the hope that it will be useful, but
// WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// General Public License for more details.
// Under Section 7 of GPL version 3, you are granted additional
// permissions described in the GCC Runtime Library Exception, version
// 3.1, as published by the Free Software Foundation.
// You should have received a copy of the GNU General Public License and
// a copy of the GCC Runtime Library Exception along with this program;
// see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
// <http://www.gnu.org/licenses/>.
/** @file parallel/settings.h
* @brief Runtime settings and tuning parameters, heuristics to decide
* whether to use parallelized algorithms.
* This file is a GNU parallel extension to the Standard C++ Library.
*
* @section parallelization_decision
* The decision whether to run an algorithm in parallel.
*
* There are several ways the user can switch on and __off the parallel
* execution of an algorithm, both at compile- and run-time.
*
* Only sequential execution can be forced at compile-time. This
* reduces code size and protects code parts that have
* non-thread-safe side effects.
*
* Ultimately, forcing parallel execution at compile-time makes
* sense. Often, the sequential algorithm implementation is used as
* a subroutine, so no reduction in code size can be achieved. Also,
* the machine the program is run on might have only one processor
* core, so to avoid overhead, the algorithm is executed
* sequentially.
*
* To force sequential execution of an algorithm ultimately at
* compile-time, the user must add the tag
* gnu_parallel::sequential_tag() to the end of the parameter list,
* e. g.
*
* \code
* std::sort(__v.begin(), __v.end(), __gnu_parallel::sequential_tag());
* \endcode
*
* This is compatible with all overloaded algorithm variants. No
* additional code will be instantiated, at all. The same holds for
* most algorithm calls with iterators not providing random access.
*
* If the algorithm call is not forced to be executed sequentially
* at compile-time, the decision is made at run-time.
* The global variable __gnu_parallel::_Settings::algorithm_strategy
* is checked. _It is a tristate variable corresponding to:
*
* a. force_sequential, meaning the sequential algorithm is executed.
* b. force_parallel, meaning the parallel algorithm is executed.
* c. heuristic
*
* For heuristic, the parallel algorithm implementation is called
* only if the input size is sufficiently large. For most
* algorithms, the input size is the (combined) length of the input
* sequence(__s). The threshold can be set by the user, individually
* for each algorithm. The according variables are called
* gnu_parallel::_Settings::[algorithm]_minimal_n .
*
* For some of the algorithms, there are even more tuning options,
* e. g. the ability to choose from multiple algorithm variants. See
* below for details.
*/
// Written by Johannes Singler and Felix Putze.
#ifndef _GLIBCXX_PARALLEL_SETTINGS_H
#define _GLIBCXX_PARALLEL_SETTINGS_H 1
#include <parallel/types.h>
/**
* @brief Determine at compile(?)-time if the parallel variant of an
* algorithm should be called.
* @param __c A condition that is convertible to bool that is overruled by
* __gnu_parallel::_Settings::algorithm_strategy. Usually a decision
* based on the input size.
*/
#define _GLIBCXX_PARALLEL_CONDITION(__c) \
(__gnu_parallel::_Settings::get().algorithm_strategy \
!= __gnu_parallel::force_sequential \
&& ((__gnu_parallel::__get_max_threads() > 1 && (__c)) \
|| __gnu_parallel::_Settings::get().algorithm_strategy \
== __gnu_parallel::force_parallel))
/*
inline bool
parallel_condition(bool __c)
{
bool ret = false;
const _Settings& __s = _Settings::get();
if (__s.algorithm_strategy != force_seqential)
{
if (__s.algorithm_strategy == force_parallel)
ret = true;
else
ret = __get_max_threads() > 1 && __c;
}
return ret;
}
*/
namespace __gnu_parallel
{
/// class _Settings
/// Run-time settings for the parallel mode including all tunable parameters.
struct _Settings
{
_AlgorithmStrategy algorithm_strategy;
_SortAlgorithm sort_algorithm;
_PartialSumAlgorithm partial_sum_algorithm;
_MultiwayMergeAlgorithm multiway_merge_algorithm;
_FindAlgorithm find_algorithm;
_SplittingAlgorithm sort_splitting;
_SplittingAlgorithm merge_splitting;
_SplittingAlgorithm multiway_merge_splitting;
// Per-algorithm settings.
/// Minimal input size for accumulate.
_SequenceIndex accumulate_minimal_n;
/// Minimal input size for adjacent_difference.
unsigned int adjacent_difference_minimal_n;
/// Minimal input size for count and count_if.
_SequenceIndex count_minimal_n;
/// Minimal input size for fill.
_SequenceIndex fill_minimal_n;
/// Block size increase factor for find.
double find_increasing_factor;
/// Initial block size for find.
_SequenceIndex find_initial_block_size;
/// Maximal block size for find.
_SequenceIndex find_maximum_block_size;
/// Start with looking for this many elements sequentially, for find.
_SequenceIndex find_sequential_search_size;
/// Minimal input size for for_each.
_SequenceIndex for_each_minimal_n;
/// Minimal input size for generate.
_SequenceIndex generate_minimal_n;
/// Minimal input size for max_element.
_SequenceIndex max_element_minimal_n;
/// Minimal input size for merge.
_SequenceIndex merge_minimal_n;
/// Oversampling factor for merge.
unsigned int merge_oversampling;
/// Minimal input size for min_element.
_SequenceIndex min_element_minimal_n;
/// Minimal input size for multiway_merge.
_SequenceIndex multiway_merge_minimal_n;
/// Oversampling factor for multiway_merge.
int multiway_merge_minimal_k;
/// Oversampling factor for multiway_merge.
unsigned int multiway_merge_oversampling;
/// Minimal input size for nth_element.
_SequenceIndex nth_element_minimal_n;
/// Chunk size for partition.
_SequenceIndex partition_chunk_size;
/// Chunk size for partition, relative to input size. If > 0.0,
/// this value overrides partition_chunk_size.
double partition_chunk_share;
/// Minimal input size for partition.
_SequenceIndex partition_minimal_n;
/// Minimal input size for partial_sort.
_SequenceIndex partial_sort_minimal_n;
/// Ratio for partial_sum. Assume "sum and write result" to be
/// this factor slower than just "sum".
float partial_sum_dilation;
/// Minimal input size for partial_sum.
unsigned int partial_sum_minimal_n;
/// Minimal input size for random_shuffle.
unsigned int random_shuffle_minimal_n;
/// Minimal input size for replace and replace_if.
_SequenceIndex replace_minimal_n;
/// Minimal input size for set_difference.
_SequenceIndex set_difference_minimal_n;
/// Minimal input size for set_intersection.
_SequenceIndex set_intersection_minimal_n;
/// Minimal input size for set_symmetric_difference.
_SequenceIndex set_symmetric_difference_minimal_n;
/// Minimal input size for set_union.
_SequenceIndex set_union_minimal_n;
/// Minimal input size for parallel sorting.
_SequenceIndex sort_minimal_n;
/// Oversampling factor for parallel std::sort (MWMS).
unsigned int sort_mwms_oversampling;
/// Such many samples to take to find a good pivot (quicksort).
unsigned int sort_qs_num_samples_preset;
/// Maximal subsequence __length to switch to unbalanced __base case.
/// Applies to std::sort with dynamically load-balanced quicksort.
_SequenceIndex sort_qsb_base_case_maximal_n;
/// Minimal input size for parallel std::transform.
_SequenceIndex transform_minimal_n;
/// Minimal input size for unique_copy.
_SequenceIndex unique_copy_minimal_n;
_SequenceIndex workstealing_chunk_size;
// Hardware dependent tuning parameters.
/// size of the L1 cache in bytes (underestimation).
unsigned long long L1_cache_size;
/// size of the L2 cache in bytes (underestimation).
unsigned long long L2_cache_size;
/// size of the Translation Lookaside Buffer (underestimation).
unsigned int TLB_size;
/// Overestimation of cache line size. Used to avoid false
/// sharing, i.e. elements of different threads are at least this
/// amount apart.
unsigned int cache_line_size;
// Statistics.
/// The number of stolen ranges in load-balanced quicksort.
_SequenceIndex qsb_steals;
/// Minimal input size for search and search_n.
_SequenceIndex search_minimal_n;
/// Block size scale-down factor with respect to current position.
float find_scale_factor;
/// Get the global settings.
_GLIBCXX_CONST static const _Settings&
get() throw();
/// Set the global settings.
static void
set(_Settings&) throw();
explicit
_Settings() :
algorithm_strategy(heuristic),
sort_algorithm(MWMS),
partial_sum_algorithm(LINEAR),
multiway_merge_algorithm(LOSER_TREE),
find_algorithm(CONSTANT_SIZE_BLOCKS),
sort_splitting(EXACT),
merge_splitting(EXACT),
multiway_merge_splitting(EXACT),
accumulate_minimal_n(1000),
adjacent_difference_minimal_n(1000),
count_minimal_n(1000),
fill_minimal_n(1000),
find_increasing_factor(2.0),
find_initial_block_size(256),
find_maximum_block_size(8192),
find_sequential_search_size(256),
for_each_minimal_n(1000),
generate_minimal_n(1000),
max_element_minimal_n(1000),
merge_minimal_n(1000),
merge_oversampling(10),
min_element_minimal_n(1000),
multiway_merge_minimal_n(1000),
multiway_merge_minimal_k(2), multiway_merge_oversampling(10),
nth_element_minimal_n(1000),
partition_chunk_size(1000),
partition_chunk_share(0.0),
partition_minimal_n(1000),
partial_sort_minimal_n(1000),
partial_sum_dilation(1.0f),
partial_sum_minimal_n(1000),
random_shuffle_minimal_n(1000),
replace_minimal_n(1000),
set_difference_minimal_n(1000),
set_intersection_minimal_n(1000),
set_symmetric_difference_minimal_n(1000),
set_union_minimal_n(1000),
sort_minimal_n(1000),
sort_mwms_oversampling(10),
sort_qs_num_samples_preset(100),
sort_qsb_base_case_maximal_n(100),
transform_minimal_n(1000),
unique_copy_minimal_n(10000),
workstealing_chunk_size(100),
L1_cache_size(16 << 10),
L2_cache_size(256 << 10),
TLB_size(128),
cache_line_size(64),
qsb_steals(0),
search_minimal_n(1000),
find_scale_factor(0.01f)
{ }
};
}
#endif /* _GLIBCXX_PARALLEL_SETTINGS_H */
|