1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
|
<chapter xmlns="http://docbook.org/ns/docbook" version="5.0"
xml:id="manual.ext.profile_mode" xreflabel="Profile Mode">
<?dbhtml filename="profile_mode.html"?>
<info><title>Profile Mode</title>
<keywordset>
<keyword>
C++
</keyword>
<keyword>
library
</keyword>
<keyword>
profile
</keyword>
</keywordset>
</info>
<section xml:id="manual.ext.profile_mode.intro" xreflabel="Intro"><info><title>Intro</title></info>
<para>
<emphasis>Goal: </emphasis>Give performance improvement advice based on
recognition of suboptimal usage patterns of the standard library.
</para>
<para>
<emphasis>Method: </emphasis>Wrap the standard library code. Insert
calls to an instrumentation library to record the internal state of
various components at interesting entry/exit points to/from the standard
library. Process trace, recognize suboptimal patterns, give advice.
For details, see
<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="http://dx.doi.org/10.1109/CGO.2009.36">paper presented at
CGO 2009</link>.
</para>
<para>
<emphasis>Strengths: </emphasis>
<itemizedlist>
<listitem><para>
Unintrusive solution. The application code does not require any
modification.
</para></listitem>
<listitem><para> The advice is call context sensitive, thus capable of
identifying precisely interesting dynamic performance behavior.
</para></listitem>
<listitem><para>
The overhead model is pay-per-view. When you turn off a diagnostic class
at compile time, its overhead disappears.
</para></listitem>
</itemizedlist>
</para>
<para>
<emphasis>Drawbacks: </emphasis>
<itemizedlist>
<listitem><para>
You must recompile the application code with custom options.
</para></listitem>
<listitem><para>You must run the application on representative input.
The advice is input dependent.
</para></listitem>
<listitem><para>
The execution time will increase, in some cases by factors.
</para></listitem>
</itemizedlist>
</para>
<section xml:id="manual.ext.profile_mode.using" xreflabel="Using"><info><title>Using the Profile Mode</title></info>
<para>
This is the anticipated common workflow for program <code>foo.cc</code>:
<programlisting>
$ cat foo.cc
#include <vector>
int main() {
vector<int> v;
for (int k = 0; k < 1024; ++k) v.insert(v.begin(), k);
}
$ g++ -D_GLIBCXX_PROFILE foo.cc
$ ./a.out
$ cat libstdcxx-profile.txt
vector-to-list: improvement = 5: call stack = 0x804842c ...
: advice = change std::vector to std::list
vector-size: improvement = 3: call stack = 0x804842c ...
: advice = change initial container size from 0 to 1024
</programlisting>
</para>
<para>
Anatomy of a warning:
<itemizedlist>
<listitem>
<para>
Warning id. This is a short descriptive string for the class
that this warning belongs to. E.g., "vector-to-list".
</para>
</listitem>
<listitem>
<para>
Estimated improvement. This is an approximation of the benefit expected
from implementing the change suggested by the warning. It is given on
a log10 scale. Negative values mean that the alternative would actually
do worse than the current choice.
In the example above, 5 comes from the fact that the overhead of
inserting at the beginning of a vector vs. a list is around 1024 * 1024 / 2,
which is around 10e5. The improvement from setting the initial size to
1024 is in the range of 10e3, since the overhead of dynamic resizing is
linear in this case.
</para>
</listitem>
<listitem>
<para>
Call stack. Currently, the addresses are printed without
symbol name or code location attribution.
Users are expected to postprocess the output using, for instance, addr2line.
</para>
</listitem>
<listitem>
<para>
The warning message. For some warnings, this is static text, e.g.,
"change vector to list". For other warnings, such as the one above,
the message contains numeric advice, e.g., the suggested initial size
of the vector.
</para>
</listitem>
</itemizedlist>
</para>
<para>Three files are generated. <code>libstdcxx-profile.txt</code>
contains human readable advice. <code>libstdcxx-profile.raw</code>
contains implementation specific data about each diagnostic.
Their format is not documented. They are sufficient to generate
all the advice given in <code>libstdcxx-profile.txt</code>. The advantage
of keeping this raw format is that traces from multiple executions can
be aggregated simply by concatenating the raw traces. We intend to
offer an external utility program that can issue advice from a trace.
<code>libstdcxx-profile.conf.out</code> lists the actual diagnostic
parameters used. To alter parameters, edit this file and rename it to
<code>libstdcxx-profile.conf</code>.
</para>
<para>Advice is given regardless whether the transformation is valid.
For instance, we advise changing a map to an unordered_map even if the
application semantics require that data be ordered.
We believe such warnings can help users understand the performance
behavior of their application better, which can lead to changes
at a higher abstraction level.
</para>
</section>
<section xml:id="manual.ext.profile_mode.tuning" xreflabel="Tuning"><info><title>Tuning the Profile Mode</title></info>
<para>Compile time switches and environment variables (see also file
profiler.h). Unless specified otherwise, they can be set at compile time
using -D_<name> or by setting variable <name>
in the environment where the program is run, before starting execution.
<itemizedlist>
<listitem><para>
<code>_GLIBCXX_PROFILE_NO_<diagnostic></code>:
disable specific diagnostics.
See section Diagnostics for possible values.
(Environment variables not supported.)
</para></listitem>
<listitem><para>
<code>_GLIBCXX_PROFILE_TRACE_PATH_ROOT</code>: set an alternative root
path for the output files.
</para></listitem>
<listitem><para>_GLIBCXX_PROFILE_MAX_WARN_COUNT: set it to the maximum
number of warnings desired. The default value is 10.</para></listitem>
<listitem><para>
<code>_GLIBCXX_PROFILE_MAX_STACK_DEPTH</code>: if set to 0,
the advice will
be collected and reported for the program as a whole, and not for each
call context.
This could also be used in continuous regression tests, where you
just need to know whether there is a regression or not.
The default value is 32.
</para></listitem>
<listitem><para>
<code>_GLIBCXX_PROFILE_MEM_PER_DIAGNOSTIC</code>:
set a limit on how much memory to use for the accounting tables for each
diagnostic type. When this limit is reached, new events are ignored
until the memory usage decreases under the limit. Generally, this means
that newly created containers will not be instrumented until some
live containers are deleted. The default is 128 MB.
</para></listitem>
<listitem><para>
<code>_GLIBCXX_PROFILE_NO_THREADS</code>:
Make the library not use threads. If thread local storage (TLS) is not
available, you will get a preprocessor error asking you to set
-D_GLIBCXX_PROFILE_NO_THREADS if your program is single-threaded.
Multithreaded execution without TLS is not supported.
(Environment variable not supported.)
</para></listitem>
<listitem><para>
<code>_GLIBCXX_HAVE_EXECINFO_H</code>:
This name should be defined automatically at library configuration time.
If your library was configured without <code>execinfo.h</code>, but
you have it in your include path, you can define it explicitly. Without
it, advice is collected for the program as a whole, and not for each
call context.
(Environment variable not supported.)
</para></listitem>
</itemizedlist>
</para>
</section>
</section>
<section xml:id="manual.ext.profile_mode.design" xreflabel="Design"><info><title>Design</title></info>
<para>
</para>
<table frame="all">
<title>Profile Code Location</title>
<tgroup cols="2" align="left" colsep="1" rowsep="1">
<colspec colname="c1"/>
<colspec colname="c2"/>
<thead>
<row>
<entry>Code Location</entry>
<entry>Use</entry>
</row>
</thead>
<tbody>
<row>
<entry><code>libstdc++-v3/include/std/*</code></entry>
<entry>Preprocessor code to redirect to profile extension headers.</entry>
</row>
<row>
<entry><code>libstdc++-v3/include/profile/*</code></entry>
<entry>Profile extension public headers (map, vector, ...).</entry>
</row>
<row>
<entry><code>libstdc++-v3/include/profile/impl/*</code></entry>
<entry>Profile extension internals. Implementation files are
only included from <code>impl/profiler.h</code>, which is the only
file included from the public headers.</entry>
</row>
</tbody>
</tgroup>
</table>
<para>
</para>
<section xml:id="manual.ext.profile_mode.design.wrapper" xreflabel="Wrapper"><info><title>Wrapper Model</title></info>
<para>
In order to get our instrumented library version included instead of the
release one,
we use the same wrapper model as the debug mode.
We subclass entities from the release version. Wherever
<code>_GLIBCXX_PROFILE</code> is defined, the release namespace is
<code>std::__norm</code>, whereas the profile namespace is
<code>std::__profile</code>. Using plain <code>std</code> translates
into <code>std::__profile</code>.
</para>
<para>
Whenever possible, we try to wrap at the public interface level, e.g.,
in <code>unordered_set</code> rather than in <code>hashtable</code>,
in order not to depend on implementation.
</para>
<para>
Mixing object files built with and without the profile mode must
not affect the program execution. However, there are no guarantees to
the accuracy of diagnostics when using even a single object not built with
<code>-D_GLIBCXX_PROFILE</code>.
Currently, mixing the profile mode with debug and parallel extensions is
not allowed. Mixing them at compile time will result in preprocessor errors.
Mixing them at link time is undefined.
</para>
</section>
<section xml:id="manual.ext.profile_mode.design.instrumentation" xreflabel="Instrumentation"><info><title>Instrumentation</title></info>
<para>
Instead of instrumenting every public entry and exit point,
we chose to add instrumentation on demand, as needed
by individual diagnostics.
The main reason is that some diagnostics require us to extract bits of
internal state that are particular only to that diagnostic.
We plan to formalize this later, after we learn more about the requirements
of several diagnostics.
</para>
<para>
All the instrumentation points can be switched on and off using
<code>-D[_NO]_GLIBCXX_PROFILE_<diagnostic></code> options.
With all the instrumentation calls off, there should be negligible
overhead over the release version. This property is needed to support
diagnostics based on timing of internal operations. For such diagnostics,
we anticipate turning most of the instrumentation off in order to prevent
profiling overhead from polluting time measurements, and thus diagnostics.
</para>
<para>
All the instrumentation on/off compile time switches live in
<code>include/profile/profiler.h</code>.
</para>
</section>
<section xml:id="manual.ext.profile_mode.design.rtlib" xreflabel="Run Time Behavior"><info><title>Run Time Behavior</title></info>
<para>
For practical reasons, the instrumentation library processes the trace
partially
rather than dumping it to disk in raw form. Each event is processed when
it occurs. It is usually attached a cost and it is aggregated into
the database of a specific diagnostic class. The cost model
is based largely on the standard performance guarantees, but in some
cases we use knowledge about GCC's standard library implementation.
</para>
<para>
Information is indexed by (1) call stack and (2) instance id or address
to be able to understand and summarize precise creation-use-destruction
dynamic chains. Although the analysis is sensitive to dynamic instances,
the reports are only sensitive to call context. Whenever a dynamic instance
is destroyed, we accumulate its effect to the corresponding entry for the
call stack of its constructor location.
</para>
<para>
For details, see
<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="http://dx.doi.org/10.1109/CGO.2009.36">paper presented at
CGO 2009</link>.
</para>
</section>
<section xml:id="manual.ext.profile_mode.design.analysis" xreflabel="Analysis and Diagnostics"><info><title>Analysis and Diagnostics</title></info>
<para>
Final analysis takes place offline, and it is based entirely on the
generated trace and debugging info in the application binary.
See section Diagnostics for a list of analysis types that we plan to support.
</para>
<para>
The input to the analysis is a table indexed by profile type and call stack.
The data type for each entry depends on the profile type.
</para>
</section>
<section xml:id="manual.ext.profile_mode.design.cost-model" xreflabel="Cost Model"><info><title>Cost Model</title></info>
<para>
While it is likely that cost models become complex as we get into
more sophisticated analysis, we will try to follow a simple set of rules
at the beginning.
</para>
<itemizedlist>
<listitem><para><emphasis>Relative benefit estimation:</emphasis>
The idea is to estimate or measure the cost of all operations
in the original scenario versus the scenario we advise to switch to.
For instance, when advising to change a vector to a list, an occurrence
of the <code>insert</code> method will generally count as a benefit.
Its magnitude depends on (1) the number of elements that get shifted
and (2) whether it triggers a reallocation.
</para></listitem>
<listitem><para><emphasis>Synthetic measurements:</emphasis>
We will measure the relative difference between similar operations on
different containers. We plan to write a battery of small tests that
compare the times of the executions of similar methods on different
containers. The idea is to run these tests on the target machine.
If this training phase is very quick, we may decide to perform it at
library initialization time. The results can be cached on disk and reused
across runs.
</para></listitem>
<listitem><para><emphasis>Timers:</emphasis>
We plan to use timers for operations of larger granularity, such as sort.
For instance, we can switch between different sort methods on the fly
and report the one that performs best for each call context.
</para></listitem>
<listitem><para><emphasis>Show stoppers:</emphasis>
We may decide that the presence of an operation nullifies the advice.
For instance, when considering switching from <code>set</code> to
<code>unordered_set</code>, if we detect use of operator <code>++</code>,
we will simply not issue the advice, since this could signal that the use
care require a sorted container.</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.design.reports" xreflabel="Reports"><info><title>Reports</title></info>
<para>
There are two types of reports. First, if we recognize a pattern for which
we have a substitute that is likely to give better performance, we print
the advice and estimated performance gain. The advice is usually associated
to a code position and possibly a call stack.
</para>
<para>
Second, we report performance characteristics for which we do not have
a clear solution for improvement. For instance, we can point to the user
the top 10 <code>multimap</code> locations
which have the worst data locality in actual traversals.
Although this does not offer a solution,
it helps the user focus on the key problems and ignore the uninteresting ones.
</para>
</section>
<section xml:id="manual.ext.profile_mode.design.testing" xreflabel="Testing"><info><title>Testing</title></info>
<para>
First, we want to make sure we preserve the behavior of the release mode.
You can just type <code>"make check-profile"</code>, which
builds and runs the whole test suite in profile mode.
</para>
<para>
Second, we want to test the correctness of each diagnostic.
We created a <code>profile</code> directory in the test suite.
Each diagnostic must come with at least two tests, one for false positives
and one for false negatives.
</para>
</section>
</section>
<section xml:id="manual.ext.profile_mode.api" xreflabel="API"><info><title>Extensions for Custom Containers</title></info>
<para>
Many large projects use their own data structures instead of the ones in the
standard library. If these data structures are similar in functionality
to the standard library, they can be instrumented with the same hooks
that are used to instrument the standard library.
The instrumentation API is exposed in file
<code>profiler.h</code> (look for "Instrumentation hooks").
</para>
</section>
<section xml:id="manual.ext.profile_mode.cost_model" xreflabel="Cost Model"><info><title>Empirical Cost Model</title></info>
<para>
Currently, the cost model uses formulas with predefined relative weights
for alternative containers or container implementations. For instance,
iterating through a vector is X times faster than iterating through a list.
</para>
<para>
(Under development.)
We are working on customizing this to a particular machine by providing
an automated way to compute the actual relative weights for operations
on the given machine.
</para>
<para>
(Under development.)
We plan to provide a performance parameter database format that can be
filled in either by hand or by an automated training mechanism.
The analysis module will then use this database instead of the built in.
generic parameters.
</para>
</section>
<section xml:id="manual.ext.profile_mode.implementation" xreflabel="Implementation"><info><title>Implementation Issues</title></info>
<section xml:id="manual.ext.profile_mode.implementation.stack" xreflabel="Stack Traces"><info><title>Stack Traces</title></info>
<para>
Accurate stack traces are needed during profiling since we group events by
call context and dynamic instance. Without accurate traces, diagnostics
may be hard to interpret. For instance, when giving advice to the user
it is imperative to reference application code, not library code.
</para>
<para>
Currently we are using the libc <code>backtrace</code> routine to get
stack traces.
<code>_GLIBCXX_PROFILE_STACK_DEPTH</code> can be set
to 0 if you are willing to give up call context information, or to a small
positive value to reduce run time overhead.
</para>
</section>
<section xml:id="manual.ext.profile_mode.implementation.symbols" xreflabel="Symbolization"><info><title>Symbolization of Instruction Addresses</title></info>
<para>
The profiling and analysis phases use only instruction addresses.
An external utility such as addr2line is needed to postprocess the result.
We do not plan to add symbolization support in the profile extension.
This would require access to symbol tables, debug information tables,
external programs or libraries and other system dependent information.
</para>
</section>
<section xml:id="manual.ext.profile_mode.implementation.concurrency" xreflabel="Concurrency"><info><title>Concurrency</title></info>
<para>
Our current model is simplistic, but precise.
We cannot afford to approximate because some of our diagnostics require
precise matching of operations to container instance and call context.
During profiling, we keep a single information table per diagnostic.
There is a single lock per information table.
</para>
</section>
<section xml:id="manual.ext.profile_mode.implementation.stdlib-in-proflib" xreflabel="Using the Standard Library in the Runtime Library"><info><title>Using the Standard Library in the Instrumentation Implementation</title></info>
<para>
As much as we would like to avoid uses of libstdc++ within our
instrumentation library, containers such as unordered_map are very
appealing. We plan to use them as long as they are named properly
to avoid ambiguity.
</para>
</section>
<section xml:id="manual.ext.profile_mode.implementation.malloc-hooks" xreflabel="Malloc Hooks"><info><title>Malloc Hooks</title></info>
<para>
User applications/libraries can provide malloc hooks.
When the implementation of the malloc hooks uses stdlibc++, there can
be an infinite cycle between the profile mode instrumentation and the
malloc hook code.
</para>
<para>
We protect against reentrance to the profile mode instrumentation code,
which should avoid this problem in most cases.
The protection mechanism is thread safe and exception safe.
This mechanism does not prevent reentrance to the malloc hook itself,
which could still result in deadlock, if, for instance, the malloc hook
uses non-recursive locks.
XXX: A definitive solution to this problem would be for the profile extension
to use a custom allocator internally, and perhaps not to use libstdc++.
</para>
</section>
<section xml:id="manual.ext.profile_mode.implementation.construction-destruction" xreflabel="Construction and Destruction of Global Objects"><info><title>Construction and Destruction of Global Objects</title></info>
<para>
The profiling library state is initialized at the first call to a profiling
method. This allows us to record the construction of all global objects.
However, we cannot do the same at destruction time. The trace is written
by a function registered by <code>atexit</code>, thus invoked by
<code>exit</code>.
</para>
</section>
</section>
<section xml:id="manual.ext.profile_mode.developer" xreflabel="Developer Information"><info><title>Developer Information</title></info>
<section xml:id="manual.ext.profile_mode.developer.bigpic" xreflabel="Big Picture"><info><title>Big Picture</title></info>
<para>The profile mode headers are included with
<code>-D_GLIBCXX_PROFILE</code> through preprocessor directives in
<code>include/std/*</code>.
</para>
<para>Instrumented implementations are provided in
<code>include/profile/*</code>. All instrumentation hooks are macros
defined in <code>include/profile/profiler.h</code>.
</para>
<para>All the implementation of the instrumentation hooks is in
<code>include/profile/impl/*</code>. Although all the code gets included,
thus is publicly visible, only a small number of functions are called from
outside this directory. All calls to hook implementations must be
done through macros defined in <code>profiler.h</code>. The macro
must ensure (1) that the call is guarded against reentrance and
(2) that the call can be turned off at compile time using a
<code>-D_GLIBCXX_PROFILE_...</code> compiler option.
</para>
</section>
<section xml:id="manual.ext.profile_mode.developer.howto" xreflabel="How To Add A Diagnostic"><info><title>How To Add A Diagnostic</title></info>
<para>Let's say the diagnostic name is "magic".
</para>
<para>If you need to instrument a header not already under
<code>include/profile/*</code>, first edit the corresponding header
under <code>include/std/</code> and add a preprocessor directive such
as the one in <code>include/std/vector</code>:
<programlisting>
#ifdef _GLIBCXX_PROFILE
# include <profile/vector>
#endif
</programlisting>
</para>
<para>If the file you need to instrument is not yet under
<code>include/profile/</code>, make a copy of the one in
<code>include/debug</code>, or the main implementation.
You'll need to include the main implementation and inherit the classes
you want to instrument. Then define the methods you want to instrument,
define the instrumentation hooks and add calls to them.
Look at <code>include/profile/vector</code> for an example.
</para>
<para>Add macros for the instrumentation hooks in
<code>include/profile/impl/profiler.h</code>.
Hook names must start with <code>__profcxx_</code>.
Make sure they transform
in no code with <code>-D_NO_GLBICXX_PROFILE_MAGIC</code>.
Make sure all calls to any method in namespace <code>__gnu_profile</code>
is protected against reentrance using macro
<code>_GLIBCXX_PROFILE_REENTRANCE_GUARD</code>.
All names of methods in namespace <code>__gnu_profile</code> called from
<code>profiler.h</code> must start with <code>__trace_magic_</code>.
</para>
<para>Add the implementation of the diagnostic.
<itemizedlist>
<listitem><para>
Create new file <code>include/profile/impl/profiler_magic.h</code>.
</para></listitem>
<listitem><para>
Define class <code>__magic_info: public __object_info_base</code>.
This is the representation of a line in the object table.
The <code>__merge</code> method is used to aggregate information
across all dynamic instances created at the same call context.
The <code>__magnitude</code> must return the estimation of the benefit
as a number of small operations, e.g., number of words copied.
The <code>__write</code> method is used to produce the raw trace.
The <code>__advice</code> method is used to produce the advice string.
</para></listitem>
<listitem><para>
Define class <code>__magic_stack_info: public __magic_info</code>.
This defines the content of a line in the stack table.
</para></listitem>
<listitem><para>
Define class <code>__trace_magic: public __trace_base<__magic_info,
__magic_stack_info></code>.
It defines the content of the trace associated with this diagnostic.
</para></listitem>
</itemizedlist>
</para>
<para>Add initialization and reporting calls in
<code>include/profile/impl/profiler_trace.h</code>. Use
<code>__trace_vector_to_list</code> as an example.
</para>
<para>Add documentation in file <code>doc/xml/manual/profile_mode.xml</code>.
</para>
</section>
</section>
<section xml:id="manual.ext.profile_mode.diagnostics"><info><title>Diagnostics</title></info>
<para>
The table below presents all the diagnostics we intend to implement.
Each diagnostic has a corresponding compile time switch
<code>-D_GLIBCXX_PROFILE_<diagnostic></code>.
Groups of related diagnostics can be turned on with a single switch.
For instance, <code>-D_GLIBCXX_PROFILE_LOCALITY</code> is equivalent to
<code>-D_GLIBCXX_PROFILE_SOFTWARE_PREFETCH
-D_GLIBCXX_PROFILE_RBTREE_LOCALITY</code>.
</para>
<para>
The benefit, cost, expected frequency and accuracy of each diagnostic
was given a grade from 1 to 10, where 10 is highest.
A high benefit means that, if the diagnostic is accurate, the expected
performance improvement is high.
A high cost means that turning this diagnostic on leads to high slowdown.
A high frequency means that we expect this to occur relatively often.
A high accuracy means that the diagnostic is unlikely to be wrong.
These grades are not perfect. They are just meant to guide users with
specific needs or time budgets.
</para>
<table frame="all">
<title>Profile Diagnostics</title>
<tgroup cols="7" align="left" colsep="1" rowsep="1">
<colspec colname="c1"/>
<colspec colname="c2"/>
<colspec colname="c3"/>
<colspec colname="c4"/>
<colspec colname="c5"/>
<colspec colname="c6"/>
<colspec colname="c7"/>
<thead>
<row>
<entry>Group</entry>
<entry>Flag</entry>
<entry>Benefit</entry>
<entry>Cost</entry>
<entry>Freq.</entry>
<entry>Implemented</entry>
</row>
</thead>
<tbody>
<row>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.containers">
CONTAINERS</link></entry>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.hashtable_too_small">
HASHTABLE_TOO_SMALL</link></entry>
<entry>10</entry>
<entry>1</entry>
<entry/>
<entry>10</entry>
<entry>yes</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.hashtable_too_large">
HASHTABLE_TOO_LARGE</link></entry>
<entry>5</entry>
<entry>1</entry>
<entry/>
<entry>10</entry>
<entry>yes</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.inefficient_hash">
INEFFICIENT_HASH</link></entry>
<entry>7</entry>
<entry>3</entry>
<entry/>
<entry>10</entry>
<entry>yes</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.vector_too_small">
VECTOR_TOO_SMALL</link></entry>
<entry>8</entry>
<entry>1</entry>
<entry/>
<entry>10</entry>
<entry>yes</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.vector_too_large">
VECTOR_TOO_LARGE</link></entry>
<entry>5</entry>
<entry>1</entry>
<entry/>
<entry>10</entry>
<entry>yes</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.vector_to_hashtable">
VECTOR_TO_HASHTABLE</link></entry>
<entry>7</entry>
<entry>7</entry>
<entry/>
<entry>10</entry>
<entry>no</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.hashtable_to_vector">
HASHTABLE_TO_VECTOR</link></entry>
<entry>7</entry>
<entry>7</entry>
<entry/>
<entry>10</entry>
<entry>no</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.vector_to_list">
VECTOR_TO_LIST</link></entry>
<entry>8</entry>
<entry>5</entry>
<entry/>
<entry>10</entry>
<entry>yes</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.list_to_vector">
LIST_TO_VECTOR</link></entry>
<entry>10</entry>
<entry>5</entry>
<entry/>
<entry>10</entry>
<entry>no</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.assoc_ord_to_unord">
ORDERED_TO_UNORDERED</link></entry>
<entry>10</entry>
<entry>5</entry>
<entry/>
<entry>10</entry>
<entry>only map/unordered_map</entry>
</row>
<row>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.algorithms">
ALGORITHMS</link></entry>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.algorithms.sort">
SORT</link></entry>
<entry>7</entry>
<entry>8</entry>
<entry/>
<entry>7</entry>
<entry>no</entry>
</row>
<row>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.locality">
LOCALITY</link></entry>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.locality.sw_prefetch">
SOFTWARE_PREFETCH</link></entry>
<entry>8</entry>
<entry>8</entry>
<entry/>
<entry>5</entry>
<entry>no</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.locality.linked">
RBTREE_LOCALITY</link></entry>
<entry>4</entry>
<entry>8</entry>
<entry/>
<entry>5</entry>
<entry>no</entry>
</row>
<row>
<entry/>
<entry><link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#manual.ext.profile_mode.analysis.mthread.false_share">
FALSE_SHARING</link></entry>
<entry>8</entry>
<entry>10</entry>
<entry/>
<entry>10</entry>
<entry>no</entry>
</row>
</tbody>
</tgroup>
</table>
<section xml:id="manual.ext.profile_mode.analysis.template" xreflabel="Template"><info><title>Diagnostic Template</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_<diagnostic></code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> What problem will it diagnose?
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>.
What is the fundamental reason why this is a problem</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>
Percentage reduction in execution time. When reduction is more than
a constant factor, describe the reduction rate formula.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>
What would the advise look like?</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis>
What stdlibc++ components need to be instrumented?</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
How do we decide when to issue the advice?</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
How do we measure benefits? Math goes here.</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
program code
...
advice sample
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.containers" xreflabel="Containers"><info><title>Containers</title></info>
<para>
<emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_CONTAINERS</code>.
</para>
<section xml:id="manual.ext.profile_mode.analysis.hashtable_too_small" xreflabel="Hashtable Too Small"><info><title>Hashtable Too Small</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_HASHTABLE_TOO_SMALL</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect hashtables with many
rehash operations, small construction size and large destruction size.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis> Rehash is very expensive.
Read content, follow chains within bucket, evaluate hash function, place at
new location in different order.</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis> 36%.
Code similar to example below.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>
Set initial size to N at construction site S.
</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis>
<code>unordered_set, unordered_map</code> constructor, destructor, rehash.
</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
For each dynamic instance of <code>unordered_[multi]set|map</code>,
record initial size and call context of the constructor.
Record size increase, if any, after each relevant operation such as insert.
Record the estimated rehash cost.</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Number of individual rehash operations * cost per rehash.</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 unordered_set<int> us;
2 for (int k = 0; k < 1000000; ++k) {
3 us.insert(k);
4 }
foo.cc:1: advice: Changing initial unordered_set size from 10 to 1000000 saves 1025530 rehash operations.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.hashtable_too_large" xreflabel="Hashtable Too Large"><info><title>Hashtable Too Large</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_HASHTABLE_TOO_LARGE</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect hashtables which are
never filled up because fewer elements than reserved are ever
inserted.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis> Save memory, which
is good in itself and may also improve memory reference performance through
fewer cache and TLB misses.</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis> unknown.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>
Set initial size to N at construction site S.
</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis>
<code>unordered_set, unordered_map</code> constructor, destructor, rehash.
</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
For each dynamic instance of <code>unordered_[multi]set|map</code>,
record initial size and call context of the constructor, and correlate it
with its size at destruction time.
</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Number of iteration operations + memory saved.</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 vector<unordered_set<int>> v(100000, unordered_set<int>(100)) ;
2 for (int k = 0; k < 100000; ++k) {
3 for (int j = 0; j < 10; ++j) {
4 v[k].insert(k + j);
5 }
6 }
foo.cc:1: advice: Changing initial unordered_set size from 100 to 10 saves N
bytes of memory and M iteration steps.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.inefficient_hash" xreflabel="Inefficient Hash"><info><title>Inefficient Hash</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_INEFFICIENT_HASH</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect hashtables with polarized
distribution.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis> A non-uniform
distribution may lead to long chains, thus possibly increasing complexity
by a factor up to the number of elements.
</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis> factor up
to container size.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis> Change hash function
for container built at site S. Distribution score = N. Access score = S.
Longest chain = C, in bucket B.
</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis>
<code>unordered_set, unordered_map</code> constructor, destructor, [],
insert, iterator.
</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
Count the exact number of link traversals.
</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Total number of links traversed.</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
class dumb_hash {
public:
size_t operator() (int i) const { return 0; }
};
...
unordered_set<int, dumb_hash> hs;
...
for (int i = 0; i < COUNT; ++i) {
hs.find(i);
}
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.vector_too_small" xreflabel="Vector Too Small"><info><title>Vector Too Small</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_VECTOR_TOO_SMALL</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis>Detect vectors with many
resize operations, small construction size and large destruction size..
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>Resizing can be expensive.
Copying large amounts of data takes time. Resizing many small vectors may
have allocation overhead and affect locality.</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>
Set initial size to N at construction site S.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis><code>vector</code>.
</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
For each dynamic instance of <code>vector</code>,
record initial size and call context of the constructor.
Record size increase, if any, after each relevant operation such as
<code>push_back</code>. Record the estimated resize cost.
</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Total number of words copied * time to copy a word.</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 vector<int> v;
2 for (int k = 0; k < 1000000; ++k) {
3 v.push_back(k);
4 }
foo.cc:1: advice: Changing initial vector size from 10 to 1000000 saves
copying 4000000 bytes and 20 memory allocations and deallocations.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.vector_too_large" xreflabel="Vector Too Large"><info><title>Vector Too Large</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_VECTOR_TOO_LARGE</code>
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis>Detect vectors which are
never filled up because fewer elements than reserved are ever
inserted.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>Save memory, which
is good in itself and may also improve memory reference performance through
fewer cache and TLB misses.</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>
Set initial size to N at construction site S.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis><code>vector</code>.
</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
For each dynamic instance of <code>vector</code>,
record initial size and call context of the constructor, and correlate it
with its size at destruction time.</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Total amount of memory saved.</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 vector<vector<int>> v(100000, vector<int>(100)) ;
2 for (int k = 0; k < 100000; ++k) {
3 for (int j = 0; j < 10; ++j) {
4 v[k].insert(k + j);
5 }
6 }
foo.cc:1: advice: Changing initial vector size from 100 to 10 saves N
bytes of memory and may reduce the number of cache and TLB misses.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.vector_to_hashtable" xreflabel="Vector to Hashtable"><info><title>Vector to Hashtable</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_VECTOR_TO_HASHTABLE</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect uses of
<code>vector</code> that can be substituted with <code>unordered_set</code>
to reduce execution time.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>
Linear search in a vector is very expensive, whereas searching in a hashtable
is very quick.</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>factor up
to container size.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>Replace
<code>vector</code> with <code>unordered_set</code> at site S.
</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis><code>vector</code>
operations and access methods.</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
For each dynamic instance of <code>vector</code>,
record call context of the constructor. Issue the advice only if the
only methods called on this <code>vector</code> are <code>push_back</code>,
<code>insert</code> and <code>find</code>.
</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Cost(vector::push_back) + cost(vector::insert) + cost(find, vector) -
cost(unordered_set::insert) + cost(unordered_set::find).
</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 vector<int> v;
...
2 for (int i = 0; i < 1000; ++i) {
3 find(v.begin(), v.end(), i);
4 }
foo.cc:1: advice: Changing "vector" to "unordered_set" will save about 500,000
comparisons.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.hashtable_to_vector" xreflabel="Hashtable to Vector"><info><title>Hashtable to Vector</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_HASHTABLE_TO_VECTOR</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect uses of
<code>unordered_set</code> that can be substituted with <code>vector</code>
to reduce execution time.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>
Hashtable iterator is slower than vector iterator.</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>95%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>Replace
<code>unordered_set</code> with <code>vector</code> at site S.
</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis><code>unordered_set</code>
operations and access methods.</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
For each dynamic instance of <code>unordered_set</code>,
record call context of the constructor. Issue the advice only if the
number of <code>find</code>, <code>insert</code> and <code>[]</code>
operations on this <code>unordered_set</code> are small relative to the
number of elements, and methods <code>begin</code> or <code>end</code>
are invoked (suggesting iteration).</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Number of .</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 unordered_set<int> us;
...
2 int s = 0;
3 for (unordered_set<int>::iterator it = us.begin(); it != us.end(); ++it) {
4 s += *it;
5 }
foo.cc:1: advice: Changing "unordered_set" to "vector" will save about N
indirections and may achieve better data locality.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.vector_to_list" xreflabel="Vector to List"><info><title>Vector to List</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_VECTOR_TO_LIST</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect cases where
<code>vector</code> could be substituted with <code>list</code> for
better performance.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>
Inserting in the middle of a vector is expensive compared to inserting in a
list.
</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>factor up to
container size.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>Replace vector with list
at site S.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis><code>vector</code>
operations and access methods.</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
For each dynamic instance of <code>vector</code>,
record the call context of the constructor. Record the overhead of each
<code>insert</code> operation based on current size and insert position.
Report instance with high insertion overhead.
</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
(Sum(cost(vector::method)) - Sum(cost(list::method)), for
method in [push_back, insert, erase])
+ (Cost(iterate vector) - Cost(iterate list))</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 vector<int> v;
2 for (int i = 0; i < 10000; ++i) {
3 v.insert(v.begin(), i);
4 }
foo.cc:1: advice: Changing "vector" to "list" will save about 5,000,000
operations.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.list_to_vector" xreflabel="List to Vector"><info><title>List to Vector</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_LIST_TO_VECTOR</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect cases where
<code>list</code> could be substituted with <code>vector</code> for
better performance.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>
Iterating through a vector is faster than through a list.
</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>64%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>Replace list with vector
at site S.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis><code>vector</code>
operations and access methods.</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
Issue the advice if there are no <code>insert</code> operations.
</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
(Sum(cost(vector::method)) - Sum(cost(list::method)), for
method in [push_back, insert, erase])
+ (Cost(iterate vector) - Cost(iterate list))</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 list<int> l;
...
2 int sum = 0;
3 for (list<int>::iterator it = l.begin(); it != l.end(); ++it) {
4 sum += *it;
5 }
foo.cc:1: advice: Changing "list" to "vector" will save about 1000000 indirect
memory references.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.list_to_slist" xreflabel="List to Forward List"><info><title>List to Forward List (Slist)</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_LIST_TO_SLIST</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect cases where
<code>list</code> could be substituted with <code>forward_list</code> for
better performance.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>
The memory footprint of a forward_list is smaller than that of a list.
This has beneficial effects on memory subsystem, e.g., fewer cache misses.
</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>40%.
Note that the reduction is only noticeable if the size of the forward_list
node is in fact larger than that of the list node. For memory allocators
with size classes, you will only notice an effect when the two node sizes
belong to different allocator size classes.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>Replace list with
forward_list at site S.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis><code>list</code>
operations and iteration methods.</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
Issue the advice if there are no <code>backwards</code> traversals
or insertion before a given node.
</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Always true.</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 list<int> l;
...
2 int sum = 0;
3 for (list<int>::iterator it = l.begin(); it != l.end(); ++it) {
4 sum += *it;
5 }
foo.cc:1: advice: Change "list" to "forward_list".
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.assoc_ord_to_unord" xreflabel="Ordered to Unordered Associative Container"><info><title>Ordered to Unordered Associative Container</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_ORDERED_TO_UNORDERED</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect cases where ordered
associative containers can be replaced with unordered ones.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>
Insert and search are quicker in a hashtable than in
a red-black tree.</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>52%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>
Replace set with unordered_set at site S.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis>
<code>set</code>, <code>multiset</code>, <code>map</code>,
<code>multimap</code> methods.</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
Issue the advice only if we are not using operator <code>++</code> on any
iterator on a particular <code>[multi]set|map</code>.
</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
(Sum(cost(hashtable::method)) - Sum(cost(rbtree::method)), for
method in [insert, erase, find])
+ (Cost(iterate hashtable) - Cost(iterate rbtree))</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 set<int> s;
2 for (int i = 0; i < 100000; ++i) {
3 s.insert(i);
4 }
5 int sum = 0;
6 for (int i = 0; i < 100000; ++i) {
7 sum += *s.find(i);
8 }
</programlisting>
</para></listitem>
</itemizedlist>
</section>
</section>
<section xml:id="manual.ext.profile_mode.analysis.algorithms" xreflabel="Algorithms"><info><title>Algorithms</title></info>
<para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_ALGORITHMS</code>.
</para>
<section xml:id="manual.ext.profile_mode.analysis.algorithms.sort" xreflabel="Sorting"><info><title>Sort Algorithm Performance</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_SORT</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Give measure of sort algorithm
performance based on actual input. For instance, advise Radix Sort over
Quick Sort for a particular call context.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>
See papers:
<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="http://portal.acm.org/citation.cfm?doid=1065944.1065981">
A framework for adaptive algorithm selection in STAPL</link> and
<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="http://ieeexplore.ieee.org/search/wrapper.jsp?arnumber=4228227">
Optimizing Sorting with Machine Learning Algorithms</link>.
</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>60%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis> Change sort algorithm
at site S from X Sort to Y Sort.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis> <code>sort</code>
algorithm.</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
Issue the advice if the cost model tells us that another sort algorithm
would do better on this input. Requires us to know what algorithm we
are using in our sort implementation in release mode.</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Runtime(algo) for algo in [radix, quick, merge, ...]</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
</programlisting>
</para></listitem>
</itemizedlist>
</section>
</section>
<section xml:id="manual.ext.profile_mode.analysis.locality" xreflabel="Data Locality"><info><title>Data Locality</title></info>
<para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_LOCALITY</code>.
</para>
<section xml:id="manual.ext.profile_mode.analysis.locality.sw_prefetch" xreflabel="Need Software Prefetch"><info><title>Need Software Prefetch</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_SOFTWARE_PREFETCH</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Discover sequences of indirect
memory accesses that are not regular, thus cannot be predicted by
hardware prefetchers.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>
Indirect references are hard to predict and are very expensive when they
miss in caches.</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>25%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis> Insert prefetch
instruction.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis> Vector iterator and
access operator [].
</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
First, get cache line size and page size from system.
Then record iterator dereference sequences for which the value is a pointer.
For each sequence within a container, issue a warning if successive pointer
addresses are not within cache lines and do not form a linear pattern
(otherwise they may be prefetched by hardware).
If they also step across page boundaries, make the warning stronger.
</para>
<para>The same analysis applies to containers other than vector.
However, we cannot give the same advice for linked structures, such as list,
as there is no random access to the n-th element. The user may still be
able to benefit from this information, for instance by employing frays (user
level light weight threads) to hide the latency of chasing pointers.
</para>
<para>
This analysis is a little oversimplified. A better cost model could be
created by understanding the capability of the hardware prefetcher.
This model could be trained automatically by running a set of synthetic
cases.
</para>
</listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Total distance between pointer values of successive elements in vectors
of pointers.</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 int zero = 0;
2 vector<int*> v(10000000, &zero);
3 for (int k = 0; k < 10000000; ++k) {
4 v[random() % 10000000] = new int(k);
5 }
6 for (int j = 0; j < 10000000; ++j) {
7 count += (*v[j] == 0 ? 0 : 1);
8 }
foo.cc:7: advice: Insert prefetch instruction.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.locality.linked" xreflabel="Linked Structure Locality"><info><title>Linked Structure Locality</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_RBTREE_LOCALITY</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Give measure of locality of
objects stored in linked structures (lists, red-black trees and hashtables)
with respect to their actual traversal patterns.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>Allocation can be tuned
to a specific traversal pattern, to result in better data locality.
See paper:
<link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="http://www.springerlink.com/content/8085744l00x72662/">
Custom Memory Allocation for Free</link>.
</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>30%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis>
High scatter score N for container built at site S.
Consider changing allocation sequence or choosing a structure conscious
allocator.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis> Methods of all
containers using linked structures.</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
First, get cache line size and page size from system.
Then record the number of successive elements that are on different line
or page, for each traversal method such as <code>find</code>. Give advice
only if the ratio between this number and the number of total node hops
is above a threshold.</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Sum(same_cache_line(this,previous))</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 set<int> s;
2 for (int i = 0; i < 10000000; ++i) {
3 s.insert(i);
4 }
5 set<int> s1, s2;
6 for (int i = 0; i < 10000000; ++i) {
7 s1.insert(i);
8 s2.insert(i);
9 }
...
// Fast, better locality.
10 for (set<int>::iterator it = s.begin(); it != s.end(); ++it) {
11 sum += *it;
12 }
// Slow, elements are further apart.
13 for (set<int>::iterator it = s1.begin(); it != s1.end(); ++it) {
14 sum += *it;
15 }
foo.cc:5: advice: High scatter score NNN for set built here. Consider changing
the allocation sequence or switching to a structure conscious allocator.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
</section>
<section xml:id="manual.ext.profile_mode.analysis.mthread" xreflabel="Multithreaded Data Access"><info><title>Multithreaded Data Access</title></info>
<para>
The diagnostics in this group are not meant to be implemented short term.
They require compiler support to know when container elements are written
to. Instrumentation can only tell us when elements are referenced.
</para>
<para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_MULTITHREADED</code>.
</para>
<section xml:id="manual.ext.profile_mode.analysis.mthread.ddtest" xreflabel="Dependence Violations at Container Level"><info><title>Data Dependence Violations at Container Level</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_DDTEST</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect container elements
that are referenced from multiple threads in the parallel region or
across parallel regions.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis>
Sharing data between threads requires communication and perhaps locking,
which may be expensive.
</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>?%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis> Change data
distribution or parallel algorithm.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis> Container access methods
and iterators.
</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
Keep a shadow for each container. Record iterator dereferences and
container member accesses. Issue advice for elements referenced by
multiple threads.
See paper: <link xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="http://portal.acm.org/citation.cfm?id=207110.207148">
The LRPD test: speculative run-time parallelization of loops with
privatization and reduction parallelization</link>.
</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Number of accesses to elements referenced from multiple threads
</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
</programlisting>
</para></listitem>
</itemizedlist>
</section>
<section xml:id="manual.ext.profile_mode.analysis.mthread.false_share" xreflabel="False Sharing"><info><title>False Sharing</title></info>
<itemizedlist>
<listitem><para><emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_FALSE_SHARING</code>.
</para></listitem>
<listitem><para><emphasis>Goal:</emphasis> Detect elements in the
same container which share a cache line, are written by at least one
thread, and accessed by different threads.
</para></listitem>
<listitem><para><emphasis>Fundamentals:</emphasis> Under these assumptions,
cache protocols require
communication to invalidate lines, which may be expensive.
</para></listitem>
<listitem><para><emphasis>Sample runtime reduction:</emphasis>68%.
</para></listitem>
<listitem><para><emphasis>Recommendation:</emphasis> Reorganize container
or use padding to avoid false sharing.</para></listitem>
<listitem><para><emphasis>To instrument:</emphasis> Container access methods
and iterators.
</para></listitem>
<listitem><para><emphasis>Analysis:</emphasis>
First, get the cache line size.
For each shared container, record all the associated iterator dereferences
and member access methods with the thread id. Compare the address lists
across threads to detect references in two different threads to the same
cache line. Issue a warning only if the ratio to total references is
significant. Do the same for iterator dereference values if they are
pointers.</para></listitem>
<listitem><para><emphasis>Cost model:</emphasis>
Number of accesses to same cache line from different threads.
</para></listitem>
<listitem><para><emphasis>Example:</emphasis>
<programlisting>
1 vector<int> v(2, 0);
2 #pragma omp parallel for shared(v, SIZE) schedule(static, 1)
3 for (i = 0; i < SIZE; ++i) {
4 v[i % 2] += i;
5 }
OMP_NUM_THREADS=2 ./a.out
foo.cc:1: advice: Change container structure or padding to avoid false
sharing in multithreaded access at foo.cc:4. Detected N shared cache lines.
</programlisting>
</para></listitem>
</itemizedlist>
</section>
</section>
<section xml:id="manual.ext.profile_mode.analysis.statistics" xreflabel="Statistics"><info><title>Statistics</title></info>
<para>
<emphasis>Switch:</emphasis>
<code>_GLIBCXX_PROFILE_STATISTICS</code>.
</para>
<para>
In some cases the cost model may not tell us anything because the costs
appear to offset the benefits. Consider the choice between a vector and
a list. When there are both inserts and iteration, an automatic advice
may not be issued. However, the programmer may still be able to make use
of this information in a different way.
</para>
<para>
This diagnostic will not issue any advice, but it will print statistics for
each container construction site. The statistics will contain the cost
of each operation actually performed on the container.
</para>
</section>
</section>
<bibliography xml:id="profile_mode.biblio"><info><title>Bibliography</title></info>
<biblioentry>
<citetitle>
Perflint: A Context Sensitive Performance Advisor for C++ Programs
</citetitle>
<author><personname><firstname>Lixia</firstname><surname>Liu</surname></personname></author>
<author><personname><firstname>Silvius</firstname><surname>Rus</surname></personname></author>
<copyright>
<year>2009</year>
<holder/>
</copyright>
<publisher>
<publishername>
Proceedings of the 2009 International Symposium on Code Generation
and Optimization
</publishername>
</publisher>
</biblioentry>
</bibliography>
</chapter>
|