77namespace stan {
88namespace math {
99
10+ // Internal macro used to modify global pointer definition to the
11+ // global AD instance.
12+ #ifdef STAN_THREADS
13+ // Whenever STAN_THREADS is set a TLS keyword is used. For reasons
14+ // explained below we use the GNU compiler extension __thread if
15+ // supported by the compiler while the generic thread_local C++11
16+ // keyword is used otherwise.
17+ #ifdef __GNUC__
18+ #define STAN_THREADS_DEF __thread
19+ #else
20+ #define STAN_THREADS_DEF thread_local
21+ #endif
22+ #else
23+ // In case STAN_THREADS is not set, then no modifier is needed.
24+ #define STAN_THREADS_DEF
25+ #endif
26+
1027/* *
11- * Provides a thread_local singleton if needed. Read warnings below!
12- * For performance reasons the singleton is a global static for the
13- * case of no threading which is returned by a function. This design
14- * should allow the compiler to apply necessary inlining to get
15- * maximal performance. However, this design suffers from "the static
16- * init order fiasco"[0]. Anywhere this is used, we must be
17- * absolutely positive that it doesn't matter when the singleton will
18- * get initialized relative to other static variables. In exchange,
19- * we get a more performant singleton pattern for the non-threading
20- * case. In the threading case we use the defacto standard C++11
21- * singleton pattern relying on a function wrapping a static local
22- * variable. This standard pattern is expected to be well supported
23- * by the major compilers (as its standard), but it does incur some
24- * performance penalty. There has been some discussion on this; see
25- * [1] and [2] and the discussions those PRs link to as well.
28+ * This struct always provides access to the autodiff stack using
29+ * the singleton pattern. Read warnings below!
30+ *
31+ * The singleton <code>instance_</code> is a global static pointer,
32+ * which is thread local (TLS) if the STAN_THREADS preprocess variable
33+ * is defined.
2634 *
27- * These are thread_local only if the user asks for it with
28- * -DSTAN_THREADS. This is primarily because Apple clang compilers
29- * before 2016 don't support thread_local and the additional
30- * performance cost. We have proposed removing support for those[3],
31- * and at that time we should evaluate the performance of a switch to
32- * thread_local. If there is no loss in performance, we can remove
33- * this ifdef.
35+ * The use of a pointer is motivated by performance reasons for the
36+ * threading case. When a TLS is used, initialization with a constant
37+ * expression at compile time is required for fast access to the
38+ * TLS. As the autodiff storage struct is non-POD, its initialization
39+ * is a dynamic expression at compile time. These dynamic expressions
40+ * are wrapped, in the TLS case, by a TLS wrapper function which slows
41+ * down its access. Using a pointer instead allows to initialize at
42+ * compile time to <code>nullptr</code>, which is a compile time
43+ * constant. In this case, the compiler avoids the use of a TLS
44+ * wrapper function.
45+ *
46+ * For performance reasons we use the __thread keyword on compilers
47+ * which support it. The __thread keyword is a GNU compiler-specific
48+ * (gcc, clang, Intel) extension which requires initialization with a
49+ * compile time constant expression. The C++11 keyword thread_local
50+ * does allow for constant and dynamic initialization of the
51+ * TLS. Thus, only the __thread keyword gurantees that constant
52+ * initialization and it's implied speedup, is used.
53+ *
54+ * The initialzation of the AD instance at run-time is handled by the
55+ * lifetime of a AutodiffStackSingleton object. More specifically, the
56+ * first instance of the AutodiffStackSingleton object will initialize
57+ * the AD instance and take ownership (it is the only one instance
58+ * with the private member own_instance_ being true). Thus, whenever
59+ * the first instance of the AutodiffStackSingleton object gets
60+ * destructed, the AD tape will be destructed as well. Within
61+ * stan-math the initialization of the AD instance for the main thread
62+ * of the program is handled by instantiating the singleton once in
63+ * the init_chainablestack.hpp file. Whenever STAN_THREADS is defined
64+ * then all created child threads must instantiate a
65+ * AutodiffStackSingleton object within the child thread before
66+ * accessing the AD system in order to initialize the TLS AD tape
67+ * within the child thread.
68+ *
69+ * The design of a globally held (optionally TLS) pointer, which is
70+ * globally initialized, allows the compiler to apply necessary
71+ * inlining to get maximal performance. However, the design suffers
72+ * from "the static init order fiasco"[0]. Whenever the static init
73+ * order fiasco occurs, the C++ client of the library may instantiate
74+ * a AutodiffStackSingleton object at the adequate code position prior
75+ * to any AD tape access to ensure proper initialization order. In
76+ * exchange, we get a more performant singleton pattern with automatic
77+ * initialization of the AD stack for the main thread. There has been
78+ * some discussion on earlier designs using the Mayer singleton
79+ * approach; see [1] and [2] and the discussions those PRs link to as
80+ * well.
3481 *
3582 * [0] https://isocpp.org/wiki/faq/ctors#static-init-order
3683 * [1] https://github.com/stan-dev/math/pull/840
3784 * [2] https://github.com/stan-dev/math/pull/826
3885 * [3]
3986 * http://discourse.mc-stan.org/t/potentially-dropping-support-for-older-versions-of-apples-version-of-clang/3780/
87+ * [4] https://github.com/stan-dev/math/pull/1135
4088 */
4189template <typename ChainableT, typename ChainableAllocT>
4290struct AutodiffStackSingleton {
4391 typedef AutodiffStackSingleton<ChainableT, ChainableAllocT>
4492 AutodiffStackSingleton_t;
4593
94+ AutodiffStackSingleton () : own_instance_(init()) {}
95+ ~AutodiffStackSingleton () {
96+ if (own_instance_) {
97+ delete instance_;
98+ instance_ = nullptr ;
99+ }
100+ }
101+
46102 struct AutodiffStackStorage {
47103 AutodiffStackStorage &operator =(const AutodiffStackStorage &) = delete ;
48104
@@ -57,30 +113,32 @@ struct AutodiffStackSingleton {
57113 std::vector<size_t > nested_var_alloc_stack_starts_;
58114 };
59115
60- AutodiffStackSingleton () = delete ;
61116 explicit AutodiffStackSingleton (AutodiffStackSingleton_t const &) = delete;
62117 AutodiffStackSingleton &operator =(const AutodiffStackSingleton_t &) = delete ;
63118
64- static inline AutodiffStackStorage &instance () {
65- #ifdef STAN_THREADS
66- thread_local static AutodiffStackStorage instance_;
67- #endif
68- return instance_;
119+ static inline constexpr AutodiffStackStorage &instance () {
120+ return *instance_;
69121 }
70122
71- #ifndef STAN_THREADS
72-
73123 private:
74- static AutodiffStackStorage instance_;
75- #endif
124+ static bool init () {
125+ if (!instance_) {
126+ instance_ = new AutodiffStackStorage ();
127+ return true ;
128+ }
129+ return false ;
130+ }
131+
132+ static STAN_THREADS_DEF AutodiffStackStorage *instance_;
133+ const bool own_instance_;
76134};
77135
78- #ifndef STAN_THREADS
79136template <typename ChainableT, typename ChainableAllocT>
80- typename AutodiffStackSingleton<ChainableT,
81- ChainableAllocT>::AutodiffStackStorage
82- AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_;
83- #endif
137+ STAN_THREADS_DEF
138+ typename AutodiffStackSingleton<ChainableT,
139+ ChainableAllocT>::AutodiffStackStorage
140+ *AutodiffStackSingleton<ChainableT, ChainableAllocT>::instance_
141+ = nullptr ;
84142
85143} // namespace math
86144} // namespace stan
0 commit comments