timemory 3.3.0
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
configuration.hpp
Go to the documentation of this file.
1// MIT License
2//
3// Copyright (c) 2020, The Regents of the University of California,
4// through Lawrence Berkeley National Laboratory (subject to receipt of any
5// required approvals from the U.S. Dept. of Energy). All rights reserved.
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in all
15// copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25/** \file timemory/ert/configuration.hpp
26 * \headerfile timemory/ert/configuration.hpp "timemory/ert/configuration.hpp"
27 * Provides configuration for executing empirical roofline toolkit (ERT)
28 *
29 */
30
31#pragma once
32
33#include "timemory/backends/device.hpp"
34#include "timemory/components/cuda/backends.hpp"
35#include "timemory/defines.h"
38#include "timemory/ert/data.hpp"
42
43#include <cstdint>
44#include <functional>
45
46// default vectorization width
47#if !defined(TIMEMORY_VEC)
48# define TIMEMORY_VEC 256
49#endif
50
51#if !defined(TIMEMORY_USER_ERT_FLOPS)
52# define TIMEMORY_USER_ERT_FLOPS
53#endif
54
55namespace tim
56{
57namespace ert
58{
59//======================================================================================//
60
61template <typename DeviceT, typename Tp, typename CounterT>
63{
66 using device_t = DeviceT;
67 using counter_t = CounterT;
69 using ert_data_ptr_t = std::shared_ptr<ert_data_t>;
71 using get_uint64_t = std::function<uint64_t()>;
72 using skip_ops_t = std::unordered_set<size_t>;
73 using get_skip_ops_t = std::function<skip_ops_t()>;
74
75 //----------------------------------------------------------------------------------//
76
78 {
79 static get_uint64_t _instance = []() {
82 // for checking if gpu
83 static constexpr bool is_gpu = device::is_gpu<DeviceT>::value;
84 return (is_gpu) ? settings::ert_num_threads_gpu()
86 };
87 return _instance;
88 }
89
90 //----------------------------------------------------------------------------------//
91
93 {
94 static get_uint64_t _instance = []() { return settings::ert_num_streams(); };
95 return _instance;
96 }
97
98 //----------------------------------------------------------------------------------//
99
101 {
102 static get_uint64_t _instance = []() { return settings::ert_grid_size(); };
103 return _instance;
104 }
105
106 //----------------------------------------------------------------------------------//
107
109 {
110 static get_uint64_t _instance = []() { return settings::ert_block_size(); };
111 return _instance;
112 }
113
114 //----------------------------------------------------------------------------------//
115
117 {
118 static get_uint64_t _instance = []() {
119 return std::max<uint64_t>(settings::ert_alignment(), 8 * sizeof(Tp));
120 };
121 return _instance;
122 }
123
124 //----------------------------------------------------------------------------------//
125
127 {
128 static get_uint64_t _instance = []() {
131 static constexpr bool is_gpu = device::is_gpu<DeviceT>::value;
132 return (is_gpu) ? settings::ert_min_working_size_gpu()
134 };
135 return _instance;
136 }
137
138 //----------------------------------------------------------------------------------//
139
141 {
142 static get_uint64_t _instance = []() -> uint64_t {
145 static constexpr bool is_gpu = device::is_gpu<DeviceT>::value;
146 if(is_gpu)
147 {
149 }
150 {
151 return 2 * ert::cache_size::get_max();
152 }
153 };
154 return _instance;
155 }
156
157 //----------------------------------------------------------------------------------//
158
160 {
161 static get_skip_ops_t _instance = []() {
162 auto _skipstr = settings::ert_skip_ops();
163 auto _skipstrvec = delimit(_skipstr, ",; \t");
164 skip_ops_t _result;
165 for(const auto& itr : _skipstrvec)
166 {
167 if(itr.find_first_not_of("0123456789") == std::string::npos)
168 _result.insert(atol(itr.c_str()));
169 }
170 return _result;
171 };
172 return _instance;
173 }
174
175 //----------------------------------------------------------------------------------//
176 /// configure the number of threads, number of streams, block size, grid size, and
177 /// alignment
178 template <typename Dev = DeviceT,
180 static void configure(uint64_t nthreads, uint64_t alignment = sizeof(Tp),
181 uint64_t nstreams = 0, uint64_t block_size = 0,
182 uint64_t grid_size = 0)
183 {
184 get_num_threads() = [=]() -> uint64_t { return nthreads; };
185 get_num_streams() = [=]() -> uint64_t { return nstreams; };
186 get_grid_size() = [=]() -> uint64_t { return grid_size; };
187 get_block_size() = [=]() -> uint64_t { return block_size; };
188 get_alignment() = [=]() -> uint64_t { return alignment; };
189 }
190
191 //----------------------------------------------------------------------------------//
192 /// configure the number of threads, number of streams, block size, grid size, and
193 /// alignment
194 template <typename Dev = DeviceT,
196 static void configure(uint64_t nthreads, uint64_t alignment = sizeof(Tp),
197 uint64_t nstreams = 1, uint64_t block_size = 1024,
198 uint64_t grid_size = 0)
199 {
200 get_num_threads() = [=]() -> uint64_t { return nthreads; };
201 get_num_streams() = [=]() -> uint64_t { return nstreams; };
202 get_grid_size() = [=]() -> uint64_t { return grid_size; };
203 get_block_size() = [=]() -> uint64_t { return block_size; };
204 get_alignment() = [=]() -> uint64_t { return alignment; };
205 }
206
207 //----------------------------------------------------------------------------------//
208
210 {
211 static executor_func_t _instance = [](ert_data_ptr_t data) {
212 using lli = long long int;
213 // configuration sizes
214 auto _mws_size = get_min_working_size()();
215 auto _max_size = get_max_data_size()();
216 auto _num_thread = get_num_threads()();
217 auto _num_stream = get_num_streams()();
218 auto _grid_size = get_grid_size()();
219 auto _block_size = get_block_size()();
220 auto _align_size = get_alignment()();
221 auto _skip_ops = get_skip_ops()();
222
223 // execution parameters
224 exec_params params(_mws_size, _max_size, _num_thread, _num_stream, _grid_size,
225 _block_size);
226 // operation _counter instance
227 ert_counter_t _counter(params, data, _align_size);
228
229 // set bytes per element
230 _counter.bytes_per_element = sizeof(Tp);
231 // set number of memory accesses per element from two functions
232 _counter.memory_accesses_per_element = 2;
233
234 for(const auto& itr : _skip_ops)
235 _counter.add_skip_ops(itr);
236
237 auto dtype = demangle(typeid(Tp).name());
238
239 std::string _dev_name{};
240 if(std::is_same<DeviceT, device::cpu>::value)
241 _dev_name = "[device::cpu]";
242 else if(std::is_same<DeviceT, device::gpu>::value)
243 _dev_name = "[device::gpu]";
244
246 {
247 printf("[ert::executor]%s> "
248 "working-set = %lli, max-size = %lli, num-thread = %lli, "
249 "num-stream = "
250 "%lli, grid-size = %lli, block-size = %lli, align-size = %lli, "
251 "data-type "
252 "= %s\n",
253 _dev_name.c_str(), (lli) _mws_size, (lli) _max_size,
254 (lli) _num_thread, (lli) _num_stream, (lli) _grid_size,
255 (lli) _block_size, (lli) _align_size, dtype.c_str());
256 }
257
258 return _counter;
259 };
260 return _instance;
261 }
262
263public:
264 bool verbose = false;
273
274public:
275 TIMEMORY_DEFAULT_OBJECT(configuration)
276};
277
278//======================================================================================//
279
280template <typename DeviceT, typename Tp, typename CounterT>
282{
283 static_assert(!std::is_same<DeviceT, device::gpu>::value,
284 "Error! Device should not be gpu");
285
286 //----------------------------------------------------------------------------------//
287 // useful aliases
288 //
289 using device_type = DeviceT;
290 using value_type = Tp;
294 using callback_type = std::function<void(counter_type&)>;
296
297public:
298 TIMEMORY_DEFAULT_OBJECT(executor)
299
300 //----------------------------------------------------------------------------------//
301 // standard invocation with no callback specialization
302 //
303 executor(configuration_type& config, std::shared_ptr<ert_data_t> _data)
304 {
305 (*this)(config, _data);
306 }
307
308 //----------------------------------------------------------------------------------//
309 // specialize the counter callback
310 //
311 template <typename FuncT>
312 executor(configuration_type& config, std::shared_ptr<ert_data_t> _data,
313 FuncT&& _counter_callback)
314 {
315 (*this)(config, _data, std::forward<FuncT>(_counter_callback));
316 }
317
318 //----------------------------------------------------------------------------------//
319 // execute
320 //
321 template <typename FuncT = std::function<void(uint64_t, counter_type&)>>
322 auto operator()(configuration_type& config, std::shared_ptr<ert_data_t> _data = {},
323 FuncT&& _counter_callback = FuncT{})
324 {
325 std::function<void(uint64_t, counter_type&)> _cb =
326 std::forward<FuncT>(_counter_callback);
327 try
328 {
329 if(!_data)
330 _data.reset(new ert_data_t);
331 auto _counter = config.executor(_data);
332 if(_cb)
333 _counter.set_callback(std::move(_cb));
334 callback(_counter);
335 } catch(std::exception& e)
336 {
337 std::cerr << "\n\nEXCEPTION:\n";
338 std::cerr << "\t" << e.what() << "\n\n" << std::endl;
339 }
340 return _data;
341 }
342
343public:
344 //----------------------------------------------------------------------------------//
345 //
347
348public:
349 //----------------------------------------------------------------------------------//
350 //
352 {
353 static callback_type _instance = [](counter_type& _counter) {
354 this_type::execute(_counter);
355 };
356 return _instance;
357 }
358
359 //----------------------------------------------------------------------------------//
360 //
361 static void execute(counter_type& _counter)
362 {
363 // vectorization number of ops
364 static constexpr const int SIZE_BITS = sizeof(Tp) * 8;
365 static_assert(SIZE_BITS > 0, "Calculated bits size is not greater than zero");
366 static constexpr const int VEC = TIMEMORY_VEC / SIZE_BITS;
367 static_assert(VEC > 0, "Calculated vector size is zero");
368
369 // functions
370 auto store_func = [](Tp& a, const Tp& b) { a = b; };
371 auto add_func = [](Tp& a, const Tp& b, const Tp& c) { a = b + c; };
372 auto fma_func = [](Tp& a, const Tp& b, const Tp& c) { a = a * b + c; };
373
374 // set bytes per element
375 _counter.bytes_per_element = sizeof(Tp);
376 // set number of memory accesses per element from two functions
377 _counter.memory_accesses_per_element = 2;
378
379 // set the label
380 _counter.label = "scalar_add";
381 // run the kernels
382 ops_main<1>(_counter, add_func, store_func);
383
384 // set the label
385 _counter.label = "vector_fma";
386 // run the kernels
387 if(!ops_main<TIMEMORY_USER_ERT_FLOPS>(_counter, fma_func, store_func))
388 ops_main<VEC / 2, VEC, 2 * VEC, 4 * VEC>(_counter, fma_func, store_func);
389 }
390
391 //----------------------------------------------------------------------------------//
392 // user specified flops as template parameters
393 template <size_t... Flops, enable_if_t<(sizeof...(Flops) > 0)> = 0>
394 static bool execute(counter_type& _counter,
395 std::array<std::string, sizeof...(Flops)> _labels)
396 {
397 bool _executed = false;
398 auto itr = _labels.begin();
399 TIMEMORY_FOLD_EXPRESSION(_executed |= execute_impl<Flops>(_counter, *(itr++)));
400 return _executed;
401 }
402
403private:
404 //----------------------------------------------------------------------------------//
405 //
406 template <size_t Flops, enable_if_t<Flops == 1> = 0>
407 static bool execute_impl(counter_type& _counter, const std::string& _label)
408 {
409 // functions
410 auto store_func = [](Tp& a, const Tp& b) { a = b; };
411 auto add_func = [](Tp& a, const Tp& b, const Tp& c) { a = b + c; };
412
413 // set bytes per element
414 _counter.bytes_per_element = sizeof(Tp);
415 // set number of memory accesses per element from two functions
416 _counter.memory_accesses_per_element = 2;
417
418 // set the label
419 _counter.label = _label;
420 // run the kernels
421 return ops_main<Flops>(_counter, add_func, store_func);
422 }
423
424 //----------------------------------------------------------------------------------//
425 //
426 template <size_t Flops, enable_if_t<(Flops > 1)> = 0>
427 static bool execute_impl(counter_type& _counter, const std::string& _label)
428 {
429 // functions
430 auto store_func = [](Tp& a, const Tp& b) { a = b; };
431 auto fma_func = [](Tp& a, const Tp& b, const Tp& c) { a = a * b + c; };
432
433 // set bytes per element
434 _counter.bytes_per_element = sizeof(Tp);
435 // set number of memory accesses per element from two functions
436 _counter.memory_accesses_per_element = 2;
437
438 // set the label
439 _counter.label = _label;
440 return ops_main<Flops>(_counter, fma_func, store_func);
441 }
442};
443
444//======================================================================================//
445
446template <typename Tp, typename CounterT>
447struct executor<device::gpu, Tp, CounterT>
448{
449 using DeviceT = device::gpu;
450 static_assert(std::is_same<DeviceT, device::gpu>::value,
451 "Error! Device should be gpu");
452
453 //----------------------------------------------------------------------------------//
454 // useful aliases
455 //
456 using device_type = device::gpu;
457 using value_type = Tp;
461 using callback_type = std::function<void(counter_type&)>;
463
464public:
465 TIMEMORY_DEFAULT_OBJECT(executor)
466
467 //----------------------------------------------------------------------------------//
468 // standard invocation with no callback specialization
469 //
470 executor(configuration_type& config, std::shared_ptr<ert_data_t> _data)
471 {
472 (*this)(config, _data);
473 }
474
475 //----------------------------------------------------------------------------------//
476 // specialize the counter callback
477 //
478 template <typename FuncT>
479 executor(configuration_type& config, std::shared_ptr<ert_data_t> _data,
480 FuncT&& _counter_callback)
481 {
482 (*this)(config, _data, std::forward<FuncT>(_counter_callback));
483 }
484
485 //----------------------------------------------------------------------------------//
486 // execute
487 //
488 template <typename FuncT = std::function<void(uint64_t, counter_type&)>>
489 auto operator()(configuration_type& config, std::shared_ptr<ert_data_t> _data = {},
490 FuncT&& _counter_callback = FuncT{})
491 {
492 std::function<void(uint64_t, counter_type&)> _cb =
493 std::forward<FuncT>(_counter_callback);
494 try
495 {
496 if(!_data)
497 _data.reset(new ert_data_t);
498 auto _counter = config.executor(_data);
499 if(_cb)
500 _counter.set_callback(std::move(_cb));
501 callback(_counter);
502 } catch(std::exception& e)
503 {
504 std::cerr << "\n\nEXCEPTION:\n";
505 std::cerr << "\t" << e.what() << "\n\n" << std::endl;
506 }
507 return _data;
508 }
509
510public:
511 //----------------------------------------------------------------------------------//
512 //
514
515public:
516 //----------------------------------------------------------------------------------//
517 //
519 {
520 static callback_type _instance = [](counter_type& _counter) {
521 this_type::execute(_counter);
522 };
523 return _instance;
524 }
525
526 //----------------------------------------------------------------------------------//
527 // The enclosing parent function for an extended __host__ __device__
528 // lambda must allow its address to be taken
529 static void execute(counter_type& _counter)
530 {
531 // functions
532 auto store_func = [] TIMEMORY_DEVICE_LAMBDA(Tp & a, const Tp& b) { a = b; };
533 auto add_func = [] TIMEMORY_DEVICE_LAMBDA(Tp & a, const Tp& b, const Tp& c) {
534 a = b + c;
535 };
536 // auto mult_func = [] TIMEMORY_LAMBDA(Tp & a, const Tp& b, const Tp& c) {
537 // a = b * c;
538 //};
539 auto fma_func = [] TIMEMORY_DEVICE_LAMBDA(Tp & a, const Tp& b, const Tp& c) {
540 a = a * b + c;
541 };
542
543 // set bytes per element
544 _counter.bytes_per_element = sizeof(Tp);
545 // set number of memory accesses per element from two functions
546 _counter.memory_accesses_per_element = 2;
547
548 // set the label
549 _counter.label = "scalar_add";
550 // run the kernels
551 ops_main<1>(_counter, add_func, store_func);
552
553 // set the label
554 // _counter.label = "vector_mult";
555 // run the kernels
556 // ops_main<4, 16, 64, 128, 256, 512>(_counter, mult_func, store_func);
557
558 // set the label
559 _counter.label = "vector_fma";
560 // run the kernels
561 if(!ops_main<TIMEMORY_USER_ERT_FLOPS>(_counter, fma_func, store_func))
562 ops_main<4, 16, 64, 128, 256, 512>(_counter, fma_func, store_func);
563 }
564
565 //----------------------------------------------------------------------------------//
566 // user specified flops as template parameters
567 template <size_t... Flops, enable_if_t<(sizeof...(Flops) > 0)> = 0>
568 static bool execute(counter_type& _counter,
569 std::array<std::string, sizeof...(Flops)> _labels)
570 {
571 bool _executed = false;
572 auto itr = _labels.begin();
573 TIMEMORY_FOLD_EXPRESSION(_executed |= execute_impl<Flops>(_counter, *(itr++)));
574 return _executed;
575 }
576
577public:
578 //----------------------------------------------------------------------------------//
579 //
580 template <size_t Flops>
582 const std::string& _label)
583 {
584 // functions
585 auto store_func = [] TIMEMORY_DEVICE_LAMBDA(Tp & a, const Tp& b) { a = b; };
586 auto add_func = [] TIMEMORY_DEVICE_LAMBDA(Tp & a, const Tp& b, const Tp& c) {
587 a = b + c;
588 };
589
590 // set bytes per element
591 _counter.bytes_per_element = sizeof(Tp);
592 // set number of memory accesses per element from two functions
593 _counter.memory_accesses_per_element = 2;
594
595 // set the label
596 _counter.label = _label;
597 // run the kernels
598 return ops_main<Flops>(_counter, add_func, store_func);
599 }
600
601 //----------------------------------------------------------------------------------//
602 //
603 template <size_t Flops>
604 static enable_if_t<(Flops > 1), bool> execute_impl(counter_type& _counter,
605 const std::string& _label)
606 {
607 // functions
608 auto store_func = [] TIMEMORY_DEVICE_LAMBDA(Tp & a, const Tp& b) { a = b; };
609 auto fma_func = [] TIMEMORY_DEVICE_LAMBDA(Tp & a, const Tp& b, const Tp& c) {
610 a = a * b + c;
611 };
612
613 // set bytes per element
614 _counter.bytes_per_element = sizeof(Tp);
615 // set number of memory accesses per element from two functions
616 _counter.memory_accesses_per_element = 2;
617
618 // set the label
619 _counter.label = _label;
620 return ops_main<Flops>(_counter, fma_func, store_func);
621 }
622};
623
624//======================================================================================//
625/// for variadic expansion to set the callback
626///
627template <typename ExecutorT>
629{
630 template <typename FuncT>
631 callback(FuncT&& f)
632 {
633 ExecutorT::get_callback() = f;
634 }
635
636 template <typename FuncT>
637 callback(ExecutorT& _exec, FuncT&& f)
638 {
639 _exec.callback = f;
640 }
641};
642
643//======================================================================================//
644
645template <typename DeviceT, typename CounterT, typename Tp, typename... Types,
646 typename DataType = exec_data<CounterT>,
647 typename DataPtr = std::shared_ptr<DataType>,
648 typename std::enable_if<(sizeof...(Types) == 0), int>::type = 0>
649std::shared_ptr<DataType>
650execute(std::shared_ptr<DataType> _data = std::make_shared<DataType>())
651{
652 using ConfigType = configuration<DeviceT, Tp, CounterT>;
653 using ExecType = executor<DeviceT, Tp, CounterT>;
654
655 ConfigType _config{};
656 ExecType(_config, _data);
657
658 return _data;
659}
660
661//======================================================================================//
662
663template <typename DeviceT, typename CounterT, typename Tp, typename... Types,
664 typename DataType = exec_data<CounterT>,
665 typename DataPtr = std::shared_ptr<DataType>,
666 typename std::enable_if<(sizeof...(Types) > 0), int>::type = 0>
667std::shared_ptr<DataType>
668execute(std::shared_ptr<DataType> _data = std::make_shared<DataType>())
669{
670 execute<DeviceT, CounterT, Tp>(_data);
671 execute<DeviceT, CounterT, Types...>(_data);
672 return _data;
673}
674
675//======================================================================================//
676
677} // namespace ert
678
679} // namespace tim
#define TIMEMORY_DEVICE_LAMBDA
Definition: attributes.hpp:180
void add_skip_ops(size_t _Nops)
Definition: counter.hpp:280
std::string label
Definition: counter.hpp:300
int memory_accesses_per_element
Definition: counter.hpp:296
#define TIMEMORY_VEC
STL namespace.
bool ops_main(counter< DeviceT, Tp, CounterT > &_counter, OpsFuncT &&ops_func, StoreFuncT &&store_func)
This is the "main" function for ERT.
Definition: kernels.hpp:159
std::shared_ptr< DataType > execute(std::shared_ptr< DataType > _data=std::make_shared< DataType >())
Definition: kokkosp.cpp:39
ert_num_streams
Definition: settings.cpp:1756
ert_min_working_size
Definition: settings.cpp:1764
ert_grid_size
Definition: settings.cpp:1758
ert_max_data_size
Definition: settings.cpp:1770
ert_min_working_size_cpu
Definition: settings.cpp:1766
ert_num_threads_gpu
Definition: settings.cpp:1754
ert_max_data_size_gpu
Definition: settings.cpp:1774
ert_alignment
Definition: settings.cpp:1762
typename std::enable_if< B, T >::type enable_if_t
Alias template for enable_if.
Definition: types.hpp:190
ert_min_working_size_gpu
Definition: settings.cpp:1768
std::string demangle(const char *_mangled_name, int *_status=nullptr)
Definition: demangle.hpp:47
ert_block_size
Definition: settings.cpp:1760
ert_num_threads_cpu
Definition: settings.cpp:1752
tim::mpl::apply< std::string > string
Definition: macros.hpp:53
ert_num_threads
Definition: settings.cpp:1750
ert_skip_ops
Definition: settings.cpp:1776
ContainerT delimit(const std::string &line, const std::string &delimiters="\"',;: ", PredicateT &&predicate=[](const std::string &s) -> std::string { return s;})
Definition: delimit.hpp:68
for variadic expansion to set the callback
callback(ExecutorT &_exec, FuncT &&f)
static get_skip_ops_t & get_skip_ops()
static get_uint64_t & get_alignment()
static get_uint64_t & get_block_size()
std::function< skip_ops_t()> get_skip_ops_t
static void configure(uint64_t nthreads, uint64_t alignment=sizeof(Tp), uint64_t nstreams=1, uint64_t block_size=1024, uint64_t grid_size=0)
configure the number of threads, number of streams, block size, grid size, and alignment
std::function< uint64_t()> get_uint64_t
static get_uint64_t & get_min_working_size()
static get_uint64_t & get_grid_size()
static get_uint64_t & get_max_data_size()
executor_func_t executor
counter< device_t, Tp, counter_t > ert_counter_t
static get_uint64_t & get_num_streams()
std::function< ert_counter_t(ert_data_ptr_t)> executor_func_t
static get_uint64_t & get_num_threads()
std::unordered_set< size_t > skip_ops_t
static void configure(uint64_t nthreads, uint64_t alignment=sizeof(Tp), uint64_t nstreams=0, uint64_t block_size=0, uint64_t grid_size=0)
configure the number of threads, number of streams, block size, grid size, and alignment
static executor_func_t & get_executor()
std::shared_ptr< ert_data_t > ert_data_ptr_t
static void execute(counter_type &_counter)
static bool execute(counter_type &_counter, std::array< std::string, sizeof...(Flops)> _labels)
std::function< void(counter_type &)> callback_type
auto operator()(configuration_type &config, std::shared_ptr< ert_data_t > _data={}, FuncT &&_counter_callback=FuncT{})
static enable_if_t< Flops==1, bool > execute_impl(counter_type &_counter, const std::string &_label)
executor(configuration_type &config, std::shared_ptr< ert_data_t > _data, FuncT &&_counter_callback)
static enable_if_t<(Flops > 1), bool > execute_impl(counter_type &_counter, const std::string &_label)
static callback_type & get_callback()
std::function< void(counter_type &)> callback_type
executor(configuration_type &config, std::shared_ptr< ert_data_t > _data, FuncT &&_counter_callback)
callback_type callback
counter< device_type, value_type, CounterT > counter_type
static void execute(counter_type &_counter)
exec_data< CounterT > ert_data_t
static bool execute(counter_type &_counter, std::array< std::string, sizeof...(Flops)> _labels)
auto operator()(configuration_type &config, std::shared_ptr< ert_data_t > _data={}, FuncT &&_counter_callback=FuncT{})
#define TIMEMORY_FOLD_EXPRESSION(...)
Definition: types.hpp:56