33#include "timemory/backends/device.hpp"
34#include "timemory/components/cuda/backends.hpp"
35#include "timemory/defines.h"
47#if !defined(TIMEMORY_VEC)
48# define TIMEMORY_VEC 256
51#if !defined(TIMEMORY_USER_ERT_FLOPS)
52# define TIMEMORY_USER_ERT_FLOPS
61template <
typename DeviceT,
typename Tp,
typename CounterT>
83 static constexpr bool is_gpu = device::is_gpu<DeviceT>::value;
131 static constexpr bool is_gpu = device::is_gpu<DeviceT>::value;
145 static constexpr bool is_gpu = device::is_gpu<DeviceT>::value;
151 return 2 * ert::cache_size::get_max();
163 auto _skipstrvec =
delimit(_skipstr,
",; \t");
165 for(
const auto& itr : _skipstrvec)
167 if(itr.find_first_not_of(
"0123456789") == std::string::npos)
168 _result.insert(atol(itr.c_str()));
178 template <
typename Dev = DeviceT,
181 uint64_t nstreams = 0, uint64_t
block_size = 0,
194 template <
typename Dev = DeviceT,
197 uint64_t nstreams = 1, uint64_t
block_size = 1024,
212 using lli =
long long int;
224 exec_params params(_mws_size, _max_size, _num_thread, _num_stream, _grid_size,
234 for(
const auto& itr : _skip_ops)
237 auto dtype =
demangle(
typeid(Tp).name());
240 if(std::is_same<DeviceT, device::cpu>::value)
241 _dev_name =
"[device::cpu]";
242 else if(std::is_same<DeviceT, device::gpu>::value)
243 _dev_name =
"[device::gpu]";
247 printf(
"[ert::executor]%s> "
248 "working-set = %lli, max-size = %lli, num-thread = %lli, "
250 "%lli, grid-size = %lli, block-size = %lli, align-size = %lli, "
253 _dev_name.c_str(), (lli) _mws_size, (lli) _max_size,
254 (lli) _num_thread, (lli) _num_stream, (lli) _grid_size,
255 (lli) _block_size, (lli) _align_size, dtype.c_str());
280template <
typename DeviceT,
typename Tp,
typename CounterT>
283 static_assert(!std::is_same<DeviceT, device::gpu>::value,
284 "Error! Device should not be gpu");
305 (*this)(config, _data);
311 template <
typename FuncT>
313 FuncT&& _counter_callback)
315 (*this)(config, _data, std::forward<FuncT>(_counter_callback));
321 template <
typename FuncT = std::function<
void(u
int64_t, counter_type&)>>
323 FuncT&& _counter_callback = FuncT{})
326 std::forward<FuncT>(_counter_callback);
331 auto _counter = config.
executor(_data);
333 _counter.set_callback(std::move(_cb));
335 }
catch(std::exception& e)
337 std::cerr <<
"\n\nEXCEPTION:\n";
338 std::cerr <<
"\t" << e.what() <<
"\n\n" << std::endl;
364 static constexpr const int SIZE_BITS =
sizeof(Tp) * 8;
365 static_assert(SIZE_BITS > 0,
"Calculated bits size is not greater than zero");
366 static constexpr const int VEC =
TIMEMORY_VEC / SIZE_BITS;
367 static_assert(VEC > 0,
"Calculated vector size is zero");
370 auto store_func = [](Tp& a,
const Tp& b) { a = b; };
371 auto add_func = [](Tp& a,
const Tp& b,
const Tp& c) { a = b + c; };
372 auto fma_func = [](Tp& a,
const Tp& b,
const Tp& c) { a = a * b + c; };
380 _counter.
label =
"scalar_add";
382 ops_main<1>(_counter, add_func, store_func);
385 _counter.
label =
"vector_fma";
387 if(!ops_main<TIMEMORY_USER_ERT_FLOPS>(_counter, fma_func, store_func))
388 ops_main<VEC / 2, VEC, 2 * VEC, 4 * VEC>(_counter, fma_func, store_func);
393 template <
size_t... Flops,
enable_if_t<(
sizeof...(Flops) > 0)> = 0>
397 bool _executed =
false;
398 auto itr = _labels.begin();
406 template <
size_t Flops, enable_if_t<Flops == 1> = 0>
410 auto store_func = [](Tp& a,
const Tp& b) { a = b; };
411 auto add_func = [](Tp& a,
const Tp& b,
const Tp& c) { a = b + c; };
414 _counter.bytes_per_element =
sizeof(Tp);
416 _counter.memory_accesses_per_element = 2;
419 _counter.label = _label;
421 return ops_main<Flops>(_counter, add_func, store_func);
426 template <
size_t Flops, enable_if_t<(Flops > 1)> = 0>
430 auto store_func = [](Tp& a,
const Tp& b) { a = b; };
431 auto fma_func = [](Tp& a,
const Tp& b,
const Tp& c) { a = a * b + c; };
434 _counter.bytes_per_element =
sizeof(Tp);
436 _counter.memory_accesses_per_element = 2;
439 _counter.label = _label;
440 return ops_main<Flops>(_counter, fma_func, store_func);
446template <
typename Tp,
typename CounterT>
450 static_assert(std::is_same<DeviceT, device::gpu>::value,
451 "Error! Device should be gpu");
472 (*this)(config, _data);
478 template <
typename FuncT>
480 FuncT&& _counter_callback)
482 (*this)(config, _data, std::forward<FuncT>(_counter_callback));
488 template <
typename FuncT = std::function<
void(u
int64_t, counter_type&)>>
490 FuncT&& _counter_callback = FuncT{})
493 std::forward<FuncT>(_counter_callback);
498 auto _counter = config.
executor(_data);
500 _counter.set_callback(std::move(_cb));
502 }
catch(std::exception& e)
504 std::cerr <<
"\n\nEXCEPTION:\n";
505 std::cerr <<
"\t" << e.what() <<
"\n\n" << std::endl;
549 _counter.
label =
"scalar_add";
551 ops_main<1>(_counter, add_func, store_func);
559 _counter.
label =
"vector_fma";
561 if(!ops_main<TIMEMORY_USER_ERT_FLOPS>(_counter, fma_func, store_func))
562 ops_main<4, 16, 64, 128, 256, 512>(_counter, fma_func, store_func);
567 template <
size_t... Flops,
enable_if_t<(
sizeof...(Flops) > 0)> = 0>
571 bool _executed =
false;
572 auto itr = _labels.begin();
580 template <
size_t Flops>
596 _counter.
label = _label;
598 return ops_main<Flops>(_counter, add_func, store_func);
603 template <
size_t Flops>
619 _counter.
label = _label;
620 return ops_main<Flops>(_counter, fma_func, store_func);
627template <
typename ExecutorT>
630 template <
typename FuncT>
633 ExecutorT::get_callback() = f;
636 template <
typename FuncT>
645template <
typename DeviceT,
typename CounterT,
typename Tp,
typename... Types,
646 typename DataType = exec_data<CounterT>,
647 typename DataPtr = std::shared_ptr<DataType>,
648 typename std::enable_if<(
sizeof...(Types) == 0),
int>::type = 0>
649std::shared_ptr<DataType>
650execute(std::shared_ptr<DataType> _data = std::make_shared<DataType>())
655 ConfigType _config{};
656 ExecType(_config, _data);
663template <
typename DeviceT,
typename CounterT,
typename Tp,
typename... Types,
664 typename DataType = exec_data<CounterT>,
665 typename DataPtr = std::shared_ptr<DataType>,
666 typename std::enable_if<(
sizeof...(Types) > 0),
int>::type = 0>
667std::shared_ptr<DataType>
668execute(std::shared_ptr<DataType> _data = std::make_shared<DataType>())
670 execute<DeviceT, CounterT, Tp>(_data);
671 execute<DeviceT, CounterT, Types...>(_data);
#define TIMEMORY_DEVICE_LAMBDA
void add_skip_ops(size_t _Nops)
int memory_accesses_per_element
bool ops_main(counter< DeviceT, Tp, CounterT > &_counter, OpsFuncT &&ops_func, StoreFuncT &&store_func)
This is the "main" function for ERT.
std::shared_ptr< DataType > execute(std::shared_ptr< DataType > _data=std::make_shared< DataType >())
typename std::enable_if< B, T >::type enable_if_t
Alias template for enable_if.
std::string demangle(const char *_mangled_name, int *_status=nullptr)
tim::mpl::apply< std::string > string
ContainerT delimit(const std::string &line, const std::string &delimiters="\"',;: ", PredicateT &&predicate=[](const std::string &s) -> std::string { return s;})
for variadic expansion to set the callback
callback(ExecutorT &_exec, FuncT &&f)
static get_skip_ops_t & get_skip_ops()
get_uint64_t max_data_size
get_uint64_t min_working_size
static get_uint64_t & get_alignment()
static get_uint64_t & get_block_size()
std::function< skip_ops_t()> get_skip_ops_t
static void configure(uint64_t nthreads, uint64_t alignment=sizeof(Tp), uint64_t nstreams=1, uint64_t block_size=1024, uint64_t grid_size=0)
configure the number of threads, number of streams, block size, grid size, and alignment
std::function< uint64_t()> get_uint64_t
static get_uint64_t & get_min_working_size()
static get_uint64_t & get_grid_size()
static get_uint64_t & get_max_data_size()
counter< device_t, Tp, counter_t > ert_counter_t
static get_uint64_t & get_num_streams()
std::function< ert_counter_t(ert_data_ptr_t)> executor_func_t
static get_uint64_t & get_num_threads()
std::unordered_set< size_t > skip_ops_t
static void configure(uint64_t nthreads, uint64_t alignment=sizeof(Tp), uint64_t nstreams=0, uint64_t block_size=0, uint64_t grid_size=0)
configure the number of threads, number of streams, block size, grid size, and alignment
static executor_func_t & get_executor()
std::shared_ptr< ert_data_t > ert_data_ptr_t
static void execute(counter_type &_counter)
static bool execute(counter_type &_counter, std::array< std::string, sizeof...(Flops)> _labels)
std::function< void(counter_type &)> callback_type
auto operator()(configuration_type &config, std::shared_ptr< ert_data_t > _data={}, FuncT &&_counter_callback=FuncT{})
static callback_type & get_callback()
static enable_if_t< Flops==1, bool > execute_impl(counter_type &_counter, const std::string &_label)
executor(configuration_type &config, std::shared_ptr< ert_data_t > _data, FuncT &&_counter_callback)
static enable_if_t<(Flops > 1), bool > execute_impl(counter_type &_counter, const std::string &_label)
static callback_type & get_callback()
std::function< void(counter_type &)> callback_type
executor(configuration_type &config, std::shared_ptr< ert_data_t > _data, FuncT &&_counter_callback)
counter< device_type, value_type, CounterT > counter_type
static void execute(counter_type &_counter)
exec_data< CounterT > ert_data_t
static bool execute(counter_type &_counter, std::array< std::string, sizeof...(Flops)> _labels)
auto operator()(configuration_type &config, std::shared_ptr< ert_data_t > _data={}, FuncT &&_counter_callback=FuncT{})
#define TIMEMORY_FOLD_EXPRESSION(...)