27#include "timemory/backends/papi.hpp"
31#include "timemory/components/roofline/backends.hpp"
69template <
typename... Types>
71:
public base<cpu_roofline<Types...>, std::pair<std::vector<long long>, double>>
74 "Error! No CPU roofline support for gpu::fp16_t");
104 template <
typename Tp>
106 template <
typename Tp>
108 template <
typename Tp>
110 template <
typename Tp>
119 static_assert(std::tuple_size<ert_config_t>::value ==
120 std::tuple_size<types_tuple>::value,
121 "Error! ert_config_t size does not match types_tuple size!");
156 static bool _instance =
true;
175 (_env ==
"op" || _env ==
"hw" || _env ==
"counters")
177 : ((_env ==
"ai" || _env ==
"ac" || _env ==
"activity") ?
MODE::AI
181 std::cerr <<
"[" << demangle<this_type>()
182 <<
"]> roofline mode: " << ((_val ==
MODE::OP) ?
"op" :
"ai")
188 static MODE _instance = _get();
214 static auto _instance = []() {
224 for(
const auto& itr : _extra_events)
225 _events.push_back(itr);
233 _events.push_back(PAPI_SP_OPS);
235 _events.push_back(PAPI_DP_OPS);
245 _events.push_back(PAPI_LD_INS);
246 _events.push_back(PAPI_SR_INS);
247 _events.push_back(PAPI_LST_INS);
248 _events.push_back(PAPI_TOT_INS);
255 for(
const auto& itr : _extra_events)
256 _events.push_back(itr);
277 is_configured() =
true;
278 for(
auto&& itr : _events)
289 PRINT_HERE(
"%s",
"global initialization of cpu_roofline");
298 PRINT_HERE(
"%s",
"thread initialization of cpu_roofline");
308 template <
typename Tp,
typename FuncT>
320 if(_store && _store->
size() > 0)
327 std::cout << *(ert_data) << std::endl;
333 template <
typename Archive>
338 _ert_data = std::make_shared<ert_data_t>();
339 ar(cereal::make_nvp(
"roofline", *_ert_data));
367 auto _units = m_papi_vector->display_unit_array();
368 _units.push_back(m_wall_clock->display_unit());
398 return "Model used to provide performance relative to the peak possible "
399 "performance on a CPU architecture.";
406 auto hwcount = m_papi_vector->record();
407 auto duration = m_wall_clock->record();
416 m_papi_vector = std::make_shared<papi_vector>();
417 m_wall_clock = std::make_shared<wall_clock>();
418 std::tie(value.second, accum.second) = std::make_pair(0, 0);
429 TIMEMORY_NODISCARD
std::vector<
double>
get()
const
431 auto _data = m_papi_vector->get();
432 _data.push_back(m_wall_clock->get());
440 m_wall_clock->start();
441 m_papi_vector->start();
442 value =
value_type{ m_papi_vector->get_value(), m_wall_clock->get_value() };
449 m_papi_vector->stop();
450 m_wall_clock->stop();
451 value =
value_type{ m_papi_vector->get_value(), m_wall_clock->get_value() };
452 accum +=
value_type{ m_papi_vector->get_accum(), m_wall_clock->get_accum() };
459 if(rhs.value.first.size() > value.first.size())
460 value.first.resize(rhs.value.first.size());
461 if(rhs.accum.first.size() > accum.first.size())
462 accum.first.resize(rhs.accum.first.size());
472 if(rhs.value.first.size() > value.first.size())
473 value.first.resize(rhs.value.first.size());
474 if(rhs.accum.first.size() > accum.first.size())
475 accum.first.resize(rhs.accum.first.size());
484 using base_type::accum;
488 using base_type::value;
492 trait::uses_value_storage<this_type, value_type>::value>;
510 auto& _obj = obj.
load();
511 std::stringstream sst;
512 auto t_value = _obj.second;
520 sst << std::setw(t_width) << std::setprecision(t_prec) << t_value;
522 sst <<
" " << t_disp;
524 sst <<
" " << t_label;
532 auto _value = obj.
get();
539 std::cout <<
"value: " << _value << std::endl;
540 std::cout <<
"label: " << _label << std::endl;
541 std::cout <<
"displ: " << _disp << std::endl;
544 assert(_value.size() <= _label.size());
545 assert(_value.size() <= _disp.size());
547 auto n = _label.size();
548 for(
size_t i = 0; i < n; ++i)
550 std::stringstream ss_value;
551 std::stringstream ss_extra;
552 ss_value.setf(_flags);
553 ss_value << std::setw(_width) << std::setprecision(_prec) << _value.at(i);
554 if(!_disp.at(i).empty())
556 ss_extra <<
" " << _disp.at(i);
558 else if(!_label.at(i).empty())
560 ss_extra <<
" " << _label.at(i);
562 os << sst.str() << ss_value.str() << ss_extra.str();
572 template <
typename Archive>
573 void load(Archive& ar,
const unsigned int)
578 ar(cereal::make_nvp(
"laps",
laps), cereal::make_nvp(
"labels", labels),
579 cereal::make_nvp(
"papi_vector", m_papi_vector));
580 ar(cereal::make_nvp(
"value", value));
581 ar(cereal::make_nvp(
"accum", accum));
586 template <
typename Archive>
587 void save(Archive& ar,
const unsigned int)
const
592 ar(cereal::make_nvp(
"laps",
laps), cereal::make_nvp(
"display", _disp),
595 cereal::make_nvp(
"labels", labels),
596 cereal::make_nvp(
"papi_vector", m_papi_vector));
599 ar.setNextName(
"repr_data");
601 auto litr = labels.begin();
602 auto ditr = data.begin();
603 for(; litr != labels.end() && ditr != data.end(); ++litr, ++ditr)
604 ar(cereal::make_nvp(*litr,
double(*ditr)));
607 ar(cereal::make_nvp(
"value", value));
608 ar(cereal::make_nvp(
"accum", accum));
618 strvec_t arr = m_papi_vector->label_array();
619 arr.push_back(
"Runtime");
628 strvec_t arr = m_papi_vector->description_array();
629 arr.push_back(
"Runtime");
637 strvec_t arr = m_papi_vector->display_unit_array();
647 auto arr = m_papi_vector->unit_array();
656 std::shared_ptr<papi_vector> m_papi_vector{
nullptr };
657 std::shared_ptr<wall_clock> m_wall_clock{
nullptr };
667 static bool& is_configured()
669 static thread_local bool _instance =
false;
the namespace provides overloads to output complex data types w/ streams
std::string demangle(const char *_mangled_name, int *_status=nullptr)
tim::mpl::apply< std::string > string
const std::string std::ostream * os
typename impl::is_one_of< Tp, Types > is_one_of
check if type is in expansion
static short get_precision()
static int64_t get_unit()
static std::string get_label()
void set_stopped()
store that stop has been called
static std::string get_display_unit()
static fmtflags get_format_flags()
storage< Tp, Value > storage_type
void set_started()
store that start has been called
Combines hardware counters and timers and executes the empirical roofline toolkit during application ...
strvec_t description_array() const
std::vector< int64_t > unit_array() const
std::function< value_type()> record_type
std::tuple< Types... > types_tuple
std::pair< array_type, double > value_type
display_unit_type display_unit()
static void thread_init()
std::vector< int > event_type
static ert_data_ptr_t get_ert_data()
static events_callback_t & get_events_callback()
replace this callback to add in custom HW counters
friend std::ostream & operator<<(std::ostream &os, const this_type &obj)
std::function< intvec_t(const MODE &)> events_callback_t
std::vector< double > get_display() const
static ert_config_t & get_finalizer()
static std::string description()
std::vector< long long > array_type
static void thread_finalize()
static std::string label()
this_type & operator+=(const this_type &rhs)
typename count_type::ratio_t ratio_t
void load(Archive &ar, const unsigned int)
std::shared_ptr< ert_data_t > ert_data_ptr_t
cpu_roofline & operator=(const cpu_roofline &)=default
static void global_init()
std::vector< int > intvec_t
typename array_type::const_iterator const_iterator
static std::string get_type_string()
display_unit_type get_display_unit()
cpu_roofline(cpu_roofline &&rhs) noexcept=default
std::tuple< ert_callback_type< Types >... > ert_callback_t
typename trait::units< this_type >::display_type display_unit_type
static std::string get_mode_string()
strvec_t label_array() const
std::tuple< ert_config_type< Types >... > ert_config_t
static void set_executor_callback(FuncT &&f)
static MODE & event_mode()
std::vector< std::string > strvec_t
static void extra_serialization(Archive &ar)
typename array_type::iterator iterator
std::tuple< ert_executor_type< Types >... > ert_executor_t
std::vector< double > get() const
strvec_t display_unit_array() const
cpu_roofline & operator=(cpu_roofline &&) noexcept=default
static void global_finalize(storage_type *_store)
static event_type get_events()
static const short precision
typename trait::units< this_type >::type unit_type
cpu_roofline(const cpu_roofline &rhs)=default
static bool & use_predefined_enums()
set to false to suppress adding predefined enumerations
cpu_roofline< Types... > this_type
this_type & operator-=(const this_type &rhs)
void save(Archive &ar, const unsigned int) const
std::tuple< ert_counter_type< Types >... > ert_counter_t
A very lightweight storage class which provides nothing.
constexpr size_t size() const
static void add_event(int evt)
for variadic expansion to set the callback
static callback_type & get_callback()
static string_t join(SepT &&separator, Tuple &&__tup, index_sequence< Idx... >) noexcept
This operation attempts to call a member function which the component provides to internally store wh...
This operation attempts to call a member function which the component provides to internally store wh...