27 #include "timemory/backends/papi.hpp"
33 #include "timemory/components/roofline/backends.hpp"
71 template <
typename... Types>
73 :
public base<cpu_roofline<Types...>, std::pair<std::vector<long long>, double>>
75 static_assert(!
is_one_of<cuda::fp16_t, std::tuple<Types...>>::value,
76 "Error! No CPU roofline support for cuda::fp16_t");
106 template <
typename Tp>
108 template <
typename Tp>
110 template <
typename Tp>
112 template <
typename Tp>
121 static_assert(std::tuple_size<ert_config_t>::value ==
122 std::tuple_size<types_tuple>::value,
123 "Error! ert_config_t size does not match types_tuple size!");
158 static bool _instance =
true;
177 (_env ==
"op" || _env ==
"hw" || _env ==
"counters")
179 : ((_env ==
"ai" || _env ==
"ac" || _env ==
"activity") ?
MODE::AI
183 std::cerr <<
"[" << demangle<this_type>()
184 <<
"]> roofline mode: " << ((_val ==
MODE::OP) ?
"op" :
"ai")
190 static MODE _instance = _get();
216 static auto _instance = []() {
226 for(
const auto& itr : _extra_events)
227 _events.push_back(itr);
235 _events.push_back(PAPI_SP_OPS);
237 _events.push_back(PAPI_DP_OPS);
247 _events.push_back(PAPI_LD_INS);
248 _events.push_back(PAPI_SR_INS);
249 _events.push_back(PAPI_LST_INS);
250 _events.push_back(PAPI_TOT_INS);
257 for(
const auto& itr : _extra_events)
258 _events.push_back(itr);
279 is_configured() =
true;
280 for(
auto&& itr : _events)
291 PRINT_HERE(
"%s",
"global initialization of cpu_roofline");
300 PRINT_HERE(
"%s",
"thread initialization of cpu_roofline");
310 template <
typename Tp,
typename FuncT>
322 if(_store && _store->size() > 0)
329 std::cout << *(ert_data) << std::endl;
335 template <
typename Archive>
340 _ert_data = std::make_shared<ert_data_t>();
341 ar(cereal::make_nvp(
"roofline", *_ert_data));
369 auto _units = m_papi_vector->display_unit_array();
370 _units.push_back(m_wall_clock->display_unit());
400 return "Model used to provide performance relative to the peak possible "
401 "performance on a CPU architecture.";
408 auto hwcount = m_papi_vector->record();
409 auto duration = m_wall_clock->record();
418 m_papi_vector = std::make_shared<papi_vector>();
419 m_wall_clock = std::make_shared<wall_clock>();
420 std::tie(value.second, accum.second) = std::make_pair(0, 0);
431 TIMEMORY_NODISCARD std::vector<
double>
get()
const
433 auto _data = m_papi_vector->get();
434 _data.push_back(m_wall_clock->get());
442 m_wall_clock->start();
443 m_papi_vector->start();
444 value =
value_type{ m_papi_vector->get_value(), m_wall_clock->get_value() };
451 m_papi_vector->stop();
452 m_wall_clock->stop();
453 value =
value_type{ m_papi_vector->get_value(), m_wall_clock->get_value() };
454 accum +=
value_type{ m_papi_vector->get_accum(), m_wall_clock->get_accum() };
461 if(rhs.value.first.size() > value.first.size())
462 value.first.resize(rhs.value.first.size());
463 if(rhs.accum.first.size() > accum.first.size())
464 accum.first.resize(rhs.accum.first.size());
474 if(rhs.value.first.size() > value.first.size())
475 value.first.resize(rhs.value.first.size());
476 if(rhs.accum.first.size() > accum.first.size())
477 accum.first.resize(rhs.accum.first.size());
486 using base_type::accum;
490 using base_type::value;
494 trait::uses_value_storage<this_type, value_type>::value>;
512 auto& _obj = obj.
load();
513 std::stringstream sst;
514 auto t_value = _obj.second;
522 sst << std::setw(t_width) << std::setprecision(t_prec) << t_value;
524 sst <<
" " << t_disp;
526 sst <<
" " << t_label;
534 auto _value = obj.
get();
541 std::cout <<
"value: " << _value << std::endl;
542 std::cout <<
"label: " << _label << std::endl;
543 std::cout <<
"displ: " << _disp << std::endl;
546 assert(_value.size() <= _label.size());
547 assert(_value.size() <= _disp.size());
549 auto n = _label.size();
550 for(
size_t i = 0; i < n; ++i)
552 std::stringstream ss_value;
553 std::stringstream ss_extra;
554 ss_value.setf(_flags);
555 ss_value << std::setw(_width) << std::setprecision(_prec) << _value.at(i);
556 if(!_disp.at(i).empty())
558 ss_extra <<
" " << _disp.at(i);
560 else if(!_label.at(i).empty())
562 ss_extra <<
" " << _label.at(i);
564 os << sst.str() << ss_value.str() << ss_extra.str();
574 template <
typename Archive>
575 void load(Archive& ar,
const unsigned int)
580 ar(cereal::make_nvp(
"laps",
laps), cereal::make_nvp(
"labels", labels),
581 cereal::make_nvp(
"papi_vector", m_papi_vector));
582 ar(cereal::make_nvp(
"value", value));
583 ar(cereal::make_nvp(
"accum", accum));
588 template <
typename Archive>
589 void save(Archive& ar,
const unsigned int)
const
594 ar(cereal::make_nvp(
"laps",
laps), cereal::make_nvp(
"display", _disp),
597 cereal::make_nvp(
"labels", labels),
598 cereal::make_nvp(
"papi_vector", m_papi_vector));
601 ar.setNextName(
"repr_data");
603 auto litr = labels.begin();
604 auto ditr = data.begin();
605 for(; litr != labels.end() && ditr != data.end(); ++litr, ++ditr)
606 ar(cereal::make_nvp(*litr,
double(*ditr)));
609 ar(cereal::make_nvp(
"value", value));
610 ar(cereal::make_nvp(
"accum", accum));
620 strvec_t arr = m_papi_vector->label_array();
621 arr.push_back(
"Runtime");
630 strvec_t arr = m_papi_vector->description_array();
631 arr.push_back(
"Runtime");
639 strvec_t arr = m_papi_vector->display_unit_array();
649 auto arr = m_papi_vector->unit_array();
658 std::shared_ptr<papi_vector> m_papi_vector{
nullptr };
659 std::shared_ptr<wall_clock> m_wall_clock{
nullptr };
669 static bool& is_configured()
671 static thread_local
bool _instance =
false;
the namespace provides overloads to output complex data types w/ streams
std::string demangle(const char *_mangled_name, int *_status=nullptr)
tim::mpl::apply< std::string > string
typename impl::is_one_of< Tp, Types > is_one_of
check if type is in expansion
The declaration for the types for settings without definitions.
static short get_precision()
static int64_t get_unit()
static std::string get_label()
void set_stopped()
store that stop has been called
static std::string get_display_unit()
static fmtflags get_format_flags()
storage< Tp, Value > storage_type
void set_started()
store that start has been called
Combines hardware counters and timers and executes the empirical roofline toolkit during application ...
strvec_t description_array() const
std::function< value_type()> record_type
std::tuple< Types... > types_tuple
std::pair< array_type, double > value_type
std::vector< int64_t > unit_array() const
std::vector< double > get() const
display_unit_type display_unit()
this_type & operator-=(const this_type &rhs)
static void thread_init()
std::vector< int > event_type
static ert_data_ptr_t get_ert_data()
this_type & operator+=(const this_type &rhs)
typename base_type::storage_type storage_type
std::function< intvec_t(const MODE &)> events_callback_t
static std::string description()
std::vector< long long > array_type
static void thread_finalize()
static std::string label()
typename count_type::ratio_t ratio_t
void load(Archive &ar, const unsigned int)
static ert_config_t & get_finalizer()
std::vector< double > get_display() const
std::shared_ptr< ert_data_t > ert_data_ptr_t
static void global_init()
static events_callback_t & get_events_callback()
replace this callback to add in custom HW counters
cpu_roofline & operator=(cpu_roofline &&) noexcept=default
std::vector< int > intvec_t
typename array_type::const_iterator const_iterator
static std::string get_type_string()
display_unit_type get_display_unit()
static bool & use_predefined_enums()
set to false to suppress adding predefined enumerations
cpu_roofline(cpu_roofline &&rhs) noexcept=default
std::tuple< ert_callback_type< Types >... > ert_callback_t
static MODE & event_mode()
typename trait::units< this_type >::display_type display_unit_type
static std::string get_mode_string()
strvec_t label_array() const
std::tuple< ert_config_type< Types >... > ert_config_t
static void set_executor_callback(FuncT &&f)
std::vector< std::string > strvec_t
static void extra_serialization(Archive &ar)
typename array_type::iterator iterator
std::tuple< ert_executor_type< Types >... > ert_executor_t
strvec_t display_unit_array() const
static void global_finalize(storage_type *_store)
static event_type get_events()
static const short precision
typename trait::units< this_type >::type unit_type
cpu_roofline(const cpu_roofline &rhs)=default
cpu_roofline< Types... > this_type
void save(Archive &ar, const unsigned int) const
friend std::ostream & operator<<(std::ostream &os, const this_type &obj)
cpu_roofline & operator=(const cpu_roofline &)=default
std::tuple< ert_counter_type< Types >... > ert_counter_t
static void add_event(int evt)
for variadic expansion to set the callback
static callback_type & get_callback()
static string_t join(SepT &&separator, Tuple &&__tup, index_sequence< Idx... >) noexcept
This operation attempts to call a member function which the component provides to internally store wh...
This operation attempts to call a member function which the component provides to internally store wh...