30#include "timemory/components/roofline/backends.hpp"
69template <
typename...
Types>
71:
public base<gpu_roofline<Types...>, std::tuple<typename cupti_activity::value_type,
72 typename cupti_counters::value_type>>
99 template <
typename Tp>
101 template <
typename Tp>
103 template <
typename Tp>
105 template <
typename Tp>
114 static_assert(std::tuple_size<ert_config_t>::value ==
115 std::tuple_size<types_tuple>::value,
116 "Error! ert_config_t size does not match types_tuple size!");
154 auto&& _get = [=]() {
165 return (_env ==
"op" || _env ==
"hw" || _env ==
"counters")
167 : ((_env ==
"ai" || _env ==
"ac" || _env ==
"activity")
172 static MODE _instance = _get();
184 is_configured() =
true;
194 strvec_t events = {
"global_load",
"global_store" };
195 strvec_t metrics = {
"ldst_executed" };
196#if defined(TIMEMORY_USE_CUDA_HALF)
199 metrics.push_back(
"flop_count_hp");
205 metrics.push_back(
"flop_count_sp");
210 metrics.push_back(
"flop_count_dp");
217 for(
const auto* itr :
218 {
"ipc",
"inst_executed",
"inst_integer",
"inst_fp_64",
"inst_fp_32",
219 "inst_fp_16",
"local_load_transactions_per_request",
220 "local_store_transactions_per_request",
221 "shared_load_transactions_per_request",
222 "shared_store_transactions_per_request",
223 "gld_transactions_per_request",
"gst_transactions_per_request",
224 "inst_executed_global_reductions",
"inst_executed_global_stores",
225 "inst_executed_global_loads",
"inst_executed_local_loads",
226 "inst_executed_local_stores",
"inst_executed_shared_loads",
227 "inst_executed_shared_stores" })
228 metrics.emplace_back(itr);
233 for(
const auto& itr : _extra_events)
234 events.push_back(itr);
238 for(
const auto& itr : _extra_metrics)
239 metrics.push_back(itr);
241 auto _get_unique = [](
const strvec_t& _vec) {
242 std::set<std::string> _set;
243 for(
const auto& itr : _vec)
246 for(
const auto& itr : _set)
251 metrics = _get_unique(metrics);
252 events = _get_unique(events);
315 template <
typename Tp,
typename FuncT>
338 if(_store && _store->
size() > 0)
340 assert(_store->is_finalizing());
346 std::cout << *(ert_data) << std::endl;
357 template <
typename Archive>
362 _ert_data = std::make_shared<ert_data_t>();
363 ar(cereal::make_nvp(
"roofline", *_ert_data));
382 while(ret.find(
"__") != std::string::npos)
383 ret.erase(ret.find(
"__"), 1);
392 return "Model used to provide performance relative to the peak possible "
393 "performance on a GPU architecture.";
420 static bool& is_configured()
422 static bool _instance =
false;
439 base_type::operator=(rhs);
469 m_data.activity->start();
470 std::get<0>(value) = m_data.activity->get_value();
475 m_data.counters->start();
476 std::get<1>(value) = m_data.counters->get_value();
491 m_data.activity->stop();
492 std::get<0>(accum) = m_data.activity->get_accum();
493 std::get<0>(value) = m_data.activity->get_value();
498 m_data.counters->stop();
499 std::get<1>(accum) = m_data.counters->get_accum();
500 std::get<1>(value) = m_data.counters->get_value();
514 *m_data.activity += *rhs.m_data.activity;
515 std::get<0>(accum) = m_data.activity->get_accum();
516 std::get<0>(value) = m_data.activity->get_value();
521 *m_data.counters += *rhs.m_data.counters;
522 std::get<1>(accum) = m_data.counters->get_accum();
523 std::get<1>(value) = m_data.counters->get_value();
538 *m_data.activity -= *rhs.m_data.activity;
539 std::get<0>(accum) = m_data.activity->get_accum();
540 std::get<0>(value) = m_data.activity->get_value();
545 *m_data.counters -= *rhs.m_data.counters;
546 std::get<1>(accum) = m_data.counters->get_accum();
547 std::get<1>(value) = m_data.counters->get_value();
562 *m_data.activity += std::get<0>(rhs);
563 std::get<0>(accum) = m_data.activity->get_accum();
564 std::get<0>(value) = m_data.activity->get_value();
569 *m_data.counters += std::get<1>(rhs);
570 std::get<1>(accum) = m_data.counters->get_accum();
571 std::get<1>(value) = m_data.counters->get_value();
593 auto&& _tmp = m_data.activity->get_secondary();
594 for(
auto&& itr : _tmp)
603 auto&& _tmp = m_data.counters->get_secondary();
604 for(
auto&& itr : _tmp)
618 using base_type::accum;
622 using base_type::value;
626 trait::uses_value_storage<this_type, value_type>::value>;
637 std::stringstream ss;
640 return m_data.counters->get_display();
643 ss << m_data.activity->get_display();
664 const auto& _labels = get_labels();
679 std::stringstream ss_value;
680 std::stringstream ss_extra;
681 ss_value.setf(_flags);
682 ss_value << std::setw(_width) << std::setprecision(_prec) << _value;
684 ss_extra <<
" " << _disp;
686 ss_extra <<
" " << _label;
688 std::stringstream ss;
689 ss << ss_value.str() << ss_extra.str();
694 static label_type& get_labels() {
return *_get_labels(); }
698 static auto _instance = std::make_unique<label_type>();
699 return _instance.get();
705 cupti_activity* activity =
nullptr;
706 cupti_counters* counters;
726 cupti_data(
const cupti_data& rhs)
730 case MODE::ACTIVITY: activity =
new cupti_activity(*rhs.activity);
break;
731 case MODE::COUNTERS: counters =
new cupti_counters(*rhs.counters);
break;
735 cupti_data(cupti_data&& rhs)
noexcept
741 std::swap(activity, rhs.activity);
745 std::swap(counters, rhs.counters);
750 cupti_data&
operator=(
const cupti_data& rhs)
758 activity =
new cupti_activity(*rhs.activity);
762 counters =
new cupti_counters(*rhs.counters);
768 cupti_data&
operator=(cupti_data&& rhs)
noexcept
777 std::swap(activity, rhs.activity);
782 std::swap(counters, rhs.counters);
794 template <
typename Archive>
795 void save(Archive& ar,
const unsigned int)
const
799 auto _labels = get_labels();
801 ar(cereal::make_nvp(
"laps",
laps), cereal::make_nvp(
"display", _disp),
804 cereal::make_nvp(
"labels", _labels));
806 ar.setNextName(
"repr_data");
808 auto litr = _labels.begin();
809 auto ditr = _data.begin();
810 for(; litr != _labels.end() && ditr != _data.end(); ++litr, ++ditr)
811 ar(cereal::make_nvp(*litr, *ditr));
814 ar.setNextName(
"value");
819 ar(std::get<0>(value));
823 ar(std::get<1>(value));
827 ar.setNextName(
"accum");
832 ar(std::get<0>(accum));
836 ar(std::get<1>(accum));
843 template <
typename Archive>
844 void load(Archive& ar,
const unsigned int)
848 std::vector<std::string> _labels;
852 ar(cereal::make_nvp(
"laps",
laps), cereal::make_nvp(
"display", _disp),
853 cereal::make_nvp(
"mode", _mode_str), cereal::make_nvp(
"type", _type_str),
854 cereal::make_nvp(
"labels", _labels));
856 if(_mode_str ==
"counters")
860 else if(_mode_str ==
"activity")
865 _data.resize(_labels.size());
867 ar.setNextName(
"repr_data");
869 auto litr = _labels.begin();
870 auto ditr = _data.begin();
871 for(; litr != _labels.end() && ditr != _data.end(); ++litr, ++ditr)
872 ar(cereal::make_nvp(*litr, *ditr));
875 ar.setNextName(
"value");
879 ar(std::get<0>(value));
883 ar(std::get<1>(value));
887 ar.setNextName(
"accum");
891 ar(std::get<0>(accum));
895 ar(std::get<1>(accum));
Definition for global and thread-local finalization functions for a component.
Definition for global and thread-local initialzation functions for a component.
const hash_alias_ptr_t hash_value_t std::string *& _ret
typename impl::is_one_of_integral< Types > is_one_of_integral
check if type is in expansion
std::integral_constant< int, ModeV > mode_constant
std::string demangle(const char *_mangled_name, int *_status=nullptr)
tim::mpl::apply< std::string > string
const std::string std::ostream * os
typename impl::is_one_of< Tp, Types > is_one_of
check if type is in expansion
static short get_precision()
static std::string get_label()
void set_stopped()
store that stop has been called
static std::string display_unit()
static std::string get_display_unit()
static fmtflags get_format_flags()
storage< Tp, Value > storage_type
void set_started()
store that start has been called
CUPTI activity tracing component for high-precision kernel timing. For low-precision kernel timing,...
static value_type record()
NVprof-style hardware counters via the CUpti callback API. Collecting these hardware counters has a h...
cupti::profiler::results_t value_type
static string_t display_unit()
static value_type record()
static array_t< string_t > label_array()
A very lightweight storage class which provides nothing.
constexpr size_t size() const
Combines hardware counters and timers and executes the empirical roofline toolkit during application ...
this_type & operator+=(const this_type &rhs)
static std::string description()
string_t get_display() const
std::shared_ptr< ert_data_t > ert_data_ptr_t
static void thread_init()
secondary_type get_secondary() const
typename cupti_activity::value_type activity_value_type
std::tuple< ert_counter_type< Types >... > ert_counter_t
std::unordered_multimap< std::string, value_type > secondary_type
std::vector< std::string > label_type
static void configure(const MODE &_mode, int _device=0)
static std::string get_type_string()
friend std::ostream & operator<<(std::ostream &os, const this_type &obj)
static std::string get_mode_string()
static std::string display_unit()
gpu_roofline< Types... > this_type
static void set_executor_callback(FuncT &&f)
std::vector< std::string > strvec_t
void load(Archive &ar, const unsigned int)
std::function< strvec_t()> events_callback_t
typename cupti_counters::value_type counters_value_type
this_type & operator+=(const value_type &rhs)
static label_type label_array()
static events_callback_t & get_events_callback()
gpu_roofline(gpu_roofline &&) noexcept=default
static void global_init()
static void global_finalize(storage_type *_store)
static void thread_finalize()
void save(Archive &ar, const unsigned int) const
std::tuple< ert_callback_type< Types >... > ert_callback_t
events_callback_t metrics_callback_t
this_type & operator-=(const this_type &rhs)
static std::string label()
std::tuple< typename cupti_activity::value_type, typename cupti_counters::value_type > value_type
gpu_roofline & operator=(const gpu_roofline &rhs)
std::tuple< Types... > types_tuple
static MODE & event_mode()
std::tuple< ert_config_type< Types >... > ert_config_t
static void extra_serialization(Archive &ar)
gpu_roofline(const gpu_roofline &rhs)
std::tuple< ert_executor_type< Types >... > ert_executor_t
std::vector< double > result_type
static ert_config_t & get_finalizer()
static value_type record()
static ert_data_ptr_t & get_ert_data()
static metrics_callback_t & get_metrics_callback()
static const short precision
static label_type display_unit_array()
for variadic expansion to set the callback
static callback_type & get_callback()
static string_t join(SepT &&separator, Tuple &&__tup, index_sequence< Idx... >) noexcept
This operation class is used for invoking the static initializer and thread-local initializer of a co...
This operation class is used for invoking the static initializer and thread-local initializer of a co...
This operation attempts to call a member function which the component provides to internally store wh...
This operation attempts to call a member function which the component provides to internally store wh...
trait that signifies that a component will handle printing the label(s)
trait that signifies that a component will handle printing the units(s)