34#include "timemory/components/cupti/backends.hpp"
119 if(cuda::device_count() < 1)
137 if(_get_profiler().
get() ==
nullptr)
148 if(_get_profiler().
get() ==
nullptr)
164 auto* _labels = _get_labels();
167 value.resize(_labels->size());
168 accum.resize(_labels->size());
169 for(
size_type i = 0; i < _labels->size(); ++i)
171 value[i].name = (*_labels)[i];
172 accum[i].name = (*_labels)[i];
184 base_type::operator=(rhs);
185 m_kernel_value = rhs.m_kernel_value;
186 m_kernel_accum = rhs.m_kernel_accum;
192 static int64_t
unit() {
return 1; }
202 auto& _profiler = _get_profiler();
203 if(!_profiler || !_get_labels())
205 auto& _labels = *_get_labels();
209 tmp = _profiler->get_events_and_metrics(_labels);
211 else if(tmp.size() == _labels.size())
213 auto ret = _profiler->get_events_and_metrics(_labels);
214 for(
size_t j = 0; j < _labels.size(); ++j)
219 fprintf(stderr,
"Warning! mis-matched size in cupti_event::%s @ %s:%i\n",
232 auto& _profiler = _get_profiler();
235 m_kernel_value = _profiler->get_kernel_events_and_metrics(*_get_labels());
246 auto& _profiler = _get_profiler();
251 _profiler->get_kernel_events_and_metrics(*_get_labels());
257 for(
size_type i = 0; i < tmp.size(); ++i)
258 accum[i] -= value[i];
262 for(
size_type i = 0; i < tmp.size(); ++i)
263 accum[i] += (tmp[i] - value[i]);
266 for(
size_t i = 0; i < m_kernel_value.size(); ++i)
267 kernel_tmp[i].second -= m_kernel_value[i].second;
268 for(
size_t i = 0; i < kernel_tmp.size(); ++i)
270 if(i >= m_kernel_accum.size())
272 m_kernel_accum.resize(i + 1, kernel_tmp[i]);
276 m_kernel_accum[i].second += kernel_tmp[i].second;
280 value = std::move(tmp);
281 m_kernel_value = std::move(kernel_data);
286 auto _get_display = [&](std::ostream&
os,
const cupti::result& obj) {
287 auto _label = obj.name;
292 std::stringstream ss;
293 std::stringstream ssv;
294 std::stringstream ssi;
296 ssv << std::setw(_width) << std::setprecision(_prec);
299 ssi <<
" " << _label;
300 ss << ssv.str() << ssi.str();
304 const auto& _data =
load();
305 std::stringstream ss;
306 for(
size_type i = 0; i < _data.size(); ++i)
308 _get_display(ss, _data[i]);
309 if(i + 1 < _data.size())
315 TIMEMORY_NODISCARD std::vector<double>
get()
const
317 std::vector<double> values;
318 const auto& _data =
load();
319 values.reserve(_data.size());
320 for(
const auto& itr : _data)
321 values.push_back(cupti::get<double>(itr.data));
330 for(
const auto& itr : m_kernel_accum)
331 _data.insert({ itr.first, itr.second });
335 template <
typename Tp>
345 return std::find(arr.begin(), arr.end(),
entry) != arr.end();
349 arr.push_back(
entry);
351 auto* _labels = _get_labels();
354 for(
const auto& itr : *_labels)
389 auto& _labels = *_get_labels();
396 for(
size_type i = 0; i < _labels.size(); ++i)
397 _data[i] += _other[i];
401 _combine(value, rhs.value);
402 _combine(accum, rhs.accum);
409 auto& _labels = *_get_labels();
414 for(
size_type i = 0; i < _labels.size(); ++i)
415 _data[i] -= _other[i];
418 _combine(value, rhs.value);
419 _combine(accum, rhs.accum);
432 for(
size_type i = 0; i < _other.size(); ++i)
433 _data[i] += _other[i];
437 _combine(value, rhs);
438 _combine(accum, rhs);
446 template <
typename Archive>
450 std::vector<double> values;
451 for(
const auto& itr : _data)
452 values.push_back(cupti::get<double>(itr.data));
458 ar(cereal::make_nvp(
"laps",
laps), cereal::make_nvp(
"repr_data", _disp),
459 cereal::make_nvp(
"value", _value), cereal::make_nvp(
"accum", _accum),
460 cereal::make_nvp(
"display", _disp));
467 template <
typename Archive>
470 auto& _devices = *_get_device();
471 auto& _events = *_get_events();
472 auto& _metrics = *_get_metrics();
473 auto& _labels = *_get_labels();
475 ar(cereal::make_nvp(
"devices", _devices), cereal::make_nvp(
"events", _events),
476 cereal::make_nvp(
"metrics", _metrics), cereal::make_nvp(
"labels", _labels));
482 template <
typename Tp>
485 using const_iterator =
typename Tp::const_iterator;
491 TIMEMORY_NODISCARD const_iterator begin()
const {
return obj.begin(); }
492 TIMEMORY_NODISCARD const_iterator
end()
const {
return obj.end(); }
494 friend std::ostream&
operator<<(std::ostream&
os,
const writer<Tp>& _obj)
496 auto sz = std::distance(_obj.begin(), _obj.end());
497 for(
auto itr = _obj.begin(); itr != _obj.end(); ++itr)
499 auto idx = std::distance(_obj.begin(), itr);
526 static int*& _get_device()
528 static int* _instance =
new int(0);
540 array_t<string_t> arr;
542 return std::find(arr.begin(), arr.end(),
entry) != arr.end();
546 arr.push_back(
entry);
551 for(
const auto& itr : profiler->get_event_names())
553 for(
const auto& itr : profiler->get_metric_names())
559 static strvec_t get_available_events(
int devid)
561 return cupti::available_events(cupti::get_device(devid));
564 static strvec_t get_available_metrics(
int devid)
566 return cupti::available_metrics(cupti::get_device(devid));
574 if(!_manager || _manager->is_finalized() || _manager->is_finalizing())
577 auto _init_cb = tim::get_env<bool>(
"TIMEMORY_CUPTI_INIT_CB",
true);
578 cupti::init_driver();
583 auto& _profiler = _get_profiler();
584 auto& _events = *_get_events();
585 auto& _metrics = *_get_metrics();
586 auto& _device = *_get_device();
587 auto& _labels = *_get_labels();
591 _device = std::get<0>(_init);
592 _events = std::get<1>(_init);
593 _metrics = std::get<2>(_init);
595 using intset_t = std::set<int>;
596 using strset_t = std::set<string_t>;
602 auto _dev_init = get_available(_init, _device);
603 auto& _dev = std::get<0>(_dev_init);
609 printf(
"Creating CUPTI hardware profiler for device %i...\n", _device);
611 auto& _evt = std::get<1>(_dev_init);
612 auto& _met = std::get<2>(_dev_init);
614 if(!_evt.empty() || !_met.empty())
616 _profiler = std::make_shared<cupti::profiler>(_evt, _met, _dev, _init_cb);
617 _used_devs.insert(_dev);
618 for(
const auto& itr : _evt)
619 _used_evts.insert(itr);
620 for(
const auto& itr : _met)
621 _used_mets.insert(itr);
622 _labels = generate_labels();
626 static int _pass = 0;
628 fprintf(stderr,
"[cupti_counters]> Warning! No events or metrics!\n");
633 fprintf(stderr,
"[cupti_counters]> Warning! No devices available!\n");
636 if(!_used_devs.empty())
640 std::cout <<
"Devices : " << writer<intset_t>(_used_devs) << std::endl;
641 std::cout <<
"Event : " << writer<strset_t>(_used_evts) << std::endl;
642 std::cout <<
"Metrics : " << writer<strset_t>(_used_mets) << std::endl;
643 std::cout <<
"Labels : " << writer<strvec_t>(_labels) << std::endl;
651 _get_metrics()->clear();
653 _get_events()->clear();
654 _get_profiler().reset();
661 delete _get_device();
662 delete _get_events();
663 delete _get_labels();
664 delete _get_metrics();
665 _get_device() =
nullptr;
666 _get_events() =
nullptr;
667 _get_labels() =
nullptr;
668 _get_metrics() =
nullptr;
679cupti_counters::get_available(
const tuple_type& _init,
int devid)
681 if(devid < 0 || devid >= cuda::device_count())
683 int ndev = cuda::device_count();
684 fprintf(stderr,
"[cupti_counters]> Invalid device id: %i. # devices: %i...\n",
689 strvec_t _events = std::get<1>(_init);
690 strvec_t _metrics = std::get<2>(_init);
695 _events = std::get<1>(_tmp_init);
704 _metrics = std::get<2>(_tmp_init);
713 const auto& _avail_events = get_available_events(devid);
714 const auto& _avail_metric = get_available_metrics(devid);
716 std::set<std::string> _discarded_events{};
717 std::set<std::string> _discarded_metrics{};
719 bool _discard =
true;
722 auto _not_event = [&_avail_events, &_discarded_events,
724 bool nf = (std::find(std::begin(_avail_events),
std::end(_avail_events), evt) ==
727 _discarded_events.insert(evt);
732 auto _not_metric = [&_avail_metric, &_discarded_metrics,
734 bool nf = (std::find(std::begin(_avail_metric),
std::end(_avail_metric), met) ==
737 _discarded_metrics.insert(met);
742 _events.erase(std::remove_if(std::begin(_events),
std::end(_events), _not_event),
745 _metrics.erase(std::remove_if(std::begin(_metrics),
std::end(_metrics), _not_metric),
752 for(
const auto& itr : _discarded_events)
754 bool is_metric = !(_not_metric(itr));
757 _metrics.push_back(itr);
762 "[cupti_counters]> Removing unavailable event '%s' on device %i...\n",
768 for(
const auto& itr : _discarded_metrics)
770 bool is_event = !(_not_event(itr));
773 _events.push_back(itr);
778 "[cupti_counters]> Removing unavailable metric '%s' on device "
static pointer_t instance()
Get a shared pointer to the instance for the current thread.
The declaration for the types for manager without definitions.
_reported insert(_hash_id)
void print(std::ostream &os, Args &&... args)
tim::mpl::apply< std::string > string
const std::string std::ostream * os
ContainerT delimit(const std::string &line, const std::string &delimiters="\"',;: ", PredicateT &&predicate=[](const std::string &s) -> std::string { return s;})
#define TIMEMORY_ERROR_FUNCTION_MACRO
static short get_precision()
friend std::ostream & operator<<(std::ostream &os, const base_type &obj)
static fmtflags get_format_flags()
NVprof-style hardware counters via the CUpti callback API. Collecting these hardware counters has a h...
this_type & operator+=(const this_type &rhs)
~cupti_counters()=default
std::tuple< int, strvec_t, strvec_t > tuple_type
std::vector< double > get() const
typename value_type::value_type entry_type
cupti::profiler::results_t value_type
static device_func_t & get_device_initializer()
static const strvec_t & get_metrics()
static array_t< int64_t > unit_array()
static array_t< string_t > description_array()
static void extra_serialization(Archive &ar)
cupti_counters(const cupti_counters &)=default
this_type & operator+=(const results_t &rhs)
static string_t description()
static get_initializer_t & get_initializer()
static string_t display_unit()
static const strvec_t & get_events()
std::function< int()> device_func_t
static event_func_t & get_event_initializer()
std::function< tuple_type()> get_initializer_t
cupti_counters & operator=(cupti_counters &&) noexcept=default
static void configure(int device, const strvec_t &events, const strvec_t &metrics={})
explicitly configure for a device and set of events/metrics.
static value_type record()
std::function< strvec_t()> metric_func_t
std::unordered_multimap< std::string, value_type > secondary_type
static const profptr_t & get_profiler()
std::shared_ptr< cupti::profiler > profptr_t
cupti_counters(cupti_counters &&) noexcept=default
static metric_func_t & get_metric_initializer()
void serialize(Archive &ar, const unsigned int)
std::function< strvec_t()> event_func_t
secondary_type get_secondary() const
static const short precision
std::vector< Tp > array_t
cupti::result kernel_data_t
std::vector< string_t > strvec_t
static const strvec_t & get_labels()
cupti::profiler::results_t results_t
string_t get_display() const
static void global_init()
static void global_finalize()
cupti::profiler::kernel_results_t kernel_results_t
static array_t< string_t > label_array()
this_type & operator-=(const this_type &rhs)
static array_t< string_t > display_unit_array()