timemory  3.2.1
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
cpu_roofline.hpp
Go to the documentation of this file.
1 // MIT License
2 //
3 // Copyright (c) 2020, The Regents of the University of California,
4 // through Lawrence Berkeley National Laboratory (subject to receipt of any
5 // required approvals from the U.S. Dept. of Energy). All rights reserved.
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a copy
8 // of this software and associated documentation files (the "Software"), to deal
9 // in the Software without restriction, including without limitation the rights
10 // to use, copy, modify, merge, publish, distribute, sublicense, and
11 // copies of the Software, and to permit persons to whom the Software is
12 // furnished to do so, subject to the following conditions:
13 //
14 // The above copyright notice and this permission notice shall be included in all
15 // copies or substantial portions of the Software.
16 //
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 // SOFTWARE.
24 
25 #pragma once
26 
27 #include "timemory/backends/papi.hpp"
31 
33 #include "timemory/components/roofline/backends.hpp"
36 
38 #include "timemory/ert/extern.hpp"
39 
40 #include <array>
41 #include <memory>
42 #include <numeric>
43 #include <utility>
44 
45 //======================================================================================//
46 
47 namespace tim
48 {
49 namespace component
50 {
51 //--------------------------------------------------------------------------------------//
52 // this computes the numerator of the roofline for a given set of PAPI counters.
53 // e.g. for FLOPS roofline (floating point operations / second:
54 //
55 // single precision:
56 // cpu_roofline<float>
57 //
58 // double precision:
59 // cpu_roofline<double>
60 //
61 // generic:
62 // cpu_roofline<T, ...>
63 //
64 /// \struct tim::component::cpu_roofline
65 /// \tparam Types Variadic list of data types for roofline analysis
66 ///
67 /// \brief Combines hardware counters and timers and executes the empirical roofline
68 /// toolkit during application termination to estimate the peak possible performance for
69 /// the machine
70 ///
71 template <typename... Types>
73 : public base<cpu_roofline<Types...>, std::pair<std::vector<long long>, double>>
74 {
75  static_assert(!is_one_of<cuda::fp16_t, std::tuple<Types...>>::value,
76  "Error! No CPU roofline support for cuda::fp16_t");
77 
78  using size_type = std::size_t;
79  using event_type = std::vector<int>;
80  using array_type = std::vector<long long>;
81  using data_type = long long*;
82  using value_type = std::pair<array_type, double>;
86  using record_type = std::function<value_type()>;
87 
88  friend struct operation::record<this_type>;
89  friend struct operation::start<this_type>;
90  friend struct operation::stop<this_type>;
91  friend struct operation::set_started<this_type>;
92  friend struct operation::set_stopped<this_type>;
93 
96 
97  using device_t = device::cpu;
99  using ratio_t = typename count_type::ratio_t;
100  using types_tuple = std::tuple<Types...>;
101 
103  using ert_data_ptr_t = std::shared_ptr<ert_data_t>;
104 
105  // short-hand for variadic expansion
106  template <typename Tp>
108  template <typename Tp>
110  template <typename Tp>
112  template <typename Tp>
114 
115  // variadic expansion for ERT types
116  using ert_config_t = std::tuple<ert_config_type<Types>...>;
117  using ert_counter_t = std::tuple<ert_counter_type<Types>...>;
118  using ert_executor_t = std::tuple<ert_executor_type<Types>...>;
119  using ert_callback_t = std::tuple<ert_callback_type<Types>...>;
120 
121  static_assert(std::tuple_size<ert_config_t>::value ==
122  std::tuple_size<types_tuple>::value,
123  "Error! ert_config_t size does not match types_tuple size!");
124 
125  using iterator = typename array_type::iterator;
126  using const_iterator = typename array_type::const_iterator;
127 
128  static const short precision = 3;
129  static const short width = 8;
130 
131  //----------------------------------------------------------------------------------//
132 
133  // collection mode, AI (arithmetic intensity) is the load/store: PAPI_LST_INS
134  enum class MODE
135  {
136  OP,
137  AI
138  };
139 
140  //----------------------------------------------------------------------------------//
141 
142  using strvec_t = std::vector<std::string>;
143  using intvec_t = std::vector<int>;
144  using events_callback_t = std::function<intvec_t(const MODE&)>;
145 
146  //----------------------------------------------------------------------------------//
147  /// replace this callback to add in custom HW counters
149  {
150  static events_callback_t _instance = [](const MODE&) { return intvec_t{}; };
151  return _instance;
152  }
153 
154  //----------------------------------------------------------------------------------//
155  /// set to false to suppress adding predefined enumerations
156  static bool& use_predefined_enums()
157  {
158  static bool _instance = true;
159  return _instance;
160  }
161 
162  //----------------------------------------------------------------------------------//
163 
164  static MODE& event_mode()
165  {
166  auto&& _get = []() {
167  auto&& aslc = [](std::string str) {
168  for(auto& itr : str)
169  itr = tolower(itr);
170  return str;
171  };
172  // check the standard variable
174  if(_env.empty())
175  _env = aslc(settings::roofline_mode());
176  auto _val =
177  (_env == "op" || _env == "hw" || _env == "counters")
178  ? MODE::OP
179  : ((_env == "ai" || _env == "ac" || _env == "activity") ? MODE::AI
180  : MODE::OP);
181  if(settings::verbose() > 1 || settings::debug())
182  {
183  std::cerr << "[" << demangle<this_type>()
184  << "]> roofline mode: " << ((_val == MODE::OP) ? "op" : "ai")
185  << std::endl;
186  }
187  return _val;
188  };
189 
190  static MODE _instance = _get();
191  if(!is_configured())
192  _instance = _get();
193  return _instance;
194  }
195 
196  //----------------------------------------------------------------------------------//
197 
199  {
200  static ert_config_t _instance;
201  return _instance;
202  }
203 
204  //----------------------------------------------------------------------------------//
205 
207  {
208  static ert_data_ptr_t _instance = std::make_shared<ert_data_t>();
209  return _instance;
210  }
211 
212  //----------------------------------------------------------------------------------//
213 
215  {
216  static auto _instance = []() {
217  event_type _events;
218  auto _mode = event_mode();
219  if(_mode == MODE::OP)
220  {
221  //
222  // add in user callback events BEFORE presets based on type so that
223  // the user can override the counters being used
224  //
225  auto _extra_events = get_events_callback()(_mode);
226  for(const auto& itr : _extra_events)
227  _events.push_back(itr);
228 
229  //
230  // add some presets based on data types
231  //
233  {
235  _events.push_back(PAPI_SP_OPS);
237  _events.push_back(PAPI_DP_OPS);
238  }
239  }
240  else if(_mode == MODE::AI)
241  {
242  //
243  // add the load/store hardware counter
244  //
246  {
247  _events.push_back(PAPI_LD_INS);
248  _events.push_back(PAPI_SR_INS);
249  _events.push_back(PAPI_LST_INS);
250  _events.push_back(PAPI_TOT_INS);
251  }
252  //
253  // add in user callback events AFTER load/store so that load/store
254  // instructions are always counted
255  //
256  auto _extra_events = get_events_callback()(_mode);
257  for(const auto& itr : _extra_events)
258  _events.push_back(itr);
259  }
260 
261  return _events;
262  }();
263 
264  return _instance;
265  }
266 
267  //----------------------------------------------------------------------------------//
268 
269  static void configure()
270  {
271  if(!is_configured())
272  {
273  if(settings::debug() || settings::verbose() > 1)
274  PRINT_HERE("%s", "configuring cpu_roofline");
275 
276  // do this BEFORE setting is_configured to true to ensure mode is updated
277  // properly
278  auto _events = get_events();
279  is_configured() = true;
280  for(auto&& itr : _events)
283  }
284  }
285 
286  //----------------------------------------------------------------------------------//
287 
288  static void global_init()
289  {
290  if(settings::debug() || settings::verbose() > 2)
291  PRINT_HERE("%s", "global initialization of cpu_roofline");
292  configure();
293  }
294 
295  //----------------------------------------------------------------------------------//
296 
297  static void thread_init()
298  {
299  if(settings::debug() || settings::verbose() > 2)
300  PRINT_HERE("%s", "thread initialization of cpu_roofline");
301  configure();
302  }
303 
304  //----------------------------------------------------------------------------------//
305 
306  static void thread_finalize() {}
307 
308  //----------------------------------------------------------------------------------//
309 
310  template <typename Tp, typename FuncT>
311  static void set_executor_callback(FuncT&& f)
312  {
313  ert_executor_type<Tp>::get_callback() = std::forward<FuncT>(f);
314  }
315 
316  //----------------------------------------------------------------------------------//
317 
318  static void global_finalize(storage_type* _store)
319  {
320  // query environment for whether this is part of CI test
321  // auto ci = get_env<bool>("CONTINUOUS_INTEGRATION", false);
322  if(_store && _store->size() > 0)
323  {
324  // run roofline peak generation
325  auto ert_config = get_finalizer();
326  auto ert_data = get_ert_data();
327  mpl::apply<void>::access<ert_executor_t>(ert_config, ert_data);
328  if(ert_data && (settings::verbose() > 1 || settings::debug()))
329  std::cout << *(ert_data) << std::endl;
330  }
331  }
332 
333  //----------------------------------------------------------------------------------//
334 
335  template <typename Archive>
336  static void extra_serialization(Archive& ar)
337  {
338  auto _ert_data = get_ert_data();
339  if(!_ert_data) // for input
340  _ert_data = std::make_shared<ert_data_t>();
341  ar(cereal::make_nvp("roofline", *_ert_data));
342  }
343 
344  //----------------------------------------------------------------------------------//
345 
347  {
348  return (event_mode() == MODE::OP) ? "op" : "ai";
349  }
350 
351  //----------------------------------------------------------------------------------//
352 
354  {
355  return mpl::apply<std::string>::join('_', demangle(typeid(Types).name())...);
356  }
357 
358  //----------------------------------------------------------------------------------//
359 
360  static unit_type unit()
361  {
362  return (event_mode() == MODE::OP) ? (1.0 / count_type::unit()) : 1.0;
363  }
364 
365  //----------------------------------------------------------------------------------//
366 
368  {
369  auto _units = m_papi_vector->display_unit_array();
370  _units.push_back(m_wall_clock->display_unit());
371  return _units;
372  }
373 
374  //----------------------------------------------------------------------------------//
375 
376  unit_type get_unit() { return unit(); }
377 
378  //----------------------------------------------------------------------------------//
379 
381 
382  //----------------------------------------------------------------------------------//
383 
385  {
387  {
388  return std::string("cpu_roofline_") + get_type_string() + "_" +
389  get_mode_string();
390  }
391  {
392  return std::string("cpu_roofline_") + get_mode_string();
393  }
394  }
395 
396  //----------------------------------------------------------------------------------//
397 
399  {
400  return "Model used to provide performance relative to the peak possible "
401  "performance on a CPU architecture.";
402  }
403 
404  //----------------------------------------------------------------------------------//
405 
407  {
408  auto hwcount = m_papi_vector->record();
409  auto duration = m_wall_clock->record();
410  return value_type(hwcount, duration);
411  }
412 
413 public:
415  : base_type()
416  {
417  configure();
418  m_papi_vector = std::make_shared<papi_vector>();
419  m_wall_clock = std::make_shared<wall_clock>();
420  std::tie(value.second, accum.second) = std::make_pair(0, 0);
421  }
422 
423  ~cpu_roofline() = default;
424  cpu_roofline(const cpu_roofline& rhs) = default;
425  cpu_roofline(cpu_roofline&& rhs) noexcept = default;
426  cpu_roofline& operator=(const cpu_roofline&) = default;
427  cpu_roofline& operator=(cpu_roofline&&) noexcept = default;
428 
429  //----------------------------------------------------------------------------------//
430 
431  TIMEMORY_NODISCARD std::vector<double> get() const
432  {
433  auto _data = m_papi_vector->get();
434  _data.push_back(m_wall_clock->get());
435  return _data;
436  }
437 
438  //----------------------------------------------------------------------------------//
439 
440  void start()
441  {
442  m_wall_clock->start();
443  m_papi_vector->start();
444  value = value_type{ m_papi_vector->get_value(), m_wall_clock->get_value() };
445  }
446 
447  //----------------------------------------------------------------------------------//
448 
449  void stop()
450  {
451  m_papi_vector->stop();
452  m_wall_clock->stop();
453  value = value_type{ m_papi_vector->get_value(), m_wall_clock->get_value() };
454  accum += value_type{ m_papi_vector->get_accum(), m_wall_clock->get_accum() };
455  }
456 
457  //----------------------------------------------------------------------------------//
458 
460  {
461  if(rhs.value.first.size() > value.first.size())
462  value.first.resize(rhs.value.first.size());
463  if(rhs.accum.first.size() > accum.first.size())
464  accum.first.resize(rhs.accum.first.size());
465  value += rhs.value;
466  accum += rhs.accum;
467  return *this;
468  }
469 
470  //----------------------------------------------------------------------------------//
471 
473  {
474  if(rhs.value.first.size() > value.first.size())
475  value.first.resize(rhs.value.first.size());
476  if(rhs.accum.first.size() > accum.first.size())
477  accum.first.resize(rhs.accum.first.size());
478  value -= rhs.value;
479  accum -= rhs.accum;
480  return *this;
481  }
482 
483  using base_type::load;
484 
485 protected:
486  using base_type::accum;
487  using base_type::laps;
490  using base_type::value;
491 
492  friend struct base<this_type, value_type>;
493  friend class impl::storage<this_type,
494  trait::uses_value_storage<this_type, value_type>::value>;
495 
496 public:
497  //==================================================================================//
498  //
499  // representation as a string
500  //
501  //==================================================================================//
502 
503  TIMEMORY_NODISCARD std::vector<double> get_display() const { return get(); }
504 
505  //----------------------------------------------------------------------------------//
506 
507  friend std::ostream& operator<<(std::ostream& os, const this_type& obj)
508  {
509  using namespace tim::stl::ostream;
510 
511  // output the time
512  auto& _obj = obj.load();
513  std::stringstream sst;
514  auto t_value = _obj.second;
515  auto t_label = count_type::get_label();
516  auto t_disp = count_type::get_display_unit();
517  auto t_prec = count_type::get_precision();
518  auto t_width = count_type::get_width();
519  auto t_flags = count_type::get_format_flags();
520 
521  sst.setf(t_flags);
522  sst << std::setw(t_width) << std::setprecision(t_prec) << t_value;
523  if(!t_disp.empty())
524  sst << " " << t_disp;
525  if(!t_label.empty())
526  sst << " " << t_label;
527  sst << ", ";
528 
529  auto _prec = count_type::get_precision();
530  auto _width = this_type::get_width();
531  auto _flags = count_type::get_format_flags();
532 
533  // output the roofline metric
534  auto _value = obj.get();
535  auto _label = obj.label_array();
536  auto _disp = obj.display_unit_array();
537 
538 #if defined(DEBUG)
539  if(settings::debug())
540  {
541  std::cout << "value: " << _value << std::endl;
542  std::cout << "label: " << _label << std::endl;
543  std::cout << "displ: " << _disp << std::endl;
544  }
545 #endif
546  assert(_value.size() <= _label.size());
547  assert(_value.size() <= _disp.size());
548 
549  auto n = _label.size();
550  for(size_t i = 0; i < n; ++i)
551  {
552  std::stringstream ss_value;
553  std::stringstream ss_extra;
554  ss_value.setf(_flags);
555  ss_value << std::setw(_width) << std::setprecision(_prec) << _value.at(i);
556  if(!_disp.at(i).empty())
557  {
558  ss_extra << " " << _disp.at(i);
559  }
560  else if(!_label.at(i).empty())
561  {
562  ss_extra << " " << _label.at(i);
563  }
564  os << sst.str() << ss_value.str() << ss_extra.str();
565  if(i + 1 < n)
566  os << ", ";
567  }
568 
569  return os;
570  }
571 
572  //----------------------------------------------------------------------------------//
573  //
574  template <typename Archive>
575  void load(Archive& ar, const unsigned int)
576  {
577  auto _disp = get_display();
578  auto labels = label_array();
579 
580  ar(cereal::make_nvp("laps", laps), cereal::make_nvp("labels", labels),
581  cereal::make_nvp("papi_vector", m_papi_vector));
582  ar(cereal::make_nvp("value", value));
583  ar(cereal::make_nvp("accum", accum));
584  }
585 
586  //----------------------------------------------------------------------------------//
587  //
588  template <typename Archive>
589  void save(Archive& ar, const unsigned int) const
590  {
591  auto _disp = get_display();
592  auto labels = label_array();
593 
594  ar(cereal::make_nvp("laps", laps), cereal::make_nvp("display", _disp),
595  cereal::make_nvp("mode", get_mode_string()),
596  cereal::make_nvp("type", get_type_string()),
597  cereal::make_nvp("labels", labels),
598  cereal::make_nvp("papi_vector", m_papi_vector));
599 
600  auto data = get();
601  ar.setNextName("repr_data");
602  ar.startNode();
603  auto litr = labels.begin();
604  auto ditr = data.begin();
605  for(; litr != labels.end() && ditr != data.end(); ++litr, ++ditr)
606  ar(cereal::make_nvp(*litr, double(*ditr)));
607  ar.finishNode();
608 
609  ar(cereal::make_nvp("value", value));
610  ar(cereal::make_nvp("accum", accum));
611  // ar(cereal::make_nvp("units", unit_array()));
612  // ar(cereal::make_nvp("display_units", display_unit_array()));
613  }
614 
615  //----------------------------------------------------------------------------------//
616  // array of descriptions
617  //
618  TIMEMORY_NODISCARD strvec_t label_array() const
619  {
620  strvec_t arr = m_papi_vector->label_array();
621  arr.push_back("Runtime");
622  return arr;
623  }
624 
625  //----------------------------------------------------------------------------------//
626  // array of labels
627  //
628  TIMEMORY_NODISCARD strvec_t description_array() const
629  {
630  strvec_t arr = m_papi_vector->description_array();
631  arr.push_back("Runtime");
632  return arr;
633  }
634 
635  //----------------------------------------------------------------------------------//
636  //
637  TIMEMORY_NODISCARD strvec_t display_unit_array() const
638  {
639  strvec_t arr = m_papi_vector->display_unit_array();
640  arr.push_back(count_type::get_display_unit());
641  return arr;
642  }
643 
644  //----------------------------------------------------------------------------------//
645  // array of unit values
646  //
647  TIMEMORY_NODISCARD std::vector<int64_t> unit_array() const
648  {
649  auto arr = m_papi_vector->unit_array();
650  arr.push_back(count_type::get_unit());
651  return arr;
652  }
653 
654 private:
655  //----------------------------------------------------------------------------------//
656  // these are needed after the global label array is destroyed
657  //
658  std::shared_ptr<papi_vector> m_papi_vector{ nullptr };
659  std::shared_ptr<wall_clock> m_wall_clock{ nullptr };
660 
661 public:
662  //----------------------------------------------------------------------------------//
663 
664  static void cleanup() {}
665 
666 private:
667  //----------------------------------------------------------------------------------//
668 
669  static bool& is_configured()
670  {
671  static thread_local bool _instance = false;
672  return _instance;
673  }
674 };
675 
676 //--------------------------------------------------------------------------------------//
677 } // namespace component
678 } // namespace tim
the namespace provides overloads to output complex data types w/ streams
Definition: stl.hpp:68
std::string display_type
Definition: kokkosp.cpp:38
roofline_mode
Definition: settings.cpp:1426
cpu_roofline_mode
Definition: settings.cpp:1428
std::string demangle(const char *_mangled_name, int *_status=nullptr)
Definition: utility.hpp:166
tim::mpl::apply< std::string > string
Definition: macros.hpp:52
roofline_type_labels
Definition: settings.cpp:1436
typename impl::is_one_of< Tp, Types > is_one_of
check if type is in expansion
Definition: types.hpp:738
roofline_type_labels_cpu
Definition: settings.cpp:1438
The declaration for the types for settings without definitions.
decltype(auto) load()
void set_stopped()
store that stop has been called
storage< Tp, Value > storage_type
Definition: declaration.hpp:90
void set_started()
store that start has been called
Combines hardware counters and timers and executes the empirical roofline toolkit during application ...
strvec_t description_array() const
std::function< value_type()> record_type
std::tuple< Types... > types_tuple
std::pair< array_type, double > value_type
std::vector< int64_t > unit_array() const
std::vector< double > get() const
display_unit_type display_unit()
this_type & operator-=(const this_type &rhs)
std::vector< int > event_type
static ert_data_ptr_t get_ert_data()
this_type & operator+=(const this_type &rhs)
typename base_type::storage_type storage_type
std::function< intvec_t(const MODE &)> events_callback_t
static std::string description()
std::vector< long long > array_type
static std::string label()
typename count_type::ratio_t ratio_t
void load(Archive &ar, const unsigned int)
static ert_config_t & get_finalizer()
std::vector< double > get_display() const
std::shared_ptr< ert_data_t > ert_data_ptr_t
static events_callback_t & get_events_callback()
replace this callback to add in custom HW counters
cpu_roofline & operator=(cpu_roofline &&) noexcept=default
std::vector< int > intvec_t
typename array_type::const_iterator const_iterator
static std::string get_type_string()
display_unit_type get_display_unit()
static bool & use_predefined_enums()
set to false to suppress adding predefined enumerations
cpu_roofline(cpu_roofline &&rhs) noexcept=default
std::tuple< ert_callback_type< Types >... > ert_callback_t
typename trait::units< this_type >::display_type display_unit_type
static std::string get_mode_string()
std::tuple< ert_config_type< Types >... > ert_config_t
static void set_executor_callback(FuncT &&f)
std::vector< std::string > strvec_t
static void extra_serialization(Archive &ar)
typename array_type::iterator iterator
std::tuple< ert_executor_type< Types >... > ert_executor_t
strvec_t display_unit_array() const
static void global_finalize(storage_type *_store)
static event_type get_events()
static const short precision
typename trait::units< this_type >::type unit_type
cpu_roofline(const cpu_roofline &rhs)=default
cpu_roofline< Types... > this_type
void save(Archive &ar, const unsigned int) const
friend std::ostream & operator<<(std::ostream &os, const this_type &obj)
cpu_roofline & operator=(const cpu_roofline &)=default
std::tuple< ert_counter_type< Types >... > ert_counter_t
static void add_event(int evt)
for variadic expansion to set the callback
static callback_type & get_callback()
static string_t join(SepT &&separator, Tuple &&__tup, index_sequence< Idx... >) noexcept
Definition: apply.hpp:409
This operation attempts to call a member function which the component provides to internally store wh...
Definition: types.hpp:472
This operation attempts to call a member function which the component provides to internally store wh...
Definition: types.hpp:505
#define PRINT_HERE(...)
Definition: macros.hpp:147