timemory 3.3.0
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
cpu_roofline.hpp
Go to the documentation of this file.
1// MIT License
2//
3// Copyright (c) 2020, The Regents of the University of California,
4// through Lawrence Berkeley National Laboratory (subject to receipt of any
5// required approvals from the U.S. Dept. of Energy). All rights reserved.
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in all
15// copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25#pragma once
26
27#include "timemory/backends/papi.hpp"
31#include "timemory/components/roofline/backends.hpp"
37
38#include <array>
39#include <memory>
40#include <numeric>
41#include <utility>
42
43//======================================================================================//
44
45namespace tim
46{
47namespace component
48{
49//--------------------------------------------------------------------------------------//
50// this computes the numerator of the roofline for a given set of PAPI counters.
51// e.g. for FLOPS roofline (floating point operations / second:
52//
53// single precision:
54// cpu_roofline<float>
55//
56// double precision:
57// cpu_roofline<double>
58//
59// generic:
60// cpu_roofline<T, ...>
61//
62/// \struct tim::component::cpu_roofline
63/// \tparam Types Variadic list of data types for roofline analysis
64///
65/// \brief Combines hardware counters and timers and executes the empirical roofline
66/// toolkit during application termination to estimate the peak possible performance for
67/// the machine
68///
69template <typename... Types>
71: public base<cpu_roofline<Types...>, std::pair<std::vector<long long>, double>>
72{
73 static_assert(!is_one_of<gpu::fp16_t, std::tuple<Types...>>::value,
74 "Error! No CPU roofline support for gpu::fp16_t");
75
76 using size_type = std::size_t;
77 using event_type = std::vector<int>;
78 using array_type = std::vector<long long>;
79 using data_type = long long*;
80 using value_type = std::pair<array_type, double>;
84 using record_type = std::function<value_type()>;
85
86 friend struct operation::record<this_type>;
87 friend struct operation::start<this_type>;
88 friend struct operation::stop<this_type>;
89 friend struct operation::set_started<this_type>;
90 friend struct operation::set_stopped<this_type>;
91
94
95 using device_t = device::cpu;
97 using ratio_t = typename count_type::ratio_t;
98 using types_tuple = std::tuple<Types...>;
99
101 using ert_data_ptr_t = std::shared_ptr<ert_data_t>;
102
103 // short-hand for variadic expansion
104 template <typename Tp>
106 template <typename Tp>
108 template <typename Tp>
110 template <typename Tp>
112
113 // variadic expansion for ERT types
114 using ert_config_t = std::tuple<ert_config_type<Types>...>;
115 using ert_counter_t = std::tuple<ert_counter_type<Types>...>;
116 using ert_executor_t = std::tuple<ert_executor_type<Types>...>;
117 using ert_callback_t = std::tuple<ert_callback_type<Types>...>;
118
119 static_assert(std::tuple_size<ert_config_t>::value ==
120 std::tuple_size<types_tuple>::value,
121 "Error! ert_config_t size does not match types_tuple size!");
122
123 using iterator = typename array_type::iterator;
124 using const_iterator = typename array_type::const_iterator;
125
126 static const short precision = 3;
127 static const short width = 8;
128
129 //----------------------------------------------------------------------------------//
130
131 // collection mode, AI (arithmetic intensity) is the load/store: PAPI_LST_INS
132 enum class MODE
133 {
134 OP,
135 AI
136 };
137
138 //----------------------------------------------------------------------------------//
139
140 using strvec_t = std::vector<std::string>;
141 using intvec_t = std::vector<int>;
142 using events_callback_t = std::function<intvec_t(const MODE&)>;
143
144 //----------------------------------------------------------------------------------//
145 /// replace this callback to add in custom HW counters
147 {
148 static events_callback_t _instance = [](const MODE&) { return intvec_t{}; };
149 return _instance;
150 }
151
152 //----------------------------------------------------------------------------------//
153 /// set to false to suppress adding predefined enumerations
154 static bool& use_predefined_enums()
155 {
156 static bool _instance = true;
157 return _instance;
158 }
159
160 //----------------------------------------------------------------------------------//
161
162 static MODE& event_mode()
163 {
164 auto&& _get = []() {
165 auto&& aslc = [](std::string str) {
166 for(auto& itr : str)
167 itr = tolower(itr);
168 return str;
169 };
170 // check the standard variable
172 if(_env.empty())
173 _env = aslc(settings::roofline_mode());
174 auto _val =
175 (_env == "op" || _env == "hw" || _env == "counters")
176 ? MODE::OP
177 : ((_env == "ai" || _env == "ac" || _env == "activity") ? MODE::AI
178 : MODE::OP);
180 {
181 std::cerr << "[" << demangle<this_type>()
182 << "]> roofline mode: " << ((_val == MODE::OP) ? "op" : "ai")
183 << std::endl;
184 }
185 return _val;
186 };
187
188 static MODE _instance = _get();
189 if(!is_configured())
190 _instance = _get();
191 return _instance;
192 }
193
194 //----------------------------------------------------------------------------------//
195
197 {
198 static ert_config_t _instance;
199 return _instance;
200 }
201
202 //----------------------------------------------------------------------------------//
203
205 {
206 static ert_data_ptr_t _instance = std::make_shared<ert_data_t>();
207 return _instance;
208 }
209
210 //----------------------------------------------------------------------------------//
211
213 {
214 static auto _instance = []() {
215 event_type _events;
216 auto _mode = event_mode();
217 if(_mode == MODE::OP)
218 {
219 //
220 // add in user callback events BEFORE presets based on type so that
221 // the user can override the counters being used
222 //
223 auto _extra_events = get_events_callback()(_mode);
224 for(const auto& itr : _extra_events)
225 _events.push_back(itr);
226
227 //
228 // add some presets based on data types
229 //
231 {
233 _events.push_back(PAPI_SP_OPS);
235 _events.push_back(PAPI_DP_OPS);
236 }
237 }
238 else if(_mode == MODE::AI)
239 {
240 //
241 // add the load/store hardware counter
242 //
244 {
245 _events.push_back(PAPI_LD_INS);
246 _events.push_back(PAPI_SR_INS);
247 _events.push_back(PAPI_LST_INS);
248 _events.push_back(PAPI_TOT_INS);
249 }
250 //
251 // add in user callback events AFTER load/store so that load/store
252 // instructions are always counted
253 //
254 auto _extra_events = get_events_callback()(_mode);
255 for(const auto& itr : _extra_events)
256 _events.push_back(itr);
257 }
258
259 return _events;
260 }();
261
262 return _instance;
263 }
264
265 //----------------------------------------------------------------------------------//
266
267 static void configure()
268 {
269 if(!is_configured())
270 {
272 PRINT_HERE("%s", "configuring cpu_roofline");
273
274 // do this BEFORE setting is_configured to true to ensure mode is updated
275 // properly
276 auto _events = get_events();
277 is_configured() = true;
278 for(auto&& itr : _events)
281 }
282 }
283
284 //----------------------------------------------------------------------------------//
285
286 static void global_init()
287 {
289 PRINT_HERE("%s", "global initialization of cpu_roofline");
290 configure();
291 }
292
293 //----------------------------------------------------------------------------------//
294
295 static void thread_init()
296 {
298 PRINT_HERE("%s", "thread initialization of cpu_roofline");
299 configure();
300 }
301
302 //----------------------------------------------------------------------------------//
303
304 static void thread_finalize() {}
305
306 //----------------------------------------------------------------------------------//
307
308 template <typename Tp, typename FuncT>
309 static void set_executor_callback(FuncT&& f)
310 {
311 ert_executor_type<Tp>::get_callback() = std::forward<FuncT>(f);
312 }
313
314 //----------------------------------------------------------------------------------//
315
316 static void global_finalize(storage_type* _store)
317 {
318 // query environment for whether this is part of CI test
319 // auto ci = get_env<bool>("CONTINUOUS_INTEGRATION", false);
320 if(_store && _store->size() > 0)
321 {
322 // run roofline peak generation
323 auto ert_config = get_finalizer();
324 auto ert_data = get_ert_data();
325 mpl::apply<void>::access<ert_executor_t>(ert_config, ert_data);
326 if(ert_data && (settings::verbose() > 1 || settings::debug()))
327 std::cout << *(ert_data) << std::endl;
328 }
329 }
330
331 //----------------------------------------------------------------------------------//
332
333 template <typename Archive>
334 static void extra_serialization(Archive& ar)
335 {
336 auto _ert_data = get_ert_data();
337 if(!_ert_data) // for input
338 _ert_data = std::make_shared<ert_data_t>();
339 ar(cereal::make_nvp("roofline", *_ert_data));
340 }
341
342 //----------------------------------------------------------------------------------//
343
345 {
346 return (event_mode() == MODE::OP) ? "op" : "ai";
347 }
348
349 //----------------------------------------------------------------------------------//
350
352 {
353 return mpl::apply<std::string>::join('_', demangle(typeid(Types).name())...);
354 }
355
356 //----------------------------------------------------------------------------------//
357
359 {
360 return (event_mode() == MODE::OP) ? (1.0 / count_type::unit()) : 1.0;
361 }
362
363 //----------------------------------------------------------------------------------//
364
366 {
367 auto _units = m_papi_vector->display_unit_array();
368 _units.push_back(m_wall_clock->display_unit());
369 return _units;
370 }
371
372 //----------------------------------------------------------------------------------//
373
374 unit_type get_unit() { return unit(); }
375
376 //----------------------------------------------------------------------------------//
377
379
380 //----------------------------------------------------------------------------------//
381
383 {
385 {
386 return std::string("cpu_roofline_") + get_type_string() + "_" +
388 }
389 {
390 return std::string("cpu_roofline_") + get_mode_string();
391 }
392 }
393
394 //----------------------------------------------------------------------------------//
395
397 {
398 return "Model used to provide performance relative to the peak possible "
399 "performance on a CPU architecture.";
400 }
401
402 //----------------------------------------------------------------------------------//
403
405 {
406 auto hwcount = m_papi_vector->record();
407 auto duration = m_wall_clock->record();
408 return value_type(hwcount, duration);
409 }
410
411public:
413 : base_type()
414 {
415 configure();
416 m_papi_vector = std::make_shared<papi_vector>();
417 m_wall_clock = std::make_shared<wall_clock>();
418 std::tie(value.second, accum.second) = std::make_pair(0, 0);
419 }
420
421 ~cpu_roofline() = default;
422 cpu_roofline(const cpu_roofline& rhs) = default;
423 cpu_roofline(cpu_roofline&& rhs) noexcept = default;
425 cpu_roofline& operator=(cpu_roofline&&) noexcept = default;
426
427 //----------------------------------------------------------------------------------//
428
429 TIMEMORY_NODISCARD std::vector<double> get() const
430 {
431 auto _data = m_papi_vector->get();
432 _data.push_back(m_wall_clock->get());
433 return _data;
434 }
435
436 //----------------------------------------------------------------------------------//
437
438 void start()
439 {
440 m_wall_clock->start();
441 m_papi_vector->start();
442 value = value_type{ m_papi_vector->get_value(), m_wall_clock->get_value() };
443 }
444
445 //----------------------------------------------------------------------------------//
446
447 void stop()
448 {
449 m_papi_vector->stop();
450 m_wall_clock->stop();
451 value = value_type{ m_papi_vector->get_value(), m_wall_clock->get_value() };
452 accum += value_type{ m_papi_vector->get_accum(), m_wall_clock->get_accum() };
453 }
454
455 //----------------------------------------------------------------------------------//
456
458 {
459 if(rhs.value.first.size() > value.first.size())
460 value.first.resize(rhs.value.first.size());
461 if(rhs.accum.first.size() > accum.first.size())
462 accum.first.resize(rhs.accum.first.size());
463 value += rhs.value;
464 accum += rhs.accum;
465 return *this;
466 }
467
468 //----------------------------------------------------------------------------------//
469
471 {
472 if(rhs.value.first.size() > value.first.size())
473 value.first.resize(rhs.value.first.size());
474 if(rhs.accum.first.size() > accum.first.size())
475 accum.first.resize(rhs.accum.first.size());
476 value -= rhs.value;
477 accum -= rhs.accum;
478 return *this;
479 }
480
481 using base_type::load;
482
483protected:
484 using base_type::accum;
485 using base_type::laps;
488 using base_type::value;
489
490 friend struct base<this_type, value_type>;
491 friend class impl::storage<this_type,
492 trait::uses_value_storage<this_type, value_type>::value>;
493
494public:
495 //==================================================================================//
496 //
497 // representation as a string
498 //
499 //==================================================================================//
500
501 TIMEMORY_NODISCARD std::vector<double> get_display() const { return get(); }
502
503 //----------------------------------------------------------------------------------//
504
505 friend std::ostream& operator<<(std::ostream& os, const this_type& obj)
506 {
507 using namespace tim::stl::ostream;
508
509 // output the time
510 auto& _obj = obj.load();
511 std::stringstream sst;
512 auto t_value = _obj.second;
513 auto t_label = count_type::get_label();
514 auto t_disp = count_type::get_display_unit();
515 auto t_prec = count_type::get_precision();
516 auto t_width = count_type::get_width();
517 auto t_flags = count_type::get_format_flags();
518
519 sst.setf(t_flags);
520 sst << std::setw(t_width) << std::setprecision(t_prec) << t_value;
521 if(!t_disp.empty())
522 sst << " " << t_disp;
523 if(!t_label.empty())
524 sst << " " << t_label;
525 sst << ", ";
526
527 auto _prec = count_type::get_precision();
528 auto _width = this_type::get_width();
529 auto _flags = count_type::get_format_flags();
530
531 // output the roofline metric
532 auto _value = obj.get();
533 auto _label = obj.label_array();
534 auto _disp = obj.display_unit_array();
535
536#if defined(DEBUG)
537 if(settings::debug())
538 {
539 std::cout << "value: " << _value << std::endl;
540 std::cout << "label: " << _label << std::endl;
541 std::cout << "displ: " << _disp << std::endl;
542 }
543#endif
544 assert(_value.size() <= _label.size());
545 assert(_value.size() <= _disp.size());
546
547 auto n = _label.size();
548 for(size_t i = 0; i < n; ++i)
549 {
550 std::stringstream ss_value;
551 std::stringstream ss_extra;
552 ss_value.setf(_flags);
553 ss_value << std::setw(_width) << std::setprecision(_prec) << _value.at(i);
554 if(!_disp.at(i).empty())
555 {
556 ss_extra << " " << _disp.at(i);
557 }
558 else if(!_label.at(i).empty())
559 {
560 ss_extra << " " << _label.at(i);
561 }
562 os << sst.str() << ss_value.str() << ss_extra.str();
563 if(i + 1 < n)
564 os << ", ";
565 }
566
567 return os;
568 }
569
570 //----------------------------------------------------------------------------------//
571 //
572 template <typename Archive>
573 void load(Archive& ar, const unsigned int)
574 {
575 auto _disp = get_display();
576 auto labels = label_array();
577
578 ar(cereal::make_nvp("laps", laps), cereal::make_nvp("labels", labels),
579 cereal::make_nvp("papi_vector", m_papi_vector));
580 ar(cereal::make_nvp("value", value));
581 ar(cereal::make_nvp("accum", accum));
582 }
583
584 //----------------------------------------------------------------------------------//
585 //
586 template <typename Archive>
587 void save(Archive& ar, const unsigned int) const
588 {
589 auto _disp = get_display();
590 auto labels = label_array();
591
592 ar(cereal::make_nvp("laps", laps), cereal::make_nvp("display", _disp),
593 cereal::make_nvp("mode", get_mode_string()),
594 cereal::make_nvp("type", get_type_string()),
595 cereal::make_nvp("labels", labels),
596 cereal::make_nvp("papi_vector", m_papi_vector));
597
598 auto data = get();
599 ar.setNextName("repr_data");
600 ar.startNode();
601 auto litr = labels.begin();
602 auto ditr = data.begin();
603 for(; litr != labels.end() && ditr != data.end(); ++litr, ++ditr)
604 ar(cereal::make_nvp(*litr, double(*ditr)));
605 ar.finishNode();
606
607 ar(cereal::make_nvp("value", value));
608 ar(cereal::make_nvp("accum", accum));
609 // ar(cereal::make_nvp("units", unit_array()));
610 // ar(cereal::make_nvp("display_units", display_unit_array()));
611 }
612
613 //----------------------------------------------------------------------------------//
614 // array of descriptions
615 //
616 TIMEMORY_NODISCARD strvec_t label_array() const
617 {
618 strvec_t arr = m_papi_vector->label_array();
619 arr.push_back("Runtime");
620 return arr;
621 }
622
623 //----------------------------------------------------------------------------------//
624 // array of labels
625 //
626 TIMEMORY_NODISCARD strvec_t description_array() const
627 {
628 strvec_t arr = m_papi_vector->description_array();
629 arr.push_back("Runtime");
630 return arr;
631 }
632
633 //----------------------------------------------------------------------------------//
634 //
635 TIMEMORY_NODISCARD strvec_t display_unit_array() const
636 {
637 strvec_t arr = m_papi_vector->display_unit_array();
638 arr.push_back(count_type::get_display_unit());
639 return arr;
640 }
641
642 //----------------------------------------------------------------------------------//
643 // array of unit values
644 //
645 TIMEMORY_NODISCARD std::vector<int64_t> unit_array() const
646 {
647 auto arr = m_papi_vector->unit_array();
648 arr.push_back(count_type::get_unit());
649 return arr;
650 }
651
652private:
653 //----------------------------------------------------------------------------------//
654 // these are needed after the global label array is destroyed
655 //
656 std::shared_ptr<papi_vector> m_papi_vector{ nullptr };
657 std::shared_ptr<wall_clock> m_wall_clock{ nullptr };
658
659public:
660 //----------------------------------------------------------------------------------//
661
662 static void cleanup() {}
663
664private:
665 //----------------------------------------------------------------------------------//
666
667 static bool& is_configured()
668 {
669 static thread_local bool _instance = false;
670 return _instance;
671 }
672};
673
674//--------------------------------------------------------------------------------------//
675} // namespace component
676} // namespace tim
STL namespace.
the namespace provides overloads to output complex data types w/ streams
std::string display_type
Definition: kokkosp.cpp:39
roofline_mode
Definition: settings.cpp:1732
cpu_roofline_mode
Definition: settings.cpp:1734
std::string demangle(const char *_mangled_name, int *_status=nullptr)
Definition: demangle.hpp:47
tim::mpl::apply< std::string > string
Definition: macros.hpp:53
const std::string std::ostream * os
roofline_type_labels
Definition: settings.cpp:1742
typename impl::is_one_of< Tp, Types > is_one_of
check if type is in expansion
Definition: types.hpp:777
roofline_type_labels_cpu
Definition: settings.cpp:1744
decltype(auto) load()
void set_stopped()
store that stop has been called
storage< Tp, Value > storage_type
void set_started()
store that start has been called
Combines hardware counters and timers and executes the empirical roofline toolkit during application ...
strvec_t description_array() const
std::vector< int64_t > unit_array() const
std::function< value_type()> record_type
std::tuple< Types... > types_tuple
std::pair< array_type, double > value_type
display_unit_type display_unit()
std::vector< int > event_type
static ert_data_ptr_t get_ert_data()
static events_callback_t & get_events_callback()
replace this callback to add in custom HW counters
friend std::ostream & operator<<(std::ostream &os, const this_type &obj)
std::function< intvec_t(const MODE &)> events_callback_t
std::vector< double > get_display() const
static ert_config_t & get_finalizer()
static std::string description()
std::vector< long long > array_type
static std::string label()
this_type & operator+=(const this_type &rhs)
typename count_type::ratio_t ratio_t
void load(Archive &ar, const unsigned int)
std::shared_ptr< ert_data_t > ert_data_ptr_t
cpu_roofline & operator=(const cpu_roofline &)=default
std::vector< int > intvec_t
typename array_type::const_iterator const_iterator
static std::string get_type_string()
display_unit_type get_display_unit()
cpu_roofline(cpu_roofline &&rhs) noexcept=default
std::tuple< ert_callback_type< Types >... > ert_callback_t
typename trait::units< this_type >::display_type display_unit_type
static std::string get_mode_string()
std::tuple< ert_config_type< Types >... > ert_config_t
static void set_executor_callback(FuncT &&f)
std::vector< std::string > strvec_t
static void extra_serialization(Archive &ar)
typename array_type::iterator iterator
std::tuple< ert_executor_type< Types >... > ert_executor_t
std::vector< double > get() const
strvec_t display_unit_array() const
cpu_roofline & operator=(cpu_roofline &&) noexcept=default
static void global_finalize(storage_type *_store)
static event_type get_events()
static const short precision
typename trait::units< this_type >::type unit_type
cpu_roofline(const cpu_roofline &rhs)=default
static bool & use_predefined_enums()
set to false to suppress adding predefined enumerations
cpu_roofline< Types... > this_type
this_type & operator-=(const this_type &rhs)
void save(Archive &ar, const unsigned int) const
std::tuple< ert_counter_type< Types >... > ert_counter_t
A very lightweight storage class which provides nothing.
Definition: declaration.hpp:51
constexpr size_t size() const
Definition: declaration.hpp:54
static void add_event(int evt)
for variadic expansion to set the callback
static callback_type & get_callback()
static string_t join(SepT &&separator, Tuple &&__tup, index_sequence< Idx... >) noexcept
Definition: apply.hpp:408
This operation attempts to call a member function which the component provides to internally store wh...
Definition: types.hpp:469
This operation attempts to call a member function which the component provides to internally store wh...
Definition: types.hpp:502
#define PRINT_HERE(...)
Definition: macros.hpp:152