timemory  3.2.1
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
components.hpp
Go to the documentation of this file.
1 // MIT License
2 //
3 // Copyright (c) 2020, The Regents of the University of California,
4 // through Lawrence Berkeley National Laboratory (subject to receipt of any
5 // required approvals from the U.S. Dept. of Energy). All rights reserved.
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a copy
8 // of this software and associated documentation files (the "Software"), to deal
9 // in the Software without restriction, including without limitation the rights
10 // to use, copy, modify, merge, publish, distribute, sublicense, and
11 // copies of the Software, and to permit persons to whom the Software is
12 // furnished to do so, subject to the following conditions:
13 //
14 // The above copyright notice and this permission notice shall be included in all
15 // copies or substantial portions of the Software.
16 //
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 // SOFTWARE.
24 
25 /**
26  * \file timemory/components/cuda/components.hpp
27  * \brief Implementation of the cuda component(s)
28  */
29 
30 #pragma once
31 
32 #include <memory>
33 
35 #include "timemory/mpl/apply.hpp"
36 #include "timemory/mpl/types.hpp"
38 #include "timemory/units.hpp"
39 
40 #include "timemory/components/cuda/backends.hpp"
42 
43 #if defined(TIMEMORY_PYBIND11_SOURCE)
44 # include "pybind11/cast.h"
45 # include "pybind11/pybind11.h"
46 # include "pybind11/stl.h"
47 #endif
48 
49 //======================================================================================//
50 //
51 namespace tim
52 {
53 namespace component
54 {
55 //
56 //--------------------------------------------------------------------------------------//
57 // this component extracts the time spent in GPU kernels
58 //
59 /// \struct tim::component::cuda_event
60 /// \brief Records the time interval between two points in a CUDA stream. Less accurate
61 /// than 'cupti_activity' for kernel timing but does not require linking to the CUDA
62 /// driver.
63 ///
64 struct cuda_event : public base<cuda_event, float>
65 {
66  struct marker
67  {
68  bool valid = true;
69  bool synced = false;
70  bool running = false;
71  cuda::event_t first = cuda::event_t{};
72  cuda::event_t second = cuda::event_t{};
73 
74  marker() { valid = (cuda::event_create(first) && cuda::event_create(second)); }
75  ~marker() = default;
76 
77  void start(cuda::stream_t& stream)
78  {
79  if(!valid)
80  return;
81  synced = false;
82  running = true;
83  cuda::event_record(first, stream);
84  }
85 
86  void stop(cuda::stream_t& stream)
87  {
88  if(!valid || !running)
89  return;
90  cuda::event_record(second, stream);
91  running = false;
92  }
93 
94  float sync()
95  {
96  if(!valid)
97  return 0.0;
98  if(!synced)
99  cuda::event_sync(second);
100  synced = true;
101  return cuda::event_elapsed_time(first, second);
102  }
103  };
104 
105  using ratio_t = std::milli;
106  using value_type = float;
108  using marker_list_t = std::vector<marker>;
109 
110  static std::string label() { return "cuda_event"; }
112  {
113  return "Records the time interval between two points in a CUDA stream. Less "
114  "accurate than 'cupti_activity' for kernel timing";
115  }
116  static value_type record() { return 0.0f; }
117 
118  static uint64_t& get_batched_marker_size()
119  {
120  static uint64_t _instance = settings::cuda_event_batch_size();
121  return _instance;
122  }
123 
124 public:
125  TIMEMORY_DEFAULT_OBJECT(cuda_event)
126 
127  explicit cuda_event(cuda::stream_t _stream)
128  : m_stream(_stream)
129 
130  {}
131 
132 #if defined(TIMEMORY_PYBIND11_SOURCE)
133  // explicit cuda_event(pybind11::object _stream)
134  //: cuda_event(_stream.cast<cuda::stream_t>())
135  //{}
136 #endif
137 
138  TIMEMORY_NODISCARD float get_display() const
139  {
140  return static_cast<float>(load() / static_cast<float>(ratio_t::den) *
142  }
143 
144  TIMEMORY_NODISCARD float get() const
145  {
146  return static_cast<float>(load() / static_cast<float>(ratio_t::den) *
148  }
149 
150  void start()
151  {
152  m_global_synced = false;
153  m_global.start(m_stream);
154  }
155 
156  void stop()
157  {
158  for(uint64_t i = 0; i < m_num_markers; ++i)
159  m_markers[i].stop(m_stream);
160  if(m_current_marker == 0 && m_num_markers == 0)
161  m_global.stop(m_stream);
162  sync();
163  }
164 
165  void sync()
166  {
167  if(m_current_marker == 0 && m_num_markers == 0)
168  {
169  if(!m_global_synced)
170  {
171  float tmp = m_global.sync();
172  m_global_synced = true;
173  accum += tmp;
174  value = tmp;
175  }
176  }
177  else if(m_current_marker > m_synced_markers)
178  {
179  float tmp = 0.0;
180  for(uint64_t i = m_synced_markers; i < m_num_markers; ++i, ++m_synced_markers)
181  tmp += m_markers[i].sync();
182  m_markers_synced = true;
183  accum += tmp;
184  value = tmp;
185  }
186  }
187 
188  void set_stream(cuda::stream_t _stream) { m_stream = _stream; }
189  auto get_stream() { return m_stream; }
190 
191  void mark_begin()
192  {
193  m_markers_synced = false;
194  m_current_marker = m_num_markers++;
195  if(m_current_marker >= m_markers.size())
196  append_marker_list(std::max<uint64_t>(m_marker_batch_size, 1));
197  m_markers[m_current_marker].start(m_stream);
198  }
199 
200  void mark_end() { m_markers[m_current_marker].stop(m_stream); }
201 
202  void mark_begin(cuda::stream_t _stream)
203  {
204  m_markers_synced = false;
205  m_current_marker = m_num_markers++;
206  if(m_current_marker >= m_markers.size())
207  append_marker_list(std::max<uint64_t>(m_marker_batch_size, 1));
208  m_markers[m_current_marker].start(_stream);
209  }
210 
211  void mark_end(cuda::stream_t _stream) { m_markers[m_current_marker].stop(_stream); }
212 
213 #if defined(TIMEMORY_PYBIND11_SOURCE)
214  // void mark_begin(pybind11::object obj) { mark_begin(obj.cast<cuda::stream_t>()); }
215  // void mark_end(pybind11::object obj) { mark_begin(obj.cast<cuda::stream_t>()); }
216 #endif
217 
218 protected:
219  void append_marker_list(const uint64_t nsize)
220  {
221  for(uint64_t i = 0; i < nsize; ++i)
222  m_markers.emplace_back(marker());
223  }
224 
225 private:
226  bool m_global_synced = false;
227  bool m_markers_synced = false;
228  uint64_t m_synced_markers = 0;
229  uint64_t m_current_marker = 0;
230  uint64_t m_num_markers = 0;
231  uint64_t m_marker_batch_size = get_batched_marker_size();
232  cuda::stream_t m_stream = 0;
233  marker m_global = {};
234  marker_list_t m_markers = {};
235 
236 public:
237 #if defined(TIMEMORY_PYBIND11_SOURCE)
238  //
239  /// this is called by python api
240  ///
241  /// Use this to add customizations to the python module. The instance
242  /// of the component is within in a variadic wrapper which is used
243  /// elsewhere to ensure that calling mark_begin(...) on a component
244  /// without that member function is not invalid
245  ///
246  template <template <typename...> class BundleT>
247  static void configure(project::python,
248  pybind11::class_<BundleT<cuda_event>>& _pyclass)
249  {
250  auto _sync = [](BundleT<cuda_event>* obj) {
251  obj->template get<cuda_event>()->sync();
252  };
253  _pyclass.def("sync", _sync, "Synchronize the event (blocking)");
254  }
255 #endif
256 };
257 //
258 //======================================================================================//
259 //
260 // controls the CUDA profiler
261 //
262 /// \struct tim::component::cuda_profiler
263 /// \brief Control switch for a CUDA profiler running on the application. Only the
264 /// first call to `start()` and the last call to `stop()` actually toggle the
265 /// state of the external CUDA profiler when component instances are nested.
266 ///
268 : public base<cuda_profiler, void>
269 , private policy::instance_tracker<cuda_profiler>
270 {
271  using value_type = void;
275 
276  static std::string label() { return "cuda_profiler"; }
278  {
279  return "Control switch for a CUDA profiler running on the application";
280  }
281 
282  enum class mode : short
283  {
284  nvp,
285  csv
286  };
287 
288  using config_type = std::tuple<std::string, std::string, mode>;
289  using initializer_type = std::function<config_type()>;
290 
292  {
293  static initializer_type _instance = []() {
294  return config_type("cuda_profiler.inp", "cuda_profiler.out", mode::nvp);
295  };
296  return _instance;
297  }
298 
299  static void global_init()
300  {
301 #if defined(TIMEMORY_USE_CUDA)
302  cudaProfilerStop();
303 #endif
304  }
305 
306  static void global_finalize()
307  {
308 #if defined(TIMEMORY_USE_CUDA)
309  cudaProfilerStop();
310 #endif
311  }
312 
313  static void configure()
314  {
315  auto _config = get_initializer()();
316  configure(std::get<0>(_config), std::get<1>(_config), std::get<2>(_config));
317  }
318 
319  static void configure(const std::string& _infile, const std::string& _outfile,
320  mode _mode)
321  {
322  static std::atomic<int32_t> _once;
323  if(_once++ > 0)
324  return;
325 #if defined(TIMEMORY_USE_CUDA) && (CUDA_VERSION < 11000)
326  cudaProfilerInitialize(_infile.c_str(), _outfile.c_str(),
327  (_mode == mode::nvp) ? cudaKeyValuePair : cudaCSV);
328 #else
329  consume_parameters(_infile, _outfile, _mode);
330 #endif
331  }
332 
334 
335  void start()
336  {
337 #if defined(TIMEMORY_USE_CUDA)
339  if(m_tot == 0)
340  cudaProfilerStart();
341 #endif
342  }
343 
344  void stop()
345  {
346 #if defined(TIMEMORY_USE_CUDA)
348  if(m_tot == 0)
349  cudaProfilerStop();
350 #endif
351  }
352 
353 public:
354 #if defined(TIMEMORY_PYBIND11_SOURCE)
355  //
356  /// this is called by python api
357  ///
358  /// args --> pybind11::args --> pybind11::tuple
359  /// kwargs --> pybind11::kwargs --> pybind11::dict
360  ///
361  static void configure(project::python, pybind11::args _args, pybind11::kwargs _kwargs)
362  {
363  auto _config = get_initializer()();
364  if(_args.size() > 0)
365  std::get<0>(_config) = _args[0].cast<std::string>();
366  if(_args.size() > 1)
367  std::get<1>(_config) = _args[1].cast<std::string>();
368  if(_args.size() > 2)
369  {
370  auto _m = _args[2].cast<std::string>();
371  if(_m == "csv")
372  std::get<2>(_config) = mode::csv;
373  }
374  //
375  if(_kwargs)
376  {
377  for(auto itr : _kwargs)
378  {
379  if(itr.first.cast<std::string>().find("in") == 0)
380  std::get<0>(_config) = itr.second.cast<std::string>();
381  else if(itr.first.cast<std::string>().find("out") == 0)
382  std::get<1>(_config) = itr.second.cast<std::string>();
383  else
384  {
385  auto _m = itr.second.cast<std::string>();
386  if(_m == "csv")
387  std::get<2>(_config) = mode::csv;
388  }
389  }
390  }
391  configure(std::get<0>(_config), std::get<1>(_config), std::get<2>(_config));
392  }
393 #endif
394 };
395 //
396 //======================================================================================//
397 // adds NVTX markers
398 //
399 /// \struct tim::component::nvtx_marker
400 /// \brief Inserts NVTX markers with the current timemory prefix. The default color
401 /// scheme is a round-robin of red, blue, green, yellow, purple, cyan, pink, and
402 /// light_green. These colors
403 ///
404 struct nvtx_marker : public base<nvtx_marker, void>
405 {
406  using value_type = void;
409 
410  static std::string label() { return "nvtx_marker"; }
412  {
413  return "Generates high-level region markers for CUDA profilers";
414  }
415  static value_type record() {}
416 
417  static bool& use_device_sync()
418  {
419  static bool _instance = settings::nvtx_marker_device_sync();
420  return _instance;
421  }
422 
423  static void thread_init() { nvtx::name_thread(threading::get_id()); }
424 
425  nvtx_marker() = default;
426 
427  /// construct with an specific color
428  explicit nvtx_marker(const nvtx::color::color_t& _color)
429  : m_color(_color)
430 
431  {}
432 
433  /// construct with an specific CUDA stream
434  explicit nvtx_marker(cuda::stream_t _stream)
435  : m_stream(_stream)
436 
437  {}
438 
439  /// construct with an specific color and CUDA stream
440  nvtx_marker(const nvtx::color::color_t& _color, cuda::stream_t _stream)
441  : m_color(_color)
442  , m_stream(_stream)
443 
444  {}
445 
446 #if defined(TIMEMORY_PYBIND11_SOURCE)
447  // explicit nvtx_marker(pybind11::object _stream)
448  //: nvtx_marker(_stream.cast<cuda::stream_t>())
449  //{}
450 
451  // nvtx_marker(const nvtx::color::color_t& _color, pybind11::object _stream)
452  //: nvtx_marker(_color, _stream.cast<cuda::stream_t>())
453  //{}
454 #endif
455 
456  /// start an nvtx range. Equivalent to `nvtxRangeStartEx`
457  void start() { m_range_id = nvtx::range_start(get_attribute()); }
458 
459  /// stop the nvtx range. Equivalent to `nvtxRangeEnd`. Depending on
460  /// `settings::nvtx_marker_device_sync()` this will either call
461  /// `cudaDeviceSynchronize()` or `cudaStreamSynchronize(m_stream)` before stopping the
462  /// range.
463  void stop()
464  {
465  if(use_device_sync())
466  {
467  cuda::device_sync();
468  }
469  else
470  {
471  cuda::stream_sync(m_stream);
472  }
473  nvtx::range_stop(m_range_id);
474  }
475 
476  /// asynchronously add a marker. Equivalent to `nvtxMarkA`
477  void mark_begin()
478  {
479  nvtx::mark(TIMEMORY_JOIN("", m_prefix, "_begin_t", threading::get_id()));
480  }
481 
482  /// asynchronously add a marker. Equivalent to `nvtxMarkA`
483  void mark_end()
484  {
485  nvtx::mark(TIMEMORY_JOIN("", m_prefix, "_end_t", threading::get_id()));
486  }
487 
488  /// asynchronously add a marker for a specific stream. Equivalent to `nvtxMarkA`
489  void mark_begin(cuda::stream_t _stream)
490  {
491  nvtx::mark(TIMEMORY_JOIN("", m_prefix, "_begin_t", threading::get_id(), "_s",
492  get_stream_id(_stream)));
493  }
494 
495  /// asynchronously add a marker for a specific stream. Equivalent to `nvtxMarkA`
496  void mark_end(cuda::stream_t _stream)
497  {
498  nvtx::mark(TIMEMORY_JOIN("", m_prefix, "_end_t", threading::get_id(), "_s",
499  get_stream_id(_stream)));
500  }
501 
502 #if defined(TIMEMORY_PYBIND11_SOURCE)
503  // void mark_begin(pybind11::object obj) { mark_begin(obj.cast<cuda::stream_t>()); }
504  // void mark_end(pybind11::object obj) { mark_begin(obj.cast<cuda::stream_t>()); }
505 #endif
506 
507  /// set the current CUDA stream
508  void set_stream(cuda::stream_t _stream) { m_stream = _stream; }
509  /// set the current color
510  void set_color(nvtx::color::color_t _color) { m_color = _color; }
511  void set_prefix(const char* _prefix) { m_prefix = _prefix; }
512 
513  auto get_range_id() { return m_range_id; }
514  auto get_stream() { return m_stream; }
515  auto get_color() { return m_color; }
516 
517 private:
518  static int32_t get_stream_id(cuda::stream_t _stream)
519  {
520  using pair_t = std::pair<cuda::stream_t, int32_t>;
521  using map_t = std::map<cuda::stream_t, int32_t>;
522  using map_ptr_t = std::unique_ptr<map_t>;
523 
524  static thread_local map_ptr_t _instance = std::make_unique<map_t>();
525  if(_instance->find(_stream) == _instance->end())
526  _instance->insert(pair_t(_stream, _instance->size()));
527  return _instance->find(_stream)->second;
528  }
529 
530 private:
531  bool m_has_attribute = false;
532  nvtx::color::color_t m_color = 0;
533  nvtx::event_attributes_t m_attribute = {};
534  nvtx::range_id_t m_range_id = 0;
535  cuda::stream_t m_stream = 0;
536  const char* m_prefix = nullptr;
537 
538 private:
539  nvtx::event_attributes_t& get_attribute()
540  {
541  if(!m_has_attribute)
542  {
543  m_has_attribute = true;
544  if(settings::debug())
545  {
546  std::stringstream ss;
547  ss << "[nvtx_marker]> Creating NVTX marker with label: \"" << m_prefix
548  << "\" and color " << std::hex << m_color << "...";
549  std::cout << ss.str() << std::endl;
550  }
551  m_attribute = nvtx::init_marker(m_prefix, m_color);
552  }
553  return m_attribute;
554  }
555 
556 public:
557 #if defined(TIMEMORY_PYBIND11_SOURCE)
558  //
559  /// this is called by python api
560  ///
561  /// Use this to add customizations to the python module. The instance
562  /// of the component is within in a variadic wrapper which is used
563  /// elsewhere to ensure that calling mark_begin(...) on a component
564  /// without that member function is not invalid
565  ///
566  template <template <typename...> class BundleT>
567  static void configure(project::python,
568  pybind11::class_<BundleT<nvtx_marker>>& _pyclass)
569  {
570  _pyclass.def_property_static(
571  "use_device_sync", [](pybind11::object) { return use_device_sync(); },
572  [](pybind11::object, bool v) { use_device_sync() = v; },
573  "Configure CudaEvent to use cudaSynchronize() vs. cudaStreamSychronize(...)");
574 
575  // add nvtx colors
576  pybind11::enum_<nvtx::color::color_idx> _pyattr(_pyclass, "color", "NVTX colors");
577  _pyattr.value("red", nvtx::color::red_idx)
578  .value("blue", nvtx::color::blue_idx)
579  .value("green", nvtx::color::green_idx)
580  .value("yellow", nvtx::color::yellow_idx)
581  .value("purple", nvtx::color::purple_idx)
582  .value("cyan", nvtx::color::cyan_idx)
583  .value("pink", nvtx::color::pink_idx)
584  .value("light_green", nvtx::color::light_green_idx);
585  _pyattr.export_values();
586 
587  auto _set_color = [](BundleT<nvtx_marker>* obj, nvtx::color::color_t arg) {
588  obj->template get<nvtx_marker>()->set_color(arg);
589  };
590  auto _get_color = [](BundleT<nvtx_marker>* obj) {
591  return obj->template get<nvtx_marker>()->get_color();
592  };
593  _pyclass.def("set_color", _set_color, "Set the color");
594  _pyclass.def("get_color", _get_color, "Return the color");
595  }
596 #endif
597 };
598 //
599 //======================================================================================//
600 //
601 } // namespace component
602 } // namespace tim
603 //
604 //======================================================================================//
Declare the cuda component types.
void stop(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:368
void mark(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:439
void start(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:298
Inherit from this policy to add reference counting support. Useful if you want to turn a global setti...
Definition: types.hpp:367
data::stream stream
Definition: stream.hpp:982
Definition: kokkosp.cpp:38
char const std::string & _prefix
Definition: definition.hpp:59
void consume_parameters(ArgsT &&...) TIMEMORY_HIDDEN
Definition: types.hpp:285
tim::mpl::apply< std::string > string
Definition: macros.hpp:52
cuda_event_batch_size
Definition: settings.cpp:1413
nvtx_marker_device_sync
Definition: settings.cpp:1415
The declaration for the types for settings without definitions.
static int64_t get_unit()
void stop(cuda::stream_t &stream)
Definition: components.hpp:86
void start(cuda::stream_t &stream)
Definition: components.hpp:77
Records the time interval between two points in a CUDA stream. Less accurate than 'cupti_activity' fo...
Definition: components.hpp:65
std::vector< marker > marker_list_t
Definition: components.hpp:108
void mark_begin(cuda::stream_t _stream)
Definition: components.hpp:202
static uint64_t & get_batched_marker_size()
Definition: components.hpp:118
void append_marker_list(const uint64_t nsize)
Definition: components.hpp:219
static std::string label()
Definition: components.hpp:110
void set_stream(cuda::stream_t _stream)
Definition: components.hpp:188
static std::string description()
Definition: components.hpp:111
void mark_end(cuda::stream_t _stream)
Definition: components.hpp:211
static value_type record()
Definition: components.hpp:116
Control switch for a CUDA profiler running on the application. Only the first call to start() and the...
Definition: components.hpp:270
static std::string label()
Definition: components.hpp:276
static std::string description()
Definition: components.hpp:277
static void configure(const std::string &_infile, const std::string &_outfile, mode _mode)
Definition: components.hpp:319
static initializer_type & get_initializer()
Definition: components.hpp:291
std::function< config_type()> initializer_type
Definition: components.hpp:289
std::tuple< std::string, std::string, mode > config_type
Definition: components.hpp:288
Inserts NVTX markers with the current timemory prefix. The default color scheme is a round-robin of r...
Definition: components.hpp:405
void stop()
stop the nvtx range. Equivalent to nvtxRangeEnd. Depending on settings::nvtx_marker_device_sync() thi...
Definition: components.hpp:463
static bool & use_device_sync()
Definition: components.hpp:417
void mark_end()
asynchronously add a marker. Equivalent to nvtxMarkA
Definition: components.hpp:483
void set_prefix(const char *_prefix)
Definition: components.hpp:511
void mark_begin(cuda::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to nvtxMarkA
Definition: components.hpp:489
void mark_begin()
asynchronously add a marker. Equivalent to nvtxMarkA
Definition: components.hpp:477
static std::string description()
Definition: components.hpp:411
void set_color(nvtx::color::color_t _color)
set the current color
Definition: components.hpp:510
nvtx_marker(const nvtx::color::color_t &_color)
construct with an specific color
Definition: components.hpp:428
static std::string label()
Definition: components.hpp:410
void mark_end(cuda::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to nvtxMarkA
Definition: components.hpp:496
nvtx_marker(cuda::stream_t _stream)
construct with an specific CUDA stream
Definition: components.hpp:434
void start()
start an nvtx range. Equivalent to nvtxRangeStartEx
Definition: components.hpp:457
nvtx_marker(const nvtx::color::color_t &_color, cuda::stream_t _stream)
construct with an specific color and CUDA stream
Definition: components.hpp:440
static value_type record()
Definition: components.hpp:415
void set_stream(cuda::stream_t _stream)
set the current CUDA stream
Definition: components.hpp:508
#define TIMEMORY_JOIN(delim,...)
Definition: macros.hpp:89