timemory 3.3.0
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
components.hpp
Go to the documentation of this file.
1// MIT License
2//
3// Copyright (c) 2020, The Regents of the University of California,
4// through Lawrence Berkeley National Laboratory (subject to receipt of any
5// required approvals from the U.S. Dept. of Energy). All rights reserved.
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in all
15// copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25/**
26 * \file timemory/components/cuda/components.hpp
27 * \brief Implementation of the cuda component(s)
28 */
29
30#pragma once
31
32#include "timemory/backends/device.hpp"
34#include "timemory/components/cuda/backends.hpp"
39#include "timemory/units.hpp"
40
41#include <memory>
42
43#if defined(TIMEMORY_PYBIND11_SOURCE)
44# include "pybind11/cast.h"
45# include "pybind11/pybind11.h"
46# include "pybind11/stl.h"
47#endif
48
49//======================================================================================//
50//
51namespace tim
52{
53namespace component
54{
55//
56//--------------------------------------------------------------------------------------//
57// this component extracts the time spent in GPU kernels
58//
59/// \struct tim::component::cuda_event
60/// \brief Records the time interval between two points in a CUDA stream. Less accurate
61/// than 'cupti_activity' for kernel timing but does not require linking to the CUDA
62/// driver.
63///
64struct cuda_event : public base<cuda_event, float>
65{
66 struct marker
67 {
68 bool valid = true;
69 bool synced = false;
70 bool running = false;
71 cuda::event_t first = cuda::event_t{};
72 cuda::event_t second = cuda::event_t{};
73
74 marker() { valid = (cuda::event_create(first) && cuda::event_create(second)); }
75 ~marker() = default;
76
77 void start(cuda::stream_t& stream)
78 {
79 if(!valid || running)
80 return;
81 synced = false;
82 running = true;
83 cuda::event_record(first, stream);
84 }
85
86 void stop(cuda::stream_t& stream)
87 {
88 if(!valid || !running)
89 return;
90 cuda::event_record(second, stream);
91 running = false;
92 }
93
94 float sync()
95 {
96 if(!valid)
97 return 0.0;
98 if(!synced)
99 cuda::event_sync(second);
100 synced = true;
101 return cuda::event_elapsed_time(first, second) * units::msec;
102 }
103 };
104
105 using ratio_t = std::milli;
106 using value_type = float;
108 using marker_list_t = std::vector<marker>;
109
110 static std::string label() { return "cuda_event"; }
112 {
113 return "Records the time interval between two points in a CUDA stream. Less "
114 "accurate than 'cupti_activity' for kernel timing";
115 }
116 static value_type record() { return 0.0f; }
117
118 static uint64_t& get_batched_marker_size()
119 {
120 static uint64_t _instance = settings::cuda_event_batch_size();
121 return _instance;
122 }
123
125 {};
126
127public:
128 TIMEMORY_DEFAULT_OBJECT(cuda_event)
129
130 explicit cuda_event(cuda::stream_t _stream)
131 : m_stream(_stream)
132 {}
133
134 float get() const noexcept
135 {
136 return load() / static_cast<float>(base_type::get_unit());
137 }
138 float get_display() const noexcept { return get(); }
139
140 void store(explicit_streams_only, bool _v) { m_explicit_only = _v; }
141
142 void start()
143 {
144 if(!m_explicit_only || m_stream != cuda::default_stream_v)
145 {
146 m_global_synced = false;
147 m_global.start(m_stream);
148 }
149 }
150
151 void stop()
152 {
153 for(uint64_t i = 0; i < m_num_markers; ++i)
154 m_markers[i].stop(m_stream);
155 if(m_current_marker == 0 && m_num_markers == 0)
156 m_global.stop(m_stream);
157 sync();
158 }
159
160 void sync()
161 {
162 if(m_current_marker == 0 && m_num_markers == 0)
163 {
164 if(!m_global_synced)
165 {
166 float tmp = m_global.sync();
167 m_global_synced = true;
168 accum += tmp;
169 value = tmp;
170 }
171 }
172 else if(m_current_marker > m_synced_markers)
173 {
174 float tmp = 0.0;
175 for(uint64_t i = m_synced_markers; i < m_num_markers; ++i, ++m_synced_markers)
176 tmp += m_markers[i].sync();
177 m_markers_synced = true;
178 accum += tmp;
179 value = tmp;
180 }
181 }
182
183 void set_stream(cuda::stream_t _stream) { m_stream = _stream; }
184 auto get_stream() { return m_stream; }
185
187 {
188 m_markers_synced = false;
189 m_current_marker = m_num_markers++;
190 if(m_current_marker >= m_markers.size())
191 append_marker_list(std::max<uint64_t>(m_marker_batch_size, 1));
192 m_markers[m_current_marker].start(m_stream);
193 }
194
195 void mark_end() { m_markers[m_current_marker].stop(m_stream); }
196
197 void mark_begin(cuda::stream_t _stream)
198 {
199 m_markers_synced = false;
200 m_current_marker = m_num_markers++;
201 if(m_current_marker >= m_markers.size())
202 append_marker_list(std::max<uint64_t>(m_marker_batch_size, 1));
203 m_markers[m_current_marker].start(_stream);
204 }
205
206 void mark_end(cuda::stream_t _stream) { m_markers[m_current_marker].stop(_stream); }
207
208protected:
209 void append_marker_list(const uint64_t nsize)
210 {
211 m_markers.reserve(m_markers.size() + nsize);
212 for(uint64_t i = 0; i < nsize; ++i)
213 m_markers.emplace_back(marker{});
214 }
215
216private:
217 bool m_global_synced = false;
218 bool m_markers_synced = false;
219 bool m_explicit_only = false;
220 uint64_t m_synced_markers = 0;
221 uint64_t m_current_marker = 0;
222 uint64_t m_num_markers = 0;
223 uint64_t m_marker_batch_size = get_batched_marker_size();
224 cuda::stream_t m_stream = cuda::default_stream_v;
225 marker m_global = {};
226 marker_list_t m_markers = {};
227
228public:
229#if defined(TIMEMORY_PYBIND11_SOURCE)
230 //
231 /// this is called by python api
232 ///
233 /// Use this to add customizations to the python module. The instance
234 /// of the component is within in a variadic wrapper which is used
235 /// elsewhere to ensure that calling mark_begin(...) on a component
236 /// without that member function is not invalid
237 ///
238 template <template <typename...> class BundleT>
239 static void configure(project::python,
240 pybind11::class_<BundleT<cuda_event>>& _pyclass)
241 {
242 auto _sync = [](BundleT<cuda_event>* obj) {
243 obj->template get<cuda_event>()->sync();
244 };
245 _pyclass.def("sync", _sync, "Synchronize the event (blocking)");
246 }
247#endif
248};
249//
250//======================================================================================//
251//
252// controls the CUDA profiler
253//
254/// \struct tim::component::cuda_profiler
255/// \brief Control switch for a CUDA profiler running on the application. Only the
256/// first call to `start()` and the last call to `stop()` actually toggle the
257/// state of the external CUDA profiler when component instances are nested.
258///
260: public base<cuda_profiler, void>
261, private policy::instance_tracker<cuda_profiler>
262{
263 using value_type = void;
267
268 static std::string label() { return "cuda_profiler"; }
270 {
271 return "Control switch for a CUDA profiler running on the application";
272 }
273
274 enum class mode : short
275 {
276 nvp,
277 csv
278 };
279
280 using config_type = std::tuple<std::string, std::string, mode>;
281 using initializer_type = std::function<config_type()>;
282
284 {
285 static initializer_type _instance = []() {
286 return config_type("cuda_profiler.inp", "cuda_profiler.out", mode::nvp);
287 };
288 return _instance;
289 }
290
291 static void global_init()
292 {
293#if defined(TIMEMORY_USE_CUDA)
294 cudaProfilerStop();
295#endif
296 }
297
298 static void global_finalize()
299 {
300#if defined(TIMEMORY_USE_CUDA)
301 cudaProfilerStop();
302#endif
303 }
304
305 static void configure()
306 {
307 auto _config = get_initializer()();
308 configure(std::get<0>(_config), std::get<1>(_config), std::get<2>(_config));
309 }
310
311 static void configure(const std::string& _infile, const std::string& _outfile,
312 mode _mode)
313 {
314 static std::atomic<int32_t> _once;
315 if(_once++ > 0)
316 return;
317#if defined(TIMEMORY_USE_CUDA) && (CUDA_VERSION < 11000)
318 cudaProfilerInitialize(_infile.c_str(), _outfile.c_str(),
319 (_mode == mode::nvp) ? cudaKeyValuePair : cudaCSV);
320#else
321 consume_parameters(_infile, _outfile, _mode);
322#endif
323 }
324
326
327 void start()
328 {
329#if defined(TIMEMORY_USE_CUDA)
331 if(m_tot == 0)
332 cudaProfilerStart();
333#endif
334 }
335
336 void stop()
337 {
338#if defined(TIMEMORY_USE_CUDA)
340 if(m_tot == 0)
341 cudaProfilerStop();
342#endif
343 }
344
345public:
346#if defined(TIMEMORY_PYBIND11_SOURCE)
347 //
348 /// this is called by python api
349 ///
350 /// args --> pybind11::args --> pybind11::tuple
351 /// kwargs --> pybind11::kwargs --> pybind11::dict
352 ///
353 static void configure(project::python, pybind11::args _args, pybind11::kwargs _kwargs)
354 {
355 auto _config = get_initializer()();
356 if(_args.size() > 0)
357 std::get<0>(_config) = _args[0].cast<std::string>();
358 if(_args.size() > 1)
359 std::get<1>(_config) = _args[1].cast<std::string>();
360 if(_args.size() > 2)
361 {
362 auto _m = _args[2].cast<std::string>();
363 if(_m == "csv")
364 std::get<2>(_config) = mode::csv;
365 }
366 //
367 if(_kwargs)
368 {
369 for(auto itr : _kwargs)
370 {
371 if(itr.first.cast<std::string>().find("in") == 0)
372 std::get<0>(_config) = itr.second.cast<std::string>();
373 else if(itr.first.cast<std::string>().find("out") == 0)
374 std::get<1>(_config) = itr.second.cast<std::string>();
375 else
376 {
377 auto _m = itr.second.cast<std::string>();
378 if(_m == "csv")
379 std::get<2>(_config) = mode::csv;
380 }
381 }
382 }
383 configure(std::get<0>(_config), std::get<1>(_config), std::get<2>(_config));
384 }
385#endif
386};
387//
388//======================================================================================//
389// adds NVTX markers
390//
391/// \struct tim::component::nvtx_marker
392/// \brief Inserts NVTX markers with the current timemory prefix. The default color
393/// scheme is a round-robin of red, blue, green, yellow, purple, cyan, pink, and
394/// light_green. These colors
395///
396struct nvtx_marker : public base<nvtx_marker, void>
397{
398 using value_type = void;
401
402 static std::string label() { return "nvtx_marker"; }
404 {
405 return "Generates high-level region markers for CUDA profilers";
406 }
407 static value_type record() {}
408
409 static bool& use_device_sync()
410 {
411 static bool _instance = settings::nvtx_marker_device_sync();
412 return _instance;
413 }
414
415 static void thread_init() { nvtx::name_thread(threading::get_id()); }
416
417 nvtx_marker() = default;
418
419 /// construct with an specific color
420 explicit nvtx_marker(const nvtx::color::color_t& _color)
421 : m_color(_color)
422 {}
423
424 /// construct with an specific CUDA stream
425 explicit nvtx_marker(cuda::stream_t _stream)
426 : m_stream(_stream)
427 {}
428
429 /// construct with an specific color and CUDA stream
430 nvtx_marker(const nvtx::color::color_t& _color, cuda::stream_t _stream)
431 : m_color(_color)
432 , m_stream(_stream)
433
434 {}
435
436#if defined(TIMEMORY_PYBIND11_SOURCE)
437 // explicit nvtx_marker(pybind11::object _stream)
438 //: nvtx_marker(_stream.cast<cuda::stream_t>())
439 //{}
440
441 // nvtx_marker(const nvtx::color::color_t& _color, pybind11::object _stream)
442 //: nvtx_marker(_color, _stream.cast<cuda::stream_t>())
443 //{}
444#endif
445
446 /// start an nvtx range. Equivalent to `nvtxRangeStartEx`
447 void start() { m_range_id = nvtx::range_start(get_attribute()); }
448
449 /// stop the nvtx range. Equivalent to `nvtxRangeEnd`. Depending on
450 /// `settings::nvtx_marker_device_sync()` this will either call
451 /// `cudaDeviceSynchronize()` or `cudaStreamSynchronize(m_stream)` before stopping the
452 /// range.
453 void stop()
454 {
455 if(m_device_sync)
456 {
457 cuda::device_sync();
458 }
459 else
460 {
461 cuda::stream_sync(m_stream);
462 }
463 nvtx::range_stop(m_range_id);
464 }
465
466 /// asynchronously add a marker. Equivalent to `nvtxMarkA`
468 {
469 nvtx::mark(TIMEMORY_JOIN("", m_prefix, "_begin_t", threading::get_id()));
470 }
471
472 /// asynchronously add a marker. Equivalent to `nvtxMarkA`
473 void mark_end()
474 {
475 nvtx::mark(TIMEMORY_JOIN("", m_prefix, "_end_t", threading::get_id()));
476 }
477
478 /// asynchronously add a marker for a specific stream. Equivalent to `nvtxMarkA`
479 void mark_begin(cuda::stream_t _stream)
480 {
481 nvtx::mark(TIMEMORY_JOIN("", m_prefix, "_begin_t", threading::get_id(), "_s",
482 get_stream_id(_stream)));
483 }
484
485 /// asynchronously add a marker for a specific stream. Equivalent to `nvtxMarkA`
486 void mark_end(cuda::stream_t _stream)
487 {
488 nvtx::mark(TIMEMORY_JOIN("", m_prefix, "_end_t", threading::get_id(), "_s",
489 get_stream_id(_stream)));
490 }
491
492#if defined(TIMEMORY_PYBIND11_SOURCE)
493 // void mark_begin(pybind11::object obj) { mark_begin(obj.cast<cuda::stream_t>()); }
494 // void mark_end(pybind11::object obj) { mark_begin(obj.cast<cuda::stream_t>()); }
495#endif
496
497 /// set the current CUDA stream
498 void set_stream(cuda::stream_t _stream) { m_stream = _stream; }
499 /// set the current color
500 void set_color(nvtx::color::color_t _color) { m_color = _color; }
501 void set_prefix(const char* _prefix) { m_prefix = _prefix; }
502
503 auto get_range_id() { return m_range_id; }
504 auto get_stream() { return m_stream; }
505 auto get_color() { return m_color; }
506
507private:
508 static int32_t get_stream_id(cuda::stream_t _stream)
509 {
510 using pair_t = std::pair<cuda::stream_t, int32_t>;
511 using map_t = std::map<cuda::stream_t, int32_t>;
512 using map_ptr_t = std::unique_ptr<map_t>;
513
514 static thread_local map_ptr_t _instance = std::make_unique<map_t>();
515 if(_instance->find(_stream) == _instance->end())
516 _instance->insert(pair_t(_stream, _instance->size()));
517 return _instance->find(_stream)->second;
518 }
519
520private:
521 bool m_has_attribute = false;
522 bool m_device_sync = use_device_sync();
523 nvtx::color::color_t m_color = 0;
524 nvtx::event_attributes_t m_attribute = {};
525 nvtx::range_id_t m_range_id = 0;
526 cuda::stream_t m_stream = 0;
527 const char* m_prefix = nullptr;
528
529private:
530 nvtx::event_attributes_t& get_attribute()
531 {
532 if(!m_has_attribute)
533 {
534 m_has_attribute = true;
535 if(settings::debug())
536 {
537 std::stringstream ss;
538 ss << "[nvtx_marker]> Creating NVTX marker with label: \"" << m_prefix
539 << "\" and color " << std::hex << m_color << "...";
540 std::cout << ss.str() << std::endl;
541 }
542 m_attribute = nvtx::init_marker(m_prefix, m_color);
543 }
544 return m_attribute;
545 }
546
547public:
548#if defined(TIMEMORY_PYBIND11_SOURCE)
549 //
550 /// this is called by python api
551 ///
552 /// Use this to add customizations to the python module. The instance
553 /// of the component is within in a variadic wrapper which is used
554 /// elsewhere to ensure that calling mark_begin(...) on a component
555 /// without that member function is not invalid
556 ///
557 template <template <typename...> class BundleT>
558 static void configure(project::python,
559 pybind11::class_<BundleT<nvtx_marker>>& _pyclass)
560 {
561 _pyclass.def_property_static(
562 "use_device_sync", [](pybind11::object) { return use_device_sync(); },
563 [](pybind11::object, bool v) { use_device_sync() = v; },
564 "Configure CudaEvent to use cudaSynchronize() vs. cudaStreamSychronize(...)");
565
566 // add nvtx colors
567 pybind11::enum_<nvtx::color::color_idx> _pyattr(_pyclass, "color", "NVTX colors");
568 _pyattr.value("red", nvtx::color::red_idx)
569 .value("blue", nvtx::color::blue_idx)
570 .value("green", nvtx::color::green_idx)
571 .value("yellow", nvtx::color::yellow_idx)
572 .value("purple", nvtx::color::purple_idx)
573 .value("cyan", nvtx::color::cyan_idx)
574 .value("pink", nvtx::color::pink_idx)
575 .value("light_green", nvtx::color::light_green_idx);
576 _pyattr.export_values();
577
578 auto _set_color = [](BundleT<nvtx_marker>* obj, nvtx::color::color_t arg) {
579 obj->template get<nvtx_marker>()->set_color(arg);
580 };
581 auto _get_color = [](BundleT<nvtx_marker>* obj) {
582 return obj->template get<nvtx_marker>()->get_color();
583 };
584 _pyclass.def("set_color", _set_color, "Set the color");
585 _pyclass.def("get_color", _get_color, "Return the color");
586 }
587#endif
588};
589//
590//======================================================================================//
591//
592//
593//======================================================================================//
594//
595} // namespace component
596} // namespace tim
597//
598//======================================================================================//
Declare the cuda component types.
void stop(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:386
void mark(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:457
void start(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:316
Inherit from this policy to add reference counting support. Useful if you want to turn a global setti...
Definition: types.hpp:406
data::stream stream
Definition: stream.hpp:982
Definition: kokkosp.cpp:39
std::array< char *, 4 > _args
char const std::string & _prefix
Definition: config.cpp:55
tim::mpl::apply< std::string > string
Definition: macros.hpp:53
cuda_event_batch_size
Definition: settings.cpp:1719
nvtx_marker_device_sync
Definition: settings.cpp:1721
void consume_parameters(ArgsT &&...)
Definition: types.hpp:285
static int64_t get_unit()
void stop(cuda::stream_t &stream)
Definition: components.hpp:86
void start(cuda::stream_t &stream)
Definition: components.hpp:77
Records the time interval between two points in a CUDA stream. Less accurate than 'cupti_activity' fo...
Definition: components.hpp:65
std::vector< marker > marker_list_t
Definition: components.hpp:108
float get() const noexcept
Definition: components.hpp:134
float get_display() const noexcept
Definition: components.hpp:138
void mark_begin(cuda::stream_t _stream)
Definition: components.hpp:197
void append_marker_list(const uint64_t nsize)
Definition: components.hpp:209
static std::string label()
Definition: components.hpp:110
void set_stream(cuda::stream_t _stream)
Definition: components.hpp:183
static std::string description()
Definition: components.hpp:111
void mark_end(cuda::stream_t _stream)
Definition: components.hpp:206
void store(explicit_streams_only, bool _v)
Definition: components.hpp:140
static value_type record()
Definition: components.hpp:116
static uint64_t & get_batched_marker_size()
Definition: components.hpp:118
Control switch for a CUDA profiler running on the application. Only the first call to start() and the...
Definition: components.hpp:262
static std::string label()
Definition: components.hpp:268
static initializer_type & get_initializer()
Definition: components.hpp:283
static std::string description()
Definition: components.hpp:269
static void configure(const std::string &_infile, const std::string &_outfile, mode _mode)
Definition: components.hpp:311
std::function< config_type()> initializer_type
Definition: components.hpp:281
std::tuple< std::string, std::string, mode > config_type
Definition: components.hpp:280
Inserts NVTX markers with the current timemory prefix. The default color scheme is a round-robin of r...
Definition: components.hpp:397
void stop()
stop the nvtx range. Equivalent to nvtxRangeEnd. Depending on settings::nvtx_marker_device_sync() thi...
Definition: components.hpp:453
void mark_end()
asynchronously add a marker. Equivalent to nvtxMarkA
Definition: components.hpp:473
void set_prefix(const char *_prefix)
Definition: components.hpp:501
void mark_begin(cuda::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to nvtxMarkA
Definition: components.hpp:479
void mark_begin()
asynchronously add a marker. Equivalent to nvtxMarkA
Definition: components.hpp:467
static std::string description()
Definition: components.hpp:403
void set_color(nvtx::color::color_t _color)
set the current color
Definition: components.hpp:500
nvtx_marker(const nvtx::color::color_t &_color)
construct with an specific color
Definition: components.hpp:420
static std::string label()
Definition: components.hpp:402
void mark_end(cuda::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to nvtxMarkA
Definition: components.hpp:486
nvtx_marker(cuda::stream_t _stream)
construct with an specific CUDA stream
Definition: components.hpp:425
void start()
start an nvtx range. Equivalent to nvtxRangeStartEx
Definition: components.hpp:447
static bool & use_device_sync()
Definition: components.hpp:409
nvtx_marker(const nvtx::color::color_t &_color, cuda::stream_t _stream)
construct with an specific color and CUDA stream
Definition: components.hpp:430
static value_type record()
Definition: components.hpp:407
void set_stream(cuda::stream_t _stream)
set the current CUDA stream
Definition: components.hpp:498
#define TIMEMORY_JOIN(delim,...)
Definition: macros.hpp:90