32#include "timemory/backends/device.hpp"
34#include "timemory/components/cuda/backends.hpp"
43#if defined(TIMEMORY_PYBIND11_SOURCE)
44# include "pybind11/cast.h"
45# include "pybind11/pybind11.h"
46# include "pybind11/stl.h"
71 cuda::event_t
first = cuda::event_t{};
72 cuda::event_t
second = cuda::event_t{};
101 return cuda::event_elapsed_time(
first,
second) * units::msec;
113 return "Records the time interval between two points in a CUDA stream. Less "
114 "accurate than 'cupti_activity' for kernel timing";
134 float get() const noexcept
144 if(!m_explicit_only || m_stream != cuda::default_stream_v)
146 m_global_synced =
false;
147 m_global.
start(m_stream);
153 for(uint64_t i = 0; i < m_num_markers; ++i)
154 m_markers[i].
stop(m_stream);
155 if(m_current_marker == 0 && m_num_markers == 0)
156 m_global.
stop(m_stream);
162 if(m_current_marker == 0 && m_num_markers == 0)
166 float tmp = m_global.
sync();
167 m_global_synced =
true;
172 else if(m_current_marker > m_synced_markers)
175 for(uint64_t i = m_synced_markers; i < m_num_markers; ++i, ++m_synced_markers)
176 tmp += m_markers[i].
sync();
177 m_markers_synced =
true;
183 void set_stream(cuda::stream_t _stream) { m_stream = _stream; }
188 m_markers_synced =
false;
189 m_current_marker = m_num_markers++;
190 if(m_current_marker >= m_markers.size())
192 m_markers[m_current_marker].start(m_stream);
195 void mark_end() { m_markers[m_current_marker].stop(m_stream); }
199 m_markers_synced =
false;
200 m_current_marker = m_num_markers++;
201 if(m_current_marker >= m_markers.size())
203 m_markers[m_current_marker].start(_stream);
206 void mark_end(cuda::stream_t _stream) { m_markers[m_current_marker].stop(_stream); }
211 m_markers.reserve(m_markers.size() + nsize);
212 for(uint64_t i = 0; i < nsize; ++i)
213 m_markers.emplace_back(
marker{});
217 bool m_global_synced =
false;
218 bool m_markers_synced =
false;
219 bool m_explicit_only =
false;
220 uint64_t m_synced_markers = 0;
221 uint64_t m_current_marker = 0;
222 uint64_t m_num_markers = 0;
224 cuda::stream_t m_stream = cuda::default_stream_v;
225 marker m_global = {};
229#if defined(TIMEMORY_PYBIND11_SOURCE)
238 template <
template <
typename...>
class BundleT>
240 pybind11::class_<BundleT<cuda_event>>& _pyclass)
242 auto _sync = [](BundleT<cuda_event>* obj) {
243 obj->template get<cuda_event>()->sync();
245 _pyclass.def(
"sync", _sync,
"Synchronize the event (blocking)");
260:
public base<cuda_profiler, void>
271 return "Control switch for a CUDA profiler running on the application";
293#if defined(TIMEMORY_USE_CUDA)
300#if defined(TIMEMORY_USE_CUDA)
308 configure(std::get<0>(_config), std::get<1>(_config), std::get<2>(_config));
314 static std::atomic<int32_t> _once;
317#if defined(TIMEMORY_USE_CUDA) && (CUDA_VERSION < 11000)
318 cudaProfilerInitialize(_infile.c_str(), _outfile.c_str(),
319 (_mode ==
mode::nvp) ? cudaKeyValuePair : cudaCSV);
329#if defined(TIMEMORY_USE_CUDA)
338#if defined(TIMEMORY_USE_CUDA)
346#if defined(TIMEMORY_PYBIND11_SOURCE)
353 static void configure(project::python, pybind11::args
_args, pybind11::kwargs _kwargs)
369 for(
auto itr : _kwargs)
372 std::get<0>(_config) = itr.second.cast<
std::string>();
373 else if(itr.first.cast<
std::string>().find(
"out") == 0)
374 std::get<1>(_config) = itr.second.cast<
std::string>();
383 configure(std::get<0>(_config), std::get<1>(_config), std::get<2>(_config));
405 return "Generates high-level region markers for CUDA profilers";
415 static void thread_init() { nvtx::name_thread(threading::get_id()); }
430 nvtx_marker(
const nvtx::color::color_t& _color, cuda::stream_t _stream)
436#if defined(TIMEMORY_PYBIND11_SOURCE)
447 void start() { m_range_id = nvtx::range_start(get_attribute()); }
461 cuda::stream_sync(m_stream);
463 nvtx::range_stop(m_range_id);
482 get_stream_id(_stream)));
489 get_stream_id(_stream)));
492#if defined(TIMEMORY_PYBIND11_SOURCE)
498 void set_stream(cuda::stream_t _stream) { m_stream = _stream; }
500 void set_color(nvtx::color::color_t _color) { m_color = _color; }
508 static int32_t get_stream_id(cuda::stream_t _stream)
510 using pair_t = std::pair<cuda::stream_t, int32_t>;
511 using map_t = std::map<cuda::stream_t, int32_t>;
512 using map_ptr_t = std::unique_ptr<map_t>;
514 static thread_local map_ptr_t _instance = std::make_unique<map_t>();
515 if(_instance->find(_stream) == _instance->end())
516 _instance->insert(pair_t(_stream, _instance->size()));
517 return _instance->find(_stream)->second;
521 bool m_has_attribute =
false;
523 nvtx::color::color_t m_color = 0;
524 nvtx::event_attributes_t m_attribute = {};
525 nvtx::range_id_t m_range_id = 0;
526 cuda::stream_t m_stream = 0;
527 const char* m_prefix =
nullptr;
530 nvtx::event_attributes_t& get_attribute()
534 m_has_attribute =
true;
537 std::stringstream ss;
538 ss <<
"[nvtx_marker]> Creating NVTX marker with label: \"" << m_prefix
539 <<
"\" and color " << std::hex << m_color <<
"...";
540 std::cout << ss.str() << std::endl;
542 m_attribute = nvtx::init_marker(m_prefix, m_color);
548#if defined(TIMEMORY_PYBIND11_SOURCE)
557 template <
template <
typename...>
class BundleT>
559 pybind11::class_<BundleT<nvtx_marker>>& _pyclass)
561 _pyclass.def_property_static(
564 "Configure CudaEvent to use cudaSynchronize() vs. cudaStreamSychronize(...)");
567 pybind11::enum_<nvtx::color::color_idx> _pyattr(_pyclass,
"color",
"NVTX colors");
568 _pyattr.value(
"red", nvtx::color::red_idx)
569 .value(
"blue", nvtx::color::blue_idx)
570 .value(
"green", nvtx::color::green_idx)
571 .value(
"yellow", nvtx::color::yellow_idx)
572 .value(
"purple", nvtx::color::purple_idx)
573 .value(
"cyan", nvtx::color::cyan_idx)
574 .value(
"pink", nvtx::color::pink_idx)
575 .value(
"light_green", nvtx::color::light_green_idx);
576 _pyattr.export_values();
578 auto _set_color = [](BundleT<nvtx_marker>* obj, nvtx::color::color_t arg) {
579 obj->template get<nvtx_marker>()->set_color(arg);
581 auto _get_color = [](BundleT<nvtx_marker>* obj) {
582 return obj->template get<nvtx_marker>()->get_color();
584 _pyclass.def(
"set_color", _set_color,
"Set the color");
585 _pyclass.def(
"get_color", _get_color,
"Return the color");
Declare the cuda component types.
void stop(TupleT< Tp... > &obj, Args &&... args)
void mark(TupleT< Tp... > &obj, Args &&... args)
void start(TupleT< Tp... > &obj, Args &&... args)
Inherit from this policy to add reference counting support. Useful if you want to turn a global setti...
std::array< char *, 4 > _args
char const std::string & _prefix
tim::mpl::apply< std::string > string
void consume_parameters(ArgsT &&...)
static int64_t get_unit()
static void configure(Args &&...)
void stop(cuda::stream_t &stream)
void start(cuda::stream_t &stream)
Records the time interval between two points in a CUDA stream. Less accurate than 'cupti_activity' fo...
std::vector< marker > marker_list_t
float get() const noexcept
float get_display() const noexcept
void mark_begin(cuda::stream_t _stream)
void append_marker_list(const uint64_t nsize)
static std::string label()
void set_stream(cuda::stream_t _stream)
static std::string description()
void mark_end(cuda::stream_t _stream)
void store(explicit_streams_only, bool _v)
static value_type record()
static uint64_t & get_batched_marker_size()
Control switch for a CUDA profiler running on the application. Only the first call to start() and the...
static std::string label()
static void global_init()
static initializer_type & get_initializer()
static std::string description()
static void configure(const std::string &_infile, const std::string &_outfile, mode _mode)
std::function< config_type()> initializer_type
std::tuple< std::string, std::string, mode > config_type
static void global_finalize()
Inserts NVTX markers with the current timemory prefix. The default color scheme is a round-robin of r...
void stop()
stop the nvtx range. Equivalent to nvtxRangeEnd. Depending on settings::nvtx_marker_device_sync() thi...
void mark_end()
asynchronously add a marker. Equivalent to nvtxMarkA
void set_prefix(const char *_prefix)
void mark_begin(cuda::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to nvtxMarkA
void mark_begin()
asynchronously add a marker. Equivalent to nvtxMarkA
static std::string description()
void set_color(nvtx::color::color_t _color)
set the current color
nvtx_marker(const nvtx::color::color_t &_color)
construct with an specific color
static std::string label()
void mark_end(cuda::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to nvtxMarkA
nvtx_marker(cuda::stream_t _stream)
construct with an specific CUDA stream
void start()
start an nvtx range. Equivalent to nvtxRangeStartEx
static void thread_init()
static bool & use_device_sync()
nvtx_marker(const nvtx::color::color_t &_color, cuda::stream_t _stream)
construct with an specific color and CUDA stream
static value_type record()
void set_stream(cuda::stream_t _stream)
set the current CUDA stream
#define TIMEMORY_JOIN(delim,...)