40 #include "timemory/components/cuda/backends.hpp"
43 #if defined(TIMEMORY_PYBIND11_SOURCE)
44 # include "pybind11/cast.h"
45 # include "pybind11/pybind11.h"
46 # include "pybind11/stl.h"
71 cuda::event_t
first = cuda::event_t{};
72 cuda::event_t
second = cuda::event_t{};
113 return "Records the time interval between two points in a CUDA stream. Less "
114 "accurate than 'cupti_activity' for kernel timing";
132 #if defined(TIMEMORY_PYBIND11_SOURCE)
140 return static_cast<float>(
load() /
static_cast<float>(ratio_t::den) *
144 TIMEMORY_NODISCARD
float get()
const
146 return static_cast<float>(
load() /
static_cast<float>(ratio_t::den) *
152 m_global_synced =
false;
153 m_global.
start(m_stream);
158 for(uint64_t i = 0; i < m_num_markers; ++i)
159 m_markers[i].
stop(m_stream);
160 if(m_current_marker == 0 && m_num_markers == 0)
161 m_global.
stop(m_stream);
167 if(m_current_marker == 0 && m_num_markers == 0)
171 float tmp = m_global.
sync();
172 m_global_synced =
true;
177 else if(m_current_marker > m_synced_markers)
180 for(uint64_t i = m_synced_markers; i < m_num_markers; ++i, ++m_synced_markers)
181 tmp += m_markers[i].
sync();
182 m_markers_synced =
true;
188 void set_stream(cuda::stream_t _stream) { m_stream = _stream; }
193 m_markers_synced =
false;
194 m_current_marker = m_num_markers++;
195 if(m_current_marker >= m_markers.size())
197 m_markers[m_current_marker].start(m_stream);
200 void mark_end() { m_markers[m_current_marker].stop(m_stream); }
204 m_markers_synced =
false;
205 m_current_marker = m_num_markers++;
206 if(m_current_marker >= m_markers.size())
208 m_markers[m_current_marker].start(_stream);
211 void mark_end(cuda::stream_t _stream) { m_markers[m_current_marker].stop(_stream); }
213 #if defined(TIMEMORY_PYBIND11_SOURCE)
221 for(uint64_t i = 0; i < nsize; ++i)
222 m_markers.emplace_back(
marker());
226 bool m_global_synced =
false;
227 bool m_markers_synced =
false;
228 uint64_t m_synced_markers = 0;
229 uint64_t m_current_marker = 0;
230 uint64_t m_num_markers = 0;
232 cuda::stream_t m_stream = 0;
233 marker m_global = {};
237 #if defined(TIMEMORY_PYBIND11_SOURCE)
246 template <
template <
typename...>
class BundleT>
248 pybind11::class_<BundleT<cuda_event>>& _pyclass)
250 auto _sync = [](BundleT<cuda_event>* obj) {
251 obj->template get<cuda_event>()->sync();
253 _pyclass.def(
"sync", _sync,
"Synchronize the event (blocking)");
268 :
public base<cuda_profiler, void>
279 return "Control switch for a CUDA profiler running on the application";
301 #if defined(TIMEMORY_USE_CUDA)
308 #if defined(TIMEMORY_USE_CUDA)
316 configure(std::get<0>(_config), std::get<1>(_config), std::get<2>(_config));
322 static std::atomic<int32_t> _once;
325 #if defined(TIMEMORY_USE_CUDA) && (CUDA_VERSION < 11000)
326 cudaProfilerInitialize(_infile.c_str(), _outfile.c_str(),
327 (_mode ==
mode::nvp) ? cudaKeyValuePair : cudaCSV);
337 #if defined(TIMEMORY_USE_CUDA)
346 #if defined(TIMEMORY_USE_CUDA)
354 #if defined(TIMEMORY_PYBIND11_SOURCE)
361 static void configure(project::python, pybind11::args _args, pybind11::kwargs _kwargs)
365 std::get<0>(_config) = _args[0].cast<
std::string>();
367 std::get<1>(_config) = _args[1].cast<
std::string>();
377 for(
auto itr : _kwargs)
380 std::get<0>(_config) = itr.second.cast<
std::string>();
381 else if(itr.first.cast<
std::string>().find(
"out") == 0)
382 std::get<1>(_config) = itr.second.cast<
std::string>();
391 configure(std::get<0>(_config), std::get<1>(_config), std::get<2>(_config));
413 return "Generates high-level region markers for CUDA profilers";
423 static void thread_init() { nvtx::name_thread(threading::get_id()); }
440 nvtx_marker(
const nvtx::color::color_t& _color, cuda::stream_t _stream)
446 #if defined(TIMEMORY_PYBIND11_SOURCE)
457 void start() { m_range_id = nvtx::range_start(get_attribute()); }
471 cuda::stream_sync(m_stream);
473 nvtx::range_stop(m_range_id);
492 get_stream_id(_stream)));
499 get_stream_id(_stream)));
502 #if defined(TIMEMORY_PYBIND11_SOURCE)
508 void set_stream(cuda::stream_t _stream) { m_stream = _stream; }
510 void set_color(nvtx::color::color_t _color) { m_color = _color; }
518 static int32_t get_stream_id(cuda::stream_t _stream)
520 using pair_t = std::pair<cuda::stream_t, int32_t>;
521 using map_t = std::map<cuda::stream_t, int32_t>;
522 using map_ptr_t = std::unique_ptr<map_t>;
524 static thread_local map_ptr_t _instance = std::make_unique<map_t>();
525 if(_instance->find(_stream) == _instance->end())
526 _instance->insert(pair_t(_stream, _instance->size()));
527 return _instance->find(_stream)->second;
531 bool m_has_attribute =
false;
532 nvtx::color::color_t m_color = 0;
533 nvtx::event_attributes_t m_attribute = {};
534 nvtx::range_id_t m_range_id = 0;
535 cuda::stream_t m_stream = 0;
536 const char* m_prefix =
nullptr;
539 nvtx::event_attributes_t& get_attribute()
543 m_has_attribute =
true;
546 std::stringstream ss;
547 ss <<
"[nvtx_marker]> Creating NVTX marker with label: \"" << m_prefix
548 <<
"\" and color " << std::hex << m_color <<
"...";
549 std::cout << ss.str() << std::endl;
551 m_attribute = nvtx::init_marker(m_prefix, m_color);
557 #if defined(TIMEMORY_PYBIND11_SOURCE)
566 template <
template <
typename...>
class BundleT>
568 pybind11::class_<BundleT<nvtx_marker>>& _pyclass)
570 _pyclass.def_property_static(
573 "Configure CudaEvent to use cudaSynchronize() vs. cudaStreamSychronize(...)");
576 pybind11::enum_<nvtx::color::color_idx> _pyattr(_pyclass,
"color",
"NVTX colors");
577 _pyattr.value(
"red", nvtx::color::red_idx)
578 .value(
"blue", nvtx::color::blue_idx)
579 .value(
"green", nvtx::color::green_idx)
580 .value(
"yellow", nvtx::color::yellow_idx)
581 .value(
"purple", nvtx::color::purple_idx)
582 .value(
"cyan", nvtx::color::cyan_idx)
583 .value(
"pink", nvtx::color::pink_idx)
584 .value(
"light_green", nvtx::color::light_green_idx);
585 _pyattr.export_values();
587 auto _set_color = [](BundleT<nvtx_marker>* obj, nvtx::color::color_t arg) {
588 obj->template get<nvtx_marker>()->set_color(arg);
590 auto _get_color = [](BundleT<nvtx_marker>* obj) {
591 return obj->template get<nvtx_marker>()->get_color();
593 _pyclass.def(
"set_color", _set_color,
"Set the color");
594 _pyclass.def(
"get_color", _get_color,
"Return the color");
Declare the cuda component types.
void stop(TupleT< Tp... > &obj, Args &&... args)
void mark(TupleT< Tp... > &obj, Args &&... args)
void start(TupleT< Tp... > &obj, Args &&... args)
Inherit from this policy to add reference counting support. Useful if you want to turn a global setti...
char const std::string & _prefix
void consume_parameters(ArgsT &&...) TIMEMORY_HIDDEN
tim::mpl::apply< std::string > string
The declaration for the types for settings without definitions.
static int64_t get_unit()
static void configure(Args &&...)
void stop(cuda::stream_t &stream)
void start(cuda::stream_t &stream)
Records the time interval between two points in a CUDA stream. Less accurate than 'cupti_activity' fo...
std::vector< marker > marker_list_t
void mark_begin(cuda::stream_t _stream)
static uint64_t & get_batched_marker_size()
void append_marker_list(const uint64_t nsize)
static std::string label()
void set_stream(cuda::stream_t _stream)
float get_display() const
static std::string description()
void mark_end(cuda::stream_t _stream)
static value_type record()
Control switch for a CUDA profiler running on the application. Only the first call to start() and the...
static std::string label()
static void global_init()
static std::string description()
static void configure(const std::string &_infile, const std::string &_outfile, mode _mode)
static initializer_type & get_initializer()
std::function< config_type()> initializer_type
std::tuple< std::string, std::string, mode > config_type
static void global_finalize()
Inserts NVTX markers with the current timemory prefix. The default color scheme is a round-robin of r...
void stop()
stop the nvtx range. Equivalent to nvtxRangeEnd. Depending on settings::nvtx_marker_device_sync() thi...
static bool & use_device_sync()
void mark_end()
asynchronously add a marker. Equivalent to nvtxMarkA
void set_prefix(const char *_prefix)
void mark_begin(cuda::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to nvtxMarkA
void mark_begin()
asynchronously add a marker. Equivalent to nvtxMarkA
static std::string description()
void set_color(nvtx::color::color_t _color)
set the current color
nvtx_marker(const nvtx::color::color_t &_color)
construct with an specific color
static std::string label()
void mark_end(cuda::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to nvtxMarkA
nvtx_marker(cuda::stream_t _stream)
construct with an specific CUDA stream
void start()
start an nvtx range. Equivalent to nvtxRangeStartEx
static void thread_init()
nvtx_marker(const nvtx::color::color_t &_color, cuda::stream_t _stream)
construct with an specific color and CUDA stream
static value_type record()
void set_stream(cuda::stream_t _stream)
set the current CUDA stream
#define TIMEMORY_JOIN(delim,...)