timemory 3.3.0
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
components.hpp
Go to the documentation of this file.
1// MIT License
2//
3// Copyright (c) 2020, The Regents of the University of California,
4// through Lawrence Berkeley National Laboratory (subject to receipt of any
5// required approvals from the U.S. Dept. of Energy). All rights reserved.
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in all
15// copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25/**
26 * \file timemory/components/hip/components.hpp
27 * \brief Implementation of the hip component(s)
28 */
29
30#pragma once
31
32#include "timemory/backends/device.hpp"
34#include "timemory/components/hip/backends.hpp"
39#include "timemory/units.hpp"
40
41#include <memory>
42
43#if defined(TIMEMORY_PYBIND11_SOURCE)
44# include "pybind11/cast.h"
45# include "pybind11/pybind11.h"
46# include "pybind11/stl.h"
47#endif
48
49//======================================================================================//
50//
51namespace tim
52{
53namespace component
54{
55//
56//--------------------------------------------------------------------------------------//
57// this component extracts the time spent in GPU kernels
58//
59/// \struct tim::component::hip_event
60/// \brief Records the time interval between two points in a HIP stream. Less accurate
61/// than 'cupti_activity' for kernel timing but does not require linking to the HIP
62/// driver.
63///
64struct hip_event : public base<hip_event, float>
65{
66 struct marker
67 {
68 bool valid = true;
69 bool synced = false;
70 bool running = false;
71 hip::event_t first = hip::event_t{};
72 hip::event_t second = hip::event_t{};
73
74 marker() { valid = (hip::event_create(first) && hip::event_create(second)); }
75 ~marker() = default;
76
77 void start(hip::stream_t& stream)
78 {
79 if(!valid || running)
80 return;
81 synced = false;
82 running = true;
83 hip::event_record(first, stream);
84 }
85
86 void stop(hip::stream_t& stream)
87 {
88 if(!valid || !running)
89 return;
90 hip::event_record(second, stream);
91 running = false;
92 }
93
94 float sync()
95 {
96 if(!valid)
97 return 0.0;
98 if(!synced)
99 hip::event_sync(second);
100 synced = true;
101 return hip::event_elapsed_time(first, second) * units::msec;
102 }
103 };
104
105 using value_type = float;
107 using marker_list_t = std::vector<marker>;
108
109 static std::string label() { return "hip_event"; }
111 {
112 return "Records the time interval between two points in a HIP stream. Less "
113 "accurate than 'roctracer' for kernel timing";
114 }
115 static value_type record() { return 0.0f; }
116
117 static uint64_t& get_batched_marker_size()
118 {
119 static uint64_t _instance = settings::cuda_event_batch_size();
120 return _instance;
121 }
122
124 {};
125
126public:
127 TIMEMORY_DEFAULT_OBJECT(hip_event)
128
129 explicit hip_event(hip::stream_t _stream)
130 : m_stream(_stream)
131 {}
132
133 float get_display() const { return get(); }
134
135 float get() const { return load() / static_cast<float>(base_type::get_unit()); }
136
137 void store(explicit_streams_only, bool _v) { m_explicit_only = _v; }
138
139 void start()
140 {
141 if(!m_explicit_only || m_stream != hip::default_stream_v)
142 {
143 m_global_synced = false;
144 m_global.start(m_stream);
145 }
146 }
147
148 void stop()
149 {
150 for(uint64_t i = 0; i < m_num_markers; ++i)
151 m_markers[i].stop(m_stream);
152 if(m_current_marker == 0 && m_num_markers == 0)
153 m_global.stop(m_stream);
154 sync();
155 }
156
157 void sync()
158 {
159 if(m_current_marker == 0 && m_num_markers == 0)
160 {
161 if(!m_global_synced)
162 {
163 float tmp = m_global.sync();
164 m_global_synced = true;
165 accum += tmp;
166 value = tmp;
167 }
168 }
169 else if(m_current_marker > m_synced_markers)
170 {
171 float tmp = 0.0;
172 for(uint64_t i = m_synced_markers; i < m_num_markers; ++i, ++m_synced_markers)
173 tmp += m_markers[i].sync();
174 m_markers_synced = true;
175 accum += tmp;
176 value = tmp;
177 }
178 }
179
180 void set_stream(hip::stream_t _stream) { m_stream = _stream; }
181 auto get_stream() { return m_stream; }
182
184 {
185 m_markers_synced = false;
186 m_current_marker = m_num_markers++;
187 if(m_current_marker >= m_markers.size())
188 append_marker_list(std::max<uint64_t>(m_marker_batch_size, 1));
189 m_markers[m_current_marker].start(m_stream);
190 }
191
192 void mark_end() { m_markers[m_current_marker].stop(m_stream); }
193
194 void mark_begin(hip::stream_t _stream)
195 {
196 m_markers_synced = false;
197 m_current_marker = m_num_markers++;
198 if(m_current_marker >= m_markers.size())
199 append_marker_list(std::max<uint64_t>(m_marker_batch_size, 1));
200 m_markers[m_current_marker].start(_stream);
201 }
202
203 void mark_end(hip::stream_t _stream) { m_markers[m_current_marker].stop(_stream); }
204
205protected:
206 void append_marker_list(const uint64_t nsize)
207 {
208 m_markers.reserve(m_markers.size() + nsize);
209 for(uint64_t i = 0; i < nsize; ++i)
210 m_markers.emplace_back(marker{});
211 }
212
213private:
214 bool m_global_synced = false;
215 bool m_markers_synced = false;
216 bool m_explicit_only = false;
217 uint64_t m_synced_markers = 0;
218 uint64_t m_current_marker = 0;
219 uint64_t m_num_markers = 0;
220 uint64_t m_marker_batch_size = get_batched_marker_size();
221 hip::stream_t m_stream = hip::default_stream_v;
222 marker m_global = {};
223 marker_list_t m_markers = {};
224
225public:
226#if defined(TIMEMORY_PYBIND11_SOURCE)
227 //
228 /// this is called by python api
229 ///
230 /// Use this to add customizations to the python module. The instance
231 /// of the component is within in a variadic wrapper which is used
232 /// elsewhere to ensure that calling mark_begin(...) on a component
233 /// without that member function is not invalid
234 ///
235 template <template <typename...> class BundleT>
236 static void configure(project::python, pybind11::class_<BundleT<hip_event>>& _pyclass)
237 {
238 auto _sync = [](BundleT<hip_event>* obj) {
239 obj->template get<hip_event>()->sync();
240 };
241 _pyclass.def("sync", _sync, "Synchronize the event (blocking)");
242 }
243#endif
244};
245//
246//======================================================================================//
247// adds ROCTX markers
248//
249/// \struct tim::component::roctx_marker
250/// \brief Inserts ROCTX markers with the current timemory prefix.
251///
252struct roctx_marker : public base<roctx_marker, void>
253{
254 using value_type = void;
257
258 static std::string label() { return "roctx_marker"; }
260 {
261 return "Generates high-level region markers for HIP profilers";
262 }
263 static value_type record() {}
264
265 static bool& use_device_sync()
266 {
267 static bool _instance = settings::nvtx_marker_device_sync();
268 return _instance;
269 }
270
271 TIMEMORY_DEFAULT_OBJECT(roctx_marker)
272
273 /// construct with an specific HIP stream
274 explicit roctx_marker(hip::stream_t _stream)
275 : m_stream(_stream)
276 {}
277
278 /// start an roctx range. Equivalent to `roctxRangeStartEx`
279 void start() { m_range_id = roctx::range_start(m_prefix); }
280
281 /// stop the roctx range. Equivalent to `roctxRangeEnd`. Depending on
282 /// `settings::roctx_marker_device_sync()` this will either call
283 /// `hipDeviceSynchronize()` or `hipStreamSynchronize(m_stream)` before stopping the
284 /// range.
285 void stop()
286 {
287 if(use_device_sync())
288 {
289 hip::device_sync();
290 }
291 else
292 {
293 hip::stream_sync(m_stream);
294 }
295 roctx::range_stop(m_range_id);
296 }
297
298 /// asynchronously add a marker. Equivalent to `roctxMarkA`
300 {
301 roctx::mark(TIMEMORY_JOIN("", m_prefix, "_begin_t", threading::get_id()));
302 }
303
304 /// asynchronously add a marker. Equivalent to `roctxMarkA`
305 void mark_end()
306 {
307 roctx::mark(TIMEMORY_JOIN("", m_prefix, "_end_t", threading::get_id()));
308 }
309
310 /// asynchronously add a marker for a specific stream. Equivalent to `roctxMarkA`
311 void mark_begin(hip::stream_t _stream)
312 {
313 roctx::mark(TIMEMORY_JOIN("", m_prefix, "_begin_t", threading::get_id(), "_s",
314 get_stream_id(_stream)));
315 }
316
317 /// asynchronously add a marker for a specific stream. Equivalent to `roctxMarkA`
318 void mark_end(hip::stream_t _stream)
319 {
320 roctx::mark(TIMEMORY_JOIN("", m_prefix, "_end_t", threading::get_id(), "_s",
321 get_stream_id(_stream)));
322 }
323
324 /// set the current HIP stream
325 void set_stream(hip::stream_t _stream) { m_stream = _stream; }
326 /// set the label
327 void set_prefix(const char* _prefix) { m_prefix = _prefix; }
328
329 auto get_range_id() { return m_range_id; }
330 auto get_stream() { return m_stream; }
331
332private:
333 static int32_t get_stream_id(hip::stream_t _stream)
334 {
335 using pair_t = std::pair<hip::stream_t, int32_t>;
336 using map_t = std::map<hip::stream_t, int32_t>;
337 using map_ptr_t = std::unique_ptr<map_t>;
338
339 static thread_local map_ptr_t _instance = std::make_unique<map_t>();
340 if(_instance->find(_stream) == _instance->end())
341 _instance->insert(pair_t(_stream, _instance->size()));
342 return _instance->find(_stream)->second;
343 }
344
345private:
346 roctx::range_id_t m_range_id = 0;
347 hip::stream_t m_stream = 0;
348 const char* m_prefix = nullptr;
349
350public:
351#if defined(TIMEMORY_PYBIND11_SOURCE)
352 //
353 /// this is called by python api
354 ///
355 /// Use this to add customizations to the python module. The instance
356 /// of the component is within in a variadic wrapper which is used
357 /// elsewhere to ensure that calling mark_begin(...) on a component
358 /// without that member function is not invalid
359 ///
360 template <template <typename...> class BundleT>
361 static void configure(project::python,
362 pybind11::class_<BundleT<roctx_marker>>& _pyclass)
363 {
364 _pyclass.def_property_static(
365 "use_device_sync", [](pybind11::object) { return use_device_sync(); },
366 [](pybind11::object, bool v) { use_device_sync() = v; },
367 "Configure CudaEvent to use hipSynchronize() vs. hipStreamSychronize(...)");
368 }
369#endif
370};
371//
372//======================================================================================//
373//
374//
375//======================================================================================//
376//
377} // namespace component
378} // namespace tim
379//
380//======================================================================================//
Declare the hip component types.
void mark(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:457
data::stream stream
Definition: stream.hpp:982
Definition: kokkosp.cpp:39
char const std::string & _prefix
Definition: config.cpp:55
tim::mpl::apply< std::string > string
Definition: macros.hpp:53
cuda_event_batch_size
Definition: settings.cpp:1719
nvtx_marker_device_sync
Definition: settings.cpp:1721
static int64_t get_unit()
void stop(hip::stream_t &stream)
Definition: components.hpp:86
void start(hip::stream_t &stream)
Definition: components.hpp:77
Records the time interval between two points in a HIP stream. Less accurate than 'cupti_activity' for...
Definition: components.hpp:65
void set_stream(hip::stream_t _stream)
Definition: components.hpp:180
std::vector< marker > marker_list_t
Definition: components.hpp:107
static uint64_t & get_batched_marker_size()
Definition: components.hpp:117
static std::string description()
Definition: components.hpp:110
void mark_end(hip::stream_t _stream)
Definition: components.hpp:203
void append_marker_list(const uint64_t nsize)
Definition: components.hpp:206
void mark_begin(hip::stream_t _stream)
Definition: components.hpp:194
static std::string label()
Definition: components.hpp:109
static value_type record()
Definition: components.hpp:115
void store(explicit_streams_only, bool _v)
Definition: components.hpp:137
Inserts ROCTX markers with the current timemory prefix.
Definition: components.hpp:253
void mark_begin(hip::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to roctxMarkA
Definition: components.hpp:311
void mark_begin()
asynchronously add a marker. Equivalent to roctxMarkA
Definition: components.hpp:299
void stop()
stop the roctx range. Equivalent to roctxRangeEnd. Depending on settings::roctx_marker_device_sync() ...
Definition: components.hpp:285
void mark_end()
asynchronously add a marker. Equivalent to roctxMarkA
Definition: components.hpp:305
void set_stream(hip::stream_t _stream)
set the current HIP stream
Definition: components.hpp:325
void set_prefix(const char *_prefix)
set the label
Definition: components.hpp:327
roctx_marker(hip::stream_t _stream)
construct with an specific HIP stream
Definition: components.hpp:274
void start()
start an roctx range. Equivalent to roctxRangeStartEx
Definition: components.hpp:279
static bool & use_device_sync()
Definition: components.hpp:265
static std::string description()
Definition: components.hpp:259
static std::string label()
Definition: components.hpp:258
void mark_end(hip::stream_t _stream)
asynchronously add a marker for a specific stream. Equivalent to roctxMarkA
Definition: components.hpp:318
static value_type record()
Definition: components.hpp:263
#define TIMEMORY_JOIN(delim,...)
Definition: macros.hpp:90