timemory 3.3.0
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
cupti_profiler.hpp
Go to the documentation of this file.
1// MIT License
2//
3// Copyright (c) 2020, The Regents of the University of California,
4// through Lawrence Berkeley National Laboratory (subject to receipt of any
5// required approvals from the U.S. Dept. of Energy). All rights reserved.
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in all
15// copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25/**
26 * \headerfile "timemory/components/cupti/cupti_profiler.hpp"
27 * Provides implementation of CUPTI Profiler API
28 *
29 */
30
31#pragma once
32
33#if defined(TIMEMORY_USE_CUPTI_NVPERF)
34
36# include "timemory/components/cupti/backends.hpp"
38# include "timemory/macros.hpp"
40
41# include <nvperf_cuda_host.h>
42# include <nvperf_host.h>
43# include <nvperf_target.h>
44//
45# include <cupti_profiler_target.h>
46# include <cupti_target.h>
47//
48# include <cstdint>
49# include <cuda.h>
50# include <fstream>
51# include <iostream>
52# include <string>
53# include <vector>
54//
55//======================================================================================//
56//
57# if !defined(TIMEMORY_CUPTI_API_CALL)
58# define TIMEMORY_CUPTI_API_CALL(...) TIMEMORY_CUPTI_CALL(__VA_ARGS__)
59# endif
60//
61//======================================================================================//
62//
63# if !defined(TIMEMORY_NVPW_API_CALL)
64# define TIMEMORY_NVPW_API_CALL(apiFuncCall) \
65 do \
66 { \
67 NVPA_Status _status = apiFuncCall; \
68 if(_status != NVPA_STATUS_SUCCESS) \
69 { \
70 fprintf(stderr, "%s:%d: error: function %s failed with error %d.\n", \
71 __FILE__, __LINE__, #apiFuncCall, _status); \
72 exit(-1); \
73 } \
74 } while(0)
75# endif
76//
77//======================================================================================//
78//
79# define TIMEMORY_RETURN_IF_NVPW_ERROR(retval, actual) \
80 do \
81 { \
82 if(NVPA_STATUS_SUCCESS != actual) \
83 { \
84 fprintf(stderr, "FAILED: %s\n", #actual); \
85 return retval; \
86 } \
87 } while(0)
88//
89//======================================================================================//
90//
91template <typename T>
92class ScopeExit
93{
94public:
95 ScopeExit(T _t)
96 : t(_t)
97 {}
98 ~ScopeExit() { t(); }
99 T t;
100};
101//
102//======================================================================================//
103//
104template <typename T>
105ScopeExit<T>
106MoveScopeExit(T t)
107{
108 return ScopeExit<T>(t);
109}
110//
111//======================================================================================//
112//
113# define NV_ANONYMOUS_VARIABLE_DIRECT(name, line) name##line
114# define NV_ANONYMOUS_VARIABLE_INDIRECT(name, line) \
115 NV_ANONYMOUS_VARIABLE_DIRECT(name, line)
116# define SCOPE_EXIT(func) \
117 const auto NV_ANONYMOUS_VARIABLE_INDIRECT(EXIT, __LINE__) = \
118 MoveScopeExit([=]() { func; })
119//
120//======================================================================================//
121//
122namespace tim
123{
124namespace component
125{
126//--------------------------------------------------------------------------------------//
127//
128// CUPTI Profiler component
129//
130//--------------------------------------------------------------------------------------//
131/// \struct tim::component::cupti_profiler
132/// \brief Hardware counters via the CUpti profiling API. The profiling API is only
133/// available with newer NVIDIA hardware and CUDA 10+. This component currently has issues
134/// with nesting.
135///
136struct cupti_profiler : public base<cupti_profiler, std::map<std::string, double>>
137{
138protected:
139 struct MetricNameValue;
140
141public:
142 // required aliases
143 using value_type = std::map<std::string, double>;
144 using this_type = cupti_profiler;
145 using base_type = base<this_type, value_type>;
146 using data_type = std::vector<MetricNameValue>;
147
148 // component-specific aliases
149 using size_type = std::size_t;
150 using string_t = std::string;
151
152 static std::string label() { return "cupti_profiler"; }
153 static std::string description() { return "CUpti Profiler API"; }
154
155 static void configure(int device = 0);
156 static void finalize();
157
158 static void global_init() { configure(); }
159
160 static void global_finalize() { finalize(); }
161
162 TIMEMORY_DEFAULT_OBJECT(cupti_profiler)
163
164 value_type record()
165 {
166 auto& chipName = get_persistent_data().chipName;
167 auto& counterDataImage = get_persistent_data().counterDataImage;
168 auto& metricNames = get_persistent_data().metricNames;
169 data_type _data;
170 value_type _tmp;
171 GetMetricGpuValue(chipName, counterDataImage, metricNames, _data);
172 if(settings::verbose() > 0)
173 PRINT_HERE("METRIC_GPU_VALUE size: %li", (long int) _data.size());
174 for(const auto& itr : _data)
175 {
176 auto _prefix = itr.metricName + ".";
177 if(settings::verbose() > 0)
178 PRINT_HERE(" METRIC[%s] size: %li", itr.metricName.c_str(),
179 (long int) itr.rangeNameMetricValueMap.size());
180 for(const auto& vitr : itr.rangeNameMetricValueMap)
181 _tmp[_prefix + vitr.first] = vitr.second;
182 }
183 return _tmp;
184 }
185
186 void start()
187 {
188 auto _count = get_counter()++;
189 if(_count == 0)
190 {
191 // enable();
192 TIMEMORY_CUPTI_API_CALL(cuptiProfilerBeginPass(&beginPassParams));
193 TIMEMORY_CUPTI_API_CALL(cuptiProfilerEnableProfiling(&enableProfilingParams));
194 }
195
196 value = record();
197 TIMEMORY_CUPTI_API_CALL(cuptiProfilerPushRange(&pushRangeParams));
198 }
199
200 void stop()
201 {
202 using namespace tim::component::operators;
203 TIMEMORY_CUPTI_API_CALL(cuptiProfilerPopRange(&popRangeParams));
204 auto _count = --get_counter();
205 if(_count == 0)
206 {
207 TIMEMORY_CUPTI_API_CALL(
208 cuptiProfilerDisableProfiling(&disableProfilingParams));
209 TIMEMORY_CUPTI_API_CALL(cuptiProfilerEndPass(&endPassParams));
210 // disable();
211 }
212
213 cuda::stream_sync(0);
214 cuda::device_sync();
215
216 TIMEMORY_CUPTI_API_CALL(cuptiProfilerFlushCounterData(&flushCounterDataParams));
217
218 auto _tmp = record();
219 for(auto& itr : _tmp)
220 {
221 auto& vitr = value[itr.first];
222 vitr = (itr.second - vitr);
223 accum[itr.first] += vitr;
224 }
225 }
226
227 void set_prefix(const std::string& _prefix)
228 {
229 pushRangeParams.pRangeName = _prefix.c_str();
230 }
231
232 std::vector<double> get() const
233 {
234 std::vector<double> data;
235 for(const auto& itr : accum)
236 data.emplace_back(itr.second);
237 return data;
238 }
239
240 string_t get_display() const
241 {
242 auto _get_display = [&](std::ostream& os, const auto& obj) {
243 auto _label = obj.first;
244 auto _prec = base_type::get_precision();
245 auto _width = base_type::get_width();
246 auto _flags = base_type::get_format_flags();
247
248 std::stringstream ssv, ssi;
249 ssv.setf(_flags);
250 ssv << std::setw(_width) << std::setprecision(_prec) << obj.second;
251 if(!_label.empty())
252 ssi << " " << _label;
253 os << ssv.str() << ssi.str();
254 };
255
256 const auto& _data = load();
257 std::stringstream ss;
258 for(size_type i = 0; i < _data.size(); ++i)
259 {
260 auto itr = _data.begin();
261 std::advance(itr, i);
262 _get_display(ss, *itr);
263 if(i + 1 < _data.size())
264 ss << ", ";
265 }
266 return ss.str();
267 }
268
269 static std::vector<string_t> label_array()
270 {
271 auto ret = get_persistent_data().metricNames;
272 std::sort(ret.begin(), ret.end());
273 return ret;
274 }
275
276 static std::vector<string_t> description_array() { return label_array(); }
277
278 static std::vector<string_t> display_unit_array()
279 {
280 return std::vector<string_t>(get_persistent_data().metricNames.size(), "");
281 }
282
283 static std::vector<int64_t> unit_array()
284 {
285 return std::vector<int64_t>(get_persistent_data().metricNames.size(), 1);
286 }
287
288 template <typename Archive>
289 void serialize(Archive& ar, const unsigned int)
290 {
291 auto _get = [&](const value_type& _data) {
292 std::vector<double> values;
293 for(auto itr : _data)
294 values.push_back(itr.second);
295 return values;
296 };
297 std::vector<double> _disp = _get(accum);
298 std::vector<double> _value = _get(value);
299 std::vector<double> _accum = _get(accum);
300 ar(cereal::make_nvp("laps", laps), cereal::make_nvp("repr_data", _disp),
301 cereal::make_nvp("value", _value), cereal::make_nvp("accum", _accum),
302 cereal::make_nvp("display", _disp));
303 // ar(cereal::make_nvp("units", unit_array()),
304 // cereal::make_nvp("display_units", display_unit_array()));
305 }
306
307public:
308 this_type& operator+=(const this_type& rhs)
309 {
310 for(const auto& itr : rhs.value)
311 {
312 value[itr.first] += itr.second;
313 }
314
315 for(const auto& itr : rhs.accum)
316 {
317 accum[itr.first] += itr.second;
318 }
319
320 return *this;
321 }
322
323 this_type& operator-=(const this_type& rhs)
324 {
325 for(const auto& itr : rhs.value)
326 {
327 if(value.find(itr.first) != value.end())
328 value[itr.first] -= itr.second;
329 }
330
331 for(const auto& itr : rhs.accum)
332 {
333 if(accum.find(itr.first) != accum.end())
334 accum[itr.first] -= itr.second;
335 }
336
337 return *this;
338 }
339
340public:
341 static bool WriteBinaryFile(const char* pFileName, const std::vector<uint8_t>& data);
342 static bool ReadBinaryFile(const char* pFileName, std::vector<uint8_t>& image);
343
344 static std::set<std::string> ListSupportedChips();
345 static std::set<std::string> ListMetrics(const char* chipName, bool listSubMetrics);
346 static std::string GetHwUnit(const std::string& metricName);
347
348 static bool GetMetricGpuValue(std::string chipName,
349 std::vector<uint8_t> counterDataImage,
350 std::vector<std::string> metricNames,
351 std::vector<MetricNameValue>& metricNameValueMap);
352
353 static bool PrintMetricValues(std::string chipName,
354 std::vector<uint8_t> counterDataImage,
355 std::vector<std::string> metricNames);
356
357protected:
358 static bool create_counter_data_image(std::vector<uint8_t>& counterDataImage,
359 std::vector<uint8_t>& counterDataScratchBuffer,
360 std::vector<uint8_t>& counterDataImagePrefix);
361
362 static bool enable();
363 static bool disable();
364
365 static bool GetConfigImage(std::string chipName, std::vector<std::string> metricNames,
366 std::vector<uint8_t>& configImage);
367
368 static bool GetCounterDataPrefixImage(std::string chipName,
369 std::vector<std::string> metricNames,
370 std::vector<uint8_t>& counterDataImagePrefix);
371
372 static bool GetRawMetricRequests(
373 NVPA_MetricsContext* pMetricsContext, std::vector<std::string> metricNames,
374 std::vector<NVPA_RawMetricRequest>& rawMetricRequests,
375 std::vector<std::string>& temp);
376
377 static bool ParseMetricNameString(const std::string& metricName, std::string* reqName,
378 bool* isolated, bool* keepInstances);
379
380protected:
381 std::string* m_prefix = nullptr;
382
383 CUpti_Profiler_PushRange_Params pushRangeParams = {
384 CUpti_Profiler_PushRange_Params_STRUCT_SIZE
385 };
386 CUpti_Profiler_PopRange_Params popRangeParams = {
387 CUpti_Profiler_PopRange_Params_STRUCT_SIZE
388 };
389 CUpti_Profiler_BeginPass_Params beginPassParams = {
390 CUpti_Profiler_BeginPass_Params_STRUCT_SIZE
391 };
392 CUpti_Profiler_EndPass_Params endPassParams = {
393 CUpti_Profiler_EndPass_Params_STRUCT_SIZE
394 };
395 CUpti_Profiler_FlushCounterData_Params flushCounterDataParams = {
396 CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE
397 };
398 CUpti_Profiler_EnableProfiling_Params enableProfilingParams = {
399 CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE
400 };
401 CUpti_Profiler_DisableProfiling_Params disableProfilingParams = {
402 CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE
403 };
404
405protected:
406 struct MetricNameValue
407 {
408 std::string metricName;
409 int numRanges;
410 std::vector<std::pair<std::string, double>> rangeNameMetricValueMap;
411 };
412
413 struct persistent_data
414 {
415 std::atomic<int64_t> instCounter;
416 CUdevice cuDevice;
417 CUcontext cuContext;
418 bool enabled = false;
419 int deviceCount = 1;
420 int deviceNum = 0;
421 int numRanges = 64;
422 int computeCapabilityMajor = 0;
423 int computeCapabilityMinor = 0;
424 CUpti_ProfilerReplayMode profilerReplayMode = CUPTI_UserReplay;
425 CUpti_ProfilerRange profilerRange = CUPTI_UserRange;
426 std::string chipName = "";
427 std::string CounterDataFileName = "SimpleCupti.counterdata";
428 std::string CounterDataSBFileName = "SimpleCupti.counterdataSB";
429 std::vector<uint8_t> counterDataImagePrefix;
430 std::vector<uint8_t> configImage;
431 std::vector<uint8_t> counterDataImage;
432 std::vector<uint8_t> counterDataScratchBuffer;
433 std::vector<std::string> metricNames;
434 };
435
436 static persistent_data& get_persistent_data()
437 {
438 static persistent_data _instance;
439 return _instance;
440 }
441
442 static std::atomic<int64_t>& get_counter()
443 {
444 return get_persistent_data().instCounter;
445 }
446};
447//
448//--------------------------------------------------------------------------------------//
449//
450inline bool
451cupti_profiler::create_counter_data_image(std::vector<uint8_t>& counterDataImage,
452 std::vector<uint8_t>& counterDataScratchBuffer,
453 std::vector<uint8_t>& counterDataImagePrefix)
454{
455 auto& numRanges = get_persistent_data().numRanges;
456
457 CUpti_Profiler_CounterDataImageOptions counterDataImageOptions;
458 counterDataImageOptions.pCounterDataPrefix = &counterDataImagePrefix[0];
459 counterDataImageOptions.counterDataPrefixSize = counterDataImagePrefix.size();
460 counterDataImageOptions.maxNumRanges = numRanges;
461 counterDataImageOptions.maxNumRangeTreeNodes = numRanges;
462 counterDataImageOptions.maxRangeNameLength = 64;
463
464 CUpti_Profiler_CounterDataImage_CalculateSize_Params calculateSizeParams = {
465 CUpti_Profiler_CounterDataImage_CalculateSize_Params_STRUCT_SIZE
466 };
467
468 calculateSizeParams.pOptions = &counterDataImageOptions;
469 calculateSizeParams.sizeofCounterDataImageOptions =
470 CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
471
472 TIMEMORY_CUPTI_API_CALL(
473 cuptiProfilerCounterDataImageCalculateSize(&calculateSizeParams));
474
475 CUpti_Profiler_CounterDataImage_Initialize_Params initializeParams = {
476 CUpti_Profiler_CounterDataImage_Initialize_Params_STRUCT_SIZE
477 };
478
479 initializeParams.sizeofCounterDataImageOptions =
480 CUpti_Profiler_CounterDataImageOptions_STRUCT_SIZE;
481 initializeParams.pOptions = &counterDataImageOptions;
482 initializeParams.counterDataImageSize = calculateSizeParams.counterDataImageSize;
483
484 counterDataImage.resize(calculateSizeParams.counterDataImageSize);
485 initializeParams.pCounterDataImage = &counterDataImage[0];
486
487 TIMEMORY_CUPTI_API_CALL(cuptiProfilerCounterDataImageInitialize(&initializeParams));
488
489 CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params
490 scratchBufferSizeParams = {
491 CUpti_Profiler_CounterDataImage_CalculateScratchBufferSize_Params_STRUCT_SIZE
492 };
493
494 scratchBufferSizeParams.counterDataImageSize =
495 calculateSizeParams.counterDataImageSize;
496 scratchBufferSizeParams.pCounterDataImage = initializeParams.pCounterDataImage;
497
498 TIMEMORY_CUPTI_API_CALL(cuptiProfilerCounterDataImageCalculateScratchBufferSize(
499 &scratchBufferSizeParams));
500
501 counterDataScratchBuffer.resize(scratchBufferSizeParams.counterDataScratchBufferSize);
502
503 CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params
504 initScratchBufferParams = {
505 CUpti_Profiler_CounterDataImage_InitializeScratchBuffer_Params_STRUCT_SIZE
506 };
507
508 initScratchBufferParams.counterDataImageSize =
509 calculateSizeParams.counterDataImageSize;
510 initScratchBufferParams.pCounterDataImage = initializeParams.pCounterDataImage;
511 initScratchBufferParams.counterDataScratchBufferSize =
512 scratchBufferSizeParams.counterDataScratchBufferSize;
513 initScratchBufferParams.pCounterDataScratchBuffer = &counterDataScratchBuffer[0];
514
515 TIMEMORY_CUPTI_API_CALL(
516 cuptiProfilerCounterDataImageInitializeScratchBuffer(&initScratchBufferParams));
517
518 return true;
519}
520//
521//--------------------------------------------------------------------------------------//
522//
523inline void
524cupti_profiler::configure(int device)
525{
526 auto& cuDevice = get_persistent_data().cuDevice;
527 auto& metricNames = get_persistent_data().metricNames;
528 auto& counterDataImagePrefix = get_persistent_data().counterDataImagePrefix;
529 auto& configImage = get_persistent_data().configImage;
530 auto& counterDataImage = get_persistent_data().counterDataImage;
531 auto& counterDataScratchBuffer = get_persistent_data().counterDataScratchBuffer;
532 auto& profilerReplayMode = get_persistent_data().profilerReplayMode;
533 auto& profilerRange = get_persistent_data().profilerRange;
534 auto& deviceCount = get_persistent_data().deviceCount;
535 auto& deviceNum = get_persistent_data().deviceNum;
536 auto& computeCapabilityMajor = get_persistent_data().computeCapabilityMajor;
537 auto& computeCapabilityMinor = get_persistent_data().computeCapabilityMinor;
538 auto& chipName = get_persistent_data().chipName;
539
540 // printf("Usage: %s [device_num] [metric_names comma separated]\n", argv[0]);
541
543 TIMEMORY_CUDA_DRIVER_API_CALL(cuDeviceGetCount(&deviceCount));
544
545 if(deviceCount == 0)
546 {
547 fprintf(stderr, "There is no device supporting CUDA.\n");
548 return;
549 }
550
551 deviceNum = device;
552 printf("CUDA Device Number: %d\n", deviceNum);
553
554 TIMEMORY_CUDA_DRIVER_API_CALL(cuDeviceGet(&cuDevice, deviceNum));
555 TIMEMORY_CUDA_DRIVER_API_CALL(cuDeviceGetAttribute(
556 &computeCapabilityMajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cuDevice));
557 TIMEMORY_CUDA_DRIVER_API_CALL(cuDeviceGetAttribute(
558 &computeCapabilityMinor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cuDevice));
559
560 printf("Compute Capability of Device: %d.%d\n", computeCapabilityMajor,
561 computeCapabilityMinor);
562
563 if(computeCapabilityMajor < 7)
564 {
565 printf("Sample unsupported on Device with compute capability < 7.0\n");
566 return;
567 }
568
569 // Get the names of the metrics to collect
570 metricNames =
572
573 CUpti_Profiler_Initialize_Params profilerInitializeParams = {
574 CUpti_Profiler_Initialize_Params_STRUCT_SIZE
575 };
576 TIMEMORY_CUPTI_API_CALL(cuptiProfilerInitialize(&profilerInitializeParams));
577
578 /* Get chip name for the cuda device */
579 CUpti_Device_GetChipName_Params getChipNameParams = {
580 CUpti_Device_GetChipName_Params_STRUCT_SIZE
581 };
582
583 getChipNameParams.deviceIndex = deviceNum;
584
585 TIMEMORY_CUPTI_API_CALL(cuptiDeviceGetChipName(&getChipNameParams));
586 chipName = getChipNameParams.pChipName;
587
588 /* Generate configuration for metrics, this can also be done offline*/
589 NVPW_InitializeHost_Params initializeHostParams = {
590 NVPW_InitializeHost_Params_STRUCT_SIZE
591 };
592
593 TIMEMORY_NVPW_API_CALL(NVPW_InitializeHost(&initializeHostParams));
594
595 if(metricNames.size())
596 {
597 if(!GetConfigImage(chipName, metricNames, configImage))
598 {
599 std::cerr << "Failed to create configImage" << std::endl;
600 return;
601 }
602 if(!GetCounterDataPrefixImage(chipName, metricNames, counterDataImagePrefix))
603 {
604 std::cerr << "Failed to create counterDataImagePrefix" << std::endl;
605 return;
606 }
607 }
608 else
609 {
610 std::cerr << "No metrics provided to profile" << std::endl;
611 return;
612 }
613
614 if(!create_counter_data_image(counterDataImage, counterDataScratchBuffer,
615 counterDataImagePrefix))
616 {
617 std::cerr << "Failed to create counterDataImage" << std::endl;
618 return;
619 }
620
621 auto& enabled = get_persistent_data().enabled;
622 auto& numRanges = get_persistent_data().numRanges;
623 auto& cuContext = get_persistent_data().cuContext;
624
625 CUpti_Profiler_BeginSession_Params beginSessionParams = {
626 CUpti_Profiler_BeginSession_Params_STRUCT_SIZE
627 };
628
629 CUpti_Profiler_SetConfig_Params setConfigParams = {
630 CUpti_Profiler_SetConfig_Params_STRUCT_SIZE
631 };
632
633 TIMEMORY_CUDA_DRIVER_API_CALL(cuCtxCreate(&cuContext, 0, cuDevice));
634
635 beginSessionParams.ctx = cuContext;
636 beginSessionParams.counterDataImageSize = counterDataImage.size();
637 beginSessionParams.pCounterDataImage = &counterDataImage[0];
638 beginSessionParams.counterDataScratchBufferSize = counterDataScratchBuffer.size();
639 beginSessionParams.pCounterDataScratchBuffer = &counterDataScratchBuffer[0];
640 beginSessionParams.range = profilerRange;
641 beginSessionParams.replayMode = profilerReplayMode;
642 beginSessionParams.maxRangesPerPass = numRanges;
643 beginSessionParams.maxLaunchesPerPass = numRanges;
644
645 setConfigParams.pConfig = &configImage[0];
646 setConfigParams.configSize = configImage.size();
647 setConfigParams.passIndex = 0;
648 setConfigParams.minNestingLevel = 1;
649 setConfigParams.numNestingLevels = 1;
650
651 TIMEMORY_CUPTI_API_CALL(cuptiProfilerBeginSession(&beginSessionParams));
652 TIMEMORY_CUPTI_API_CALL(cuptiProfilerSetConfig(&setConfigParams));
653
654 enabled = true;
655}
656//
657//--------------------------------------------------------------------------------------//
658//
659inline bool
660cupti_profiler::enable()
661{
662 auto& enabled = get_persistent_data().enabled;
663 if(!enabled)
664 configure();
665
666 auto& cuContext = get_persistent_data().cuContext;
667 auto& numRanges = get_persistent_data().numRanges;
668 auto& cuDevice = get_persistent_data().cuDevice;
669 auto& metricNames = get_persistent_data().metricNames;
670 auto& counterDataImagePrefix = get_persistent_data().counterDataImagePrefix;
671 auto& configImage = get_persistent_data().configImage;
672 auto& counterDataImage = get_persistent_data().counterDataImage;
673 auto& counterDataScratchBuffer = get_persistent_data().counterDataScratchBuffer;
674 auto& profilerReplayMode = get_persistent_data().profilerReplayMode;
675 auto& profilerRange = get_persistent_data().profilerRange;
676 auto& deviceCount = get_persistent_data().deviceCount;
677 auto& deviceNum = get_persistent_data().deviceNum;
678 auto& computeCapabilityMajor = get_persistent_data().computeCapabilityMajor;
679 auto& computeCapabilityMinor = get_persistent_data().computeCapabilityMinor;
680 auto& chipName = get_persistent_data().chipName;
681
682 CUpti_Profiler_EnableProfiling_Params enableProfilingParams = {
683 CUpti_Profiler_EnableProfiling_Params_STRUCT_SIZE
684 };
685
686 CUpti_Profiler_BeginPass_Params beginPassParams = {
687 CUpti_Profiler_BeginPass_Params_STRUCT_SIZE
688 };
689
690 TIMEMORY_CUPTI_API_CALL(cuptiProfilerBeginPass(&beginPassParams));
691 TIMEMORY_CUPTI_API_CALL(cuptiProfilerEnableProfiling(&enableProfilingParams));
692
693 return true;
694}
695//
696//--------------------------------------------------------------------------------------//
697//
698inline void
700{
701 auto& chipName = get_persistent_data().chipName;
702 auto& counterDataImage = get_persistent_data().counterDataImage;
703 auto& counterDataScratchBuffer = get_persistent_data().counterDataScratchBuffer;
704 auto& CounterDataFileName = get_persistent_data().CounterDataFileName;
705 auto& CounterDataSBFileName = get_persistent_data().CounterDataSBFileName;
706 auto& metricNames = get_persistent_data().metricNames;
707 auto& cuContext = get_persistent_data().cuContext;
708
709 CUpti_Profiler_UnsetConfig_Params unsetConfigParams = {
710 CUpti_Profiler_UnsetConfig_Params_STRUCT_SIZE
711 };
712
713 CUpti_Profiler_EndSession_Params endSessionParams = {
714 CUpti_Profiler_EndSession_Params_STRUCT_SIZE
715 };
716
717 CUpti_Profiler_DeInitialize_Params profilerDeInitializeParams = {
718 CUpti_Profiler_DeInitialize_Params_STRUCT_SIZE
719 };
720
721 TIMEMORY_CUPTI_API_CALL(cuptiProfilerUnsetConfig(&unsetConfigParams));
722 TIMEMORY_CUPTI_API_CALL(cuptiProfilerEndSession(&endSessionParams));
723 TIMEMORY_CUDA_DRIVER_API_CALL(cuCtxDestroy(cuContext));
724 TIMEMORY_CUPTI_API_CALL(cuptiProfilerDeInitialize(&profilerDeInitializeParams));
725
726 /* Dump counterDataImage in file */
727 WriteBinaryFile(CounterDataFileName.c_str(), counterDataImage);
728 WriteBinaryFile(CounterDataSBFileName.c_str(), counterDataScratchBuffer);
729
730 /* Evaluation of metrics collected in counterDataImage, this can also be done
731 * offline*/
732 PrintMetricValues(chipName, counterDataImage, metricNames);
733}
734//
735//--------------------------------------------------------------------------------------//
736//
737inline bool
738cupti_profiler::disable()
739{
740 auto& enabled = get_persistent_data().enabled;
741 auto& cuContext = get_persistent_data().cuContext;
742
743 if(!enabled)
744 return false;
745
746 CUpti_Profiler_DisableProfiling_Params disableProfilingParams = {
747 CUpti_Profiler_DisableProfiling_Params_STRUCT_SIZE
748 };
749
750 CUpti_Profiler_EndPass_Params endPassParams = {
751 CUpti_Profiler_EndPass_Params_STRUCT_SIZE
752 };
753
754 CUpti_Profiler_FlushCounterData_Params flushCounterDataParams = {
755 CUpti_Profiler_FlushCounterData_Params_STRUCT_SIZE
756 };
757
758 TIMEMORY_CUPTI_API_CALL(cuptiProfilerDisableProfiling(&disableProfilingParams));
759 TIMEMORY_CUPTI_API_CALL(cuptiProfilerEndPass(&endPassParams));
760 TIMEMORY_CUPTI_API_CALL(cuptiProfilerFlushCounterData(&flushCounterDataParams));
761
762 return true;
763}
764//
765//--------------------------------------------------------------------------------------//
766//
767inline std::string
768cupti_profiler::GetHwUnit(const std::string& metricName)
769{
770 return metricName.substr(0, metricName.find("__", 0));
771}
772//
773//--------------------------------------------------------------------------------------//
774//
775inline bool
776cupti_profiler::GetMetricGpuValue(std::string chipName,
777 std::vector<uint8_t> counterDataImage,
778 std::vector<std::string> metricNames,
779 std::vector<MetricNameValue>& metricNameValueMap)
780{
781 if(!counterDataImage.size())
782 {
783 std::cout << "Counter Data Image is empty!\n";
784 return false;
785 }
786
787 NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = {
788 NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE
789 };
790
791 metricsContextCreateParams.pChipName = chipName.c_str();
792 TIMEMORY_RETURN_IF_NVPW_ERROR(
793 false, NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams));
794
795 NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = {
796 NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE
797 };
798
799 metricsContextDestroyParams.pMetricsContext =
800 metricsContextCreateParams.pMetricsContext;
801
802 SCOPE_EXIT([&]() {
803 NVPW_MetricsContext_Destroy(
804 (NVPW_MetricsContext_Destroy_Params*) &metricsContextDestroyParams);
805 });
806
807 NVPW_CounterData_GetNumRanges_Params getNumRangesParams = {
808 NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE
809 };
810
811 getNumRangesParams.pCounterDataImage = &counterDataImage[0];
812 TIMEMORY_RETURN_IF_NVPW_ERROR(false,
813 NVPW_CounterData_GetNumRanges(&getNumRangesParams));
814
815 std::vector<std::string> reqName;
816 reqName.resize(metricNames.size());
817
818 bool isolated = true;
819 bool keepInstances = true;
820 std::vector<const char*> metricNamePtrs;
821 metricNameValueMap.resize(metricNames.size());
822
823 for(size_t metricIndex = 0; metricIndex < metricNames.size(); ++metricIndex)
824 {
825 ParseMetricNameString(metricNames[metricIndex], &reqName[metricIndex], &isolated,
826 &keepInstances);
827 metricNamePtrs.push_back(reqName[metricIndex].c_str());
828 metricNameValueMap[metricIndex].metricName = metricNames[metricIndex];
829 metricNameValueMap[metricIndex].numRanges = getNumRangesParams.numRanges;
830 }
831
832 for(size_t rangeIndex = 0; rangeIndex < getNumRangesParams.numRanges; ++rangeIndex)
833 {
834 std::vector<const char*> descriptionPtrs;
835
836 NVPW_Profiler_CounterData_GetRangeDescriptions_Params getRangeDescParams = {
837 NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE
838 };
839 getRangeDescParams.pCounterDataImage = &counterDataImage[0];
840 getRangeDescParams.rangeIndex = rangeIndex;
841 TIMEMORY_RETURN_IF_NVPW_ERROR(
842 false, NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams));
843 descriptionPtrs.resize(getRangeDescParams.numDescriptions);
844
845 getRangeDescParams.ppDescriptions = &descriptionPtrs[0];
846 TIMEMORY_RETURN_IF_NVPW_ERROR(
847 false, NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams));
848
849 std::string rangeName;
850 for(size_t descriptionIndex = 0;
851 descriptionIndex < getRangeDescParams.numDescriptions; ++descriptionIndex)
852 {
853 if(descriptionIndex)
854 {
855 rangeName += "/";
856 }
857 rangeName += descriptionPtrs[descriptionIndex];
858 }
859
860 std::vector<double> gpuValues;
861 gpuValues.resize(metricNames.size());
862 NVPW_MetricsContext_SetCounterData_Params setCounterDataParams = {
863 NVPW_MetricsContext_SetCounterData_Params_STRUCT_SIZE
864 };
865 setCounterDataParams.pMetricsContext = metricsContextCreateParams.pMetricsContext;
866 setCounterDataParams.pCounterDataImage = &counterDataImage[0];
867 setCounterDataParams.isolated = true;
868 setCounterDataParams.rangeIndex = rangeIndex;
869 NVPW_MetricsContext_SetCounterData(&setCounterDataParams);
870
871 NVPW_MetricsContext_EvaluateToGpuValues_Params evalToGpuParams = {
872 NVPW_MetricsContext_EvaluateToGpuValues_Params_STRUCT_SIZE
873 };
874 evalToGpuParams.pMetricsContext = metricsContextCreateParams.pMetricsContext;
875 evalToGpuParams.numMetrics = metricNamePtrs.size();
876 evalToGpuParams.ppMetricNames = &metricNamePtrs[0];
877 evalToGpuParams.pMetricValues = &gpuValues[0];
878 NVPW_MetricsContext_EvaluateToGpuValues(&evalToGpuParams);
879 for(size_t metricIndex = 0; metricIndex < metricNames.size(); ++metricIndex)
880 {
881 metricNameValueMap[metricIndex].rangeNameMetricValueMap.push_back(
882 std::make_pair(rangeName, gpuValues[metricIndex]));
883 }
884 }
885
886 return true;
887}
888//
889//--------------------------------------------------------------------------------------//
890//
891inline bool
892cupti_profiler::PrintMetricValues(std::string chipName,
893 std::vector<uint8_t> counterDataImage,
894 std::vector<std::string> metricNames)
895{
896 if(!counterDataImage.size())
897 {
898 std::cout << "Counter Data Image is empty!\n";
899 return false;
900 }
901
902 NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = {
903 NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE
904 };
905 metricsContextCreateParams.pChipName = chipName.c_str();
906 TIMEMORY_RETURN_IF_NVPW_ERROR(
907 false, NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams));
908
909 NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = {
910 NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE
911 };
912 metricsContextDestroyParams.pMetricsContext =
913 metricsContextCreateParams.pMetricsContext;
914 SCOPE_EXIT([&]() {
915 NVPW_MetricsContext_Destroy(
916 (NVPW_MetricsContext_Destroy_Params*) &metricsContextDestroyParams);
917 });
918
919 NVPW_CounterData_GetNumRanges_Params getNumRangesParams = {
920 NVPW_CounterData_GetNumRanges_Params_STRUCT_SIZE
921 };
922 getNumRangesParams.pCounterDataImage = &counterDataImage[0];
923 TIMEMORY_RETURN_IF_NVPW_ERROR(false,
924 NVPW_CounterData_GetNumRanges(&getNumRangesParams));
925
926 std::vector<std::string> reqName;
927 reqName.resize(metricNames.size());
928 bool isolated = true;
929 bool keepInstances = true;
930 std::vector<const char*> metricNamePtrs;
931 for(size_t metricIndex = 0; metricIndex < metricNames.size(); ++metricIndex)
932 {
933 ParseMetricNameString(metricNames[metricIndex], &reqName[metricIndex], &isolated,
934 &keepInstances);
935 metricNamePtrs.push_back(reqName[metricIndex].c_str());
936 }
937
938 for(size_t rangeIndex = 0; rangeIndex < getNumRangesParams.numRanges; ++rangeIndex)
939 {
940 std::vector<const char*> descriptionPtrs;
941
942 NVPW_Profiler_CounterData_GetRangeDescriptions_Params getRangeDescParams = {
943 NVPW_Profiler_CounterData_GetRangeDescriptions_Params_STRUCT_SIZE
944 };
945 getRangeDescParams.pCounterDataImage = &counterDataImage[0];
946 getRangeDescParams.rangeIndex = rangeIndex;
947 TIMEMORY_RETURN_IF_NVPW_ERROR(
948 false, NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams));
949
950 descriptionPtrs.resize(getRangeDescParams.numDescriptions);
951
952 getRangeDescParams.ppDescriptions = &descriptionPtrs[0];
953 TIMEMORY_RETURN_IF_NVPW_ERROR(
954 false, NVPW_Profiler_CounterData_GetRangeDescriptions(&getRangeDescParams));
955
956 std::string rangeName;
957 for(size_t descriptionIndex = 0;
958 descriptionIndex < getRangeDescParams.numDescriptions; ++descriptionIndex)
959 {
960 if(descriptionIndex)
961 {
962 rangeName += "/";
963 }
964 rangeName += descriptionPtrs[descriptionIndex];
965 }
966
967 std::vector<double> gpuValues;
968 gpuValues.resize(metricNames.size());
969
970 NVPW_MetricsContext_SetCounterData_Params setCounterDataParams = {
971 NVPW_MetricsContext_SetCounterData_Params_STRUCT_SIZE
972 };
973 setCounterDataParams.pMetricsContext = metricsContextCreateParams.pMetricsContext;
974 setCounterDataParams.pCounterDataImage = &counterDataImage[0];
975 setCounterDataParams.isolated = true;
976 setCounterDataParams.rangeIndex = rangeIndex;
977 NVPW_MetricsContext_SetCounterData(&setCounterDataParams);
978
979 NVPW_MetricsContext_EvaluateToGpuValues_Params evalToGpuParams = {
980 NVPW_MetricsContext_EvaluateToGpuValues_Params_STRUCT_SIZE
981 };
982 evalToGpuParams.pMetricsContext = metricsContextCreateParams.pMetricsContext;
983 evalToGpuParams.numMetrics = metricNamePtrs.size();
984 evalToGpuParams.ppMetricNames = &metricNamePtrs[0];
985 evalToGpuParams.pMetricValues = &gpuValues[0];
986 NVPW_MetricsContext_EvaluateToGpuValues(&evalToGpuParams);
987
988 for(size_t metricIndex = 0; metricIndex < metricNames.size(); ++metricIndex)
989 {
990 std::cout << "rangeName: " << rangeName
991 << "\tmetricName: " << metricNames[metricIndex]
992 << "\tgpuValue: " << gpuValues[metricIndex] << std::endl;
993 }
994 }
995 return true;
996}
997//
998//--------------------------------------------------------------------------------------//
999//
1000inline bool
1001cupti_profiler::GetRawMetricRequests(
1002 NVPA_MetricsContext* pMetricsContext, std::vector<std::string> metricNames,
1003 std::vector<NVPA_RawMetricRequest>& rawMetricRequests, std::vector<std::string>& temp)
1004{
1005 std::string reqName;
1006 bool isolated = true;
1007 bool keepInstances = true;
1008
1009 for(auto& metricName : metricNames)
1010 {
1011 ParseMetricNameString(metricName, &reqName, &isolated, &keepInstances);
1012 /* Bug in collection with collection of metrics without instances, keep it to
1013 * true*/
1014 keepInstances = true;
1015 NVPW_MetricsContext_GetMetricProperties_Begin_Params
1016 getMetricPropertiesBeginParams = {
1017 NVPW_MetricsContext_GetMetricProperties_Begin_Params_STRUCT_SIZE
1018 };
1019 getMetricPropertiesBeginParams.pMetricsContext = pMetricsContext;
1020 getMetricPropertiesBeginParams.pMetricName = reqName.c_str();
1021
1022 TIMEMORY_RETURN_IF_NVPW_ERROR(false,
1023 NVPW_MetricsContext_GetMetricProperties_Begin(
1024 &getMetricPropertiesBeginParams));
1025
1026 for(const char** ppMetricDependencies =
1027 getMetricPropertiesBeginParams.ppRawMetricDependencies;
1028 *ppMetricDependencies; ++ppMetricDependencies)
1029 {
1030 temp.push_back(*ppMetricDependencies);
1031 }
1032 NVPW_MetricsContext_GetMetricProperties_End_Params
1033 getMetricPropertiesEndParams = {
1034 NVPW_MetricsContext_GetMetricProperties_End_Params_STRUCT_SIZE
1035 };
1036 getMetricPropertiesEndParams.pMetricsContext = pMetricsContext;
1037 TIMEMORY_RETURN_IF_NVPW_ERROR(false, NVPW_MetricsContext_GetMetricProperties_End(
1038 &getMetricPropertiesEndParams));
1039 }
1040
1041 consume_parameters(isolated);
1042
1043 for(auto& rawMetricName : temp)
1044 {
1045 NVPA_RawMetricRequest metricRequest = { NVPA_RAW_METRIC_REQUEST_STRUCT_SIZE };
1046 metricRequest.pMetricName = rawMetricName.c_str();
1047 metricRequest.isolated = isolated;
1048 metricRequest.keepInstances = keepInstances;
1049 rawMetricRequests.push_back(metricRequest);
1050 }
1051
1052 return true;
1053}
1054//
1055//--------------------------------------------------------------------------------------//
1056//
1057inline bool
1058cupti_profiler::GetConfigImage(std::string chipName, std::vector<std::string> metricNames,
1059 std::vector<uint8_t>& configImage)
1060{
1061 NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = {
1062 NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE
1063 };
1064 metricsContextCreateParams.pChipName = chipName.c_str();
1065 TIMEMORY_RETURN_IF_NVPW_ERROR(
1066 false, NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams));
1067
1068 NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = {
1069 NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE
1070 };
1071 metricsContextDestroyParams.pMetricsContext =
1072 metricsContextCreateParams.pMetricsContext;
1073 SCOPE_EXIT([&]() {
1074 NVPW_MetricsContext_Destroy(
1075 (NVPW_MetricsContext_Destroy_Params*) &metricsContextDestroyParams);
1076 });
1077
1078 std::vector<NVPA_RawMetricRequest> rawMetricRequests;
1079 std::vector<std::string> temp;
1080 GetRawMetricRequests(metricsContextCreateParams.pMetricsContext, metricNames,
1081 rawMetricRequests, temp);
1082
1083 NVPA_RawMetricsConfigOptions metricsConfigOptions = {
1084 NVPA_RAW_METRICS_CONFIG_OPTIONS_STRUCT_SIZE
1085 };
1086 metricsConfigOptions.activityKind = NVPA_ACTIVITY_KIND_PROFILER;
1087 metricsConfigOptions.pChipName = chipName.c_str();
1088 NVPA_RawMetricsConfig* pRawMetricsConfig;
1089 TIMEMORY_RETURN_IF_NVPW_ERROR(
1090 false, NVPA_RawMetricsConfig_Create(&metricsConfigOptions, &pRawMetricsConfig));
1091
1092 NVPW_RawMetricsConfig_Destroy_Params rawMetricsConfigDestroyParams = {
1093 NVPW_RawMetricsConfig_Destroy_Params_STRUCT_SIZE
1094 };
1095 rawMetricsConfigDestroyParams.pRawMetricsConfig = pRawMetricsConfig;
1096 SCOPE_EXIT([&]() {
1097 NVPW_RawMetricsConfig_Destroy(
1098 (NVPW_RawMetricsConfig_Destroy_Params*) &rawMetricsConfigDestroyParams);
1099 });
1100
1101 NVPW_RawMetricsConfig_BeginPassGroup_Params beginPassGroupParams = {
1102 NVPW_RawMetricsConfig_BeginPassGroup_Params_STRUCT_SIZE
1103 };
1104 beginPassGroupParams.pRawMetricsConfig = pRawMetricsConfig;
1105 TIMEMORY_RETURN_IF_NVPW_ERROR(
1106 false, NVPW_RawMetricsConfig_BeginPassGroup(&beginPassGroupParams));
1107
1108 NVPW_RawMetricsConfig_AddMetrics_Params addMetricsParams = {
1109 NVPW_RawMetricsConfig_AddMetrics_Params_STRUCT_SIZE
1110 };
1111 addMetricsParams.pRawMetricsConfig = pRawMetricsConfig;
1112 addMetricsParams.pRawMetricRequests = &rawMetricRequests[0];
1113 addMetricsParams.numMetricRequests = rawMetricRequests.size();
1114 TIMEMORY_RETURN_IF_NVPW_ERROR(false,
1115 NVPW_RawMetricsConfig_AddMetrics(&addMetricsParams));
1116
1117 NVPW_RawMetricsConfig_EndPassGroup_Params endPassGroupParams = {
1118 NVPW_RawMetricsConfig_EndPassGroup_Params_STRUCT_SIZE
1119 };
1120 endPassGroupParams.pRawMetricsConfig = pRawMetricsConfig;
1121 TIMEMORY_RETURN_IF_NVPW_ERROR(
1122 false, NVPW_RawMetricsConfig_EndPassGroup(&endPassGroupParams));
1123
1124 NVPW_RawMetricsConfig_GenerateConfigImage_Params generateConfigImageParams = {
1125 NVPW_RawMetricsConfig_GenerateConfigImage_Params_STRUCT_SIZE
1126 };
1127 generateConfigImageParams.pRawMetricsConfig = pRawMetricsConfig;
1128 TIMEMORY_RETURN_IF_NVPW_ERROR(
1129 false, NVPW_RawMetricsConfig_GenerateConfigImage(&generateConfigImageParams));
1130
1131 NVPW_RawMetricsConfig_GetConfigImage_Params getConfigImageParams = {
1132 NVPW_RawMetricsConfig_GetConfigImage_Params_STRUCT_SIZE
1133 };
1134 getConfigImageParams.pRawMetricsConfig = pRawMetricsConfig;
1135 getConfigImageParams.bytesAllocated = 0;
1136 getConfigImageParams.pBuffer = NULL;
1137 TIMEMORY_RETURN_IF_NVPW_ERROR(
1138 false, NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams));
1139
1140 configImage.resize(getConfigImageParams.bytesCopied);
1141
1142 getConfigImageParams.bytesAllocated = configImage.size();
1143 getConfigImageParams.pBuffer = &configImage[0];
1144 TIMEMORY_RETURN_IF_NVPW_ERROR(
1145 false, NVPW_RawMetricsConfig_GetConfigImage(&getConfigImageParams));
1146
1147 return true;
1148}
1149//
1150//--------------------------------------------------------------------------------------//
1151//
1152inline bool
1153cupti_profiler::GetCounterDataPrefixImage(std::string chipName,
1154 std::vector<std::string> metricNames,
1155 std::vector<uint8_t>& counterDataImagePrefix)
1156{
1157 NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = {
1158 NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE
1159 };
1160
1161 metricsContextCreateParams.pChipName = chipName.c_str();
1162
1163 TIMEMORY_RETURN_IF_NVPW_ERROR(
1164 false, NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams));
1165
1166 NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = {
1167 NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE
1168 };
1169
1170 metricsContextDestroyParams.pMetricsContext =
1171 metricsContextCreateParams.pMetricsContext;
1172
1173 SCOPE_EXIT([&]() {
1174 NVPW_MetricsContext_Destroy(
1175 (NVPW_MetricsContext_Destroy_Params*) &metricsContextDestroyParams);
1176 });
1177
1178 std::vector<NVPA_RawMetricRequest> rawMetricRequests;
1179 std::vector<std::string> temp;
1180 GetRawMetricRequests(metricsContextCreateParams.pMetricsContext, metricNames,
1181 rawMetricRequests, temp);
1182
1183 NVPW_CounterDataBuilder_Create_Params counterDataBuilderCreateParams = {
1184 NVPW_CounterDataBuilder_Create_Params_STRUCT_SIZE
1185 };
1186
1187 counterDataBuilderCreateParams.pChipName = chipName.c_str();
1188
1189 TIMEMORY_RETURN_IF_NVPW_ERROR(
1190 false, NVPW_CounterDataBuilder_Create(&counterDataBuilderCreateParams));
1191
1192 NVPW_CounterDataBuilder_Destroy_Params counterDataBuilderDestroyParams = {
1193 NVPW_CounterDataBuilder_Destroy_Params_STRUCT_SIZE
1194 };
1195
1196 counterDataBuilderDestroyParams.pCounterDataBuilder =
1197 counterDataBuilderCreateParams.pCounterDataBuilder;
1198
1199 SCOPE_EXIT([&]() {
1200 NVPW_CounterDataBuilder_Destroy(
1201 (NVPW_CounterDataBuilder_Destroy_Params*) &counterDataBuilderDestroyParams);
1202 });
1203
1204 NVPW_CounterDataBuilder_AddMetrics_Params addMetricsParams = {
1205 NVPW_CounterDataBuilder_AddMetrics_Params_STRUCT_SIZE
1206 };
1207
1208 addMetricsParams.pCounterDataBuilder =
1209 counterDataBuilderCreateParams.pCounterDataBuilder;
1210 addMetricsParams.pRawMetricRequests = &rawMetricRequests[0];
1211 addMetricsParams.numMetricRequests = rawMetricRequests.size();
1212 TIMEMORY_RETURN_IF_NVPW_ERROR(false,
1213 NVPW_CounterDataBuilder_AddMetrics(&addMetricsParams));
1214
1215 // size_t counterDataPrefixSize = 0;
1216 NVPW_CounterDataBuilder_GetCounterDataPrefix_Params getCounterDataPrefixParams = {
1217 NVPW_CounterDataBuilder_GetCounterDataPrefix_Params_STRUCT_SIZE
1218 };
1219
1220 getCounterDataPrefixParams.pCounterDataBuilder =
1221 counterDataBuilderCreateParams.pCounterDataBuilder;
1222
1223 getCounterDataPrefixParams.bytesAllocated = 0;
1224 getCounterDataPrefixParams.pBuffer = NULL;
1225 TIMEMORY_RETURN_IF_NVPW_ERROR(
1226 false, NVPW_CounterDataBuilder_GetCounterDataPrefix(&getCounterDataPrefixParams));
1227
1228 counterDataImagePrefix.resize(getCounterDataPrefixParams.bytesCopied);
1229
1230 getCounterDataPrefixParams.bytesAllocated = counterDataImagePrefix.size();
1231 getCounterDataPrefixParams.pBuffer = &counterDataImagePrefix[0];
1232 TIMEMORY_RETURN_IF_NVPW_ERROR(
1233 false, NVPW_CounterDataBuilder_GetCounterDataPrefix(&getCounterDataPrefixParams));
1234
1235 return true;
1236}
1237//
1238//--------------------------------------------------------------------------------------//
1239//
1240inline std::set<std::string>
1241cupti_profiler::ListSupportedChips()
1242{
1243 std::set<std::string> _ret;
1244
1245 NVPW_GetSupportedChipNames_Params getSupportedChipNames = {
1246 NVPW_GetSupportedChipNames_Params_STRUCT_SIZE
1247 };
1248 TIMEMORY_RETURN_IF_NVPW_ERROR(_ret,
1249 NVPW_GetSupportedChipNames(&getSupportedChipNames));
1250
1251 if(settings::verbose() > 2 || settings::debug())
1252 {
1253 std::cout << "\n Number of supported chips : "
1254 << getSupportedChipNames.numChipNames;
1255 std::cout << "\n List of supported chips : \n";
1256 }
1257
1258 for(size_t i = 0; i < getSupportedChipNames.numChipNames; i++)
1259 {
1260 _ret.insert(getSupportedChipNames.ppChipNames[i]);
1261 if(settings::verbose() > 2 || settings::debug())
1262 std::cout << " " << getSupportedChipNames.ppChipNames[i] << "\n";
1263 }
1264
1265 return _ret;
1266}
1267//
1268//--------------------------------------------------------------------------------------//
1269//
1270inline std::set<std::string>
1271cupti_profiler::ListMetrics(const char* chip, bool listSubMetrics)
1272{
1273 std::set<std::string> _ret;
1274
1275 NVPW_CUDA_MetricsContext_Create_Params metricsContextCreateParams = {
1276 NVPW_CUDA_MetricsContext_Create_Params_STRUCT_SIZE
1277 };
1278
1279 metricsContextCreateParams.pChipName = chip;
1280
1281 TIMEMORY_RETURN_IF_NVPW_ERROR(
1282 _ret, NVPW_CUDA_MetricsContext_Create(&metricsContextCreateParams));
1283
1284 NVPW_MetricsContext_Destroy_Params metricsContextDestroyParams = {
1285 NVPW_MetricsContext_Destroy_Params_STRUCT_SIZE
1286 };
1287
1288 metricsContextDestroyParams.pMetricsContext =
1289 metricsContextCreateParams.pMetricsContext;
1290
1291 SCOPE_EXIT([&]() {
1292 NVPW_MetricsContext_Destroy(
1293 (NVPW_MetricsContext_Destroy_Params*) &metricsContextDestroyParams);
1294 });
1295
1296 NVPW_MetricsContext_GetMetricNames_Begin_Params getMetricNameBeginParams = {
1297 NVPW_MetricsContext_GetMetricNames_Begin_Params_STRUCT_SIZE
1298 };
1299
1300 getMetricNameBeginParams.pMetricsContext = metricsContextCreateParams.pMetricsContext;
1301 getMetricNameBeginParams.hidePeakSubMetrics = !listSubMetrics;
1302 getMetricNameBeginParams.hidePerCycleSubMetrics = !listSubMetrics;
1303 getMetricNameBeginParams.hidePctOfPeakSubMetrics = !listSubMetrics;
1304
1305 TIMEMORY_RETURN_IF_NVPW_ERROR(
1306 _ret, NVPW_MetricsContext_GetMetricNames_Begin(&getMetricNameBeginParams));
1307
1308 NVPW_MetricsContext_GetMetricNames_End_Params getMetricNameEndParams = {
1309 NVPW_MetricsContext_GetMetricNames_End_Params_STRUCT_SIZE
1310 };
1311
1312 getMetricNameEndParams.pMetricsContext = metricsContextCreateParams.pMetricsContext;
1313
1314 SCOPE_EXIT([&]() {
1315 NVPW_MetricsContext_GetMetricNames_End(
1316 (NVPW_MetricsContext_GetMetricNames_End_Params*) &getMetricNameEndParams);
1317 });
1318
1319 if(settings::verbose() > 2 || settings::debug())
1320 std::cout << getMetricNameBeginParams.numMetrics
1321 << " metrics in total on the chip\n Metrics List : \n";
1322
1323 for(size_t i = 0; i < getMetricNameBeginParams.numMetrics; i++)
1324 {
1325 _ret.insert(getMetricNameBeginParams.ppMetricNames[i]);
1326 if(settings::verbose() > 2 || settings::debug())
1327 std::cout << getMetricNameBeginParams.ppMetricNames[i] << "\n";
1328 }
1329
1330 return _ret;
1331}
1332//
1333//--------------------------------------------------------------------------------------//
1334//
1335inline bool
1336cupti_profiler::ParseMetricNameString(const std::string& metricName, std::string* reqName,
1337 bool* isolated, bool* keepInstances)
1338{
1339 std::string& name = *reqName;
1340 name = metricName;
1341 if(name.empty())
1342 {
1343 return false;
1344 }
1345
1346 // boost program_options sometimes inserts a \n between the metric name and a '&'
1347 // at the end
1348 size_t pos = name.find('\n');
1349 if(pos != std::string::npos)
1350 {
1351 name.erase(pos, 1);
1352 }
1353
1354 // trim whitespace
1355 while(name.back() == ' ')
1356 {
1357 name.pop_back();
1358 if(name.empty())
1359 {
1360 return false;
1361 }
1362 }
1363
1364 *keepInstances = false;
1365 if(name.back() == '+')
1366 {
1367 *keepInstances = true;
1368 name.pop_back();
1369 if(name.empty())
1370 {
1371 return false;
1372 }
1373 }
1374
1375 *isolated = true;
1376 if(name.back() == '$')
1377 {
1378 name.pop_back();
1379 if(name.empty())
1380 {
1381 return false;
1382 }
1383 }
1384 else if(name.back() == '&')
1385 {
1386 *isolated = false;
1387 name.pop_back();
1388 if(name.empty())
1389 {
1390 return false;
1391 }
1392 }
1393
1394 return true;
1395}
1396//
1397//--------------------------------------------------------------------------------------//
1398//
1399inline bool
1400cupti_profiler::WriteBinaryFile(const char* pFileName, const std::vector<uint8_t>& data)
1401{
1402 FILE* fp = fopen(pFileName, "wb");
1403 if(fp)
1404 {
1405 if(data.size())
1406 {
1407 fwrite(&data[0], 1, data.size(), fp);
1408 }
1409 fclose(fp);
1410 }
1411 else
1412 {
1413 std::cout << "Failed to open " << pFileName << "\n";
1414 fclose(fp);
1415 return false;
1416 }
1417 return true;
1418}
1419//
1420//--------------------------------------------------------------------------------------//
1421//
1422inline bool
1423cupti_profiler::ReadBinaryFile(const char* pFileName, std::vector<uint8_t>& image)
1424{
1425 FILE* fp = fopen(pFileName, "rb");
1426 if(!fp)
1427 {
1428 std::cout << "Failed to open " << pFileName << "\n";
1429 return false;
1430 }
1431
1432 fseek(fp, 0, SEEK_END);
1433 const long fileLength = ftell(fp);
1434 fseek(fp, 0, SEEK_SET);
1435 if(!fileLength)
1436 {
1437 std::cout << pFileName << " has zero length\n";
1438 fclose(fp);
1439 return false;
1440 }
1441
1442 image.resize((size_t) fileLength);
1443 auto ret = fread(&image[0], 1, image.size(), fp);
1444 fclose(fp);
1445 return (ret != image.size()) ? false : true;
1446}
1447//
1448//--------------------------------------------------------------------------------------//
1449//
1450} // namespace component
1451} // namespace tim
1452
1453#endif
std::string string_t
Definition: library.cpp:57
#define TIMEMORY_CUDA_DRIVER_API_CALL(...)
Definition: macros.hpp:394
void load(Archive &ar, tim::node::graph< Tp > &d)
Definition: node.hpp:520
void serialize(std::string fname, exec_data< Counter > &obj)
Definition: counter.hpp:325
return false
Definition: definition.hpp:326
const hash_alias_ptr_t hash_value_t std::string *& _ret
Definition: definition.hpp:300
void set_prefix(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:774
void record(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:634
void stop(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:386
void start(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:316
std::bitset< scope_count > data_type
Definition: types.hpp:399
std::array< Tp, N > & operator+=(std::array< Tp, N > &, Other &&)
std::array< Tp, N > & operator-=(std::array< Tp, N > &, const std::array< Tp, N > &)
Definition: stl.hpp:96
Definition: kokkosp.cpp:39
cupti_events
Definition: settings.cpp:1727
char const std::string & _prefix
Definition: config.cpp:55
void configure(std::initializer_list< EnumT > components, Args &&... args)
Definition: configure.hpp:50
cupti_metrics
Definition: settings.cpp:1729
tim::mpl::apply< std::string > string
Definition: macros.hpp:53
size_t pos
Definition: config.cpp:102
tim::popen::TIMEMORY_PIPE * fp
const std::string std::ostream * os
void finalize()
Definition: types.hpp:119
auto get(const auto_bundle< Tag, Types... > &_obj)
description("A generic option for any setting. Each argument MUST be passed in " "form: 'NAME=VALUE'. E.g. --timemory-args " "\"papi_events=PAPI_TOT_INS,PAPI_TOT_CYC\" text_output=off") .action([&](parser_t &p)
Definition: config.cpp:312
ContainerT delimit(const std::string &line, const std::string &delimiters="\"',;: ", PredicateT &&predicate=[](const std::string &s) -> std::string { return s;})
Definition: delimit.hpp:68
void consume_parameters(ArgsT &&...)
Definition: types.hpp:285
#define PRINT_HERE(...)
Definition: macros.hpp:152