25#if defined(TIMEMORY_USE_CUPTI_PCSAMPLING)
29# if !defined(TIMEMORY_CUPTI_HEADER_MODE)
42pcdata::pcdata(CUpti_PCSamplingData&& _data)
43: totalNumPcs{ _data.totalNumPcs }
44, remainingNumPcs{ _data.remainingNumPcs }
45, rangeId{ _data.rangeId }
48 for(
size_t i = 0; i < totalNumPcs; ++i)
50 if(!_data.pPcData[i].stallReason)
52 pcsample _sample{ _data.pPcData[i] };
53 auto itr = samples.find(_sample);
54 if(itr == samples.end())
55 samples.insert(std::move(_sample));
59 component::cupti_pcsampling::free_pcsampling_data(_data);
67cupti_pcsampling::get_pcsampling_data(
size_t numStallReasons,
size_t numPcsToCollect)
70 CUpti_PCSamplingData pcSamplingData = {};
71 pcSamplingData.size =
sizeof(CUpti_PCSamplingData);
72 pcSamplingData.collectNumPcs = numPcsToCollect;
73 pcSamplingData.pPcData =
74 TIMEMORY_CUPTI_CALLOC(CUpti_PCSamplingPCData, pcSamplingData.collectNumPcs);
75 for(
size_t i = 0; i < pcSamplingData.collectNumPcs; ++i)
76 pcSamplingData.pPcData[i].stallReason =
77 TIMEMORY_CUPTI_CALLOC(CUpti_PCSamplingStallReason, numStallReasons);
78 return pcSamplingData;
82cupti_pcsampling::free_pcsampling_data(CUpti_PCSamplingData pcSamplingData)
85 for(
size_t i = 0; i < pcSamplingData.collectNumPcs; i++)
87 if(pcSamplingData.pPcData[i].stallReason)
89 free(pcSamplingData.pPcData[i].stallReason);
90 pcSamplingData.pPcData[i].stallReason =
nullptr;
94 if(pcSamplingData.pPcData)
96 free(pcSamplingData.pPcData);
97 pcSamplingData.pPcData =
nullptr;
102getStallReason(
const uint32_t& stallReasonCount,
103 const uint32_t& pcSamplingStallReasonIndex, uint32_t* pStallReasonIndex,
104 char** pStallReasons)
106 for(uint32_t i = 0; i < stallReasonCount; i++)
108 if(pStallReasonIndex[i] == pcSamplingStallReasonIndex)
110 return pStallReasons[i];
113 return "ERROR_STALL_REASON_INDEX_NOT_FOUND";
117printPCSamplingData(CUpti_PCSamplingData* pPcSamplingData,
118 const uint32_t& stallReasonCount, uint32_t* pStallReasonIndex,
119 char** pStallReasons)
121 std::cout <<
"----- PC sampling data for range defined by cuptiPCSamplingStart() and "
122 "cuptiPCSamplingStop() -----"
124 std::cout <<
"Number of PCs remaining to be collected: "
125 << pPcSamplingData->remainingNumPcs <<
", ";
126 std::cout <<
"range id: " << pPcSamplingData->rangeId <<
", ";
127 std::cout <<
"total samples: " << pPcSamplingData->totalSamples <<
", ";
128 std::cout <<
"dropped samples: " << pPcSamplingData->droppedSamples << std::endl;
129 for(
size_t i = 0; i < pPcSamplingData->totalNumPcs; i++)
131 std::cout <<
"\tpcOffset : 0x" << std::hex << pPcSamplingData->pPcData[i].pcOffset
132 <<
", stallReasonCount: " << std::dec
133 << pPcSamplingData->pPcData[i].stallReasonCount <<
", functionName: "
134 <<
demangle(pPcSamplingData->pPcData[i].functionName);
135 for(
size_t j = 0; j < pPcSamplingData->pPcData[i].stallReasonCount; j++)
137 std::cout <<
"\n\t\tstallReason: "
138 << getStallReason(stallReasonCount,
139 pPcSamplingData->pPcData[i]
141 .pcSamplingStallReasonIndex,
142 pStallReasonIndex, pStallReasons)
144 << pPcSamplingData->pPcData[i].stallReason[j].samples;
146 std::cout << std::endl;
148 std::cout <<
"-----------------------------------------------------------------------"
149 "---------------------------"
154 cupti_pcsampling::configure()
167 fprintf(stderr,
"There is no device supporting CUDA.\n");
172 if(get_configuration_data().region_totals)
173 get_configuration_data().region_totals =
174 settings::instance()->get<
bool>(
"cupti_pcsampling_region_totals");
176 deviceNum = settings::instance()->get<
int>(
"cupti_device");
184 printf(
"Device Name: %s\n", prop.name);
185 printf(
"Device compute capability: %d.%d\n", prop.major, prop.minor);
188 printf(
"Component is unavailable on this device, supported on devices with "
189 "compute capability 7.0 and higher\n");
204 TIMEMORY_CUPTI_API_CALL(
205 cuptiRegisterComputeCrcCallback(&cupti::pcsample::compute_cubin_crc));
209 CUpti_PCSamplingEnableParams pcSamplingEnableParams = {};
210 pcSamplingEnableParams.size = CUpti_PCSamplingEnableParamsSize;
211 pcSamplingEnableParams.ctx = cuCtx;
212 TIMEMORY_CUPTI_API_CALL(cuptiPCSamplingEnable(&pcSamplingEnableParams));
216 size_t numStallReasons = 0;
217 CUpti_PCSamplingGetNumStallReasonsParams numStallReasonsParams = {};
218 numStallReasonsParams.size = CUpti_PCSamplingGetNumStallReasonsParamsSize;
219 numStallReasonsParams.ctx = cuCtx;
220 numStallReasonsParams.numStallReasons = &numStallReasons;
221 TIMEMORY_CUPTI_API_CALL(cuptiPCSamplingGetNumStallReasons(&numStallReasonsParams));
225 cupti::pcstall::allocate_arrays(numStallReasons);
226 uint32_t*& pStallReasonIndex = cupti::pcstall::get_index_array();
227 char**& pStallReasons = cupti::pcstall::get_name_array();
228 bool*& pStallReasonsEnabled = cupti::pcstall::get_bool_array();
230 CUpti_PCSamplingGetStallReasonsParams stallReasonsParams = {};
231 stallReasonsParams.size = CUpti_PCSamplingGetStallReasonsParamsSize;
232 stallReasonsParams.ctx = cuCtx;
233 stallReasonsParams.numStallReasons = numStallReasons;
234 stallReasonsParams.stallReasonIndex = pStallReasonIndex;
235 stallReasonsParams.stallReasons = pStallReasons;
236 cuptiPCSamplingGetStallReasons(&stallReasonsParams);
238 size_t stallReasonCount = numStallReasons;
239 for(
size_t i = 0; i < numStallReasons; ++i)
240 pStallReasonsEnabled[i] =
true;
241 auto _stall_reasons =
242 delimit(settings::instance()->get<std::string>(
"cupti_pcsampling_stall_reasons"));
243 if(!_stall_reasons.empty() && _stall_reasons.size() < stallReasonCount)
245 stallReasonCount = _stall_reasons.size();
247 for(
size_t i = 0; i < numStallReasons; ++i)
248 pStallReasonsEnabled[i] =
false;
250 for(
const auto& itr : _stall_reasons)
252 for(
size_t i = 0; i < numStallReasons; ++i)
254 if(std::regex_search(pStallReasons[i], std::regex(itr +
"$")))
255 pStallReasonsEnabled[i] =
true;
260 for(
size_t i = 0; i < numStallReasons; ++i)
262 if(pStallReasonsEnabled[i] && _idx != i)
264 std::swap(pStallReasonsEnabled[i], pStallReasonsEnabled[_idx]);
265 std::swap(pStallReasonIndex[i], pStallReasonIndex[_idx]);
266 std::swap(pStallReasons[i], pStallReasons[_idx]);
273 printf(
"[runtime]> numStallReasons = %lu\n", (
unsigned long) numStallReasons);
274 printf(
"[compile]> numStallReasons = %lu\n",
275 (
unsigned long) cupti::pcsample::stall_reasons_size);
276 for(
size_t i = 0; i < stallReasonCount; ++i)
277 printf(
"%s index: %lu\n", pStallReasons[i],
278 (
long unsigned) pStallReasonIndex[i]);
280 assert(numStallReasons <= cupti::pcsample::stall_reasons_size);
285 size_t _num_collect =
286 settings::instance()->get<
size_t>(
"cupti_pcsampling_num_collect");
287 CUpti_PCSamplingData pcSamplingData =
288 get_pcsampling_data(stallReasonCount, _num_collect);
294 std::vector<CUpti_PCSamplingConfigurationInfo> pcSamplingConfigurationInfo{};
297 auto _period = settings::instance()->get<
int>(
"cupti_pcsampling_period");
298 CUpti_PCSamplingConfigurationInfo samplingPeriod = {};
299 samplingPeriod.attributeType =
300 CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD;
301 samplingPeriod.attributeData.samplingPeriodData.samplingPeriod = _period;
302 pcSamplingConfigurationInfo.push_back(samplingPeriod);
305 CUpti_PCSamplingConfigurationInfo stallReason = {};
306 stallReason.attributeType = CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON;
307 stallReason.attributeData.stallReasonData.stallReasonCount = stallReasonCount;
308 stallReason.attributeData.stallReasonData.pStallReasonIndex = pStallReasonIndex;
309 pcSamplingConfigurationInfo.push_back(stallReason);
312 CUpti_PCSamplingConfigurationInfo scratchBufferSize = {};
313 scratchBufferSize.attributeType =
314 CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE;
315 scratchBufferSize.attributeData.scratchBufferSizeData.scratchBufferSize =
317 pcSamplingConfigurationInfo.push_back(scratchBufferSize);
320 auto _serialized = settings::instance()->get<
int>(
"cupti_pcsampling_serialized");
321 CUpti_PCSamplingConfigurationInfo collectionMode = {};
322 collectionMode.attributeType =
323 CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE;
324 collectionMode.attributeData.collectionModeData.collectionMode =
325 (_serialized) ? CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED
326 : CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS;
327 pcSamplingConfigurationInfo.push_back(collectionMode);
330 CUpti_PCSamplingConfigurationInfo enableStartStop = {};
331 enableStartStop.attributeType =
332 CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL;
333 enableStartStop.attributeData.enableStartStopControlData.enableStartStopControl =
335 pcSamplingConfigurationInfo.push_back(enableStartStop);
338 CUpti_PCSamplingConfigurationInfo outputDataFormat = {};
339 outputDataFormat.attributeType =
340 CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT;
341 outputDataFormat.attributeData.outputDataFormatData.outputDataFormat =
342 CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED;
343 pcSamplingConfigurationInfo.push_back(outputDataFormat);
346 CUpti_PCSamplingConfigurationInfo samplingDataBuffer = {};
347 samplingDataBuffer.attributeType =
348 CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER;
349 samplingDataBuffer.attributeData.samplingDataBufferData.samplingDataBuffer =
350 (
void*) pcSamplingData.pPcData;
351 pcSamplingConfigurationInfo.push_back(samplingDataBuffer);
354 CUpti_PCSamplingConfigurationInfoParams pcSamplingConfigurationInfoParams = {};
355 pcSamplingConfigurationInfoParams.size = CUpti_PCSamplingConfigurationInfoParamsSize;
356 pcSamplingConfigurationInfoParams.ctx = cuCtx;
357 pcSamplingConfigurationInfoParams.numAttributes = pcSamplingConfigurationInfo.size();
358 pcSamplingConfigurationInfoParams.pPCSamplingConfigurationInfo =
359 pcSamplingConfigurationInfo.data();
360 TIMEMORY_CUPTI_API_CALL(
361 cuptiPCSamplingSetConfigurationAttribute(&pcSamplingConfigurationInfoParams));
363 for(
auto itr : pcSamplingConfigurationInfo)
364 TIMEMORY_CUPTI_API_CALL(itr.attributeStatus);
367 CUpti_PCSamplingStartParams pcSamplingStartParams = {};
368 pcSamplingStartParams.size = CUpti_PCSamplingStartParamsSize;
369 pcSamplingStartParams.ctx = cuCtx;
373 CUpti_PCSamplingStopParams pcSamplingStopParams = {};
374 pcSamplingStopParams.size = CUpti_PCSamplingStopParamsSize;
375 pcSamplingStopParams.ctx = cuCtx;
378 return std::make_tuple(cuCtx, pcSamplingEnableParams, numStallReasonsParams,
379 stallReasonsParams, pcSamplingData,
380 std::move(pcSamplingConfigurationInfo),
381 pcSamplingConfigurationInfoParams, pcSamplingStartParams,
382 pcSamplingStopParams, stallReasonCount, _num_collect);
388 auto& _cfg = get_configuration_data();
390 std::tie(_cfg.enabled, _cfg.data) = std::make_tuple(
true,
configure());
396 auto& _cfg = get_configuration_data();
399 _cfg.enabled =
false;
400 CUcontext cuCtx = _cfg.context();
402 CUpti_PCSamplingDisableParams pcSamplingDisableParams = {};
403 pcSamplingDisableParams.size = CUpti_PCSamplingDisableParamsSize;
404 pcSamplingDisableParams.ctx = cuCtx;
405 TIMEMORY_CUPTI_API_CALL(cuptiPCSamplingDisable(&pcSamplingDisableParams));
414 auto& _cfg = get_configuration_data();
418 auto pcSamplingData =
419 get_pcsampling_data(_cfg.num_stall_reasons(), _cfg.num_collect_pcs());
421 CUpti_PCSamplingGetDataParams pcSamplingGetDataParams = {};
422 pcSamplingGetDataParams.size = CUpti_PCSamplingGetDataParamsSize;
423 pcSamplingGetDataParams.ctx = _cfg.context();
424 pcSamplingGetDataParams.pcSamplingData =
static_cast<void*
>(&pcSamplingData);
425 TIMEMORY_CUPTI_API_CALL(cuptiPCSamplingGetData(&pcSamplingGetDataParams));
427 printPCSamplingData(&pcSamplingData, _cfg.num_stall_reasons(),
428 cupti::pcstall::get_index_array(),
429 cupti::pcstall::get_name_array());
430 return data_type{ std::move(pcSamplingData) };
434cupti_pcsampling::sample()
436 cupti::pcdata _data =
record();
437 for(
auto&& itr : _data.samples)
439 component_tuple<cupti_pcsampling> _bundle{ itr.name() };
441 _bundle.store(std::move(itr));
450 if(get_configuration_data().region_totals)
451 for(
auto& itr : get_stack())
458 value = std::move(_data);
459 if(get_configuration_data().region_totals)
460 for(
auto& itr : get_stack())
471 TIMEMORY_CUPTI_API_CALL(
472 cuptiPCSamplingStart(&get_configuration_data().start_params()));
474 if(get_configuration_data().region_totals)
475 get_stack().insert(
this);
482 if(get_configuration_data().region_totals)
483 get_stack().erase(
this);
488 TIMEMORY_CUPTI_API_CALL(
489 cuptiPCSamplingStop(&get_configuration_data().stop_params()));
495cupti_pcsampling::set_started()
497 base_type::set_started();
501cupti_pcsampling::set_stopped()
505 set_is_running(
false);
509 cupti_pcsampling::get_display()
const
511 std::stringstream ss;
512 ss.precision(base_type::get_precision());
513 ss.width(base_type::get_width());
514 ss.setf(base_type::get_format_flags());
523 std::vector<int64_t> _data{};
526 for(
size_t i = 0; i < _n; ++i)
531 uint32_t _v = _val.stalls[i].samples;
532 _data.emplace_back(
static_cast<int64_t
>(_v));
539 cupti_pcsampling::label_array()
542 std::vector<std::string> _data{};
544 for(
size_t i = 0; i < _n; ++i)
547 _data.push_back(
std::string{ cupti::pcstall::name(i) });
#define TIMEMORY_CUPTI_INLINE
#define TIMEMORY_CUDA_RUNTIME_API_CALL(...)
#define TIMEMORY_CUDA_DRIVER_API_CALL(...)
void load(Archive &ar, tim::node::graph< Tp > &d)
void store(TupleT< Tp... > &obj, Args &&... args)
void record(TupleT< Tp... > &obj, Args &&... args)
void stop(TupleT< Tp... > &obj, Args &&... args)
void start(TupleT< Tp... > &obj, Args &&... args)
constexpr auto get_size(const Tp &, std::tuple<>) -> size_t
std::bitset< scope_count > data_type
void configure(std::initializer_list< EnumT > components, Args &&... args)
void initialize(CompList< CompTypes... > &obj, std::initializer_list< EnumT > components)
std::string demangle(const char *_mangled_name, int *_status=nullptr)
tim::mpl::apply< std::string > string
auto get(const auto_bundle< Tag, Types... > &_obj)
ContainerT delimit(const std::string &line, const std::string &delimiters="\"',;: ", PredicateT &&predicate=[](const std::string &s) -> std::string { return s;})
Definition for various functions for store in operations.