timemory 3.3.0
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
tim::ert Namespace Reference

Classes

class  aligned_allocator
 
struct  callback
 for variadic expansion to set the callback More...
 
struct  configuration
 
class  counter
 
class  exec_data
 
struct  exec_params
 
struct  executor
 
struct  executor< device::gpu, Tp, CounterT >
 
class  thread_barrier
 

Functions

template<typename DeviceT , typename CounterT , typename Tp , typename... Types, typename DataType = exec_data<CounterT>, typename DataPtr = std::shared_ptr<DataType>, typename std::enable_if<(sizeof...(Types)==0), int >::type = 0>
std::shared_ptr< DataType > execute (std::shared_ptr< DataType > _data=std::make_shared< DataType >())
 
template<typename Counter >
void serialize (std::string fname, exec_data< Counter > &obj)
 
template<typename DeviceT , typename Tp , typename Intp = int32_t, device::enable_if_cpu_t< DeviceT > = 0>
void initialize_buffer (Tp *A, const Tp &value, const Intp &nsize)
 
template<typename DeviceT , typename Tp , typename Intp = int32_t, device::enable_if_gpu_t< DeviceT > = 0>
void initialize_buffer (Tp *A, Tp value, Intp nsize)
 
template<size_t Nrep, typename DeviceT , typename Intp , typename Tp , typename OpsFuncT , typename StoreFuncT , device::enable_if_cpu_t< DeviceT > = 0>
void ops_kernel (Intp ntrials, Intp nsize, Tp *A, OpsFuncT &&ops_func, StoreFuncT &&store_func)
 
template<size_t Nrep, typename DeviceT , typename Intp , typename Tp , typename OpsFuncT , typename StoreFuncT , device::enable_if_gpu_t< DeviceT > = 0, enable_if_t<!std::is_same< Tp, gpu::fp16_t >::value > = 0>
void ops_kernel (Intp ntrials, Intp nsize, Tp *A, OpsFuncT &&ops_func, StoreFuncT &&store_func)
 
template<size_t Nops, size_t... Nextra, typename DeviceT , typename Tp , typename CounterT , typename OpsFuncT , typename StoreFuncT , enable_if_t< sizeof...(Nextra)==0, int > = 0>
bool ops_main (counter< DeviceT, Tp, CounterT > &_counter, OpsFuncT &&ops_func, StoreFuncT &&store_func)
 This is the "main" function for ERT. More...
 
template<size_t... Nops, typename DeviceT , typename Tp , typename CounterT , typename OpsFuncT , typename StoreFuncT , enable_if_t< sizeof...(Nops)==0, int > = 0>
bool ops_main (counter< DeviceT, Tp, CounterT > &, OpsFuncT &&, StoreFuncT &&)
 This is invoked when TIMEMORY_USER_ERT_FLOPS is empty. More...
 

Function Documentation

◆ execute()

template<typename DeviceT , typename CounterT , typename Tp , typename... Types, typename DataType = exec_data<CounterT>, typename DataPtr = std::shared_ptr<DataType>, typename std::enable_if<(sizeof...(Types)==0), int >::type = 0>
std::shared_ptr< DataType > tim::ert::execute ( std::shared_ptr< DataType >  _data = std::make_shared<DataType>())

Definition at line 650 of file configuration.hpp.

651{
652 using ConfigType = configuration<DeviceT, Tp, CounterT>;
653 using ExecType = executor<DeviceT, Tp, CounterT>;
654
655 ConfigType _config{};
656 ExecType(_config, _data);
657
658 return _data;
659}

Referenced by tim::sampling::sampler< CompT< Types... >, N, SigIds... >::configure().

◆ initialize_buffer() [1/2]

template<typename DeviceT , typename Tp , typename Intp = int32_t, device::enable_if_cpu_t< DeviceT > = 0>
void tim::ert::initialize_buffer ( Tp *  A,
const Tp &  value,
const Intp &  nsize 
)

Definition at line 292 of file data.hpp.

293{
294 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
295 for(auto i = range.begin(); i < range.end(); i += range.stride())
296 A[i] = value;
297}

◆ initialize_buffer() [2/2]

template<typename DeviceT , typename Tp , typename Intp = int32_t, device::enable_if_gpu_t< DeviceT > = 0>
void tim::ert::initialize_buffer ( Tp *  A,
Tp  value,
Intp  nsize 
)

Definition at line 308 of file data.hpp.

309{
310 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
311 for(auto i = range.begin(); i < range.end(); i += range.stride())
312 A[i] = value;
313}

◆ ops_kernel() [1/2]

template<size_t Nrep, typename DeviceT , typename Intp , typename Tp , typename OpsFuncT , typename StoreFuncT , device::enable_if_cpu_t< DeviceT > = 0>
void tim::ert::ops_kernel ( Intp  ntrials,
Intp  nsize,
Tp *  A,
OpsFuncT &&  ops_func,
StoreFuncT &&  store_func 
)

Definition at line 68 of file kernels.hpp.

69{
70 // divide by two here because macros halve, e.g. ERT_FLOP == 4 means 2 calls
71 constexpr size_t NUM_REP = Nrep / 2;
72 constexpr size_t MOD_REP = Nrep % 2;
73 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
74
75 Tp alpha = static_cast<Tp>(0.5);
76 for(Intp j = 0; j < ntrials; ++j)
77 {
78 for(auto i = range.begin(); i < range.end(); i += range.stride())
79 {
80 Tp beta = static_cast<Tp>(0.8);
82 alpha);
83 store_func(A[i], beta);
84 }
85 alpha *= static_cast<Tp>(1.0 - 1.0e-8);
86 }
87}

◆ ops_kernel() [2/2]

template<size_t Nrep, typename DeviceT , typename Intp , typename Tp , typename OpsFuncT , typename StoreFuncT , device::enable_if_gpu_t< DeviceT > = 0, enable_if_t<!std::is_same< Tp, gpu::fp16_t >::value > = 0>
void tim::ert::ops_kernel ( Intp  ntrials,
Intp  nsize,
Tp *  A,
OpsFuncT &&  ops_func,
StoreFuncT &&  store_func 
)

Definition at line 99 of file kernels.hpp.

100{
101 // divide by two here because macros halve, e.g. ERT_FLOP == 4 means 2 calls
102 constexpr size_t NUM_REP = Nrep / 2;
103 constexpr size_t MOD_REP = Nrep % 2;
104 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
105
106 Tp alpha = static_cast<Tp>(0.5);
107 for(Intp j = 0; j < ntrials; ++j)
108 {
109 for(auto i = range.begin(); i < range.end(); i += range.stride())
110 {
111 Tp beta = static_cast<Tp>(0.8);
113 alpha);
114 store_func(A[i], beta);
115 }
116 alpha *= static_cast<Tp>(1.0 - 1.0e-8);
117 }
118}

◆ ops_main() [1/2]

template<size_t... Nops, typename DeviceT , typename Tp , typename CounterT , typename OpsFuncT , typename StoreFuncT , enable_if_t< sizeof...(Nops)==0, int > = 0>
bool tim::ert::ops_main ( counter< DeviceT, Tp, CounterT > &  ,
OpsFuncT &&  ,
StoreFuncT &&   
)

This is invoked when TIMEMORY_USER_ERT_FLOPS is empty.

Definition at line 419 of file kernels.hpp.

420{
421 return false;
422}

◆ ops_main() [2/2]

template<size_t Nops, size_t... Nextra, typename DeviceT , typename Tp , typename CounterT , typename OpsFuncT , typename StoreFuncT , enable_if_t< sizeof...(Nextra)==0, int > = 0>
bool tim::ert::ops_main ( counter< DeviceT, Tp, CounterT > &  _counter,
OpsFuncT &&  ops_func,
StoreFuncT &&  store_func 
)

This is the "main" function for ERT.

This is invokes the "main" function for ERT for all the desired "FLOPs" that are unrolled in the kernel.

Definition at line 159 of file kernels.hpp.

161{
162 if(_counter.skip(Nops))
163 return false;
164
165 using stream_list_t = std::vector<gpu::stream_t>;
166 using thread_list_t = std::vector<std::thread>;
167 using device_params_t = device::params<DeviceT>;
168 using Intp = int32_t;
169 using ull = long long unsigned;
170
171 constexpr bool is_gpu = std::is_same<DeviceT, device::gpu>::value;
172
174 printf("[%s] Executing %li ops...\n", __FUNCTION__, (long int) Nops);
175
176 if(_counter.bytes_per_element == 0)
177 {
178 fprintf(stderr, "[%s:%i]> bytes-per-element is not set!\n", __FUNCTION__,
179 __LINE__);
180 }
181
182 if(_counter.memory_accesses_per_element == 0)
183 {
184 fprintf(stderr, "[%s:%i]> memory-accesses-per-element is not set!\n",
185 __FUNCTION__, __LINE__);
186 }
187
188 // list of streams
189 stream_list_t streams;
190 // generate async streams if multiple streams were requested
191 if(_counter.params.nstreams > 1)
192 {
193 // fill with implicit stream
194 streams.resize(_counter.params.nstreams, 0);
195 for(auto& itr : streams)
196 gpu::stream_create(itr);
197 }
198
199 auto _opfunc = [&](uint64_t tid, thread_barrier* fbarrier, thread_barrier* lbarrier) {
200 threading::affinity::set();
201 using opmutex_t = std::mutex;
202 using oplock_t = std::unique_lock<opmutex_t>;
203 static opmutex_t opmutex;
204 {
205 oplock_t _lock(opmutex);
206 // execute the callback
207 _counter.configure(tid);
208 }
209 // allocate buffer
210 auto buf = _counter.get_buffer();
211 uint64_t n = _counter.params.working_set_min;
212 // cache this
213 const uint64_t nstreams = std::max<uint64_t>(_counter.params.nstreams, 1);
214 // create the launch parameters (ignored on CPU)
215 //
216 // if grid_size is zero (default), the launch command will calculate a grid-size
217 // as follows:
218 //
219 // grid_size = ((data_size + block_size - 1) / block_size)
220 //
221 device_params_t dev_params(_counter.params.grid_size, _counter.params.block_size,
222 _counter.params.shmem_size, DeviceT::default_stream);
223 //
224 if(n > _counter.nsize)
225 {
226 fprintf(stderr,
227 "[%s@'%s':%i]> Warning! ERT not running any trials because working "
228 "set min > nsize: %llu > %llu\n",
229 TIMEMORY_ERROR_FUNCTION_MACRO, __FILE__, __LINE__, (ull) n,
230 (ull) _counter.nsize);
231 }
232
233 while(n <= _counter.nsize)
234 {
235 // working set - nsize
236 uint64_t ntrials = _counter.nsize / n;
237 if(ntrials < 1)
238 ntrials = 1;
239
240 if(settings::debug() && tid == 0)
241 {
242 printf("[tim::ert::ops_main<%llu>]> number of trials: %llu, n = %llu, "
243 "nsize "
244 "= %llu\n",
245 (ull) Nops, (ull) ntrials, (ull) n, (ull) _counter.nsize);
246 }
247
248 auto _itr_params = _counter.params;
249
250 if(is_gpu)
251 {
252 // make sure all streams are synced
253 for(auto& itr : streams)
254 gpu::stream_sync(itr);
255
256 // sync the streams
257 if(nstreams < 2)
258 gpu::device_sync();
259 }
260
261 // wait master thread notifies to proceed
262 // if(fbarrier)
263 // fbarrier->notify_wait();
264 if(fbarrier)
265 fbarrier->spin_wait();
266
267 // get instance of object measuring something during the calculation
268 CounterT ct = _counter.get_counter();
269 // start the timer or anything else being recorded
270 ct.start();
271
272 // only do this more complicated mess if we need to
273 if(nstreams > 1)
274 {
275 auto nchunk = n / nstreams;
276 auto nmodulo = n % nstreams;
277 for(uint64_t i = 0; i < nstreams; ++i)
278 {
279 // calculate the size of the subchunk
280 int32_t _n = nchunk + ((i + 1 == nstreams) ? nmodulo : 0);
281 auto _params = dev_params; // copy of the parameters
282 device::launch(
283 _n, streams.at(i % streams.size()), _params,
284 ops_kernel<Nops, DeviceT, Intp, Tp, OpsFuncT, StoreFuncT>,
285 ntrials, _n, buf + (i * nchunk), std::forward<OpsFuncT>(ops_func),
286 std::forward<StoreFuncT>(store_func));
287 _itr_params.grid_size =
288 (i == 0) ? _params.grid
289 : std::max<int64_t>(_itr_params.grid_size, _params.grid);
290 }
291 }
292 else
293 {
294 device::launch(n, dev_params,
295 ops_kernel<Nops, DeviceT, Intp, Tp, OpsFuncT, StoreFuncT>,
296 ntrials, n, buf, std::forward<OpsFuncT>(ops_func),
297 std::forward<StoreFuncT>(store_func));
298
299 _itr_params.grid_size = dev_params.grid;
300 }
301
302 if(is_gpu)
303 {
304 for(auto& itr : streams)
305 gpu::stream_sync(itr);
306
307 // sync the streams
308 if(nstreams < 2)
309 gpu::device_sync();
310 }
311
312 // wait master thread notifies to proceed
313 // if(lbarrier)
314 // lbarrier->notify_wait();
315 if(lbarrier)
316 lbarrier->spin_wait();
317
318 // stop the timer or anything else being recorded
319 ct.stop();
320
321 // store the result
322 if(tid == 0)
323 {
324 // ensure there is not a data race if more than one thread somehow
325 // has a tid of 0
326 oplock_t _lock(opmutex);
327 _counter.record(ct, n, ntrials, Nops, _itr_params);
328 }
329
330 n = ((1.1 * n) == n) ? (n + 1) : (1.1 * n);
331 }
332
333 if(is_gpu)
334 gpu::device_sync();
335
336 _counter.destroy_buffer(buf);
337 };
338
339 // guard against multiple threads trying to call ERT for some reason
340 static std::mutex _mtx;
341 std::unique_lock<std::mutex> _lock(_mtx);
342
343 dmp::barrier(); // synchronize MPI processes
344
345 if(is_gpu)
346 gpu::device_sync();
347
348 if(_counter.params.nthreads > 1)
349 {
350 // create synchronization barriers for the threads
351 thread_barrier fbarrier{ _counter.params.nthreads };
352 thread_barrier lbarrier{ _counter.params.nthreads };
353
354 // list of threads
355 thread_list_t threads{};
356 // create the threads
357 for(uint64_t i = 0; i < _counter.params.nthreads; ++i)
358 threads.emplace_back(_opfunc, i, &fbarrier, &lbarrier);
359
360 /*
361 uint64_t n = _counter.params.working_set_min;
362 while(n <= _counter.nsize)
363 {
364 // wait until all threads have also called notify_wait() then release
365 // barrier to start
366 fbarrier.notify_wait();
367 // wait until all threads have also called notify_wait() then release
368 // barrier to finish
369 lbarrier.notify_wait();
370 n = ((1.1 * n) == n) ? (n + 1) : (1.1 * n);
371 }*/
372
373 // wait for threads to finish
374 for(auto& itr : threads)
375 itr.join();
376 }
377 else
378 {
379 _opfunc(0, nullptr, nullptr);
380 }
381
382 if(is_gpu)
383 gpu::device_sync();
384
385 dmp::barrier(); // synchronize MPI processes
386
387 // code was executed
388 return true;
389}
Up * get_buffer()
allocate a buffer for the ERT calculation uses this function if device is CPU or device is GPU and ty...
Definition: counter.hpp:133
void configure(uint64_t tid)
Definition: counter.hpp:184
bool skip(size_t _Nops)
Definition: counter.hpp:288
void record(counter_type &_counter, int n, int trials, uint64_t nops, const exec_params &_itrp)
Definition: counter.hpp:196
counter_type get_counter() const
Definition: counter.hpp:190
int memory_accesses_per_element
Definition: counter.hpp:296
exec_params params
Definition: counter.hpp:294
void destroy_buffer(Tp *buffer)
Definition: counter.hpp:178
uint64_t nsize
Definition: counter.hpp:298
#define TIMEMORY_ERROR_FUNCTION_MACRO
Definition: macros.hpp:229
uint64_t working_set_min
Definition: data.hpp:114
uint64_t nstreams
Definition: data.hpp:119
uint64_t nthreads
Definition: data.hpp:116
uint64_t grid_size
Definition: data.hpp:120
uint64_t shmem_size
Definition: data.hpp:122
uint64_t block_size
Definition: data.hpp:121

References tim::ert::exec_params::block_size, tim::ert::counter< DeviceT, Tp, Counter >::bytes_per_element, tim::ert::counter< DeviceT, Tp, Counter >::configure(), tim::debug, tim::ert::counter< DeviceT, Tp, Counter >::destroy_buffer(), tim::ert::counter< DeviceT, Tp, Counter >::get_buffer(), tim::ert::counter< DeviceT, Tp, Counter >::get_counter(), tim::ert::exec_params::grid_size, tim::ert::counter< DeviceT, Tp, Counter >::memory_accesses_per_element, tim::ert::counter< DeviceT, Tp, Counter >::nsize, tim::ert::exec_params::nstreams, tim::ert::exec_params::nthreads, tim::ert::counter< DeviceT, Tp, Counter >::params, tim::ert::counter< DeviceT, Tp, Counter >::record(), tim::ert::exec_params::shmem_size, tim::ert::counter< DeviceT, Tp, Counter >::skip(), TIMEMORY_ERROR_FUNCTION_MACRO, tim::verbose, and tim::ert::exec_params::working_set_min.

Referenced by tim::ert::executor< DeviceT, Tp, CounterT >::execute().

◆ serialize()

template<typename Counter >
void tim::ert::serialize ( std::string  fname,
exec_data< Counter > &  obj 
)
inline

Definition at line 325 of file counter.hpp.

326{
327 using exec_data_vec_t = std::vector<exec_data<Counter>>;
328
329 int dmp_rank = dmp::rank();
330 int dmp_size = dmp::size();
331
332 exec_data_vec_t results(dmp_size);
333 if(dmp::is_initialized())
334 {
335 dmp::barrier();
336
337#if defined(TIMEMORY_USE_MPI) || defined(TIMEMORY_USE_UPCXX)
338 auto space = cereal::JSONOutputArchive::Options::IndentChar::space;
339
340 //------------------------------------------------------------------------------//
341 // Used to convert a result to a serialization
342 //
343 auto send_serialize = [&](const exec_data<Counter>& src) {
344 std::stringstream ss;
345 {
346 cereal::JSONOutputArchive::Options opt(16, space, 0);
347 cereal::JSONOutputArchive oa(ss, opt);
348 oa(cereal::make_nvp("data", src));
349 }
350 return ss.str();
351 };
352
353 //------------------------------------------------------------------------------//
354 // Used to convert the serialization to a result
355 //
356 auto recv_serialize = [&](const std::string& src) {
357 exec_data<Counter> ret;
358 std::stringstream ss;
359 ss << src;
360 {
361 cereal::JSONInputArchive ia(ss);
362 ia(cereal::make_nvp("data", ret));
363 }
364 return ret;
365 };
366#endif
367
368#if defined(TIMEMORY_USE_MPI)
369
370 auto str_ret = send_serialize(obj);
371
372 if(dmp_rank == 0)
373 {
374 for(int i = 1; i < dmp_size; ++i)
375 {
376 std::string str;
377 mpi::recv(str, i, 0, mpi::comm_world_v);
378 results[i] = recv_serialize(str);
379 }
380 results[dmp_rank] = std::move(obj);
381 }
382 else
383 {
384 mpi::send(str_ret, 0, 0, mpi::comm_world_v);
385 }
386
387#elif defined(TIMEMORY_USE_UPCXX)
388
389 //------------------------------------------------------------------------------//
390 // Function executed on remote node
391 //
392 auto remote_serialize = [=]() { return send_serialize(obj); };
393
394 //------------------------------------------------------------------------------//
395 // Combine on master rank
396 //
397 if(dmp_rank == 0)
398 {
399 for(int i = 1; i < dmp_size; ++i)
400 {
401 upcxx::future<std::string> fut = upcxx::rpc(i, remote_serialize);
402 while(!fut.ready())
403 upcxx::progress();
404 fut.wait();
405 results[i] = recv_serialize(fut.result());
406 }
407 results[dmp_rank] = std::move(obj);
408 }
409
410#endif
411 }
412 else
413 {
414 results.clear();
415 results.resize(1);
416 results.at(0) = std::move(obj);
417 }
418
419 if(dmp_rank == 0)
420 {
421 fname = settings::compose_output_filename(fname, ".json");
422 printf("[%i]> Outputting '%s'...\n", dmp_rank, fname.c_str());
423 std::ofstream ofs{};
424 if(filepath::open(ofs, fname))
425 {
426 // ensure json write final block during destruction before the file is closed
427 using policy_type = policy::output_archive_t<Counter>;
428 auto oa = policy_type::get(ofs);
429 oa->setNextName("timemory");
430 oa->startNode();
431 oa->setNextName("ranks");
432 oa->startNode();
433 oa->makeArray();
434 for(uint64_t i = 0; i < results.size(); ++i)
435 {
436 oa->startNode();
437 (*oa)(cereal::make_nvp("rank", i),
438 cereal::make_nvp("roofline", results.at(i)));
439 oa->finishNode();
440 }
441 oa->finishNode();
442 oa->finishNode();
443 }
444 if(ofs)
445 ofs << std::endl;
446 ofs.close();
447 }
448}
bool open(std::ofstream &_ofs, std::string _fpath, Args &&... _args)
Definition: filepath.hpp:207
tim::mpl::apply< std::string > string
Definition: macros.hpp:53
auto get(const auto_bundle< Tag, Types... > &_obj)

References tim::settings::compose_output_filename(), tim::get(), and tim::filepath::open().