33#include "timemory/backends/device.hpp"
34#include "timemory/backends/dmp.hpp"
35#include "timemory/backends/gpu.hpp"
36#include "timemory/backends/threading.hpp"
37#include "timemory/components/cuda/backends.hpp"
65template <
size_t Nrep,
typename DeviceT,
typename Intp,
typename Tp,
typename OpsFuncT,
66 typename StoreFuncT, device::enable_if_cpu_t<DeviceT> = 0>
68ops_kernel(Intp ntrials, Intp nsize, Tp* A, OpsFuncT&& ops_func, StoreFuncT&& store_func)
71 constexpr size_t NUM_REP = Nrep / 2;
72 constexpr size_t MOD_REP = Nrep % 2;
73 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
75 Tp alpha =
static_cast<Tp
>(0.5);
76 for(Intp j = 0; j < ntrials; ++j)
78 for(
auto i = range.begin(); i < range.end(); i += range.stride())
80 Tp beta =
static_cast<Tp
>(0.8);
83 store_func(A[i], beta);
85 alpha *=
static_cast<Tp
>(1.0 - 1.0e-8);
95template <
size_t Nrep,
typename DeviceT,
typename Intp,
typename Tp,
typename OpsFuncT,
96 typename StoreFuncT, device::enable_if_gpu_t<DeviceT> = 0,
99ops_kernel(Intp ntrials, Intp nsize, Tp* A, OpsFuncT&& ops_func, StoreFuncT&& store_func)
102 constexpr size_t NUM_REP = Nrep / 2;
103 constexpr size_t MOD_REP = Nrep % 2;
104 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
106 Tp alpha =
static_cast<Tp
>(0.5);
107 for(Intp j = 0; j < ntrials; ++j)
109 for(
auto i = range.begin(); i < range.end(); i += range.stride())
111 Tp beta =
static_cast<Tp
>(0.8);
114 store_func(A[i], beta);
116 alpha *=
static_cast<Tp
>(1.0 - 1.0e-8);
126template <
size_t Nrep,
typename DeviceT,
typename Intp,
typename Tp,
typename OpsFuncT,
127 typename StoreFuncT, device::enable_if_gpu_t<DeviceT> = 0,
130ops_kernel(Intp ntrials, Intp nsize, Tp* A, OpsFuncT&& ops_func, StoreFuncT&& store_func)
133 constexpr size_t NUM_REP = Nrep / 4;
134 constexpr size_t MOD_REP = Nrep % 4;
135 auto range = device::grid_strided_range<DeviceT, 0, int32_t>(nsize);
137 Tp alpha = { 0.5, 0.5 };
138 for(int32_t j = 0; j < ntrials; ++j)
140 for(
auto i = range.begin(); i < range.end(); i += range.stride())
142 Tp beta = { 0.8, 0.8 };
143 mpl::apply<void>::unroll<NUM_REP + MOD_REP, DeviceT>(ops_func, beta, A[i],
145 store_func(A[i], beta);
147 alpha *= { 1.0 - 1.0e-8, 1.0 - 1.0e-8 };
155template <
size_t Nops,
size_t... Nextra,
typename DeviceT,
typename Tp,
typename CounterT,
156 typename OpsFuncT,
typename StoreFuncT,
160 StoreFuncT&& store_func)
162 if(_counter.
skip(Nops))
165 using stream_list_t = std::vector<gpu::stream_t>;
166 using thread_list_t = std::vector<std::thread>;
167 using device_params_t = device::params<DeviceT>;
168 using Intp = int32_t;
169 using ull =
long long unsigned;
171 constexpr bool is_gpu = std::is_same<DeviceT, device::gpu>::value;
174 printf(
"[%s] Executing %li ops...\n", __FUNCTION__, (
long int) Nops);
178 fprintf(stderr,
"[%s:%i]> bytes-per-element is not set!\n", __FUNCTION__,
184 fprintf(stderr,
"[%s:%i]> memory-accesses-per-element is not set!\n",
185 __FUNCTION__, __LINE__);
189 stream_list_t streams;
195 for(
auto& itr : streams)
196 gpu::stream_create(itr);
200 threading::affinity::set();
201 using opmutex_t = std::mutex;
202 using oplock_t = std::unique_lock<opmutex_t>;
203 static opmutex_t opmutex;
205 oplock_t _lock(opmutex);
213 const uint64_t nstreams = std::max<uint64_t>(_counter.
params.
nstreams, 1);
224 if(n > _counter.
nsize)
227 "[%s@'%s':%i]> Warning! ERT not running any trials because working "
228 "set min > nsize: %llu > %llu\n",
230 (ull) _counter.
nsize);
233 while(n <= _counter.
nsize)
236 uint64_t ntrials = _counter.
nsize / n;
242 printf(
"[tim::ert::ops_main<%llu>]> number of trials: %llu, n = %llu, "
245 (ull) Nops, (ull) ntrials, (ull) n, (ull) _counter.
nsize);
248 auto _itr_params = _counter.
params;
253 for(
auto& itr : streams)
254 gpu::stream_sync(itr);
265 fbarrier->spin_wait();
275 auto nchunk = n / nstreams;
276 auto nmodulo = n % nstreams;
277 for(uint64_t i = 0; i < nstreams; ++i)
280 int32_t _n = nchunk + ((i + 1 == nstreams) ? nmodulo : 0);
281 auto _params = dev_params;
283 _n, streams.at(i % streams.size()), _params,
284 ops_kernel<Nops, DeviceT, Intp, Tp, OpsFuncT, StoreFuncT>,
285 ntrials, _n, buf + (i * nchunk), std::forward<OpsFuncT>(ops_func),
286 std::forward<StoreFuncT>(store_func));
287 _itr_params.grid_size =
288 (i == 0) ? _params.grid
289 : std::max<int64_t>(_itr_params.grid_size, _params.grid);
294 device::launch(n, dev_params,
295 ops_kernel<Nops, DeviceT, Intp, Tp, OpsFuncT, StoreFuncT>,
296 ntrials, n, buf, std::forward<OpsFuncT>(ops_func),
297 std::forward<StoreFuncT>(store_func));
299 _itr_params.grid_size = dev_params.grid;
304 for(
auto& itr : streams)
305 gpu::stream_sync(itr);
316 lbarrier->spin_wait();
326 oplock_t _lock(opmutex);
327 _counter.
record(ct, n, ntrials, Nops, _itr_params);
330 n = ((1.1 * n) == n) ? (n + 1) : (1.1 * n);
340 static std::mutex _mtx;
341 std::unique_lock<std::mutex> _lock(_mtx);
355 thread_list_t threads{};
358 threads.emplace_back(_opfunc, i, &fbarrier, &lbarrier);
374 for(
auto& itr : threads)
379 _opfunc(0,
nullptr,
nullptr);
396template <
size_t Nops,
size_t... Nextra,
typename DeviceT,
typename Tp,
typename CounterT,
397 typename OpsFuncT,
typename StoreFuncT,
400ops_main(counter<DeviceT, Tp, CounterT>& _counter, OpsFuncT&& ops_func,
401 StoreFuncT&& store_func)
405 ret |= ops_main<Nops>(std::ref(_counter).
get(), ops_func, store_func);
407 ret |=
ops_main<Nextra...>(std::ref(_counter).get(), ops_func, store_func);
415template <
size_t... Nops,
typename DeviceT,
typename Tp,
typename CounterT,
416 typename OpsFuncT,
typename StoreFuncT,
#define TIMEMORY_GLOBAL_FUNCTION
Up * get_buffer()
allocate a buffer for the ERT calculation uses this function if device is CPU or device is GPU and ty...
void configure(uint64_t tid)
void record(counter_type &_counter, int n, int trials, uint64_t nops, const exec_params &_itrp)
counter_type get_counter() const
int memory_accesses_per_element
void destroy_buffer(Tp *buffer)
bool ops_main(counter< DeviceT, Tp, CounterT > &_counter, OpsFuncT &&ops_func, StoreFuncT &&store_func)
This is the "main" function for ERT.
void ops_kernel(Intp ntrials, Intp nsize, Tp *A, OpsFuncT &&ops_func, StoreFuncT &&store_func)
typename std::enable_if< B, T >::type enable_if_t
Alias template for enable_if.
auto get(const auto_bundle< Tag, Types... > &_obj)
#define TIMEMORY_ERROR_FUNCTION_MACRO