timemory 3.3.0
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
kernels.hpp
Go to the documentation of this file.
1// MIT License
2//
3// Copyright (c) 2020, The Regents of the University of California,
4// through Lawrence Berkeley National Laboratory (subject to receipt of any
5// required approvals from the U.S. Dept. of Energy). All rights reserved.
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in all
15// copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25/** \file timemory/ert/kernels.hpp
26 * \headerfile timemory/ert/kernels.hpp "timemory/ert/kernels.hpp"
27 * Provides kernels for executing kernels in ERT
28 *
29 */
30
31#pragma once
32
33#include "timemory/backends/device.hpp"
34#include "timemory/backends/dmp.hpp"
35#include "timemory/backends/gpu.hpp"
36#include "timemory/backends/threading.hpp"
37#include "timemory/components/cuda/backends.hpp"
39#include "timemory/ert/data.hpp"
44
45#include <cstdint>
46#include <functional>
47#include <future>
48#include <iomanip>
49#include <sstream>
50#include <string>
51#include <tuple>
52#include <type_traits>
53#include <utility>
54
55namespace tim
56{
57namespace ert
58{
59//--------------------------------------------------------------------------------------//
60//
61// CPU -- multiple trial
62//
63//--------------------------------------------------------------------------------------//
64
65template <size_t Nrep, typename DeviceT, typename Intp, typename Tp, typename OpsFuncT,
66 typename StoreFuncT, device::enable_if_cpu_t<DeviceT> = 0>
67void
68ops_kernel(Intp ntrials, Intp nsize, Tp* A, OpsFuncT&& ops_func, StoreFuncT&& store_func)
69{
70 // divide by two here because macros halve, e.g. ERT_FLOP == 4 means 2 calls
71 constexpr size_t NUM_REP = Nrep / 2;
72 constexpr size_t MOD_REP = Nrep % 2;
73 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
74
75 Tp alpha = static_cast<Tp>(0.5);
76 for(Intp j = 0; j < ntrials; ++j)
77 {
78 for(auto i = range.begin(); i < range.end(); i += range.stride())
79 {
80 Tp beta = static_cast<Tp>(0.8);
82 alpha);
83 store_func(A[i], beta);
84 }
85 alpha *= static_cast<Tp>(1.0 - 1.0e-8);
86 }
87}
88
89//--------------------------------------------------------------------------------------//
90//
91// GPU -- multiple trial
92//
93//--------------------------------------------------------------------------------------//
94
95template <size_t Nrep, typename DeviceT, typename Intp, typename Tp, typename OpsFuncT,
96 typename StoreFuncT, device::enable_if_gpu_t<DeviceT> = 0,
99ops_kernel(Intp ntrials, Intp nsize, Tp* A, OpsFuncT&& ops_func, StoreFuncT&& store_func)
100{
101 // divide by two here because macros halve, e.g. ERT_FLOP == 4 means 2 calls
102 constexpr size_t NUM_REP = Nrep / 2;
103 constexpr size_t MOD_REP = Nrep % 2;
104 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
105
106 Tp alpha = static_cast<Tp>(0.5);
107 for(Intp j = 0; j < ntrials; ++j)
108 {
109 for(auto i = range.begin(); i < range.end(); i += range.stride())
110 {
111 Tp beta = static_cast<Tp>(0.8);
113 alpha);
114 store_func(A[i], beta);
115 }
116 alpha *= static_cast<Tp>(1.0 - 1.0e-8);
117 }
118}
119
120//--------------------------------------------------------------------------------------//
121//
122// GPU -- multiple trial -- packed (2) half-precision
123//
124//--------------------------------------------------------------------------------------//
125
126template <size_t Nrep, typename DeviceT, typename Intp, typename Tp, typename OpsFuncT,
127 typename StoreFuncT, device::enable_if_gpu_t<DeviceT> = 0,
130ops_kernel(Intp ntrials, Intp nsize, Tp* A, OpsFuncT&& ops_func, StoreFuncT&& store_func)
131{
132 // divide by four instead of two here because fp16_t is a packed operation
133 constexpr size_t NUM_REP = Nrep / 4;
134 constexpr size_t MOD_REP = Nrep % 4;
135 auto range = device::grid_strided_range<DeviceT, 0, int32_t>(nsize);
136
137 Tp alpha = { 0.5, 0.5 };
138 for(int32_t j = 0; j < ntrials; ++j)
139 {
140 for(auto i = range.begin(); i < range.end(); i += range.stride())
141 {
142 Tp beta = { 0.8, 0.8 };
143 mpl::apply<void>::unroll<NUM_REP + MOD_REP, DeviceT>(ops_func, beta, A[i],
144 alpha);
145 store_func(A[i], beta);
146 }
147 alpha *= { 1.0 - 1.0e-8, 1.0 - 1.0e-8 };
148 }
149}
150
151//--------------------------------------------------------------------------------------//
152///
153/// This is the "main" function for ERT
154///
155template <size_t Nops, size_t... Nextra, typename DeviceT, typename Tp, typename CounterT,
156 typename OpsFuncT, typename StoreFuncT,
157 enable_if_t<sizeof...(Nextra) == 0, int> = 0>
158bool
159ops_main(counter<DeviceT, Tp, CounterT>& _counter, OpsFuncT&& ops_func,
160 StoreFuncT&& store_func)
161{
162 if(_counter.skip(Nops))
163 return false;
164
165 using stream_list_t = std::vector<gpu::stream_t>;
166 using thread_list_t = std::vector<std::thread>;
167 using device_params_t = device::params<DeviceT>;
168 using Intp = int32_t;
169 using ull = long long unsigned;
170
171 constexpr bool is_gpu = std::is_same<DeviceT, device::gpu>::value;
172
174 printf("[%s] Executing %li ops...\n", __FUNCTION__, (long int) Nops);
175
176 if(_counter.bytes_per_element == 0)
177 {
178 fprintf(stderr, "[%s:%i]> bytes-per-element is not set!\n", __FUNCTION__,
179 __LINE__);
180 }
181
182 if(_counter.memory_accesses_per_element == 0)
183 {
184 fprintf(stderr, "[%s:%i]> memory-accesses-per-element is not set!\n",
185 __FUNCTION__, __LINE__);
186 }
187
188 // list of streams
189 stream_list_t streams;
190 // generate async streams if multiple streams were requested
191 if(_counter.params.nstreams > 1)
192 {
193 // fill with implicit stream
194 streams.resize(_counter.params.nstreams, 0);
195 for(auto& itr : streams)
196 gpu::stream_create(itr);
197 }
198
199 auto _opfunc = [&](uint64_t tid, thread_barrier* fbarrier, thread_barrier* lbarrier) {
200 threading::affinity::set();
201 using opmutex_t = std::mutex;
202 using oplock_t = std::unique_lock<opmutex_t>;
203 static opmutex_t opmutex;
204 {
205 oplock_t _lock(opmutex);
206 // execute the callback
207 _counter.configure(tid);
208 }
209 // allocate buffer
210 auto buf = _counter.get_buffer();
211 uint64_t n = _counter.params.working_set_min;
212 // cache this
213 const uint64_t nstreams = std::max<uint64_t>(_counter.params.nstreams, 1);
214 // create the launch parameters (ignored on CPU)
215 //
216 // if grid_size is zero (default), the launch command will calculate a grid-size
217 // as follows:
218 //
219 // grid_size = ((data_size + block_size - 1) / block_size)
220 //
221 device_params_t dev_params(_counter.params.grid_size, _counter.params.block_size,
222 _counter.params.shmem_size, DeviceT::default_stream);
223 //
224 if(n > _counter.nsize)
225 {
226 fprintf(stderr,
227 "[%s@'%s':%i]> Warning! ERT not running any trials because working "
228 "set min > nsize: %llu > %llu\n",
229 TIMEMORY_ERROR_FUNCTION_MACRO, __FILE__, __LINE__, (ull) n,
230 (ull) _counter.nsize);
231 }
232
233 while(n <= _counter.nsize)
234 {
235 // working set - nsize
236 uint64_t ntrials = _counter.nsize / n;
237 if(ntrials < 1)
238 ntrials = 1;
239
240 if(settings::debug() && tid == 0)
241 {
242 printf("[tim::ert::ops_main<%llu>]> number of trials: %llu, n = %llu, "
243 "nsize "
244 "= %llu\n",
245 (ull) Nops, (ull) ntrials, (ull) n, (ull) _counter.nsize);
246 }
247
248 auto _itr_params = _counter.params;
249
250 if(is_gpu)
251 {
252 // make sure all streams are synced
253 for(auto& itr : streams)
254 gpu::stream_sync(itr);
255
256 // sync the streams
257 if(nstreams < 2)
258 gpu::device_sync();
259 }
260
261 // wait master thread notifies to proceed
262 // if(fbarrier)
263 // fbarrier->notify_wait();
264 if(fbarrier)
265 fbarrier->spin_wait();
266
267 // get instance of object measuring something during the calculation
268 CounterT ct = _counter.get_counter();
269 // start the timer or anything else being recorded
270 ct.start();
271
272 // only do this more complicated mess if we need to
273 if(nstreams > 1)
274 {
275 auto nchunk = n / nstreams;
276 auto nmodulo = n % nstreams;
277 for(uint64_t i = 0; i < nstreams; ++i)
278 {
279 // calculate the size of the subchunk
280 int32_t _n = nchunk + ((i + 1 == nstreams) ? nmodulo : 0);
281 auto _params = dev_params; // copy of the parameters
282 device::launch(
283 _n, streams.at(i % streams.size()), _params,
284 ops_kernel<Nops, DeviceT, Intp, Tp, OpsFuncT, StoreFuncT>,
285 ntrials, _n, buf + (i * nchunk), std::forward<OpsFuncT>(ops_func),
286 std::forward<StoreFuncT>(store_func));
287 _itr_params.grid_size =
288 (i == 0) ? _params.grid
289 : std::max<int64_t>(_itr_params.grid_size, _params.grid);
290 }
291 }
292 else
293 {
294 device::launch(n, dev_params,
295 ops_kernel<Nops, DeviceT, Intp, Tp, OpsFuncT, StoreFuncT>,
296 ntrials, n, buf, std::forward<OpsFuncT>(ops_func),
297 std::forward<StoreFuncT>(store_func));
298
299 _itr_params.grid_size = dev_params.grid;
300 }
301
302 if(is_gpu)
303 {
304 for(auto& itr : streams)
305 gpu::stream_sync(itr);
306
307 // sync the streams
308 if(nstreams < 2)
309 gpu::device_sync();
310 }
311
312 // wait master thread notifies to proceed
313 // if(lbarrier)
314 // lbarrier->notify_wait();
315 if(lbarrier)
316 lbarrier->spin_wait();
317
318 // stop the timer or anything else being recorded
319 ct.stop();
320
321 // store the result
322 if(tid == 0)
323 {
324 // ensure there is not a data race if more than one thread somehow
325 // has a tid of 0
326 oplock_t _lock(opmutex);
327 _counter.record(ct, n, ntrials, Nops, _itr_params);
328 }
329
330 n = ((1.1 * n) == n) ? (n + 1) : (1.1 * n);
331 }
332
333 if(is_gpu)
334 gpu::device_sync();
335
336 _counter.destroy_buffer(buf);
337 };
338
339 // guard against multiple threads trying to call ERT for some reason
340 static std::mutex _mtx;
341 std::unique_lock<std::mutex> _lock(_mtx);
342
343 dmp::barrier(); // synchronize MPI processes
344
345 if(is_gpu)
346 gpu::device_sync();
347
348 if(_counter.params.nthreads > 1)
349 {
350 // create synchronization barriers for the threads
351 thread_barrier fbarrier{ _counter.params.nthreads };
352 thread_barrier lbarrier{ _counter.params.nthreads };
353
354 // list of threads
355 thread_list_t threads{};
356 // create the threads
357 for(uint64_t i = 0; i < _counter.params.nthreads; ++i)
358 threads.emplace_back(_opfunc, i, &fbarrier, &lbarrier);
359
360 /*
361 uint64_t n = _counter.params.working_set_min;
362 while(n <= _counter.nsize)
363 {
364 // wait until all threads have also called notify_wait() then release
365 // barrier to start
366 fbarrier.notify_wait();
367 // wait until all threads have also called notify_wait() then release
368 // barrier to finish
369 lbarrier.notify_wait();
370 n = ((1.1 * n) == n) ? (n + 1) : (1.1 * n);
371 }*/
372
373 // wait for threads to finish
374 for(auto& itr : threads)
375 itr.join();
376 }
377 else
378 {
379 _opfunc(0, nullptr, nullptr);
380 }
381
382 if(is_gpu)
383 gpu::device_sync();
384
385 dmp::barrier(); // synchronize MPI processes
386
387 // code was executed
388 return true;
389}
390
391//--------------------------------------------------------------------------------------//
392///
393/// This is invokes the "main" function for ERT for all the desired "FLOPs" that
394/// are unrolled in the kernel
395///
396template <size_t Nops, size_t... Nextra, typename DeviceT, typename Tp, typename CounterT,
397 typename OpsFuncT, typename StoreFuncT,
398 enable_if_t<(sizeof...(Nextra) > 0), int> = 0>
399bool
400ops_main(counter<DeviceT, Tp, CounterT>& _counter, OpsFuncT&& ops_func,
401 StoreFuncT&& store_func)
402{
403 bool ret = false;
404 // execute a single parameter
405 ret |= ops_main<Nops>(std::ref(_counter).get(), ops_func, store_func);
406 // continue the recursive loop
407 ret |= ops_main<Nextra...>(std::ref(_counter).get(), ops_func, store_func);
408 return ret;
409}
410
411//--------------------------------------------------------------------------------------//
412///
413/// This is invoked when TIMEMORY_USER_ERT_FLOPS is empty
414///
415template <size_t... Nops, typename DeviceT, typename Tp, typename CounterT,
416 typename OpsFuncT, typename StoreFuncT,
417 enable_if_t<sizeof...(Nops) == 0, int> = 0>
418bool
419ops_main(counter<DeviceT, Tp, CounterT>&, OpsFuncT&&, StoreFuncT&&)
420{
421 return false;
422}
423
424//--------------------------------------------------------------------------------------//
425
426} // namespace ert
427} // namespace tim
#define TIMEMORY_GLOBAL_FUNCTION
Definition: attributes.hpp:182
Up * get_buffer()
allocate a buffer for the ERT calculation uses this function if device is CPU or device is GPU and ty...
Definition: counter.hpp:133
void configure(uint64_t tid)
Definition: counter.hpp:184
bool skip(size_t _Nops)
Definition: counter.hpp:288
void record(counter_type &_counter, int n, int trials, uint64_t nops, const exec_params &_itrp)
Definition: counter.hpp:196
counter_type get_counter() const
Definition: counter.hpp:190
int memory_accesses_per_element
Definition: counter.hpp:296
exec_params params
Definition: counter.hpp:294
void destroy_buffer(Tp *buffer)
Definition: counter.hpp:178
uint64_t nsize
Definition: counter.hpp:298
bool ops_main(counter< DeviceT, Tp, CounterT > &_counter, OpsFuncT &&ops_func, StoreFuncT &&store_func)
This is the "main" function for ERT.
Definition: kernels.hpp:159
void ops_kernel(Intp ntrials, Intp nsize, Tp *A, OpsFuncT &&ops_func, StoreFuncT &&store_func)
Definition: kernels.hpp:68
Definition: kokkosp.cpp:39
typename std::enable_if< B, T >::type enable_if_t
Alias template for enable_if.
Definition: types.hpp:190
auto get(const auto_bundle< Tag, Types... > &_obj)
#define TIMEMORY_ERROR_FUNCTION_MACRO
Definition: macros.hpp:229
uint64_t working_set_min
Definition: data.hpp:114
uint64_t nstreams
Definition: data.hpp:119
uint64_t nthreads
Definition: data.hpp:116
uint64_t grid_size
Definition: data.hpp:120
uint64_t shmem_size
Definition: data.hpp:122
uint64_t block_size
Definition: data.hpp:121