timemory 3.3.0
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
counter.hpp
Go to the documentation of this file.
1// MIT License
2//
3// Copyright (c) 2020, The Regents of the University of California,
4// through Lawrence Berkeley National Laboratory (subject to receipt of any
5// required approvals from the U.S. Dept. of Energy). All rights reserved.
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in all
15// copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25/** \file timemory/ert/counter.hpp
26 * \headerfile timemory/ert/counter.hpp "timemory/ert/counter.hpp"
27 * Provides counter (i.e. timer, hw counters) for when executing ERT
28 *
29 */
30
31#pragma once
32
33#include "timemory/backends/device.hpp"
34#include "timemory/backends/dmp.hpp"
35#include "timemory/backends/hip.hpp"
36#include "timemory/components/cuda/backends.hpp"
41#include "timemory/ert/data.hpp"
46#include "timemory/tpls/cereal/archives.hpp"
49
50#include <array>
51#include <atomic>
52#include <cstddef>
53#include <cstdint>
54#include <functional>
55#include <numeric>
56#include <ostream>
57#include <sstream>
58#include <stdexcept>
59#include <string>
60#include <unordered_set>
61#include <vector>
62
63namespace tim
64{
65namespace ert
66{
67using namespace memory;
68
69//--------------------------------------------------------------------------------------//
70// measure floating-point or integer operations
71//
72template <typename DeviceT, typename Tp, typename Counter>
74{
75public:
76 using mutex_t = std::recursive_mutex;
77 using lock_t = std::unique_lock<mutex_t>;
78 using counter_type = Counter;
81 using callback_type = std::function<void(uint64_t, this_type&)>;
83 using data_ptr_t = std::shared_ptr<ert_data_t>;
84 using ull = unsigned long long;
85 using skip_ops_t = std::unordered_set<size_t>;
86
87public:
88 //----------------------------------------------------------------------------------//
89 // default construction
90 //
91 counter() = default;
92 ~counter() = default;
93 counter(const counter&) = default;
94 counter(counter&&) noexcept = default;
95 counter& operator=(const counter&) = default;
96 counter& operator=(counter&&) noexcept = default;
97
98 //----------------------------------------------------------------------------------//
99 // standard creation
100 //
101 explicit counter(const exec_params& _params, data_ptr_t _exec_data,
102 uint64_t _align = 8 * sizeof(Tp))
103 : params(_params)
104 , align(_align)
105 , data(std::move(_exec_data))
106 {
107 compute_internal();
108 }
109
110 //----------------------------------------------------------------------------------//
111 // overload how to create the counter with a callback function
112 //
113 counter(const exec_params& _params, callback_type _func, data_ptr_t _exec_data,
114 uint64_t _align = 8 * sizeof(Tp))
115 : params(_params)
116 , align(_align)
117 , data(std::move(_exec_data))
118 , configure_callback(std::move(_func))
119 {
120 compute_internal();
121 }
122
123public:
124 //----------------------------------------------------------------------------------//
125 /// allocate a buffer for the ERT calculation
126 /// uses this function if device is CPU or device is GPU and type is not half2
127 ///
128 template <typename Up = Tp, typename Dev = DeviceT,
129 typename std::enable_if<(std::is_same<Dev, device::cpu>::value ||
130 (std::is_same<Dev, device::gpu>::value &&
131 !std::is_same<Up, gpu::fp16_t>::value)),
132 int>::type = 0>
134 {
135 // check alignment and
136 align = std::max<uint64_t>(align, 8 * sizeof(Up));
137 compute_internal();
138
139 if(settings::debug())
140 printf("[%s]> nsize = %llu\n", __FUNCTION__, (ull) nsize);
141 Up* buffer = allocate_aligned<Up, DeviceT>(nsize, align);
142 if(settings::debug())
143 printf("[%s]> buffer = %p\n", __FUNCTION__, (void*) buffer);
144 device::params<DeviceT> _params(0, 512, 0, 0);
145 device::launch(nsize, _params, initialize_buffer<DeviceT, Up, uint64_t>, buffer,
146 Up{ 1 }, nsize);
147 return buffer;
148 }
149
150 //----------------------------------------------------------------------------------//
151 /// allocate a buffer for the ERT calculation
152 /// uses this function if device is GPU and type is half2
153 ///
154 template <typename Up = Tp, typename Dev = DeviceT,
155 typename std::enable_if<(std::is_same<Up, gpu::fp16_t>::value &&
156 std::is_same<Dev, device::gpu>::value),
157 int>::type = 0>
159 {
160 // check alignment and
161 align = std::max<uint64_t>(align, 8 * sizeof(Up));
162 compute_internal();
163
164 if(settings::debug())
165 printf("[%s]> nsize = %llu\n", __FUNCTION__, (ull) nsize);
166 Up* buffer = allocate_aligned<Up, DeviceT>(nsize, align);
167 if(settings::debug())
168 printf("[%s]> buffer = %p\n", __FUNCTION__, (void*) buffer);
169 device::params<DeviceT> _params(0, 512, 0, 0);
170 device::launch(nsize, _params, initialize_buffer<DeviceT, Up, uint32_t>, buffer,
171 Up{ 1, 1 }, nsize);
172 return buffer;
173 }
174
175 //----------------------------------------------------------------------------------//
176 // destroy associated buffer
177 //
178 void destroy_buffer(Tp* buffer) { free_aligned<Tp, DeviceT>(buffer); }
179
180 //----------------------------------------------------------------------------------//
181 // execute the callback that may customize the thread before returning the object
182 // that provides the measurement
183 //
184 void configure(uint64_t tid) { configure_callback(tid, *this); }
185
186 //----------------------------------------------------------------------------------//
187 // execute the callback that may customize the thread before returning the object
188 // that provides the measurement
189 //
191
192 //----------------------------------------------------------------------------------//
193 // record the data from a thread/process. Extra exec_params (_itrp) should contain
194 // the computed grid size for serialization
195 //
196 void record(counter_type& _counter, int n, int trials, uint64_t nops,
197 const exec_params& _itrp)
198 {
199 uint64_t working_set_size = n * params.nthreads * params.nproc;
200 uint64_t working_set = working_set_size * bytes_per_element;
201 uint64_t total_bytes = trials * working_set * memory_accesses_per_element;
202 uint64_t total_ops = trials * working_set_size * nops;
203
204 std::stringstream ss;
205 ss << label;
206 if(label.length() == 0)
207 {
208 if(nops > 1)
209 {
210 ss << "vector_op";
211 }
212 else
213 {
214 ss << "scalar_op";
215 }
216 }
217
218 auto _label = tim::demangle<Tp>();
219 data_type _data(ss.str(), working_set, trials, total_bytes, total_ops, nops,
220 _counter, DeviceT::name(), _label, _itrp);
221
222#if !defined(TIMEMORY_WINDOWS)
223 // using namespace tim::stl::ostream;
224 // if(settings::verbose() > 1 || settings::debug())
225 // std::cout << "[RECORD]> " << _data << std::endl;
226#endif
227
228 static std::mutex _mutex;
229 // std::unique_lock<std::mutex> _lock(_mutex);
230 _mutex.lock();
231 *data += _data;
232 _mutex.unlock();
233 }
234
235 //----------------------------------------------------------------------------------//
236 //
237 template <typename FuncT>
238 void set_callback(FuncT&& _f)
239 {
240 configure_callback = std::forward<FuncT>(_f);
241 }
242
243 //----------------------------------------------------------------------------------//
244 // provide ability to write to JSON/XML
245 //
246 template <typename Archive>
247 void serialize(Archive& ar, const unsigned int)
248 {
249 if(!data.get()) // for input
250 data = std::make_shared<ert_data_t>();
251 ar(cereal::make_nvp("params", params), cereal::make_nvp("data", *data));
252 }
253
254 //----------------------------------------------------------------------------------//
255 // write to stream
256 //
257 friend std::ostream& operator<<(std::ostream& os, const counter& obj)
258 {
259 std::stringstream ss;
260 ss << obj.params << ", "
261 << "bytes_per_element = " << obj.bytes_per_element << ", "
262 << "memory_accesses_per_element = " << obj.memory_accesses_per_element << ", "
263 << "alignment = " << obj.align << ", "
264 << "nsize = " << obj.nsize << ", "
265 << "label = " << obj.label << ", "
266 << "data entries = " << ((obj.data.get()) ? obj.data->size() : 0);
267 os << ss.str();
268 return os;
269 }
270
271 //----------------------------------------------------------------------------------//
272 // Get the data pointer
273 //
274 data_ptr_t& get_data() { return data; }
275 const data_ptr_t& get_data() const { return data; }
276
277 //----------------------------------------------------------------------------------//
278 // Skip the flop counts
279 //
280 void add_skip_ops(size_t _Nops) { skip_ops.insert(_Nops); }
281
282 void add_skip_ops(std::initializer_list<size_t> _args)
283 {
284 for(const auto& itr : _args)
285 skip_ops.insert(itr);
286 }
287
288 bool skip(size_t _Nops) { return (skip_ops.count(_Nops) > 0); }
289
290public:
291 //----------------------------------------------------------------------------------//
292 // public data members, modify as needed
293 //
295 int bytes_per_element = 0; // NOLINT
297 uint64_t align = sizeof(Tp); // NOLINT
298 uint64_t nsize = 0; // NOLINT
299 data_ptr_t data = std::make_shared<ert_data_t>(); // NOLINT
300 std::string label = ""; // NOLINT
302
303private:
304 callback_type configure_callback = [](uint64_t, this_type&) {};
305
306private:
307 //----------------------------------------------------------------------------------//
308 // compute the data size
309 //
310 void compute_internal()
311 {
312 if(device::is_cpu<DeviceT>::value)
313 params.nstreams = 1;
315 nsize = nsize & (~(align - 1));
316 nsize = nsize / sizeof(Tp);
317 nsize = std::max<uint64_t>(nsize, 1);
318 }
319};
320
321//--------------------------------------------------------------------------------------//
322
323template <typename Counter>
324inline void
326{
327 using exec_data_vec_t = std::vector<exec_data<Counter>>;
328
329 int dmp_rank = dmp::rank();
330 int dmp_size = dmp::size();
331
332 exec_data_vec_t results(dmp_size);
333 if(dmp::is_initialized())
334 {
335 dmp::barrier();
336
337#if defined(TIMEMORY_USE_MPI) || defined(TIMEMORY_USE_UPCXX)
338 auto space = cereal::JSONOutputArchive::Options::IndentChar::space;
339
340 //------------------------------------------------------------------------------//
341 // Used to convert a result to a serialization
342 //
343 auto send_serialize = [&](const exec_data<Counter>& src) {
344 std::stringstream ss;
345 {
346 cereal::JSONOutputArchive::Options opt(16, space, 0);
347 cereal::JSONOutputArchive oa(ss, opt);
348 oa(cereal::make_nvp("data", src));
349 }
350 return ss.str();
351 };
352
353 //------------------------------------------------------------------------------//
354 // Used to convert the serialization to a result
355 //
356 auto recv_serialize = [&](const std::string& src) {
358 std::stringstream ss;
359 ss << src;
360 {
361 cereal::JSONInputArchive ia(ss);
362 ia(cereal::make_nvp("data", ret));
363 }
364 return ret;
365 };
366#endif
367
368#if defined(TIMEMORY_USE_MPI)
369
370 auto str_ret = send_serialize(obj);
371
372 if(dmp_rank == 0)
373 {
374 for(int i = 1; i < dmp_size; ++i)
375 {
376 std::string str;
377 mpi::recv(str, i, 0, mpi::comm_world_v);
378 results[i] = recv_serialize(str);
379 }
380 results[dmp_rank] = std::move(obj);
381 }
382 else
383 {
384 mpi::send(str_ret, 0, 0, mpi::comm_world_v);
385 }
386
387#elif defined(TIMEMORY_USE_UPCXX)
388
389 //------------------------------------------------------------------------------//
390 // Function executed on remote node
391 //
392 auto remote_serialize = [=]() { return send_serialize(obj); };
393
394 //------------------------------------------------------------------------------//
395 // Combine on master rank
396 //
397 if(dmp_rank == 0)
398 {
399 for(int i = 1; i < dmp_size; ++i)
400 {
401 upcxx::future<std::string> fut = upcxx::rpc(i, remote_serialize);
402 while(!fut.ready())
403 upcxx::progress();
404 fut.wait();
405 results[i] = recv_serialize(fut.result());
406 }
407 results[dmp_rank] = std::move(obj);
408 }
409
410#endif
411 }
412 else
413 {
414 results.clear();
415 results.resize(1);
416 results.at(0) = std::move(obj);
417 }
418
419 if(dmp_rank == 0)
420 {
421 fname = settings::compose_output_filename(fname, ".json");
422 printf("[%i]> Outputting '%s'...\n", dmp_rank, fname.c_str());
423 std::ofstream ofs{};
424 if(filepath::open(ofs, fname))
425 {
426 // ensure json write final block during destruction before the file is closed
427 using policy_type = policy::output_archive_t<Counter>;
428 auto oa = policy_type::get(ofs);
429 oa->setNextName("timemory");
430 oa->startNode();
431 oa->setNextName("ranks");
432 oa->startNode();
433 oa->makeArray();
434 for(uint64_t i = 0; i < results.size(); ++i)
435 {
436 oa->startNode();
437 (*oa)(cereal::make_nvp("rank", i),
438 cereal::make_nvp("roofline", results.at(i)));
439 oa->finishNode();
440 }
441 oa->finishNode();
442 oa->finishNode();
443 }
444 if(ofs)
445 ofs << std::endl;
446 ofs.close();
447 }
448}
449
450//--------------------------------------------------------------------------------------//
451
452} // namespace ert
453} // namespace tim
typename ert_data_t::value_type data_type
Definition: counter.hpp:82
counter(const counter &)=default
std::recursive_mutex mutex_t
Definition: counter.hpp:76
data_ptr_t & get_data()
Definition: counter.hpp:274
~counter()=default
counter(counter &&) noexcept=default
unsigned long long ull
Definition: counter.hpp:84
void add_skip_ops(size_t _Nops)
Definition: counter.hpp:280
skip_ops_t skip_ops
Definition: counter.hpp:301
void add_skip_ops(std::initializer_list< size_t > _args)
Definition: counter.hpp:282
Up * get_buffer()
allocate a buffer for the ERT calculation uses this function if device is CPU or device is GPU and ty...
Definition: counter.hpp:133
void configure(uint64_t tid)
Definition: counter.hpp:184
std::string label
Definition: counter.hpp:300
data_ptr_t data
Definition: counter.hpp:299
bool skip(size_t _Nops)
Definition: counter.hpp:288
std::shared_ptr< ert_data_t > data_ptr_t
Definition: counter.hpp:83
void record(counter_type &_counter, int n, int trials, uint64_t nops, const exec_params &_itrp)
Definition: counter.hpp:196
Counter counter_type
Definition: counter.hpp:78
void set_callback(FuncT &&_f)
Definition: counter.hpp:238
const data_ptr_t & get_data() const
Definition: counter.hpp:275
std::function< void(uint64_t, this_type &)> callback_type
Definition: counter.hpp:81
counter_type get_counter() const
Definition: counter.hpp:190
uint64_t align
Definition: counter.hpp:297
void serialize(Archive &ar, const unsigned int)
Definition: counter.hpp:247
int memory_accesses_per_element
Definition: counter.hpp:296
exec_params params
Definition: counter.hpp:294
counter(const exec_params &_params, callback_type _func, data_ptr_t _exec_data, uint64_t _align=8 *sizeof(Tp))
Definition: counter.hpp:113
std::unordered_set< size_t > skip_ops_t
Definition: counter.hpp:85
std::unique_lock< mutex_t > lock_t
Definition: counter.hpp:77
friend std::ostream & operator<<(std::ostream &os, const counter &obj)
Definition: counter.hpp:257
void destroy_buffer(Tp *buffer)
Definition: counter.hpp:178
uint64_t nsize
Definition: counter.hpp:298
std::tuple< std::string, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, Tp, std::string, std::string, exec_params > value_type
Definition: data.hpp:133
STL namespace.
void serialize(std::string fname, exec_data< Counter > &obj)
Definition: counter.hpp:325
bool open(std::ofstream &_ofs, std::string _fpath, Args &&... _args)
Definition: filepath.hpp:207
Definition: kokkosp.cpp:39
std::array< char *, 4 > _args
tim::mpl::apply< std::string > string
Definition: macros.hpp:53
const std::string std::ostream * os
auto get(const auto_bundle< Tag, Types... > &_obj)
uint64_t nstreams
Definition: data.hpp:119
uint64_t memory_max
Definition: data.hpp:115
uint64_t nthreads
Definition: data.hpp:116
Provides a static get() function which return a shared pointer to an instance of the given archive fo...
Definition: policy.hpp:136
static string_t compose_output_filename(string_t _tag, string_t _ext, bool _use_suffix=use_output_suffix(), int32_t _suffix=default_process_suffix(), bool _make_dir=false, std::string _explicit={})
Definition: settings.cpp:322