timemory 3.3.0
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
memory_allocations.hpp
Go to the documentation of this file.
1// MIT License
2//
3// Copyright (c) 2020, The Regents of the University of California,
4// through Lawrence Berkeley National Laboratory (subject to receipt of any
5// required approvals from the U.S. Dept. of Energy). All rights reserved.
6//
7// Permission is hereby granted, free of charge, to any person obtaining a copy
8// of this software and associated documentation files (the "Software"), to deal
9// in the Software without restriction, including without limitation the rights
10// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11// copies of the Software, and to permit persons to whom the Software is
12// furnished to do so, subject to the following conditions:
13//
14// The above copyright notice and this permission notice shall be included in all
15// copies or substantial portions of the Software.
16//
17// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23// SOFTWARE.
24
25#pragma once
26
27#include "timemory/api.hpp"
28#include "timemory/backends/gpu.hpp"
36#include "timemory/types.hpp"
39
40#include <memory>
41#include <string>
42
43#if defined(__GNUC__) && (__GNUC__ >= 6)
44# pragma GCC diagnostic push
45# pragma GCC diagnostic ignored "-Wignored-attributes"
46#endif
47
48namespace tim
49{
50namespace component
51{
52//
54: base<malloc_gotcha, double>
56{
57#if defined(TIMEMORY_USE_GPU)
58 static constexpr size_t data_size = 9;
59#else
60 static constexpr size_t data_size = 3;
61#endif
62
63 using value_type = double;
67 using string_hash = std::hash<std::string>;
68
69 // required static functions
70 static std::string label() { return "malloc_gotcha"; }
72 {
73#if defined(TIMEMORY_USE_CUDA)
74 return "GOTCHA wrapper for memory allocation functions: malloc, calloc, free, "
75 "cudaMalloc, cudaMallocHost, cudaMallocManaged, cudaHostAlloc, cudaFree, "
76 "cudaFreeHost";
77#elif defined(TIMEMORY_USE_HIP)
78 return "GOTCHA wrapper for memory allocation functions: malloc, calloc, free, "
79 "hipMalloc, hipMallocHost, hipMallocManaged, hipHostAlloc, hipFree, "
80 "hipFreeHost";
81#else
82 return "GOTCHA wrapper for memory allocation functions: malloc, calloc, free";
83#endif
84 }
85
86 using base_type::accum;
89 using base_type::value;
90
91 template <typename Tp>
93
94 template <typename Tp>
97
98 template <typename Tp>
100
101 static void global_finalize()
102 {
103 for(auto& itr : get_cleanup_list())
104 itr();
105 get_cleanup_list().clear();
106 }
107
108public:
109 template <typename Tp>
110 static void configure();
111
112 template <typename Tp>
113 static void tear_down();
114
115public:
116 TIMEMORY_DEFAULT_OBJECT(malloc_gotcha)
117
118public:
119 void start() { value = 0; }
120
121 void stop()
122 {
123 // value should be updated via audit in-between start() and stop()
124 accum += value;
125 }
126
127 TIMEMORY_NODISCARD double get() const { return accum / base_type::get_unit(); }
128
129 TIMEMORY_NODISCARD double get_display() const { return get(); }
130
132
133 /// nbytes is passed to malloc
134 void audit(audit::incoming, size_t nbytes)
135 {
136 DEBUG_PRINT_HERE("%s(%i)", m_prefix, (int) nbytes);
137 // malloc
138 value = (nbytes);
139 DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
140 }
141
142 /// nmemb and size is passed to calloc
143 void audit(audit::incoming, size_t nmemb, size_t size)
144 {
145 DEBUG_PRINT_HERE("%s(%i, %i)", m_prefix, (int) nmemb, (int) size);
146 // calloc
147 value = (nmemb * size);
148 DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
149 }
150
151 /// void* is returned from malloc and calloc
152 void audit(audit::outgoing, void* ptr)
153 {
154 DEBUG_PRINT_HERE("%s(%p)", m_prefix, ptr);
155 if(ptr)
156 {
157 get_allocation_map()[ptr] = value;
158 DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
159 }
160 }
161
162 /// void* is passed to free
163 void audit(audit::incoming, void* ptr)
164 {
165 DEBUG_PRINT_HERE("%s(%p)", m_prefix, ptr);
166 auto itr = get_allocation_map().find(ptr);
167 if(itr != get_allocation_map().end())
168 {
169 value = itr->second;
170 DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
171 get_allocation_map().erase(itr);
172 }
173 else
174 {
176 {
177 printf("[%s]> free of unknown pointer size: %p\n",
178 this_type::get_label().c_str(), ptr);
179 }
180 }
181 }
182
183 //----------------------------------------------------------------------------------//
184
185#if defined(TIMEMORY_USE_GPU)
186
187 //----------------------------------------------------------------------------------//
188 // cudaMalloc, cudaMallocHost
189 // hipMalloc, hipMallocHost
190 void audit(audit::incoming, void** devPtr, size_t size)
191 {
192 DEBUG_PRINT_HERE("%s(void**, %lu)", m_prefix, (unsigned long) size);
193 // malloc
194 value = (size);
195 m_last_addr = devPtr;
196 DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
197 }
198
199 //----------------------------------------------------------------------------------//
200 // cudaHostAlloc / cudaMallocManaged
201 // hipHostAlloc / hipMallocManaged
202 void audit(audit::incoming, void** hostPtr, size_t size, unsigned int flags)
203 {
204 DEBUG_PRINT_HERE("%s(void**, %lu)", m_prefix, (unsigned long) size);
205 value = (size);
206 m_last_addr = hostPtr;
207 DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
208 consume_parameters(flags);
209 }
210
211 //----------------------------------------------------------------------------------//
212 // cudaMalloc and cudaHostAlloc
213 // hipMalloc and hipHostAlloc
214 void audit(audit::outgoing, gpu::error_t err)
215 {
216 if(m_last_addr)
217 {
218 void* ptr = (void*) ((char**) (m_last_addr)[0]);
219 get_allocation_map()[ptr] = value;
220 if(err != gpu::success_v && (settings::debug() || settings::verbose() > 1))
221 {
222 PRINT_HERE("%s did not return success, values may be corrupted",
223 m_prefix);
224 }
225 }
226 }
227
228#endif
229
230 //----------------------------------------------------------------------------------//
231
232 void set_prefix(const char* _prefix) { m_prefix = _prefix; }
233
234 //----------------------------------------------------------------------------------//
235
237 {
238 value += rhs.value;
239 accum += rhs.accum;
240 return *this;
241 }
242
243 //----------------------------------------------------------------------------------//
244
246 {
247 value -= rhs.value;
248 accum -= rhs.accum;
249 return *this;
250 }
251
252private:
253 using alloc_map_t = std::unordered_map<void*, size_t>;
254 using clean_list_t = std::vector<std::function<void()>>;
255
256 static clean_list_t& get_cleanup_list()
257 {
258 static clean_list_t _instance{};
259 return _instance;
260 }
261
262 static alloc_map_t& get_allocation_map()
263 {
264 static thread_local alloc_map_t _instance{};
265 return _instance;
266 }
267
268private:
269 const char* m_prefix = nullptr;
270#if defined(TIMEMORY_USE_GPU)
271 void** m_last_addr = nullptr;
272#endif
273};
274//
275//--------------------------------------------------------------------------------------//
276//
277#if defined(TIMEMORY_USE_GOTCHA)
278//
279template <typename Tp>
280inline void
282{
283 // static_assert(!std::is_same<Type, malloc_gotcha>::value,
284 // "Error! Cannot configure with self as the type!");
285
286 using tuple_t = push_back_t<Tp, this_type>;
287 using local_gotcha_type = gotcha<data_size, tuple_t, type_list<this_type>>;
288
289 local_gotcha_type::get_default_ready() = false;
290 local_gotcha_type::get_initializer() = []() {
291 local_gotcha_type::template configure<0, void*, size_t>("malloc");
292 local_gotcha_type::template configure<1, void*, size_t, size_t>("calloc");
293 local_gotcha_type::template configure<2, void, void*>("free");
294 // TIMEMORY_C_GOTCHA(local_gotcha_type, 0, malloc);
295 // TIMEMORY_C_GOTCHA(local_gotcha_type, 1, calloc);
296 // TIMEMORY_C_GOTCHA(local_gotcha_type, 2, free);
297# if defined(TIMEMORY_USE_CUDA)
298 local_gotcha_type::template configure<3, cudaError_t, void**, size_t>(
299 "cudaMalloc");
300 local_gotcha_type::template configure<4, cudaError_t, void**, size_t>(
301 "cudaMallocHost");
302 local_gotcha_type::template configure<5, cudaError_t, void**, size_t,
303 unsigned int>("cudaMallocManaged");
304 local_gotcha_type::template configure<6, cudaError_t, void**, size_t,
305 unsigned int>("cudaHostAlloc");
306 local_gotcha_type::template configure<7, cudaError_t, void*>("cudaFree");
307 local_gotcha_type::template configure<8, cudaError_t, void*>("cudaFreeHost");
308# elif defined(TIMEMORY_USE_HIP)
309 local_gotcha_type::template configure<3, hipError_t, void**, size_t>("hipMalloc");
310 local_gotcha_type::template configure<4, hipError_t, void**, size_t>(
311 "hipMallocHost");
312 local_gotcha_type::template configure<5, hipError_t, void**, size_t,
313 unsigned int>("hipMallocManaged");
314 local_gotcha_type::template configure<6, hipError_t, void**, size_t,
315 unsigned int>("hipHostAlloc");
316 local_gotcha_type::template configure<7, hipError_t, void*>("hipFree");
317 local_gotcha_type::template configure<8, hipError_t, void*>("hipFreeHost");
318# endif
319 };
320
321 get_cleanup_list().emplace_back([]() { malloc_gotcha::tear_down<Tp>(); });
322}
323//
324template <typename Tp>
325inline void
327{
328 // static_assert(!std::is_same<Type, malloc_gotcha>::value,
329 // "Error! Cannot configure with self as the type!");
330
331 using tuple_t = push_back_t<Tp, this_type>;
332 using local_gotcha_type = gotcha<data_size, tuple_t, type_list<this_type>>;
333
334 local_gotcha_type::get_default_ready() = false;
335 local_gotcha_type::get_initializer() = []() {};
336 local_gotcha_type::disable();
337}
338//
339#endif
340//
341/// \struct tim::component::memory_allocations
342/// \brief This component wraps malloc, calloc, free, CUDA/HIP malloc/free via
343/// GOTCHA and tracks the number of bytes requested/freed in each call.
344/// This component is useful for detecting the locations where memory re-use
345/// would provide a performance benefit.
346///
348: base<memory_allocations, void>
350, private policy::instance_tracker<memory_allocations, true>
351{
352 using value_type = void;
356
359 using data_pointer_t = std::unique_ptr<malloc_bundle_t>;
360
361 static std::string label() { return "memory_allocations"; }
363 {
364 return "Number of bytes allocated/freed instead of peak/current memory usage: "
365 "free(malloc(10)) + free(malloc(10)) would use 10 bytes but this would "
366 "report 20 bytes";
367 }
368
369 static void global_init() { malloc_gotcha::configure<component_tuple_t<>>(); }
370 static void global_finalize() { malloc_gotcha::tear_down<component_tuple_t<>>(); }
371
372 void start()
373 {
374 auto _cnt = tracker_type::start();
375 if(_cnt.first == 0 && _cnt.second == 0 && !get_data())
376 {
377 get_data() = std::make_unique<malloc_bundle_t>();
378 get_data()->start();
379 }
380 }
381
382 void stop()
383 {
384 auto _cnt = tracker_type::stop();
385 if(_cnt.first == 0 && _cnt.second == 0 && get_data())
386 {
387 get_data()->stop();
388 get_data().reset(nullptr);
389 }
390 }
391
392private:
393 static data_pointer_t& get_data()
394 {
395 static auto _instance = data_pointer_t{};
396 return _instance;
397 }
398};
399//
400} // namespace component
401} // namespace tim
402
403#if defined(__GNUC__) && (__GNUC__ >= 6)
404# pragma GCC diagnostic pop
405#endif
Implementation of the gotcha component(s)
Definition for various functions for construct in operations.
return _hash_map end()
void stop(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:386
void start(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:316
Inherit from this policy to add reference counting support. Useful if you want to turn a global setti...
Definition: types.hpp:406
Definition: kokkosp.cpp:39
char const std::string & _prefix
Definition: config.cpp:55
convert_t< mpl::available_t< concat< T... > >, component_tuple<> > component_tuple_t
Definition: available.hpp:340
typename mpl::push_back< Tuple, T >::type push_back_t
Definition: types.hpp:1087
tim::mpl::apply< std::string > string
Definition: macros.hpp:53
void consume_parameters(ArgsT &&...)
Definition: types.hpp:285
lightweight tuple-alternative for meta-programming logic
Definition: types.hpp:233
Used by component audit member function to designate the parameters being passed are incoming (e....
Definition: types.hpp:780
Used by component audit member function to designate the parameters being passed are outgoing (e....
Definition: types.hpp:790
static int64_t get_unit()
void set_stopped()
store that stop has been called
storage< Tp, Value > storage_type
void set_started()
store that start has been called
A very lightweight storage class which provides nothing.
Definition: declaration.hpp:51
The gotcha component rewrites the global offset table such that calling the wrapped function actually...
Definition: components.hpp:179
void set_prefix(const char *_prefix)
void audit(audit::outgoing, void *ptr)
void* is returned from malloc and calloc
push_back_t< Tp, gotcha_type< Tp > > component_type
void audit(audit::incoming, size_t nbytes)
nbytes is passed to malloc
void audit(audit::incoming, void *ptr)
void* is passed to free
std::hash< std::string > string_hash
push_back_t< Tp, this_type > gotcha_component_type
this_type & operator-=(const this_type &rhs)
void audit(audit::incoming, size_t nmemb, size_t size)
nmemb and size is passed to calloc
this_type & operator+=(const this_type &rhs)
static constexpr size_t data_size
This component wraps malloc, calloc, free, CUDA/HIP malloc/free via GOTCHA and tracks the number of b...
std::unique_ptr< malloc_bundle_t > data_pointer_t
typename malloc_gotcha::gotcha_type< component_tuple_t<> > malloc_gotcha_t
component_tuple_t< malloc_gotcha_t > malloc_bundle_t
#define DEBUG_PRINT_HERE(...)
Definition: macros.hpp:168
#define PRINT_HERE(...)
Definition: macros.hpp:152