timemory  3.2.1
Modular C++ Toolkit for Performance Analysis and Logging. Profiling API and Tools for C, C++, CUDA, Fortran, and Python. The C++ template API is essentially a framework to creating tools: it is designed to provide a unifying interface for recording various performance measurements alongside data logging and interfaces to other tools.
memory_allocations.hpp
Go to the documentation of this file.
1 // MIT License
2 //
3 // Copyright (c) 2020, The Regents of the University of California,
4 // through Lawrence Berkeley National Laboratory (subject to receipt of any
5 // required approvals from the U.S. Dept. of Energy). All rights reserved.
6 //
7 // Permission is hereby granted, free of charge, to any person obtaining a copy
8 // of this software and associated documentation files (the "Software"), to deal
9 // in the Software without restriction, including without limitation the rights
10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 // copies of the Software, and to permit persons to whom the Software is
12 // furnished to do so, subject to the following conditions:
13 //
14 // The above copyright notice and this permission notice shall be included in all
15 // copies or substantial portions of the Software.
16 //
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 // SOFTWARE.
24 
25 #pragma once
26 
27 #include "timemory/api.hpp"
32 #include "timemory/mpl/policy.hpp"
33 #include "timemory/mpl/types.hpp"
35 #include "timemory/types.hpp"
38 
39 #include <memory>
40 #include <string>
41 
42 #if defined(__GNUC__) && (__GNUC__ >= 6)
43 # pragma GCC diagnostic push
44 # pragma GCC diagnostic ignored "-Wignored-attributes"
45 #endif
46 
47 namespace tim
48 {
49 namespace component
50 {
51 //
53 : base<malloc_gotcha, double>
55 {
56 #if defined(TIMEMORY_USE_CUDA)
57  static constexpr size_t data_size = 9;
58 #else
59  static constexpr size_t data_size = 3;
60 #endif
61 
62  using value_type = double;
66  using string_hash = std::hash<std::string>;
67 
68  // required static functions
69  static std::string label() { return "malloc_gotcha"; }
71  {
72 #if defined(TIMEMORY_USE_CUDA)
73  return "GOTCHA wrapper for memory allocation functions: malloc, calloc, free, "
74  "cudaMalloc, cudaMallocHost, cudaMallocManaged, cudaHostAlloc, cudaFree, "
75  "cudaFreeHost";
76 #else
77  return "GOTCHA wrapper for memory allocation functions: malloc, calloc, free";
78 #endif
79  }
80 
81  using base_type::accum;
84  using base_type::value;
85 
86  template <typename Tp>
88 
89  template <typename Tp>
90  using gotcha_type =
92 
93  template <typename Tp>
95 
96  static void global_finalize()
97  {
98  for(auto& itr : get_cleanup_list())
99  itr();
100  get_cleanup_list().clear();
101  }
102 
103 public:
104  template <typename Tp>
105  static void configure();
106 
107  template <typename Tp>
108  static void tear_down();
109 
110 public:
111  TIMEMORY_DEFAULT_OBJECT(malloc_gotcha)
112 
113 public:
114  void start() { value = 0; }
115 
116  void stop()
117  {
118  // value should be updated via audit in-between start() and stop()
119  accum += value;
120  }
121 
122  TIMEMORY_NODISCARD double get() const { return accum / base_type::get_unit(); }
123 
124  TIMEMORY_NODISCARD double get_display() const { return get(); }
125 
126  void set_prefix();
127 
128  /// nbytes is passed to malloc
129  void audit(audit::incoming, size_t nbytes)
130  {
131  DEBUG_PRINT_HERE("%s(%i)", m_prefix, (int) nbytes);
132  // malloc
133  value = (nbytes);
134  DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
135  }
136 
137  /// nmemb and size is passed to calloc
138  void audit(audit::incoming, size_t nmemb, size_t size)
139  {
140  DEBUG_PRINT_HERE("%s(%i, %i)", m_prefix, (int) nmemb, (int) size);
141  // calloc
142  value = (nmemb * size);
143  DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
144  }
145 
146  /// void* is returned from malloc and calloc
147  void audit(audit::outgoing, void* ptr)
148  {
149  DEBUG_PRINT_HERE("%s(%p)", m_prefix, ptr);
150  if(ptr)
151  {
152  get_allocation_map()[ptr] = value;
153  DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
154  }
155  }
156 
157  /// void* is passed to free
158  void audit(audit::incoming, void* ptr)
159  {
160  DEBUG_PRINT_HERE("%s(%p)", m_prefix, ptr);
161  auto itr = get_allocation_map().find(ptr);
162  if(itr != get_allocation_map().end())
163  {
164  value = itr->second;
165  DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
166  get_allocation_map().erase(itr);
167  }
168  else
169  {
170  if(settings::verbose() > 1 || settings::debug())
171  {
172  printf("[%s]> free of unknown pointer size: %p\n",
173  this_type::get_label().c_str(), ptr);
174  }
175  }
176  }
177 
178  //----------------------------------------------------------------------------------//
179 
180 #if defined(TIMEMORY_USE_CUDA)
181 
182  //----------------------------------------------------------------------------------//
183  // cudaMalloc, cudaMallocHost
184  void audit(audit::incoming, void** devPtr, size_t size)
185  {
186  DEBUG_PRINT_HERE("%s(void**, %lu)", m_prefix, (unsigned long) size);
187  // malloc
188  value = (size);
189  m_last_addr = devPtr;
190  DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
191  }
192 
193  //----------------------------------------------------------------------------------//
194  // cudaHostAlloc / cudaMallocManaged
195  void audit(audit::incoming, void** hostPtr, size_t size, unsigned int flags)
196  {
197  DEBUG_PRINT_HERE("%s(void**, %lu)", m_prefix, (unsigned long) size);
198  value = (size);
199  m_last_addr = hostPtr;
200  DEBUG_PRINT_HERE("value: %12.8f, accum: %12.8f", value, accum);
201  consume_parameters(flags);
202  }
203 
204  //----------------------------------------------------------------------------------//
205  // cudaMalloc and cudaHostAlloc
206  void audit(audit::outgoing, cuda::error_t err)
207  {
208  if(m_last_addr)
209  {
210  void* ptr = (void*) ((char**) (m_last_addr)[0]);
211  get_allocation_map()[ptr] = value;
212  if(err != cuda::success_v && (settings::debug() || settings::verbose() > 1))
213  {
214  PRINT_HERE("%s did not return cudaSuccess, values may be corrupted",
215  m_prefix);
216  }
217  }
218  }
219 
220 #endif
221 
222  //----------------------------------------------------------------------------------//
223 
224  void set_prefix(const char* _prefix) { m_prefix = _prefix; }
225 
226  //----------------------------------------------------------------------------------//
227 
229  {
230  value += rhs.value;
231  accum += rhs.accum;
232  return *this;
233  }
234 
235  //----------------------------------------------------------------------------------//
236 
238  {
239  value -= rhs.value;
240  accum -= rhs.accum;
241  return *this;
242  }
243 
244 private:
245  using alloc_map_t = std::unordered_map<void*, size_t>;
246  using clean_list_t = std::vector<std::function<void()>>;
247 
248  static clean_list_t& get_cleanup_list()
249  {
250  static clean_list_t _instance{};
251  return _instance;
252  }
253 
254  static alloc_map_t& get_allocation_map()
255  {
256  static thread_local alloc_map_t _instance{};
257  return _instance;
258  }
259 
260 private:
261  const char* m_prefix = nullptr;
262 #if defined(TIMEMORY_USE_CUDA)
263  void** m_last_addr = nullptr;
264 #endif
265 };
266 //
267 //--------------------------------------------------------------------------------------//
268 //
269 #if defined(TIMEMORY_USE_GOTCHA)
270 //
271 template <typename Tp>
272 inline void
274 {
275  // static_assert(!std::is_same<Type, malloc_gotcha>::value,
276  // "Error! Cannot configure with self as the type!");
277 
278  using tuple_t = push_back_t<Tp, this_type>;
279  using local_gotcha_type = gotcha<data_size, tuple_t, type_list<this_type>>;
280 
281  local_gotcha_type::get_default_ready() = false;
282  local_gotcha_type::get_initializer() = []() {
283  local_gotcha_type::template configure<0, void*, size_t>("malloc");
284  local_gotcha_type::template configure<1, void*, size_t, size_t>("calloc");
285  local_gotcha_type::template configure<2, void, void*>("free");
286  // TIMEMORY_C_GOTCHA(local_gotcha_type, 0, malloc);
287  // TIMEMORY_C_GOTCHA(local_gotcha_type, 1, calloc);
288  // TIMEMORY_C_GOTCHA(local_gotcha_type, 2, free);
289 # if defined(TIMEMORY_USE_CUDA)
290  local_gotcha_type::template configure<3, cudaError_t, void**, size_t>(
291  "cudaMalloc");
292  local_gotcha_type::template configure<4, cudaError_t, void**, size_t>(
293  "cudaMallocHost");
294  local_gotcha_type::template configure<5, cudaError_t, void**, size_t,
295  unsigned int>("cudaMallocManaged");
296  local_gotcha_type::template configure<6, cudaError_t, void**, size_t,
297  unsigned int>("cudaHostAlloc");
298  local_gotcha_type::template configure<7, cudaError_t, void*>("cudaFree");
299  local_gotcha_type::template configure<8, cudaError_t, void*>("cudaFreeHost");
300 # endif
301  };
302 
303  get_cleanup_list().emplace_back([]() { malloc_gotcha::tear_down<Tp>(); });
304 }
305 //
306 template <typename Tp>
307 inline void
309 {
310  // static_assert(!std::is_same<Type, malloc_gotcha>::value,
311  // "Error! Cannot configure with self as the type!");
312 
313  using tuple_t = push_back_t<Tp, this_type>;
314  using local_gotcha_type = gotcha<data_size, tuple_t, type_list<this_type>>;
315 
316  local_gotcha_type::get_default_ready() = false;
317  local_gotcha_type::get_initializer() = []() {};
318  local_gotcha_type::disable();
319 }
320 //
321 #endif
322 //
323 /// \struct tim::component::memory_allocations
324 /// \brief This component wraps malloc, calloc, free, cudaMalloc, cudaFree via
325 /// GOTCHA and tracks the number of bytes requested/freed in each call.
326 /// This component is useful for detecting the locations where memory re-use
327 /// would provide a performance benefit.
328 ///
330 : base<memory_allocations, void>
332 , private policy::instance_tracker<memory_allocations, true>
333 {
334  using value_type = void;
338 
341  using data_pointer_t = std::unique_ptr<malloc_bundle_t>;
342 
343  static std::string label() { return "memory_allocations"; }
345  {
346  return "Number of bytes allocated/freed instead of peak/current memory usage: "
347  "free(malloc(10)) + free(malloc(10)) would use 10 bytes but this would "
348  "report 20 bytes";
349  }
350 
351  static void global_init() { malloc_gotcha::configure<component_tuple_t<>>(); }
352  static void global_finalize() { malloc_gotcha::tear_down<component_tuple_t<>>(); }
353 
354  void start()
355  {
356  auto _cnt = tracker_type::start();
357  if(_cnt.first == 0 && _cnt.second == 0 && !get_data())
358  {
359  get_data() = std::make_unique<malloc_bundle_t>();
360  get_data()->start();
361  }
362  }
363 
364  void stop()
365  {
366  auto _cnt = tracker_type::stop();
367  if(_cnt.first == 0 && _cnt.second == 0 && get_data())
368  {
369  get_data()->stop();
370  get_data().reset(nullptr);
371  }
372  }
373 
374 private:
375  static data_pointer_t& get_data()
376  {
377  static auto _instance = data_pointer_t{};
378  return _instance;
379  }
380 };
381 //
382 } // namespace component
383 } // namespace tim
384 
385 #if defined(__GNUC__) && (__GNUC__ >= 6)
386 # pragma GCC diagnostic pop
387 #endif
Implementation of the gotcha component(s)
Definition for various functions for construct in operations.
void stop(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:368
void start(TupleT< Tp... > &obj, Args &&... args)
Definition: functional.cpp:298
Inherit from this policy to add reference counting support. Useful if you want to turn a global setti...
Definition: types.hpp:367
Definition: kokkosp.cpp:38
char const std::string & _prefix
Definition: definition.hpp:59
convert_t< mpl::available_t< concat< T... > >, component_tuple<> > component_tuple_t
Definition: available.hpp:340
void consume_parameters(ArgsT &&...) TIMEMORY_HIDDEN
Definition: types.hpp:285
typename mpl::push_back< Tuple, T >::type push_back_t
Definition: types.hpp:1048
tim::mpl::apply< std::string > string
Definition: macros.hpp:52
lightweight tuple-alternative for meta-programming logic
Definition: types.hpp:233
Used by component audit member function to designate the parameters being passed are incoming (e....
Definition: types.hpp:780
Used by component audit member function to designate the parameters being passed are outgoing (e....
Definition: types.hpp:790
static int64_t get_unit()
void set_stopped()
store that stop has been called
storage< Tp, Value > storage_type
Definition: declaration.hpp:90
void set_started()
store that start has been called
The gotcha component rewrites the global offset table such that calling the wrapped function actually...
Definition: components.hpp:178
void set_prefix(const char *_prefix)
void audit(audit::outgoing, void *ptr)
void* is returned from malloc and calloc
this_type & operator+=(const this_type &rhs)
push_back_t< Tp, gotcha_type< Tp > > component_type
void audit(audit::incoming, size_t nbytes)
nbytes is passed to malloc
void audit(audit::incoming, void *ptr)
void* is passed to free
this_type & operator-=(const this_type &rhs)
typename base_type::storage_type storage_type
std::hash< std::string > string_hash
push_back_t< Tp, this_type > gotcha_component_type
void audit(audit::incoming, size_t nmemb, size_t size)
nmemb and size is passed to calloc
static constexpr size_t data_size
This component wraps malloc, calloc, free, cudaMalloc, cudaFree via GOTCHA and tracks the number of b...
std::unique_ptr< malloc_bundle_t > data_pointer_t
typename malloc_gotcha::gotcha_type< component_tuple_t<> > malloc_gotcha_t
component_tuple_t< malloc_gotcha_t > malloc_bundle_t
#define DEBUG_PRINT_HERE(...)
Definition: macros.hpp:163
#define PRINT_HERE(...)
Definition: macros.hpp:147