33#include "timemory/backends/device.hpp"
34#include "timemory/backends/dmp.hpp"
35#include "timemory/backends/hip.hpp"
36#include "timemory/components/cuda/backends.hpp"
46#include "timemory/tpls/cereal/archives.hpp"
60#include <unordered_set>
67using namespace memory;
72template <
typename DeviceT,
typename Tp,
typename Counter>
77 using lock_t = std::unique_lock<mutex_t>;
84 using ull =
unsigned long long;
102 uint64_t _align = 8 * sizeof(Tp))
114 uint64_t _align = 8 *
sizeof(Tp))
118 , configure_callback(
std::move(_func))
128 template <
typename Up = Tp,
typename Dev = DeviceT,
129 typename std::enable_if<(std::is_same<Dev, device::cpu>::value ||
130 (std::is_same<Dev, device::gpu>::value &&
131 !std::is_same<Up, gpu::fp16_t>::value)),
136 align = std::max<uint64_t>(
align, 8 *
sizeof(Up));
140 printf(
"[%s]> nsize = %llu\n", __FUNCTION__, (
ull)
nsize);
141 Up* buffer = allocate_aligned<Up, DeviceT>(
nsize,
align);
143 printf(
"[%s]> buffer = %p\n", __FUNCTION__, (
void*) buffer);
144 device::params<DeviceT> _params(0, 512, 0, 0);
145 device::launch(
nsize, _params, initialize_buffer<DeviceT, Up, uint64_t>, buffer,
154 template <
typename Up = Tp,
typename Dev = DeviceT,
155 typename std::enable_if<(std::is_same<Up, gpu::fp16_t>::value &&
156 std::is_same<Dev, device::gpu>::value),
161 align = std::max<uint64_t>(
align, 8 *
sizeof(Up));
165 printf(
"[%s]> nsize = %llu\n", __FUNCTION__, (
ull)
nsize);
166 Up* buffer = allocate_aligned<Up, DeviceT>(
nsize,
align);
168 printf(
"[%s]> buffer = %p\n", __FUNCTION__, (
void*) buffer);
169 device::params<DeviceT> _params(0, 512, 0, 0);
170 device::launch(
nsize, _params, initialize_buffer<DeviceT, Up, uint32_t>, buffer,
184 void configure(uint64_t tid) { configure_callback(tid, *
this); }
202 uint64_t total_ops = trials * working_set_size * nops;
204 std::stringstream ss;
206 if(
label.length() == 0)
218 auto _label = tim::demangle<Tp>();
219 data_type _data(ss.str(), working_set, trials, total_bytes, total_ops, nops,
220 _counter, DeviceT::name(), _label, _itrp);
222#if !defined(TIMEMORY_WINDOWS)
228 static std::mutex _mutex;
237 template <
typename FuncT>
240 configure_callback = std::forward<FuncT>(_f);
246 template <
typename Archive>
250 data = std::make_shared<ert_data_t>();
251 ar(cereal::make_nvp(
"params",
params), cereal::make_nvp(
"data", *
data));
259 std::stringstream ss;
263 <<
"alignment = " << obj.
align <<
", "
264 <<
"nsize = " << obj.
nsize <<
", "
265 <<
"label = " << obj.
label <<
", "
266 <<
"data entries = " << ((obj.
data.get()) ? obj.
data->size() : 0);
284 for(
const auto& itr :
_args)
310 void compute_internal()
312 if(device::is_cpu<DeviceT>::value)
323template <
typename Counter>
327 using exec_data_vec_t = std::vector<exec_data<Counter>>;
329 int dmp_rank = dmp::rank();
330 int dmp_size = dmp::size();
332 exec_data_vec_t results(dmp_size);
333 if(dmp::is_initialized())
337#if defined(TIMEMORY_USE_MPI) || defined(TIMEMORY_USE_UPCXX)
338 auto space = cereal::JSONOutputArchive::Options::IndentChar::space;
344 std::stringstream ss;
346 cereal::JSONOutputArchive::Options opt(16, space, 0);
347 cereal::JSONOutputArchive oa(ss, opt);
348 oa(cereal::make_nvp(
"data", src));
356 auto recv_serialize = [&](
const std::string& src) {
358 std::stringstream ss;
361 cereal::JSONInputArchive ia(ss);
362 ia(cereal::make_nvp(
"data", ret));
368#if defined(TIMEMORY_USE_MPI)
370 auto str_ret = send_serialize(obj);
374 for(
int i = 1; i < dmp_size; ++i)
377 mpi::recv(str, i, 0, mpi::comm_world_v);
378 results[i] = recv_serialize(str);
380 results[dmp_rank] = std::move(obj);
384 mpi::send(str_ret, 0, 0, mpi::comm_world_v);
387#elif defined(TIMEMORY_USE_UPCXX)
392 auto remote_serialize = [=]() {
return send_serialize(obj); };
399 for(
int i = 1; i < dmp_size; ++i)
401 upcxx::future<std::string> fut = upcxx::rpc(i, remote_serialize);
405 results[i] = recv_serialize(fut.result());
407 results[dmp_rank] = std::move(obj);
416 results.at(0) = std::move(obj);
422 printf(
"[%i]> Outputting '%s'...\n", dmp_rank, fname.c_str());
429 oa->setNextName(
"timemory");
431 oa->setNextName(
"ranks");
434 for(uint64_t i = 0; i < results.size(); ++i)
437 (*oa)(cereal::make_nvp(
"rank", i),
438 cereal::make_nvp(
"roofline", results.at(i)));
typename ert_data_t::value_type data_type
counter(const counter &)=default
std::recursive_mutex mutex_t
counter(counter &&) noexcept=default
void add_skip_ops(size_t _Nops)
void add_skip_ops(std::initializer_list< size_t > _args)
Up * get_buffer()
allocate a buffer for the ERT calculation uses this function if device is CPU or device is GPU and ty...
void configure(uint64_t tid)
std::shared_ptr< ert_data_t > data_ptr_t
void record(counter_type &_counter, int n, int trials, uint64_t nops, const exec_params &_itrp)
void set_callback(FuncT &&_f)
const data_ptr_t & get_data() const
std::function< void(uint64_t, this_type &)> callback_type
counter_type get_counter() const
void serialize(Archive &ar, const unsigned int)
int memory_accesses_per_element
counter(const exec_params &_params, callback_type _func, data_ptr_t _exec_data, uint64_t _align=8 *sizeof(Tp))
std::unordered_set< size_t > skip_ops_t
std::unique_lock< mutex_t > lock_t
friend std::ostream & operator<<(std::ostream &os, const counter &obj)
void destroy_buffer(Tp *buffer)
std::tuple< std::string, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, Tp, std::string, std::string, exec_params > value_type
void serialize(std::string fname, exec_data< Counter > &obj)
bool open(std::ofstream &_ofs, std::string _fpath, Args &&... _args)
std::array< char *, 4 > _args
tim::mpl::apply< std::string > string
const std::string std::ostream * os
auto get(const auto_bundle< Tag, Types... > &_obj)
Provides a static get() function which return a shared pointer to an instance of the given archive fo...
static string_t compose_output_filename(string_t _tag, string_t _ext, bool _use_suffix=use_output_suffix(), int32_t _suffix=default_process_suffix(), bool _make_dir=false, std::string _explicit={})