33#include "timemory/backends/device.hpp"
34#include "timemory/backends/dmp.hpp"
35#include "timemory/components/cuda/backends.hpp"
42#include "timemory/tpls/cereal/cereal.hpp"
61using namespace std::placeholders;
69 uint64_t mem_max = 8 * cache_size::get_max(),
70 uint64_t _nthread = 1, uint64_t _nstream = 1,
71 uint64_t _grid_size = 0, uint64_t _block_size = 32)
91 cereal::make_nvp(
"nthreads",
nthreads), cereal::make_nvp(
"nrank",
nrank),
92 cereal::make_nvp(
"nproc",
nproc), cereal::make_nvp(
"nstreams",
nstreams),
100 std::stringstream ss;
103 <<
"nthreads = " << obj.
nthreads <<
", "
104 <<
"nrank = " << obj.
nrank <<
", "
105 <<
"nproc = " << obj.
nproc <<
", "
106 <<
"nstreams = " << obj.
nstreams <<
", "
107 <<
"grid_size = " << obj.
grid_size <<
", "
128template <
typename Tp>
134 using labels_type = std::array<string_t, std::tuple_size<value_type>::value>;
169 m_values.resize(m_values.size() + 1);
170 m_values.back() =
entry;
182 for(
const auto& itr : rhs.m_values)
183 m_values.push_back(itr);
192 std::stringstream ss;
193 for(
const auto& itr : obj.m_values)
195 ss << std::setw(24) << std::get<0>(itr) <<
" (device: " << std::get<7>(itr)
196 <<
", dtype = " << std::get<8>(itr) <<
"): ";
197 obj.write<1>(ss, itr,
", ", 10);
198 obj.write<2>(ss, itr,
", ", 6);
199 obj.write<3>(ss, itr,
", ", 12);
200 obj.write<4>(ss, itr,
", ", 12);
201 obj.write<5>(ss, itr,
", ", 12);
202 obj.write<6>(ss, itr,
"\n", 12);
210 template <
typename Archive>
211 void save(Archive& ar,
const unsigned int)
const
213 constexpr auto sz = std::tuple_size<value_type>::value;
214 ar(cereal::make_nvp(
"entries", m_values.size()));
216 ar.setNextName(
"ert");
219 for(
const auto& itr : m_values)
230 template <
typename Archive>
231 void load(Archive& ar,
const unsigned int)
233 constexpr auto sz = std::tuple_size<value_type>::value;
235 ar(cereal::make_nvp(
"entries", _size));
236 m_values.resize(_size);
238 ar.setNextName(
"ert");
240 for(
auto& itr : m_values)
250 labels_type m_labels = { {
"label",
"working-set",
"trials",
"total-bytes",
251 "total-ops",
"ops-per-set",
"counter",
"device",
"dtype",
260 int32_t _width)
const
262 os << std::setw(10) << std::get<N>(m_labels) <<
" = " << std::setw(_width)
263 << std::get<N>(ret) << _trailing;
268 template <
typename Archive,
size_t... Idx>
269 void _save(Archive& ar,
const value_type& _tuple, index_sequence<Idx...>)
const
271 ar(cereal::make_nvp(std::get<Idx>(m_labels), std::get<Idx>(_tuple))...);
276 template <
typename Archive,
size_t... Idx>
277 void _load(Archive& ar,
value_type& _tuple, index_sequence<Idx...>)
279 ar(cereal::make_nvp(std::get<Idx>(m_labels), std::get<Idx>(_tuple))...);
289template <
typename DeviceT,
typename Tp,
typename Intp = int32_t,
290 device::enable_if_cpu_t<DeviceT> = 0>
294 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
295 for(
auto i = range.begin(); i < range.end(); i += range.stride())
305template <
typename DeviceT,
typename Tp,
typename Intp = int32_t,
306 device::enable_if_gpu_t<DeviceT> = 0>
310 auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
311 for(
auto i = range.begin(); i < range.end(); i += range.stride())
#define TIMEMORY_GLOBAL_FUNCTION
const_iterator end() const
std::vector< value_type > value_array
void load(Archive &ar, const unsigned int)
exec_data(exec_data &&) noexcept=default
std::tuple< std::string, uint64_t, uint64_t, uint64_t, uint64_t, uint64_t, Tp, std::string, std::string, exec_params > value_type
friend std::ostream & operator<<(std::ostream &os, const exec_data &obj)
void set_labels(const labels_type &_labels)
typename value_array::iterator iterator
void save(Archive &ar, const unsigned int) const
exec_data & operator+=(const value_type &entry)
const_iterator begin() const
labels_type get_labels() const
exec_data & operator+=(const exec_data &rhs)
exec_data(const exec_data &)=delete
typename value_array::size_type size_type
typename value_array::const_iterator const_iterator
std::array< string_t, std::tuple_size< value_type >::value > labels_type
void initialize_buffer(Tp *A, const Tp &value, const Intp &nsize)
std::make_integer_sequence< size_t, Num > make_index_sequence
Alias template make_index_sequence.
tim::mpl::apply< std::string > string
const std::string std::ostream * os
exec_params(exec_params &&) noexcept=default
void serialize(Archive &ar, const unsigned int)
exec_params(const exec_params &)=default
exec_params(uint64_t _work_set=16, uint64_t mem_max=8 *cache_size::get_max(), uint64_t _nthread=1, uint64_t _nstream=1, uint64_t _grid_size=0, uint64_t _block_size=32)
friend std::ostream & operator<<(std::ostream &os, const exec_params &obj)
typename typename typename