Classes
class	aligned_allocator

struct	callback
	for variadic expansion to set the callback More...

struct	configuration

class	counter

class	exec_data

struct	exec_params

struct	executor

struct	executor< device::gpu, Tp, CounterT >

class	thread_barrier

Functions
template<typename DeviceT , typename CounterT , typename Tp , typename... Types, typename DataType = exec_data<CounterT>, typename DataPtr = std::shared_ptr<DataType>, typename std::enable_if<(sizeof...(Types)==0), int >::type = 0>
std::shared_ptr< DataType >	execute (std::shared_ptr< DataType > _data=std::make_shared< DataType >())

template<typename Counter >
void	serialize (std::string fname, exec_data< Counter > &obj)

template<typename DeviceT , typename Tp , typename Intp = int32_t, device::enable_if_cpu_t< DeviceT > = 0>
void	initialize_buffer (Tp *A, const Tp &value, const Intp &nsize)

template<typename DeviceT , typename Tp , typename Intp = int32_t, device::enable_if_gpu_t< DeviceT > = 0>
void	initialize_buffer (Tp *A, Tp value, Intp nsize)

template<size_t Nrep, typename DeviceT , typename Intp , typename Tp , typename OpsFuncT , typename StoreFuncT , device::enable_if_cpu_t< DeviceT > = 0>
void	ops_kernel (Intp ntrials, Intp nsize, Tp *A, OpsFuncT &&ops_func, StoreFuncT &&store_func)

template<size_t Nrep, typename DeviceT , typename Intp , typename Tp , typename OpsFuncT , typename StoreFuncT , device::enable_if_gpu_t< DeviceT > = 0, enable_if_t<!std::is_same< Tp, gpu::fp16_t >::value > = 0>
void	ops_kernel (Intp ntrials, Intp nsize, Tp *A, OpsFuncT &&ops_func, StoreFuncT &&store_func)

template<size_t Nops, size_t... Nextra, typename DeviceT , typename Tp , typename CounterT , typename OpsFuncT , typename StoreFuncT , enable_if_t< sizeof...(Nextra)==0, int > = 0>
bool	ops_main (counter< DeviceT, Tp, CounterT > &_counter, OpsFuncT &&ops_func, StoreFuncT &&store_func)
	This is the "main" function for ERT. More...

template<size_t... Nops, typename DeviceT , typename Tp , typename CounterT , typename OpsFuncT , typename StoreFuncT , enable_if_t< sizeof...(Nops)==0, int > = 0>
bool	ops_main (counter< DeviceT, Tp, CounterT > &, OpsFuncT &&, StoreFuncT &&)
	This is invoked when TIMEMORY_USER_ERT_FLOPS is empty. More...

Function Documentation

◆ execute()

template<typename DeviceT , typename CounterT , typename Tp , typename... Types, typename DataType = exec_data<CounterT>, typename DataPtr = std::shared_ptr<DataType>, typename std::enable_if<(sizeof...(Types)==0), int >::type = 0>

std::shared_ptr< DataType > tim::ert::execute ( std::shared_ptr< DataType > _data = std::make_shared<DataType>() )

Definition at line 650 of file configuration.hpp.

{
    using ConfigType = configuration<DeviceT, Tp, CounterT>;
    using ExecType   = executor<DeviceT, Tp, CounterT>;
 
    ConfigType _config{};
    ExecType(_config, _data);
 
    return _data;
}

Referenced by tim::sampling::sampler< CompT< Types... >, N, SigIds... >::configure().

◆ initialize_buffer() [1/2]

template<typename DeviceT , typename Tp , typename Intp = int32_t, device::enable_if_cpu_t< DeviceT > = 0>

void tim::ert::initialize_buffer	(	Tp *	A,
		const Tp &	value,
		const Intp &	nsize
	)

Definition at line 292 of file data.hpp.

{
    auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
    for(auto i = range.begin(); i < range.end(); i += range.stride())
        A[i] = value;
}

◆ initialize_buffer() [2/2]

template<typename DeviceT , typename Tp , typename Intp = int32_t, device::enable_if_gpu_t< DeviceT > = 0>

void tim::ert::initialize_buffer	(	Tp *	A,
		Tp	value,
		Intp	nsize
	)

Definition at line 308 of file data.hpp.

{
    auto range = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
    for(auto i = range.begin(); i < range.end(); i += range.stride())
        A[i] = value;
}

◆ ops_kernel() [1/2]

template<size_t Nrep, typename DeviceT , typename Intp , typename Tp , typename OpsFuncT , typename StoreFuncT , device::enable_if_cpu_t< DeviceT > = 0>

void tim::ert::ops_kernel	(	Intp	ntrials,
		Intp	nsize,
		Tp *	A,
		OpsFuncT &&	ops_func,
		StoreFuncT &&	store_func
	)

Definition at line 68 of file kernels.hpp.

{
    // divide by two here because macros halve, e.g. ERT_FLOP == 4 means 2 calls
    constexpr size_t NUM_REP = Nrep / 2;
    constexpr size_t MOD_REP = Nrep % 2;
    auto             range   = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
 
    Tp alpha = static_cast<Tp>(0.5);
    for(Intp j = 0; j < ntrials; ++j)
    {
        for(auto i = range.begin(); i < range.end(); i += range.stride())
        {
            Tp beta = static_cast<Tp>(0.8);
            mpl::apply<void>::unroll<NUM_REP + MOD_REP, DeviceT>(ops_func, beta, A[i],
                                                                 alpha);
            store_func(A[i], beta);
        }
        alpha *= static_cast<Tp>(1.0 - 1.0e-8);
    }
}

◆ ops_kernel() [2/2]

template<size_t Nrep, typename DeviceT , typename Intp , typename Tp , typename OpsFuncT , typename StoreFuncT , device::enable_if_gpu_t< DeviceT > = 0, enable_if_t<!std::is_same< Tp, gpu::fp16_t >::value > = 0>

void tim::ert::ops_kernel	(	Intp	ntrials,
		Intp	nsize,
		Tp *	A,
		OpsFuncT &&	ops_func,
		StoreFuncT &&	store_func
	)

Definition at line 99 of file kernels.hpp.

{
    // divide by two here because macros halve, e.g. ERT_FLOP == 4 means 2 calls
    constexpr size_t NUM_REP = Nrep / 2;
    constexpr size_t MOD_REP = Nrep % 2;
    auto             range   = device::grid_strided_range<DeviceT, 0, Intp>(nsize);
 
    Tp alpha = static_cast<Tp>(0.5);
    for(Intp j = 0; j < ntrials; ++j)
    {
        for(auto i = range.begin(); i < range.end(); i += range.stride())
        {
            Tp beta = static_cast<Tp>(0.8);
            mpl::apply<void>::unroll<NUM_REP + MOD_REP, DeviceT>(ops_func, beta, A[i],
                                                                 alpha);
            store_func(A[i], beta);
        }
        alpha *= static_cast<Tp>(1.0 - 1.0e-8);
    }
}

◆ ops_main() [1/2]

template<size_t... Nops, typename DeviceT , typename Tp , typename CounterT , typename OpsFuncT , typename StoreFuncT , enable_if_t< sizeof...(Nops)==0, int > = 0>

bool tim::ert::ops_main	(	counter< DeviceT, Tp, CounterT > &	,
		OpsFuncT &&	,
		StoreFuncT &&
	)

This is invoked when TIMEMORY_USER_ERT_FLOPS is empty.

Definition at line 419 of file kernels.hpp.

{
    return false;
}

◆ ops_main() [2/2]

template<size_t Nops, size_t... Nextra, typename DeviceT , typename Tp , typename CounterT , typename OpsFuncT , typename StoreFuncT , enable_if_t< sizeof...(Nextra)==0, int > = 0>

bool tim::ert::ops_main	(	counter< DeviceT, Tp, CounterT > &	_counter,
		OpsFuncT &&	ops_func,
		StoreFuncT &&	store_func
	)

This is the "main" function for ERT.

This is invokes the "main" function for ERT for all the desired "FLOPs" that are unrolled in the kernel.

Definition at line 159 of file kernels.hpp.

{
    if(_counter.skip(Nops))
        return false;
 
    using stream_list_t   = std::vector<gpu::stream_t>;
    using thread_list_t   = std::vector<std::thread>;
    using device_params_t = device::params<DeviceT>;
    using Intp            = int32_t;
    using ull             = long long unsigned;
 
    constexpr bool is_gpu = std::is_same<DeviceT, device::gpu>::value;
 
    if(settings::verbose() > 0 || settings::debug())
        printf("[%s] Executing %li ops...\n", __FUNCTION__, (long int) Nops);
 
    if(_counter.bytes_per_element == 0)
    {
        fprintf(stderr, "[%s:%i]> bytes-per-element is not set!\n", __FUNCTION__,
                __LINE__);
    }
 
    if(_counter.memory_accesses_per_element == 0)
    {
        fprintf(stderr, "[%s:%i]> memory-accesses-per-element is not set!\n",
                __FUNCTION__, __LINE__);
    }
 
    // list of streams
    stream_list_t streams;
    // generate async streams if multiple streams were requested
    if(_counter.params.nstreams > 1)
    {
        // fill with implicit stream
        streams.resize(_counter.params.nstreams, 0);
        for(auto& itr : streams)
            gpu::stream_create(itr);
    }
 
    auto _opfunc = [&](uint64_t tid, thread_barrier* fbarrier, thread_barrier* lbarrier) {
        threading::affinity::set();
        using opmutex_t = std::mutex;
        using oplock_t  = std::unique_lock<opmutex_t>;
        static opmutex_t opmutex;
        {
            oplock_t _lock(opmutex);
            // execute the callback
            _counter.configure(tid);
        }
        // allocate buffer
        auto     buf = _counter.get_buffer();
        uint64_t n   = _counter.params.working_set_min;
        // cache this
        const uint64_t nstreams = std::max<uint64_t>(_counter.params.nstreams, 1);
        // create the launch parameters (ignored on CPU)
        //
        // if grid_size is zero (default), the launch command will calculate a grid-size
        // as follows:
        //
        //      grid_size = ((data_size + block_size - 1) / block_size)
        //
        device_params_t dev_params(_counter.params.grid_size, _counter.params.block_size,
                                   _counter.params.shmem_size, DeviceT::default_stream);
        //
        if(n > _counter.nsize)
        {
            fprintf(stderr,
                    "[%s@'%s':%i]> Warning! ERT not running any trials because working "
                    "set min > nsize: %llu > %llu\n",
                    TIMEMORY_ERROR_FUNCTION_MACRO, __FILE__, __LINE__, (ull) n,
                    (ull) _counter.nsize);
        }
 
        while(n <= _counter.nsize)
        {
            // working set - nsize
            uint64_t ntrials = _counter.nsize / n;
            if(ntrials < 1)
                ntrials = 1;
 
            if(settings::debug() && tid == 0)
            {
                printf("[tim::ert::ops_main<%llu>]> number of trials: %llu, n = %llu, "
                       "nsize "
                       "= %llu\n",
                       (ull) Nops, (ull) ntrials, (ull) n, (ull) _counter.nsize);
            }
 
            auto _itr_params = _counter.params;
 
            if(is_gpu)
            {
                // make sure all streams are synced
                for(auto& itr : streams)
                    gpu::stream_sync(itr);
 
                // sync the streams
                if(nstreams < 2)
                    gpu::device_sync();
            }
 
            // wait master thread notifies to proceed
            // if(fbarrier)
            //    fbarrier->notify_wait();
            if(fbarrier)
                fbarrier->spin_wait();
 
            // get instance of object measuring something during the calculation
            CounterT ct = _counter.get_counter();
            // start the timer or anything else being recorded
            ct.start();
 
            // only do this more complicated mess if we need to
            if(nstreams > 1)
            {
                auto nchunk  = n / nstreams;
                auto nmodulo = n % nstreams;
                for(uint64_t i = 0; i < nstreams; ++i)
                {
                    // calculate the size of the subchunk
                    int32_t _n      = nchunk + ((i + 1 == nstreams) ? nmodulo : 0);
                    auto    _params = dev_params;  // copy of the parameters
                    device::launch(
                        _n, streams.at(i % streams.size()), _params,
                        ops_kernel<Nops, DeviceT, Intp, Tp, OpsFuncT, StoreFuncT>,
                        ntrials, _n, buf + (i * nchunk), std::forward<OpsFuncT>(ops_func),
                        std::forward<StoreFuncT>(store_func));
                    _itr_params.grid_size =
                        (i == 0) ? _params.grid
                                 : std::max<int64_t>(_itr_params.grid_size, _params.grid);
                }
            }
            else
            {
                device::launch(n, dev_params,
                               ops_kernel<Nops, DeviceT, Intp, Tp, OpsFuncT, StoreFuncT>,
                               ntrials, n, buf, std::forward<OpsFuncT>(ops_func),
                               std::forward<StoreFuncT>(store_func));
 
                _itr_params.grid_size = dev_params.grid;
            }
 
            if(is_gpu)
            {
                for(auto& itr : streams)
                    gpu::stream_sync(itr);
 
                // sync the streams
                if(nstreams < 2)
                    gpu::device_sync();
            }
 
            // wait master thread notifies to proceed
            // if(lbarrier)
            //    lbarrier->notify_wait();
            if(lbarrier)
                lbarrier->spin_wait();
 
            // stop the timer or anything else being recorded
            ct.stop();
 
            // store the result
            if(tid == 0)
            {
                // ensure there is not a data race if more than one thread somehow
                // has a tid of 0
                oplock_t _lock(opmutex);
                _counter.record(ct, n, ntrials, Nops, _itr_params);
            }
 
            n = ((1.1 * n) == n) ? (n + 1) : (1.1 * n);
        }
 
        if(is_gpu)
            gpu::device_sync();
 
        _counter.destroy_buffer(buf);
    };
 
    // guard against multiple threads trying to call ERT for some reason
    static std::mutex            _mtx;
    std::unique_lock<std::mutex> _lock(_mtx);
 
    dmp::barrier();  // synchronize MPI processes
 
    if(is_gpu)
        gpu::device_sync();
 
    if(_counter.params.nthreads > 1)
    {
        // create synchronization barriers for the threads
        thread_barrier fbarrier{ _counter.params.nthreads };
        thread_barrier lbarrier{ _counter.params.nthreads };
 
        // list of threads
        thread_list_t threads{};
        // create the threads
        for(uint64_t i = 0; i < _counter.params.nthreads; ++i)
            threads.emplace_back(_opfunc, i, &fbarrier, &lbarrier);
 
        /*
        uint64_t n = _counter.params.working_set_min;
        while(n <= _counter.nsize)
        {
            // wait until all threads have also called notify_wait() then release
            // barrier to start
            fbarrier.notify_wait();
            // wait until all threads have also called notify_wait() then release
            // barrier to finish
            lbarrier.notify_wait();
            n = ((1.1 * n) == n) ? (n + 1) : (1.1 * n);
        }*/
 
        // wait for threads to finish
        for(auto& itr : threads)
            itr.join();
    }
    else
    {
        _opfunc(0, nullptr, nullptr);
    }
 
    if(is_gpu)
        gpu::device_sync();
 
    dmp::barrier();  // synchronize MPI processes
 
    // code was executed
    return true;
}

Referenced by tim::ert::executor< DeviceT, Tp, CounterT >::execute().

◆ serialize()

template<typename Counter >

void tim::ert::serialize	(	std::string	fname,
		exec_data< Counter > &	obj
	)

inline

Definition at line 325 of file counter.hpp.

{
    using exec_data_vec_t = std::vector<exec_data<Counter>>;
 
    int dmp_rank = dmp::rank();
    int dmp_size = dmp::size();
 
    exec_data_vec_t results(dmp_size);
    if(dmp::is_initialized())
    {
        dmp::barrier();
 
#if defined(TIMEMORY_USE_MPI) || defined(TIMEMORY_USE_UPCXX)
        auto space = cereal::JSONOutputArchive::Options::IndentChar::space;
 
        //------------------------------------------------------------------------------//
        //  Used to convert a result to a serialization
        //
        auto send_serialize = [&](const exec_data<Counter>& src) {
            std::stringstream ss;
            {
                cereal::JSONOutputArchive::Options opt(16, space, 0);
                cereal::JSONOutputArchive          oa(ss, opt);
                oa(cereal::make_nvp("data", src));
            }
            return ss.str();
        };
 
        //------------------------------------------------------------------------------//
        //  Used to convert the serialization to a result
        //
        auto recv_serialize = [&](const std::string& src) {
            exec_data<Counter> ret;
            std::stringstream  ss;
            ss << src;
            {
                cereal::JSONInputArchive ia(ss);
                ia(cereal::make_nvp("data", ret));
            }
            return ret;
        };
#endif
 
#if defined(TIMEMORY_USE_MPI)
 
        auto str_ret = send_serialize(obj);
 
        if(dmp_rank == 0)
        {
            for(int i = 1; i < dmp_size; ++i)
            {
                std::string str;
                mpi::recv(str, i, 0, mpi::comm_world_v);
                results[i] = recv_serialize(str);
            }
            results[dmp_rank] = std::move(obj);
        }
        else
        {
            mpi::send(str_ret, 0, 0, mpi::comm_world_v);
        }
 
#elif defined(TIMEMORY_USE_UPCXX)
 
        //------------------------------------------------------------------------------//
        //  Function executed on remote node
        //
        auto remote_serialize = [=]() { return send_serialize(obj); };
 
        //------------------------------------------------------------------------------//
        //  Combine on master rank
        //
        if(dmp_rank == 0)
        {
            for(int i = 1; i < dmp_size; ++i)
            {
                upcxx::future<std::string> fut = upcxx::rpc(i, remote_serialize);
                while(!fut.ready())
                    upcxx::progress();
                fut.wait();
                results[i] = recv_serialize(fut.result());
            }
            results[dmp_rank] = std::move(obj);
        }
 
#endif
    }
    else
    {
        results.clear();
        results.resize(1);
        results.at(0) = std::move(obj);
    }
 
    if(dmp_rank == 0)
    {
        fname = settings::compose_output_filename(fname, ".json");
        printf("[%i]> Outputting '%s'...\n", dmp_rank, fname.c_str());
        std::ofstream ofs{};
        if(filepath::open(ofs, fname))
        {
            // ensure json write final block during destruction before the file is closed
            using policy_type = policy::output_archive_t<Counter>;
            auto oa           = policy_type::get(ofs);
            oa->setNextName("timemory");
            oa->startNode();
            oa->setNextName("ranks");
            oa->startNode();
            oa->makeArray();
            for(uint64_t i = 0; i < results.size(); ++i)
            {
                oa->startNode();
                (*oa)(cereal::make_nvp("rank", i),
                      cereal::make_nvp("roofline", results.at(i)));
                oa->finishNode();
            }
            oa->finishNode();
            oa->finishNode();
        }
        if(ofs)
            ofs << std::endl;
        ofs.close();
    }
}

References tim::settings::compose_output_filename(), tim::get(), and tim::filepath::open().

Classes

Functions