.svn/pristine/5c/5cdf9c8e70d434c7620bc53c5fa7c41cac8041c7.svn-base

//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//

#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP

#include <iterator>

#include <boost/compute/utility/source.hpp>
#include <boost/compute/program.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/detail/vendor.hpp>
#include <boost/compute/detail/parameter_cache.hpp>
#include <boost/compute/detail/work_size.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/utility/program_cache.hpp>

namespace boost {
namespace compute {
namespace detail {

/// \internal
/// body reduction inside a warp
template<typename T,bool isNvidiaDevice>
struct ReduceBody
{
    static std::string body()
    {
        std::stringstream k;
        // local reduction
        k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
             "   barrier(CLK_LOCAL_MEM_FENCE);\n"  <<
             "   uint mask = (i << 1) - 1;\n"      <<
             "   if((lid & mask) == 0){\n"         <<
             "       scratch[lid] += scratch[lid+i];\n" <<
             "   }\n" <<
            "}\n";
        return k.str();
    }
};

/// \internal
/// body reduction inside a warp
/// for nvidia device we can use the "unsafe"
/// memory optimisation
template<typename T>
struct ReduceBody<T,true>
{
    static std::string body()
    {
        std::stringstream k;
        // local reduction
        // we use TPB to compile only useful instruction
        // local reduction when size is greater than warp size
        k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
        "if(TPB >= 1024){\n" <<
            "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
         "if(TPB >= 512){\n" <<
            "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
         "if(TPB >= 256){\n" <<
            "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
         "if(TPB >= 128){\n" <<
            "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<

        // warp reduction
        "if(lid < 32){\n" <<
            // volatile this way we don't need any barrier
            "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
            "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
            "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
            "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
            "if(TPB >=  8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
            "if(TPB >=  4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
            "if(TPB >=  2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
        "}\n";
        return k.str();
    }
};

template<class InputIterator, class Function>
inline void initial_reduce(InputIterator first,
                           InputIterator last,
                           buffer result,
                           const Function &function,
                           kernel &reduce_kernel,
                           const uint_ vpt,
                           const uint_ tpb,
                           command_queue &queue)
{
    (void) function;
    (void) reduce_kernel;

    typedef typename std::iterator_traits<InputIterator>::value_type Arg;
    typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;

    size_t count = std::distance(first, last);
    detail::meta_kernel k("initial_reduce");
    k.add_set_arg<const uint_>("count", uint_(count));
    size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");

    k <<
        k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<

        "__local " << type_name<T>() << " scratch[TPB];\n" <<

        // private reduction
        k.decl<T>("sum") << " = 0;\n" <<
        "for(uint i = 0; i < VPT; i++){\n" <<
        "    if(offset + lid + i*TPB < count){\n" <<
        "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
        "    }\n" <<
        "}\n" <<

        "scratch[lid] = sum;\n" <<

        // local reduction
        ReduceBody<T,false>::body() <<

        // write sum to output
        "if(lid == 0){\n" <<
        "    output[get_group_id(0)] = scratch[0];\n" <<
        "}\n";

    const context &context = queue.get_context();
    std::stringstream options;
    options << "-DVPT=" << vpt << " -DTPB=" << tpb;
    kernel generic_reduce_kernel = k.compile(context, options.str());
    generic_reduce_kernel.set_arg(output_arg, result);

    size_t work_size = calculate_work_size(count, vpt, tpb);

    queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
}

template<class T>
inline void initial_reduce(const buffer_iterator<T> &first,
                           const buffer_iterator<T> &last,
                           const buffer &result,
                           const plus<T> &function,
                           kernel &reduce_kernel,
                           const uint_ vpt,
                           const uint_ tpb,
                           command_queue &queue)
{
    (void) function;

    size_t count = std::distance(first, last);

    reduce_kernel.set_arg(0, first.get_buffer());
    reduce_kernel.set_arg(1, uint_(first.get_index()));
    reduce_kernel.set_arg(2, uint_(count));
    reduce_kernel.set_arg(3, result);
    reduce_kernel.set_arg(4, uint_(0));

    size_t work_size = calculate_work_size(count, vpt, tpb);

    queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
}

template<class InputIterator, class T, class Function>
inline void reduce_on_gpu(InputIterator first,
                          InputIterator last,
                          buffer_iterator<T> result,
                          Function function,
                          command_queue &queue)
{
    const device &device = queue.get_device();
    const context &context = queue.get_context();

    detail::meta_kernel k("reduce");
    k.add_arg<const T*>(memory_object::global_memory, "input");
    k.add_arg<const uint_>("offset");
    k.add_arg<const uint_>("count");
    k.add_arg<T*>(memory_object::global_memory, "output");
    k.add_arg<const uint_>("output_offset");

    k <<
        k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
        "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<

        "__local " << type_name<T>() << " scratch[TPB];\n" <<
        // private reduction
        k.decl<T>("sum") << " = 0;\n" <<
        "for(uint i = 0; i < VPT; i++){\n" <<
        "    if(block_offset + lid + i*TPB < count){\n" <<
        "        sum = sum + block[lid+i*TPB]; \n" <<
        "    }\n" <<
        "}\n" <<

        "scratch[lid] = sum;\n";

    // discrimination on vendor name
    if(is_nvidia_device(device))
        k << ReduceBody<T,true>::body();
    else
        k << ReduceBody<T,false>::body();

    k <<
        // write sum to output
         "if(lid == 0){\n" <<
         "    output[output_offset + get_group_id(0)] = scratch[0];\n" <<
         "}\n";

    std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();

    // load parameters
    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    uint_ vpt = parameters->get(cache_key, "vpt", 8);
    uint_ tpb = parameters->get(cache_key, "tpb", 128);

    // reduce program compiler flags
    std::stringstream options;
    options << "-DT=" << type_name<T>()
            << " -DVPT=" << vpt
            << " -DTPB=" << tpb;

    // load program
    boost::shared_ptr<program_cache> cache =
        program_cache::get_global_cache(context);

    program reduce_program = cache->get_or_build(
        cache_key, options.str(), k.source(), context
    );

    // create reduce kernel
    kernel reduce_kernel(reduce_program, "reduce");

    size_t count = std::distance(first, last);

    // first pass, reduce from input to ping
    buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
    initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);

    // update count after initial reduce
    count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));

    // middle pass(es), reduce between ping and pong
    const buffer *input_buffer = &ping;
    buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
    const buffer *output_buffer = &pong;
    if(count > vpt * tpb){
        while(count > vpt * tpb){
            reduce_kernel.set_arg(0, *input_buffer);
            reduce_kernel.set_arg(1, uint_(0));
            reduce_kernel.set_arg(2, uint_(count));
            reduce_kernel.set_arg(3, *output_buffer);
            reduce_kernel.set_arg(4, uint_(0));

            size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
            if(work_size % tpb != 0){
                work_size += tpb - work_size % tpb;
            }
            queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);

            std::swap(input_buffer, output_buffer);
            count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
        }
    }

    // final pass, reduce from ping/pong to result
    reduce_kernel.set_arg(0, *input_buffer);
    reduce_kernel.set_arg(1, uint_(0));
    reduce_kernel.set_arg(2, uint_(count));
    reduce_kernel.set_arg(3, result.get_buffer());
    reduce_kernel.set_arg(4, uint_(result.get_index()));

    queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
}

} // end detail namespace
} // end compute namespace
} // end boost namespace

#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
Initial Commit 2018-02-08 21:28:33 -05:00			`//---------------------------------------------------------------------------//`
			`// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>`
			`//`
			`// Distributed under the Boost Software License, Version 1.0`
			`// See accompanying file LICENSE_1_0.txt or copy at`
			`// http://www.boost.org/LICENSE_1_0.txt`
			`//`
			`// See http://boostorg.github.com/compute for more information.`
			`//---------------------------------------------------------------------------//`

			`#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP`
			`#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP`

			`#include <iterator>`

			`#include <boost/compute/utility/source.hpp>`
			`#include <boost/compute/program.hpp>`
			`#include <boost/compute/command_queue.hpp>`
			`#include <boost/compute/detail/vendor.hpp>`
			`#include <boost/compute/detail/parameter_cache.hpp>`
			`#include <boost/compute/detail/work_size.hpp>`
			`#include <boost/compute/detail/meta_kernel.hpp>`
			`#include <boost/compute/type_traits/type_name.hpp>`
			`#include <boost/compute/utility/program_cache.hpp>`

			`namespace boost {`
			`namespace compute {`
			`namespace detail {`

			`/// \internal`
			`/// body reduction inside a warp`
			`template<typename T,bool isNvidiaDevice>`
			`struct ReduceBody`
			`{`
			`static std::string body()`
			`{`
			`std::stringstream k;`
			`// local reduction`
			`k << "for(int i = 1; i < TPB; i <<= 1){\n" <<`
			`" barrier(CLK_LOCAL_MEM_FENCE);\n" <<`
			`" uint mask = (i << 1) - 1;\n" <<`
			`" if((lid & mask) == 0){\n" <<`
			`" scratch[lid] += scratch[lid+i];\n" <<`
			`" }\n" <<`
			`"}\n";`
			`return k.str();`
			`}`
			`};`

			`/// \internal`
			`/// body reduction inside a warp`
			`/// for nvidia device we can use the "unsafe"`
			`/// memory optimisation`
			`template<typename T>`
			`struct ReduceBody<T,true>`
			`{`
			`static std::string body()`
			`{`
			`std::stringstream k;`
			`// local reduction`
			`// we use TPB to compile only useful instruction`
			`// local reduction when size is greater than warp size`
			`k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<`
			`"if(TPB >= 1024){\n" <<`
			`"if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<`
			`"if(TPB >= 512){\n" <<`
			`"if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<`
			`"if(TPB >= 256){\n" <<`
			`"if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<`
			`"if(TPB >= 128){\n" <<`
			`"if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<`

			`// warp reduction`
			`"if(lid < 32){\n" <<`
			`// volatile this way we don't need any barrier`
			`"volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<`
			`"if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<`
			`"if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<`
			`"if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<`
			`"if(TPB >= 8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<`
			`"if(TPB >= 4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<`
			`"if(TPB >= 2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<`
			`"}\n";`
			`return k.str();`
			`}`
			`};`

			`template<class InputIterator, class Function>`
			`inline void initial_reduce(InputIterator first,`
			`InputIterator last,`
			`buffer result,`
			`const Function &function,`
			`kernel &reduce_kernel,`
			`const uint_ vpt,`
			`const uint_ tpb,`
			`command_queue &queue)`
			`{`
			`(void) function;`
			`(void) reduce_kernel;`

			`typedef typename std::iterator_traits<InputIterator>::value_type Arg;`
			`typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;`

			`size_t count = std::distance(first, last);`
			`detail::meta_kernel k("initial_reduce");`
			`k.add_set_arg<const uint_>("count", uint_(count));`
			`size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");`

			`k <<`
			`k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<`
			`k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<`

			`"__local " << type_name<T>() << " scratch[TPB];\n" <<`

			`// private reduction`
			`k.decl<T>("sum") << " = 0;\n" <<`
			`"for(uint i = 0; i < VPT; i++){\n" <<`
			`" if(offset + lid + i*TPB < count){\n" <<`
			`" sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<`
			`" }\n" <<`
			`"}\n" <<`

			`"scratch[lid] = sum;\n" <<`

			`// local reduction`
			`ReduceBody<T,false>::body() <<`

			`// write sum to output`
			`"if(lid == 0){\n" <<`
			`" output[get_group_id(0)] = scratch[0];\n" <<`
			`"}\n";`

			`const context &context = queue.get_context();`
			`std::stringstream options;`
			`options << "-DVPT=" << vpt << " -DTPB=" << tpb;`
			`kernel generic_reduce_kernel = k.compile(context, options.str());`
			`generic_reduce_kernel.set_arg(output_arg, result);`

			`size_t work_size = calculate_work_size(count, vpt, tpb);`

			`queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);`
			`}`

			`template<class T>`
			`inline void initial_reduce(const buffer_iterator<T> &first,`
			`const buffer_iterator<T> &last,`
			`const buffer &result,`
			`const plus<T> &function,`
			`kernel &reduce_kernel,`
			`const uint_ vpt,`
			`const uint_ tpb,`
			`command_queue &queue)`
			`{`
			`(void) function;`

			`size_t count = std::distance(first, last);`

			`reduce_kernel.set_arg(0, first.get_buffer());`
			`reduce_kernel.set_arg(1, uint_(first.get_index()));`
			`reduce_kernel.set_arg(2, uint_(count));`
			`reduce_kernel.set_arg(3, result);`
			`reduce_kernel.set_arg(4, uint_(0));`

			`size_t work_size = calculate_work_size(count, vpt, tpb);`

			`queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);`
			`}`

			`template<class InputIterator, class T, class Function>`
			`inline void reduce_on_gpu(InputIterator first,`
			`InputIterator last,`
			`buffer_iterator<T> result,`
			`Function function,`
			`command_queue &queue)`
			`{`
			`const device &device = queue.get_device();`
			`const context &context = queue.get_context();`

			`detail::meta_kernel k("reduce");`
			`k.add_arg<const T*>(memory_object::global_memory, "input");`
			`k.add_arg<const uint_>("offset");`
			`k.add_arg<const uint_>("count");`
			`k.add_arg<T*>(memory_object::global_memory, "output");`
			`k.add_arg<const uint_>("output_offset");`

			`k <<`
			`k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<`
			`"__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<`
			`k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<`

			`"__local " << type_name<T>() << " scratch[TPB];\n" <<`
			`// private reduction`
			`k.decl<T>("sum") << " = 0;\n" <<`
			`"for(uint i = 0; i < VPT; i++){\n" <<`
			`" if(block_offset + lid + i*TPB < count){\n" <<`
			`" sum = sum + block[lid+i*TPB]; \n" <<`
			`" }\n" <<`
			`"}\n" <<`

			`"scratch[lid] = sum;\n";`

			`// discrimination on vendor name`
			`if(is_nvidia_device(device))`
			`k << ReduceBody<T,true>::body();`
			`else`
			`k << ReduceBody<T,false>::body();`

			`k <<`
			`// write sum to output`
			`"if(lid == 0){\n" <<`
			`" output[output_offset + get_group_id(0)] = scratch[0];\n" <<`
			`"}\n";`

			`std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();`

			`// load parameters`
			`boost::shared_ptr<parameter_cache> parameters =`
			`detail::parameter_cache::get_global_cache(device);`

			`uint_ vpt = parameters->get(cache_key, "vpt", 8);`
			`uint_ tpb = parameters->get(cache_key, "tpb", 128);`

			`// reduce program compiler flags`
			`std::stringstream options;`
			`options << "-DT=" << type_name<T>()`
			`<< " -DVPT=" << vpt`
			`<< " -DTPB=" << tpb;`

			`// load program`
			`boost::shared_ptr<program_cache> cache =`
			`program_cache::get_global_cache(context);`

			`program reduce_program = cache->get_or_build(`
			`cache_key, options.str(), k.source(), context`
			`);`

			`// create reduce kernel`
			`kernel reduce_kernel(reduce_program, "reduce");`

			`size_t count = std::distance(first, last);`

			`// first pass, reduce from input to ping`
			`buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));`
			`initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);`

			`// update count after initial reduce`
			`count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));`

			`// middle pass(es), reduce between ping and pong`
			`const buffer *input_buffer = &ping;`
			`buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));`
			`const buffer *output_buffer = &pong;`
			`if(count > vpt * tpb){`
			`while(count > vpt * tpb){`
			`reduce_kernel.set_arg(0, *input_buffer);`
			`reduce_kernel.set_arg(1, uint_(0));`
			`reduce_kernel.set_arg(2, uint_(count));`
			`reduce_kernel.set_arg(3, *output_buffer);`
			`reduce_kernel.set_arg(4, uint_(0));`

			`size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));`
			`if(work_size % tpb != 0){`
			`work_size += tpb - work_size % tpb;`
			`}`
			`queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);`

			`std::swap(input_buffer, output_buffer);`
			`count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));`
			`}`
			`}`

			`// final pass, reduce from ping/pong to result`
			`reduce_kernel.set_arg(0, *input_buffer);`
			`reduce_kernel.set_arg(1, uint_(0));`
			`reduce_kernel.set_arg(2, uint_(count));`
			`reduce_kernel.set_arg(3, result.get_buffer());`
			`reduce_kernel.set_arg(4, uint_(result.get_index()));`

			`queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);`
			`}`

			`} // end detail namespace`
			`} // end compute namespace`
			`} // end boost namespace`

			`#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP`