287 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			287 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| 
								 | 
							
								//---------------------------------------------------------------------------//
							 | 
						||
| 
								 | 
							
								// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
							 | 
						||
| 
								 | 
							
								//
							 | 
						||
| 
								 | 
							
								// Distributed under the Boost Software License, Version 1.0
							 | 
						||
| 
								 | 
							
								// See accompanying file LICENSE_1_0.txt or copy at
							 | 
						||
| 
								 | 
							
								// http://www.boost.org/LICENSE_1_0.txt
							 | 
						||
| 
								 | 
							
								//
							 | 
						||
| 
								 | 
							
								// See http://boostorg.github.com/compute for more information.
							 | 
						||
| 
								 | 
							
								//---------------------------------------------------------------------------//
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
							 | 
						||
| 
								 | 
							
								#define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include <iterator>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include <boost/compute/utility/source.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/program.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/command_queue.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/vendor.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/parameter_cache.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/work_size.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/meta_kernel.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/type_traits/type_name.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/utility/program_cache.hpp>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								namespace boost {
							 | 
						||
| 
								 | 
							
								namespace compute {
							 | 
						||
| 
								 | 
							
								namespace detail {
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/// \internal
							 | 
						||
| 
								 | 
							
								/// body reduction inside a warp
							 | 
						||
| 
								 | 
							
								template<typename T,bool isNvidiaDevice>
							 | 
						||
| 
								 | 
							
								struct ReduceBody
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    static std::string body()
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        std::stringstream k;
							 | 
						||
| 
								 | 
							
								        // local reduction
							 | 
						||
| 
								 | 
							
								        k << "for(int i = 1; i < TPB; i <<= 1){\n" <<
							 | 
						||
| 
								 | 
							
								             "   barrier(CLK_LOCAL_MEM_FENCE);\n"  <<
							 | 
						||
| 
								 | 
							
								             "   uint mask = (i << 1) - 1;\n"      <<
							 | 
						||
| 
								 | 
							
								             "   if((lid & mask) == 0){\n"         <<
							 | 
						||
| 
								 | 
							
								             "       scratch[lid] += scratch[lid+i];\n" <<
							 | 
						||
| 
								 | 
							
								             "   }\n" <<
							 | 
						||
| 
								 | 
							
								            "}\n";
							 | 
						||
| 
								 | 
							
								        return k.str();
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/// \internal
							 | 
						||
| 
								 | 
							
								/// body reduction inside a warp
							 | 
						||
| 
								 | 
							
								/// for nvidia device we can use the "unsafe"
							 | 
						||
| 
								 | 
							
								/// memory optimisation
							 | 
						||
| 
								 | 
							
								template<typename T>
							 | 
						||
| 
								 | 
							
								struct ReduceBody<T,true>
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    static std::string body()
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        std::stringstream k;
							 | 
						||
| 
								 | 
							
								        // local reduction
							 | 
						||
| 
								 | 
							
								        // we use TPB to compile only useful instruction
							 | 
						||
| 
								 | 
							
								        // local reduction when size is greater than warp size
							 | 
						||
| 
								 | 
							
								        k << "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
							 | 
						||
| 
								 | 
							
								        "if(TPB >= 1024){\n" <<
							 | 
						||
| 
								 | 
							
								            "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
							 | 
						||
| 
								 | 
							
								         "if(TPB >= 512){\n" <<
							 | 
						||
| 
								 | 
							
								            "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
							 | 
						||
| 
								 | 
							
								         "if(TPB >= 256){\n" <<
							 | 
						||
| 
								 | 
							
								            "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" <<
							 | 
						||
| 
								 | 
							
								         "if(TPB >= 128){\n" <<
							 | 
						||
| 
								 | 
							
								            "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        // warp reduction
							 | 
						||
| 
								 | 
							
								        "if(lid < 32){\n" <<
							 | 
						||
| 
								 | 
							
								            // volatile this way we don't need any barrier
							 | 
						||
| 
								 | 
							
								            "volatile __local " << type_name<T>() << " *lmem = scratch;\n" <<
							 | 
						||
| 
								 | 
							
								            "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" <<
							 | 
						||
| 
								 | 
							
								            "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" <<
							 | 
						||
| 
								 | 
							
								            "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" <<
							 | 
						||
| 
								 | 
							
								            "if(TPB >=  8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" <<
							 | 
						||
| 
								 | 
							
								            "if(TPB >=  4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" <<
							 | 
						||
| 
								 | 
							
								            "if(TPB >=  2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" <<
							 | 
						||
| 
								 | 
							
								        "}\n";
							 | 
						||
| 
								 | 
							
								        return k.str();
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								};
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								template<class InputIterator, class Function>
							 | 
						||
| 
								 | 
							
								inline void initial_reduce(InputIterator first,
							 | 
						||
| 
								 | 
							
								                           InputIterator last,
							 | 
						||
| 
								 | 
							
								                           buffer result,
							 | 
						||
| 
								 | 
							
								                           const Function &function,
							 | 
						||
| 
								 | 
							
								                           kernel &reduce_kernel,
							 | 
						||
| 
								 | 
							
								                           const uint_ vpt,
							 | 
						||
| 
								 | 
							
								                           const uint_ tpb,
							 | 
						||
| 
								 | 
							
								                           command_queue &queue)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    (void) function;
							 | 
						||
| 
								 | 
							
								    (void) reduce_kernel;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    typedef typename std::iterator_traits<InputIterator>::value_type Arg;
							 | 
						||
| 
								 | 
							
								    typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    size_t count = std::distance(first, last);
							 | 
						||
| 
								 | 
							
								    detail::meta_kernel k("initial_reduce");
							 | 
						||
| 
								 | 
							
								    k.add_set_arg<const uint_>("count", uint_(count));
							 | 
						||
| 
								 | 
							
								    size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    k <<
							 | 
						||
| 
								 | 
							
								        k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" <<
							 | 
						||
| 
								 | 
							
								        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        "__local " << type_name<T>() << " scratch[TPB];\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        // private reduction
							 | 
						||
| 
								 | 
							
								        k.decl<T>("sum") << " = 0;\n" <<
							 | 
						||
| 
								 | 
							
								        "for(uint i = 0; i < VPT; i++){\n" <<
							 | 
						||
| 
								 | 
							
								        "    if(offset + lid + i*TPB < count){\n" <<
							 | 
						||
| 
								 | 
							
								        "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" <<
							 | 
						||
| 
								 | 
							
								        "    }\n" <<
							 | 
						||
| 
								 | 
							
								        "}\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        "scratch[lid] = sum;\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        // local reduction
							 | 
						||
| 
								 | 
							
								        ReduceBody<T,false>::body() <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        // write sum to output
							 | 
						||
| 
								 | 
							
								        "if(lid == 0){\n" <<
							 | 
						||
| 
								 | 
							
								        "    output[get_group_id(0)] = scratch[0];\n" <<
							 | 
						||
| 
								 | 
							
								        "}\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const context &context = queue.get_context();
							 | 
						||
| 
								 | 
							
								    std::stringstream options;
							 | 
						||
| 
								 | 
							
								    options << "-DVPT=" << vpt << " -DTPB=" << tpb;
							 | 
						||
| 
								 | 
							
								    kernel generic_reduce_kernel = k.compile(context, options.str());
							 | 
						||
| 
								 | 
							
								    generic_reduce_kernel.set_arg(output_arg, result);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    size_t work_size = calculate_work_size(count, vpt, tpb);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								template<class T>
							 | 
						||
| 
								 | 
							
								inline void initial_reduce(const buffer_iterator<T> &first,
							 | 
						||
| 
								 | 
							
								                           const buffer_iterator<T> &last,
							 | 
						||
| 
								 | 
							
								                           const buffer &result,
							 | 
						||
| 
								 | 
							
								                           const plus<T> &function,
							 | 
						||
| 
								 | 
							
								                           kernel &reduce_kernel,
							 | 
						||
| 
								 | 
							
								                           const uint_ vpt,
							 | 
						||
| 
								 | 
							
								                           const uint_ tpb,
							 | 
						||
| 
								 | 
							
								                           command_queue &queue)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    (void) function;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    size_t count = std::distance(first, last);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(0, first.get_buffer());
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(1, uint_(first.get_index()));
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(2, uint_(count));
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(3, result);
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(4, uint_(0));
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    size_t work_size = calculate_work_size(count, vpt, tpb);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								template<class InputIterator, class T, class Function>
							 | 
						||
| 
								 | 
							
								inline void reduce_on_gpu(InputIterator first,
							 | 
						||
| 
								 | 
							
								                          InputIterator last,
							 | 
						||
| 
								 | 
							
								                          buffer_iterator<T> result,
							 | 
						||
| 
								 | 
							
								                          Function function,
							 | 
						||
| 
								 | 
							
								                          command_queue &queue)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    const device &device = queue.get_device();
							 | 
						||
| 
								 | 
							
								    const context &context = queue.get_context();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    detail::meta_kernel k("reduce");
							 | 
						||
| 
								 | 
							
								    k.add_arg<const T*>(memory_object::global_memory, "input");
							 | 
						||
| 
								 | 
							
								    k.add_arg<const uint_>("offset");
							 | 
						||
| 
								 | 
							
								    k.add_arg<const uint_>("count");
							 | 
						||
| 
								 | 
							
								    k.add_arg<T*>(memory_object::global_memory, "output");
							 | 
						||
| 
								 | 
							
								    k.add_arg<const uint_>("output_offset");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    k <<
							 | 
						||
| 
								 | 
							
								        k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" <<
							 | 
						||
| 
								 | 
							
								        "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" <<
							 | 
						||
| 
								 | 
							
								        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        "__local " << type_name<T>() << " scratch[TPB];\n" <<
							 | 
						||
| 
								 | 
							
								        // private reduction
							 | 
						||
| 
								 | 
							
								        k.decl<T>("sum") << " = 0;\n" <<
							 | 
						||
| 
								 | 
							
								        "for(uint i = 0; i < VPT; i++){\n" <<
							 | 
						||
| 
								 | 
							
								        "    if(block_offset + lid + i*TPB < count){\n" <<
							 | 
						||
| 
								 | 
							
								        "        sum = sum + block[lid+i*TPB]; \n" <<
							 | 
						||
| 
								 | 
							
								        "    }\n" <<
							 | 
						||
| 
								 | 
							
								        "}\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        "scratch[lid] = sum;\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // discrimination on vendor name
							 | 
						||
| 
								 | 
							
								    if(is_nvidia_device(device))
							 | 
						||
| 
								 | 
							
								        k << ReduceBody<T,true>::body();
							 | 
						||
| 
								 | 
							
								    else
							 | 
						||
| 
								 | 
							
								        k << ReduceBody<T,false>::body();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    k <<
							 | 
						||
| 
								 | 
							
								        // write sum to output
							 | 
						||
| 
								 | 
							
								         "if(lid == 0){\n" <<
							 | 
						||
| 
								 | 
							
								         "    output[output_offset + get_group_id(0)] = scratch[0];\n" <<
							 | 
						||
| 
								 | 
							
								         "}\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // load parameters
							 | 
						||
| 
								 | 
							
								    boost::shared_ptr<parameter_cache> parameters =
							 | 
						||
| 
								 | 
							
								        detail::parameter_cache::get_global_cache(device);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    uint_ vpt = parameters->get(cache_key, "vpt", 8);
							 | 
						||
| 
								 | 
							
								    uint_ tpb = parameters->get(cache_key, "tpb", 128);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // reduce program compiler flags
							 | 
						||
| 
								 | 
							
								    std::stringstream options;
							 | 
						||
| 
								 | 
							
								    options << "-DT=" << type_name<T>()
							 | 
						||
| 
								 | 
							
								            << " -DVPT=" << vpt
							 | 
						||
| 
								 | 
							
								            << " -DTPB=" << tpb;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // load program
							 | 
						||
| 
								 | 
							
								    boost::shared_ptr<program_cache> cache =
							 | 
						||
| 
								 | 
							
								        program_cache::get_global_cache(context);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    program reduce_program = cache->get_or_build(
							 | 
						||
| 
								 | 
							
								        cache_key, options.str(), k.source(), context
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // create reduce kernel
							 | 
						||
| 
								 | 
							
								    kernel reduce_kernel(reduce_program, "reduce");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    size_t count = std::distance(first, last);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // first pass, reduce from input to ping
							 | 
						||
| 
								 | 
							
								    buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T));
							 | 
						||
| 
								 | 
							
								    initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // update count after initial reduce
							 | 
						||
| 
								 | 
							
								    count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // middle pass(es), reduce between ping and pong
							 | 
						||
| 
								 | 
							
								    const buffer *input_buffer = &ping;
							 | 
						||
| 
								 | 
							
								    buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T)));
							 | 
						||
| 
								 | 
							
								    const buffer *output_buffer = &pong;
							 | 
						||
| 
								 | 
							
								    if(count > vpt * tpb){
							 | 
						||
| 
								 | 
							
								        while(count > vpt * tpb){
							 | 
						||
| 
								 | 
							
								            reduce_kernel.set_arg(0, *input_buffer);
							 | 
						||
| 
								 | 
							
								            reduce_kernel.set_arg(1, uint_(0));
							 | 
						||
| 
								 | 
							
								            reduce_kernel.set_arg(2, uint_(count));
							 | 
						||
| 
								 | 
							
								            reduce_kernel.set_arg(3, *output_buffer);
							 | 
						||
| 
								 | 
							
								            reduce_kernel.set_arg(4, uint_(0));
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt));
							 | 
						||
| 
								 | 
							
								            if(work_size % tpb != 0){
							 | 
						||
| 
								 | 
							
								                work_size += tpb - work_size % tpb;
							 | 
						||
| 
								 | 
							
								            }
							 | 
						||
| 
								 | 
							
								            queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            std::swap(input_buffer, output_buffer);
							 | 
						||
| 
								 | 
							
								            count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb));
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // final pass, reduce from ping/pong to result
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(0, *input_buffer);
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(1, uint_(0));
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(2, uint_(count));
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(3, result.get_buffer());
							 | 
						||
| 
								 | 
							
								    reduce_kernel.set_arg(4, uint_(result.get_index()));
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								} // end detail namespace
							 | 
						||
| 
								 | 
							
								} // end compute namespace
							 | 
						||
| 
								 | 
							
								} // end boost namespace
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP
							 |