287 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			287 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | //---------------------------------------------------------------------------// | ||
|  | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | ||
|  | // | ||
|  | // Distributed under the Boost Software License, Version 1.0 | ||
|  | // See accompanying file LICENSE_1_0.txt or copy at | ||
|  | // http://www.boost.org/LICENSE_1_0.txt | ||
|  | // | ||
|  | // See http://boostorg.github.com/compute for more information. | ||
|  | //---------------------------------------------------------------------------// | ||
|  | 
 | ||
|  | #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP | ||
|  | #define BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP | ||
|  | 
 | ||
|  | #include <iterator> | ||
|  | 
 | ||
|  | #include <boost/compute/utility/source.hpp> | ||
|  | #include <boost/compute/program.hpp> | ||
|  | #include <boost/compute/command_queue.hpp> | ||
|  | #include <boost/compute/detail/vendor.hpp> | ||
|  | #include <boost/compute/detail/parameter_cache.hpp> | ||
|  | #include <boost/compute/detail/work_size.hpp> | ||
|  | #include <boost/compute/detail/meta_kernel.hpp> | ||
|  | #include <boost/compute/type_traits/type_name.hpp> | ||
|  | #include <boost/compute/utility/program_cache.hpp> | ||
|  | 
 | ||
|  | namespace boost { | ||
|  | namespace compute { | ||
|  | namespace detail { | ||
|  | 
 | ||
|  | /// \internal | ||
|  | /// body reduction inside a warp | ||
|  | template<typename T,bool isNvidiaDevice> | ||
|  | struct ReduceBody | ||
|  | { | ||
|  |     static std::string body() | ||
|  |     { | ||
|  |         std::stringstream k; | ||
|  |         // local reduction | ||
|  |         k << "for(int i = 1; i < TPB; i <<= 1){\n" << | ||
|  |              "   barrier(CLK_LOCAL_MEM_FENCE);\n"  << | ||
|  |              "   uint mask = (i << 1) - 1;\n"      << | ||
|  |              "   if((lid & mask) == 0){\n"         << | ||
|  |              "       scratch[lid] += scratch[lid+i];\n" << | ||
|  |              "   }\n" << | ||
|  |             "}\n"; | ||
|  |         return k.str(); | ||
|  |     } | ||
|  | }; | ||
|  | 
 | ||
|  | /// \internal | ||
|  | /// body reduction inside a warp | ||
|  | /// for nvidia device we can use the "unsafe" | ||
|  | /// memory optimisation | ||
|  | template<typename T> | ||
|  | struct ReduceBody<T,true> | ||
|  | { | ||
|  |     static std::string body() | ||
|  |     { | ||
|  |         std::stringstream k; | ||
|  |         // local reduction | ||
|  |         // we use TPB to compile only useful instruction | ||
|  |         // local reduction when size is greater than warp size | ||
|  |         k << "barrier(CLK_LOCAL_MEM_FENCE);\n" << | ||
|  |         "if(TPB >= 1024){\n" << | ||
|  |             "if(lid < 512) { sum += scratch[lid + 512]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" << | ||
|  |          "if(TPB >= 512){\n" << | ||
|  |             "if(lid < 256) { sum += scratch[lid + 256]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" << | ||
|  |          "if(TPB >= 256){\n" << | ||
|  |             "if(lid < 128) { sum += scratch[lid + 128]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);}\n" << | ||
|  |          "if(TPB >= 128){\n" << | ||
|  |             "if(lid < 64) { sum += scratch[lid + 64]; scratch[lid] = sum;} barrier(CLK_LOCAL_MEM_FENCE);} \n" << | ||
|  | 
 | ||
|  |         // warp reduction | ||
|  |         "if(lid < 32){\n" << | ||
|  |             // volatile this way we don't need any barrier | ||
|  |             "volatile __local " << type_name<T>() << " *lmem = scratch;\n" << | ||
|  |             "if(TPB >= 64) { lmem[lid] = sum = sum + lmem[lid+32];} \n" << | ||
|  |             "if(TPB >= 32) { lmem[lid] = sum = sum + lmem[lid+16];} \n" << | ||
|  |             "if(TPB >= 16) { lmem[lid] = sum = sum + lmem[lid+ 8];} \n" << | ||
|  |             "if(TPB >=  8) { lmem[lid] = sum = sum + lmem[lid+ 4];} \n" << | ||
|  |             "if(TPB >=  4) { lmem[lid] = sum = sum + lmem[lid+ 2];} \n" << | ||
|  |             "if(TPB >=  2) { lmem[lid] = sum = sum + lmem[lid+ 1];} \n" << | ||
|  |         "}\n"; | ||
|  |         return k.str(); | ||
|  |     } | ||
|  | }; | ||
|  | 
 | ||
|  | template<class InputIterator, class Function> | ||
|  | inline void initial_reduce(InputIterator first, | ||
|  |                            InputIterator last, | ||
|  |                            buffer result, | ||
|  |                            const Function &function, | ||
|  |                            kernel &reduce_kernel, | ||
|  |                            const uint_ vpt, | ||
|  |                            const uint_ tpb, | ||
|  |                            command_queue &queue) | ||
|  | { | ||
|  |     (void) function; | ||
|  |     (void) reduce_kernel; | ||
|  | 
 | ||
|  |     typedef typename std::iterator_traits<InputIterator>::value_type Arg; | ||
|  |     typedef typename boost::tr1_result_of<Function(Arg, Arg)>::type T; | ||
|  | 
 | ||
|  |     size_t count = std::distance(first, last); | ||
|  |     detail::meta_kernel k("initial_reduce"); | ||
|  |     k.add_set_arg<const uint_>("count", uint_(count)); | ||
|  |     size_t output_arg = k.add_arg<T *>(memory_object::global_memory, "output"); | ||
|  | 
 | ||
|  |     k << | ||
|  |         k.decl<const uint_>("offset") << " = get_group_id(0) * VPT * TPB;\n" << | ||
|  |         k.decl<const uint_>("lid") << " = get_local_id(0);\n" << | ||
|  | 
 | ||
|  |         "__local " << type_name<T>() << " scratch[TPB];\n" << | ||
|  | 
 | ||
|  |         // private reduction | ||
|  |         k.decl<T>("sum") << " = 0;\n" << | ||
|  |         "for(uint i = 0; i < VPT; i++){\n" << | ||
|  |         "    if(offset + lid + i*TPB < count){\n" << | ||
|  |         "        sum = sum + " << first[k.var<uint_>("offset+lid+i*TPB")] << ";\n" << | ||
|  |         "    }\n" << | ||
|  |         "}\n" << | ||
|  | 
 | ||
|  |         "scratch[lid] = sum;\n" << | ||
|  | 
 | ||
|  |         // local reduction | ||
|  |         ReduceBody<T,false>::body() << | ||
|  | 
 | ||
|  |         // write sum to output | ||
|  |         "if(lid == 0){\n" << | ||
|  |         "    output[get_group_id(0)] = scratch[0];\n" << | ||
|  |         "}\n"; | ||
|  | 
 | ||
|  |     const context &context = queue.get_context(); | ||
|  |     std::stringstream options; | ||
|  |     options << "-DVPT=" << vpt << " -DTPB=" << tpb; | ||
|  |     kernel generic_reduce_kernel = k.compile(context, options.str()); | ||
|  |     generic_reduce_kernel.set_arg(output_arg, result); | ||
|  | 
 | ||
|  |     size_t work_size = calculate_work_size(count, vpt, tpb); | ||
|  | 
 | ||
|  |     queue.enqueue_1d_range_kernel(generic_reduce_kernel, 0, work_size, tpb); | ||
|  | } | ||
|  | 
 | ||
|  | template<class T> | ||
|  | inline void initial_reduce(const buffer_iterator<T> &first, | ||
|  |                            const buffer_iterator<T> &last, | ||
|  |                            const buffer &result, | ||
|  |                            const plus<T> &function, | ||
|  |                            kernel &reduce_kernel, | ||
|  |                            const uint_ vpt, | ||
|  |                            const uint_ tpb, | ||
|  |                            command_queue &queue) | ||
|  | { | ||
|  |     (void) function; | ||
|  | 
 | ||
|  |     size_t count = std::distance(first, last); | ||
|  | 
 | ||
|  |     reduce_kernel.set_arg(0, first.get_buffer()); | ||
|  |     reduce_kernel.set_arg(1, uint_(first.get_index())); | ||
|  |     reduce_kernel.set_arg(2, uint_(count)); | ||
|  |     reduce_kernel.set_arg(3, result); | ||
|  |     reduce_kernel.set_arg(4, uint_(0)); | ||
|  | 
 | ||
|  |     size_t work_size = calculate_work_size(count, vpt, tpb); | ||
|  | 
 | ||
|  |     queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); | ||
|  | } | ||
|  | 
 | ||
|  | template<class InputIterator, class T, class Function> | ||
|  | inline void reduce_on_gpu(InputIterator first, | ||
|  |                           InputIterator last, | ||
|  |                           buffer_iterator<T> result, | ||
|  |                           Function function, | ||
|  |                           command_queue &queue) | ||
|  | { | ||
|  |     const device &device = queue.get_device(); | ||
|  |     const context &context = queue.get_context(); | ||
|  | 
 | ||
|  |     detail::meta_kernel k("reduce"); | ||
|  |     k.add_arg<const T*>(memory_object::global_memory, "input"); | ||
|  |     k.add_arg<const uint_>("offset"); | ||
|  |     k.add_arg<const uint_>("count"); | ||
|  |     k.add_arg<T*>(memory_object::global_memory, "output"); | ||
|  |     k.add_arg<const uint_>("output_offset"); | ||
|  | 
 | ||
|  |     k << | ||
|  |         k.decl<const uint_>("block_offset") << " = get_group_id(0) * VPT * TPB;\n" << | ||
|  |         "__global const " << type_name<T>() << " *block = input + offset + block_offset;\n" << | ||
|  |         k.decl<const uint_>("lid") << " = get_local_id(0);\n" << | ||
|  | 
 | ||
|  |         "__local " << type_name<T>() << " scratch[TPB];\n" << | ||
|  |         // private reduction | ||
|  |         k.decl<T>("sum") << " = 0;\n" << | ||
|  |         "for(uint i = 0; i < VPT; i++){\n" << | ||
|  |         "    if(block_offset + lid + i*TPB < count){\n" << | ||
|  |         "        sum = sum + block[lid+i*TPB]; \n" << | ||
|  |         "    }\n" << | ||
|  |         "}\n" << | ||
|  | 
 | ||
|  |         "scratch[lid] = sum;\n"; | ||
|  | 
 | ||
|  |     // discrimination on vendor name | ||
|  |     if(is_nvidia_device(device)) | ||
|  |         k << ReduceBody<T,true>::body(); | ||
|  |     else | ||
|  |         k << ReduceBody<T,false>::body(); | ||
|  | 
 | ||
|  |     k << | ||
|  |         // write sum to output | ||
|  |          "if(lid == 0){\n" << | ||
|  |          "    output[output_offset + get_group_id(0)] = scratch[0];\n" << | ||
|  |          "}\n"; | ||
|  | 
 | ||
|  |     std::string cache_key = std::string("__boost_reduce_on_gpu_") + type_name<T>(); | ||
|  | 
 | ||
|  |     // load parameters | ||
|  |     boost::shared_ptr<parameter_cache> parameters = | ||
|  |         detail::parameter_cache::get_global_cache(device); | ||
|  | 
 | ||
|  |     uint_ vpt = parameters->get(cache_key, "vpt", 8); | ||
|  |     uint_ tpb = parameters->get(cache_key, "tpb", 128); | ||
|  | 
 | ||
|  |     // reduce program compiler flags | ||
|  |     std::stringstream options; | ||
|  |     options << "-DT=" << type_name<T>() | ||
|  |             << " -DVPT=" << vpt | ||
|  |             << " -DTPB=" << tpb; | ||
|  | 
 | ||
|  |     // load program | ||
|  |     boost::shared_ptr<program_cache> cache = | ||
|  |         program_cache::get_global_cache(context); | ||
|  | 
 | ||
|  |     program reduce_program = cache->get_or_build( | ||
|  |         cache_key, options.str(), k.source(), context | ||
|  |     ); | ||
|  | 
 | ||
|  |     // create reduce kernel | ||
|  |     kernel reduce_kernel(reduce_program, "reduce"); | ||
|  | 
 | ||
|  |     size_t count = std::distance(first, last); | ||
|  | 
 | ||
|  |     // first pass, reduce from input to ping | ||
|  |     buffer ping(context, std::ceil(float(count) / vpt / tpb) * sizeof(T)); | ||
|  |     initial_reduce(first, last, ping, function, reduce_kernel, vpt, tpb, queue); | ||
|  | 
 | ||
|  |     // update count after initial reduce | ||
|  |     count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb)); | ||
|  | 
 | ||
|  |     // middle pass(es), reduce between ping and pong | ||
|  |     const buffer *input_buffer = &ping; | ||
|  |     buffer pong(context, static_cast<size_t>(count / vpt / tpb * sizeof(T))); | ||
|  |     const buffer *output_buffer = &pong; | ||
|  |     if(count > vpt * tpb){ | ||
|  |         while(count > vpt * tpb){ | ||
|  |             reduce_kernel.set_arg(0, *input_buffer); | ||
|  |             reduce_kernel.set_arg(1, uint_(0)); | ||
|  |             reduce_kernel.set_arg(2, uint_(count)); | ||
|  |             reduce_kernel.set_arg(3, *output_buffer); | ||
|  |             reduce_kernel.set_arg(4, uint_(0)); | ||
|  | 
 | ||
|  |             size_t work_size = static_cast<size_t>(std::ceil(float(count) / vpt)); | ||
|  |             if(work_size % tpb != 0){ | ||
|  |                 work_size += tpb - work_size % tpb; | ||
|  |             } | ||
|  |             queue.enqueue_1d_range_kernel(reduce_kernel, 0, work_size, tpb); | ||
|  | 
 | ||
|  |             std::swap(input_buffer, output_buffer); | ||
|  |             count = static_cast<size_t>(std::ceil(float(count) / vpt / tpb)); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     // final pass, reduce from ping/pong to result | ||
|  |     reduce_kernel.set_arg(0, *input_buffer); | ||
|  |     reduce_kernel.set_arg(1, uint_(0)); | ||
|  |     reduce_kernel.set_arg(2, uint_(count)); | ||
|  |     reduce_kernel.set_arg(3, result.get_buffer()); | ||
|  |     reduce_kernel.set_arg(4, uint_(result.get_index())); | ||
|  | 
 | ||
|  |     queue.enqueue_1d_range_kernel(reduce_kernel, 0, tpb, tpb); | ||
|  | } | ||
|  | 
 | ||
|  | } // end detail namespace | ||
|  | } // end compute namespace | ||
|  | } // end boost namespace | ||
|  | 
 | ||
|  | #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_REDUCE_ON_GPU_HPP |