137 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			137 lines
		
	
	
		
			4.9 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | //---------------------------------------------------------------------------// | ||
|  | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | ||
|  | // | ||
|  | // Distributed under the Boost Software License, Version 1.0 | ||
|  | // See accompanying file LICENSE_1_0.txt or copy at | ||
|  | // http://www.boost.org/LICENSE_1_0.txt | ||
|  | // | ||
|  | // See http://boostorg.github.com/compute for more information. | ||
|  | //---------------------------------------------------------------------------// | ||
|  | 
 | ||
|  | #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP | ||
|  | #define BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP | ||
|  | 
 | ||
|  | #include <iterator> | ||
|  | 
 | ||
|  | #include <boost/utility/result_of.hpp> | ||
|  | 
 | ||
|  | #include <boost/compute/command_queue.hpp> | ||
|  | #include <boost/compute/container/vector.hpp> | ||
|  | #include <boost/compute/detail/iterator_range_size.hpp> | ||
|  | #include <boost/compute/memory/local_buffer.hpp> | ||
|  | 
 | ||
|  | namespace boost { | ||
|  | namespace compute { | ||
|  | namespace detail { | ||
|  | 
 | ||
|  | template<class Iterator, class BinaryFunction> | ||
|  | inline void inplace_reduce(Iterator first, | ||
|  |                            Iterator last, | ||
|  |                            BinaryFunction function, | ||
|  |                            command_queue &queue) | ||
|  | { | ||
|  |     typedef typename | ||
|  |         std::iterator_traits<Iterator>::value_type | ||
|  |         value_type; | ||
|  | 
 | ||
|  |     size_t input_size = iterator_range_size(first, last); | ||
|  |     if(input_size < 2){ | ||
|  |         return; | ||
|  |     } | ||
|  | 
 | ||
|  |     const context &context = queue.get_context(); | ||
|  | 
 | ||
|  |     size_t block_size = 64; | ||
|  |     size_t values_per_thread = 8; | ||
|  |     size_t block_count = input_size / (block_size * values_per_thread); | ||
|  |     if(block_count * block_size * values_per_thread != input_size) | ||
|  |         block_count++; | ||
|  | 
 | ||
|  |     vector<value_type> output(block_count, context); | ||
|  | 
 | ||
|  |     meta_kernel k("inplace_reduce"); | ||
|  |     size_t input_arg = k.add_arg<value_type *>(memory_object::global_memory, "input"); | ||
|  |     size_t input_size_arg = k.add_arg<const uint_>("input_size"); | ||
|  |     size_t output_arg = k.add_arg<value_type *>(memory_object::global_memory, "output"); | ||
|  |     size_t scratch_arg = k.add_arg<value_type *>(memory_object::local_memory, "scratch"); | ||
|  |     k << | ||
|  |         "const uint gid = get_global_id(0);\n" << | ||
|  |         "const uint lid = get_local_id(0);\n" << | ||
|  |         "const uint values_per_thread =\n" | ||
|  |             << uint_(values_per_thread) << ";\n" << | ||
|  | 
 | ||
|  |         // thread reduce | ||
|  |         "const uint index = gid * values_per_thread;\n" << | ||
|  |         "if(index < input_size){\n" << | ||
|  |             k.decl<value_type>("sum") << " = input[index];\n" << | ||
|  |             "for(uint i = 1;\n" << | ||
|  |                  "i < values_per_thread && (index + i) < input_size;\n" << | ||
|  |                  "i++){\n" << | ||
|  |             "    sum = " << | ||
|  |                      function(k.var<value_type>("sum"), | ||
|  |                               k.var<value_type>("input[index+i]")) << ";\n" << | ||
|  |             "}\n" << | ||
|  |             "scratch[lid] = sum;\n" << | ||
|  |         "}\n" << | ||
|  | 
 | ||
|  |         // local reduce | ||
|  |         "for(uint i = 1; i < get_local_size(0); i <<= 1){\n" << | ||
|  |         "    barrier(CLK_LOCAL_MEM_FENCE);\n" << | ||
|  |         "    uint mask = (i << 1) - 1;\n" << | ||
|  |         "    uint next_index = (gid + i) * values_per_thread;\n" | ||
|  |         "    if((lid & mask) == 0 && next_index < input_size){\n" << | ||
|  |         "        scratch[lid] = " << | ||
|  |                      function(k.var<value_type>("scratch[lid]"), | ||
|  |                               k.var<value_type>("scratch[lid+i]")) << ";\n" << | ||
|  |         "    }\n" << | ||
|  |         "}\n" << | ||
|  | 
 | ||
|  |         // write output for block | ||
|  |         "if(lid == 0){\n" << | ||
|  |         "    output[get_group_id(0)] = scratch[0];\n" << | ||
|  |         "}\n" | ||
|  |         ; | ||
|  | 
 | ||
|  |     const buffer *input_buffer = &first.get_buffer(); | ||
|  |     const buffer *output_buffer = &output.get_buffer(); | ||
|  | 
 | ||
|  |     kernel kernel = k.compile(context); | ||
|  | 
 | ||
|  |     while(input_size > 1){ | ||
|  |         kernel.set_arg(input_arg, *input_buffer); | ||
|  |         kernel.set_arg(input_size_arg, static_cast<uint_>(input_size)); | ||
|  |         kernel.set_arg(output_arg, *output_buffer); | ||
|  |         kernel.set_arg(scratch_arg, local_buffer<value_type>(block_size)); | ||
|  | 
 | ||
|  |         queue.enqueue_1d_range_kernel(kernel, | ||
|  |                                       0, | ||
|  |                                       block_count * block_size, | ||
|  |                                       block_size); | ||
|  | 
 | ||
|  |         input_size = | ||
|  |             static_cast<size_t>( | ||
|  |                 std::ceil(float(input_size) / (block_size * values_per_thread) | ||
|  |             ) | ||
|  |         ); | ||
|  | 
 | ||
|  |         block_count = input_size / (block_size * values_per_thread); | ||
|  |         if(block_count * block_size * values_per_thread != input_size) | ||
|  |             block_count++; | ||
|  | 
 | ||
|  |         std::swap(input_buffer, output_buffer); | ||
|  |     } | ||
|  | 
 | ||
|  |     if(input_buffer != &first.get_buffer()){ | ||
|  |         ::boost::compute::copy(output.begin(), | ||
|  |                                output.begin() + 1, | ||
|  |                                first, | ||
|  |                                queue); | ||
|  |     } | ||
|  | } | ||
|  | 
 | ||
|  | } // end detail namespace | ||
|  | } // end compute namespace | ||
|  | } // end boost namespace | ||
|  | 
 | ||
|  | #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_INPLACE_REDUCE_HPP |