444 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			444 lines
		
	
	
		
			19 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| 
								 | 
							
								//---------------------------------------------------------------------------//
							 | 
						||
| 
								 | 
							
								// Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com>
							 | 
						||
| 
								 | 
							
								//
							 | 
						||
| 
								 | 
							
								// Distributed under the Boost Software License, Version 1.0
							 | 
						||
| 
								 | 
							
								// See accompanying file LICENSE_1_0.txt or copy at
							 | 
						||
| 
								 | 
							
								// http://www.boost.org/LICENSE_1_0.txt
							 | 
						||
| 
								 | 
							
								//
							 | 
						||
| 
								 | 
							
								// See http://boostorg.github.com/compute for more information.
							 | 
						||
| 
								 | 
							
								//---------------------------------------------------------------------------//
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
							 | 
						||
| 
								 | 
							
								#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include <algorithm>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include <boost/compute/types.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/command_queue.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/algorithm/copy.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/allocator/pinned_allocator.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/container/vector.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/meta_kernel.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/iterator_range_size.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/parameter_cache.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/memory/local_buffer.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/type_traits/type_name.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/utility/program_cache.hpp>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								namespace boost {
							 | 
						||
| 
								 | 
							
								namespace compute {
							 | 
						||
| 
								 | 
							
								namespace detail {
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								template<class InputIterator>
							 | 
						||
| 
								 | 
							
								bool find_extrema_with_reduce_requirements_met(InputIterator first,
							 | 
						||
| 
								 | 
							
								                                               InputIterator last,
							 | 
						||
| 
								 | 
							
								                                               command_queue &queue)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const device &device = queue.get_device();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // device must have dedicated local memory storage
							 | 
						||
| 
								 | 
							
								    // otherwise reduction would be highly inefficient
							 | 
						||
| 
								 | 
							
								    if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL)
							 | 
						||
| 
								 | 
							
								    {
							 | 
						||
| 
								 | 
							
								        return false;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
							 | 
						||
| 
								 | 
							
								    // local memory size in bytes (per compute unit)
							 | 
						||
| 
								 | 
							
								    const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    std::string cache_key = std::string("__boost_find_extrema_reduce_")
							 | 
						||
| 
								 | 
							
								        + type_name<input_type>();
							 | 
						||
| 
								 | 
							
								    // load parameters
							 | 
						||
| 
								 | 
							
								    boost::shared_ptr<parameter_cache> parameters =
							 | 
						||
| 
								 | 
							
								        detail::parameter_cache::get_global_cache(device);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // Get preferred work group size
							 | 
						||
| 
								 | 
							
								    size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    work_group_size = (std::min)(max_work_group_size, work_group_size);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // local memory size needed to perform parallel reduction
							 | 
						||
| 
								 | 
							
								    size_t required_local_mem_size = 0;
							 | 
						||
| 
								 | 
							
								    // indices size
							 | 
						||
| 
								 | 
							
								    required_local_mem_size += sizeof(uint_) * work_group_size;
							 | 
						||
| 
								 | 
							
								    // values size
							 | 
						||
| 
								 | 
							
								    required_local_mem_size += sizeof(input_type) * work_group_size;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // at least 4 work groups per compute unit otherwise reduction
							 | 
						||
| 
								 | 
							
								    // would be highly inefficient
							 | 
						||
| 
								 | 
							
								    return ((required_local_mem_size * 4) <= local_mem_size);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								/// \internal_
							 | 
						||
| 
								 | 
							
								/// Algorithm finds the first extremum in given range, i.e., with the lowest
							 | 
						||
| 
								 | 
							
								/// index.
							 | 
						||
| 
								 | 
							
								///
							 | 
						||
| 
								 | 
							
								/// If \p use_input_idx is false, it's assumed that input data is ordered by
							 | 
						||
| 
								 | 
							
								/// increasing index and \p input_idx is not used in the algorithm.
							 | 
						||
| 
								 | 
							
								template<class InputIterator, class ResultIterator, class Compare>
							 | 
						||
| 
								 | 
							
								inline void find_extrema_with_reduce(InputIterator input,
							 | 
						||
| 
								 | 
							
								                                     vector<uint_>::iterator input_idx,
							 | 
						||
| 
								 | 
							
								                                     size_t count,
							 | 
						||
| 
								 | 
							
								                                     ResultIterator result,
							 | 
						||
| 
								 | 
							
								                                     vector<uint_>::iterator result_idx,
							 | 
						||
| 
								 | 
							
								                                     size_t work_groups_no,
							 | 
						||
| 
								 | 
							
								                                     size_t work_group_size,
							 | 
						||
| 
								 | 
							
								                                     Compare compare,
							 | 
						||
| 
								 | 
							
								                                     const bool find_minimum,
							 | 
						||
| 
								 | 
							
								                                     const bool use_input_idx,
							 | 
						||
| 
								 | 
							
								                                     command_queue &queue)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const context &context = queue.get_context();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    meta_kernel k("find_extrema_reduce");
							 | 
						||
| 
								 | 
							
								    size_t count_arg = k.add_arg<uint_>("count");
							 | 
						||
| 
								 | 
							
								    size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
							 | 
						||
| 
								 | 
							
								    size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    k <<
							 | 
						||
| 
								 | 
							
								        // Work item global id
							 | 
						||
| 
								 | 
							
								        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        // Index of element that will be read from input buffer
							 | 
						||
| 
								 | 
							
								        k.decl<uint_>("idx") << " = gid;\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        k.decl<input_type>("acc") << ";\n" <<
							 | 
						||
| 
								 | 
							
								        k.decl<uint_>("acc_idx") << ";\n" <<
							 | 
						||
| 
								 | 
							
								        "if(gid < count) {\n" <<
							 | 
						||
| 
								 | 
							
								            // Real index of currently best element
							 | 
						||
| 
								 | 
							
								            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
							 | 
						||
| 
								 | 
							
								            k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
							 | 
						||
| 
								 | 
							
								            "#else\n" <<
							 | 
						||
| 
								 | 
							
								            k.var<uint_>("acc_idx") << " = idx;\n" <<
							 | 
						||
| 
								 | 
							
								            "#endif\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            // Init accumulator with first[get_global_id(0)]
							 | 
						||
| 
								 | 
							
								            "acc = " << input[k.var<uint_>("idx")] << ";\n" <<
							 | 
						||
| 
								 | 
							
								            "idx += get_global_size(0);\n" <<
							 | 
						||
| 
								 | 
							
								        "}\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        k.decl<bool>("compare_result") << ";\n" <<
							 | 
						||
| 
								 | 
							
								        k.decl<bool>("equal") << ";\n\n" <<
							 | 
						||
| 
								 | 
							
								        "while( idx < count ){\n" <<
							 | 
						||
| 
								 | 
							
								            // Next element
							 | 
						||
| 
								 | 
							
								            k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" <<
							 | 
						||
| 
								 | 
							
								            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
							 | 
						||
| 
								 | 
							
								            k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" <<
							 | 
						||
| 
								 | 
							
								            "#endif\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            // Comparison between currently best element (acc) and next element
							 | 
						||
| 
								 | 
							
								            "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
							 | 
						||
| 
								 | 
							
								            "compare_result = " << compare(k.var<input_type>("next"),
							 | 
						||
| 
								 | 
							
								                                           k.var<input_type>("acc")) << ";\n" <<
							 | 
						||
| 
								 | 
							
								            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
							 | 
						||
| 
								 | 
							
								            "equal = !compare_result && !" <<
							 | 
						||
| 
								 | 
							
								                compare(k.var<input_type>("acc"),
							 | 
						||
| 
								 | 
							
								                        k.var<input_type>("next")) << ";\n" <<
							 | 
						||
| 
								 | 
							
								            "# endif\n" <<
							 | 
						||
| 
								 | 
							
								            "#else\n" <<
							 | 
						||
| 
								 | 
							
								            "compare_result = " << compare(k.var<input_type>("acc"),
							 | 
						||
| 
								 | 
							
								                                           k.var<input_type>("next")) << ";\n" <<
							 | 
						||
| 
								 | 
							
								            "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
							 | 
						||
| 
								 | 
							
								            "equal = !compare_result && !" <<
							 | 
						||
| 
								 | 
							
								                compare(k.var<input_type>("next"),
							 | 
						||
| 
								 | 
							
								                        k.var<input_type>("acc")) << ";\n" <<
							 | 
						||
| 
								 | 
							
								            "# endif\n" <<
							 | 
						||
| 
								 | 
							
								            "#endif\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								            // save the winner
							 | 
						||
| 
								 | 
							
								            "acc = compare_result ? acc : next;\n" <<
							 | 
						||
| 
								 | 
							
								            "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" <<
							 | 
						||
| 
								 | 
							
								            "acc_idx = compare_result ? " <<
							 | 
						||
| 
								 | 
							
								                "acc_idx : " <<
							 | 
						||
| 
								 | 
							
								                "(equal ? min(acc_idx, next_idx) : next_idx);\n" <<
							 | 
						||
| 
								 | 
							
								            "#else\n" <<
							 | 
						||
| 
								 | 
							
								            "acc_idx = compare_result ? acc_idx : idx;\n" <<
							 | 
						||
| 
								 | 
							
								            "#endif\n" <<
							 | 
						||
| 
								 | 
							
								            "idx += get_global_size(0);\n" <<
							 | 
						||
| 
								 | 
							
								        "}\n\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        // Work item local id
							 | 
						||
| 
								 | 
							
								        k.decl<const uint_>("lid") << " = get_local_id(0);\n" <<
							 | 
						||
| 
								 | 
							
								        "block[lid] = acc;\n" <<
							 | 
						||
| 
								 | 
							
								        "block_idx[lid] = acc_idx;\n" <<
							 | 
						||
| 
								 | 
							
								        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        k.decl<uint_>("group_offset") <<
							 | 
						||
| 
								 | 
							
								            " = count - (get_local_size(0) * get_group_id(0));\n\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    k <<
							 | 
						||
| 
								 | 
							
								        "#pragma unroll\n"
							 | 
						||
| 
								 | 
							
								        "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " <<
							 | 
						||
| 
								 | 
							
								             "offset = offset / 2) {\n" <<
							 | 
						||
| 
								 | 
							
								             "if((lid < offset) && ((lid + offset) < group_offset)) { \n" <<
							 | 
						||
| 
								 | 
							
								                 k.decl<input_type>("mine") << " = block[lid];\n" <<
							 | 
						||
| 
								 | 
							
								                 k.decl<input_type>("other") << " = block[lid+offset];\n" <<
							 | 
						||
| 
								 | 
							
								                 "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" <<
							 | 
						||
| 
								 | 
							
								                 "compare_result = " << compare(k.var<input_type>("other"),
							 | 
						||
| 
								 | 
							
								                                                k.var<input_type>("mine")) << ";\n" <<
							 | 
						||
| 
								 | 
							
								                 "equal = !compare_result && !" <<
							 | 
						||
| 
								 | 
							
								                     compare(k.var<input_type>("mine"),
							 | 
						||
| 
								 | 
							
								                             k.var<input_type>("other")) << ";\n" <<
							 | 
						||
| 
								 | 
							
								                 "#else\n" <<
							 | 
						||
| 
								 | 
							
								                 "compare_result = " << compare(k.var<input_type>("mine"),
							 | 
						||
| 
								 | 
							
								                                                k.var<input_type>("other")) << ";\n" <<
							 | 
						||
| 
								 | 
							
								                 "equal = !compare_result && !" <<
							 | 
						||
| 
								 | 
							
								                     compare(k.var<input_type>("other"),
							 | 
						||
| 
								 | 
							
								                             k.var<input_type>("mine")) << ";\n" <<
							 | 
						||
| 
								 | 
							
								                 "#endif\n" <<
							 | 
						||
| 
								 | 
							
								                 "block[lid] = compare_result ? mine : other;\n" <<
							 | 
						||
| 
								 | 
							
								                 k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" <<
							 | 
						||
| 
								 | 
							
								                 k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" <<
							 | 
						||
| 
								 | 
							
								                 "block_idx[lid] = compare_result ? " <<
							 | 
						||
| 
								 | 
							
								                     "mine_idx : " <<
							 | 
						||
| 
								 | 
							
								                     "(equal ? min(mine_idx, other_idx) : other_idx);\n" <<
							 | 
						||
| 
								 | 
							
								             "}\n"
							 | 
						||
| 
								 | 
							
								             "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
							 | 
						||
| 
								 | 
							
								        "}\n\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								         // write block result to global output
							 | 
						||
| 
								 | 
							
								        "if(lid == 0){\n" <<
							 | 
						||
| 
								 | 
							
								            result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" <<
							 | 
						||
| 
								 | 
							
								            result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" <<
							 | 
						||
| 
								 | 
							
								        "}";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    std::string options;
							 | 
						||
| 
								 | 
							
								    if(!find_minimum){
							 | 
						||
| 
								 | 
							
								        options = "-DBOOST_COMPUTE_FIND_MAXIMUM";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    if(use_input_idx){
							 | 
						||
| 
								 | 
							
								        options += " -DBOOST_COMPUTE_USE_INPUT_IDX";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    kernel kernel = k.compile(context, options);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    kernel.set_arg(count_arg, static_cast<uint_>(count));
							 | 
						||
| 
								 | 
							
								    kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size));
							 | 
						||
| 
								 | 
							
								    kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size));
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    queue.enqueue_1d_range_kernel(kernel,
							 | 
						||
| 
								 | 
							
								                                  0,
							 | 
						||
| 
								 | 
							
								                                  work_groups_no * work_group_size,
							 | 
						||
| 
								 | 
							
								                                  work_group_size);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								template<class InputIterator, class ResultIterator, class Compare>
							 | 
						||
| 
								 | 
							
								inline void find_extrema_with_reduce(InputIterator input,
							 | 
						||
| 
								 | 
							
								                                     size_t count,
							 | 
						||
| 
								 | 
							
								                                     ResultIterator result,
							 | 
						||
| 
								 | 
							
								                                     vector<uint_>::iterator result_idx,
							 | 
						||
| 
								 | 
							
								                                     size_t work_groups_no,
							 | 
						||
| 
								 | 
							
								                                     size_t work_group_size,
							 | 
						||
| 
								 | 
							
								                                     Compare compare,
							 | 
						||
| 
								 | 
							
								                                     const bool find_minimum,
							 | 
						||
| 
								 | 
							
								                                     command_queue &queue)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    // dummy will not be used
							 | 
						||
| 
								 | 
							
								    buffer_iterator<uint_> dummy = result_idx;
							 | 
						||
| 
								 | 
							
								    return find_extrema_with_reduce(
							 | 
						||
| 
								 | 
							
								        input, dummy, count, result, result_idx, work_groups_no,
							 | 
						||
| 
								 | 
							
								        work_group_size, compare, find_minimum, false, queue
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								template<class InputIterator, class Compare>
							 | 
						||
| 
								 | 
							
								InputIterator find_extrema_with_reduce(InputIterator first,
							 | 
						||
| 
								 | 
							
								                                       InputIterator last,
							 | 
						||
| 
								 | 
							
								                                       Compare compare,
							 | 
						||
| 
								 | 
							
								                                       const bool find_minimum,
							 | 
						||
| 
								 | 
							
								                                       command_queue &queue)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
							 | 
						||
| 
								 | 
							
								    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const context &context = queue.get_context();
							 | 
						||
| 
								 | 
							
								    const device &device = queue.get_device();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // Getting information about used queue and device
							 | 
						||
| 
								 | 
							
								    const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
							 | 
						||
| 
								 | 
							
								    const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const size_t count = detail::iterator_range_size(first, last);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
							 | 
						||
| 
								 | 
							
								        + type_name<input_type>();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // load parameters
							 | 
						||
| 
								 | 
							
								    boost::shared_ptr<parameter_cache> parameters =
							 | 
						||
| 
								 | 
							
								        detail::parameter_cache::get_global_cache(device);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // get preferred work group size and preferred number
							 | 
						||
| 
								 | 
							
								    // of work groups per compute unit
							 | 
						||
| 
								 | 
							
								    size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
							 | 
						||
| 
								 | 
							
								    size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // calculate work group size and number of work groups
							 | 
						||
| 
								 | 
							
								    work_group_size = (std::min)(max_work_group_size, work_group_size);
							 | 
						||
| 
								 | 
							
								    size_t work_groups_no = compute_units_no * work_groups_per_cu;
							 | 
						||
| 
								 | 
							
								    work_groups_no = (std::min)(
							 | 
						||
| 
								 | 
							
								        work_groups_no,
							 | 
						||
| 
								 | 
							
								        static_cast<size_t>(std::ceil(float(count) / work_group_size))
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // phase I: finding candidates for extremum
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // device buffors for extremum candidates and their indices
							 | 
						||
| 
								 | 
							
								    // each work-group computes its candidate
							 | 
						||
| 
								 | 
							
								    vector<input_type> candidates(work_groups_no, context);
							 | 
						||
| 
								 | 
							
								    vector<uint_> candidates_idx(work_groups_no, context);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // finding candidates for first extremum and their indices
							 | 
						||
| 
								 | 
							
								    find_extrema_with_reduce(
							 | 
						||
| 
								 | 
							
								        first, count, candidates.begin(), candidates_idx.begin(),
							 | 
						||
| 
								 | 
							
								        work_groups_no, work_group_size, compare, find_minimum, queue
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // phase II: finding extremum from among the candidates
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // zero-copy buffers for final result (value and index)
							 | 
						||
| 
								 | 
							
								    vector<input_type, ::boost::compute::pinned_allocator<input_type> >
							 | 
						||
| 
								 | 
							
								        result(1, context);
							 | 
						||
| 
								 | 
							
								    vector<uint_, ::boost::compute::pinned_allocator<uint_> >
							 | 
						||
| 
								 | 
							
								        result_idx(1, context);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // get extremum from among the candidates
							 | 
						||
| 
								 | 
							
								    find_extrema_with_reduce(
							 | 
						||
| 
								 | 
							
								        candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(),
							 | 
						||
| 
								 | 
							
								        result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // mapping extremum index to host
							 | 
						||
| 
								 | 
							
								    uint_* result_idx_host_ptr =
							 | 
						||
| 
								 | 
							
								        static_cast<uint_*>(
							 | 
						||
| 
								 | 
							
								            queue.enqueue_map_buffer(
							 | 
						||
| 
								 | 
							
								                result_idx.get_buffer(), command_queue::map_read,
							 | 
						||
| 
								 | 
							
								                0, sizeof(uint_)
							 | 
						||
| 
								 | 
							
								            )
							 | 
						||
| 
								 | 
							
								        );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return first + static_cast<difference_type>(*result_idx_host_ptr);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								template<class InputIterator>
							 | 
						||
| 
								 | 
							
								InputIterator find_extrema_with_reduce(InputIterator first,
							 | 
						||
| 
								 | 
							
								                                       InputIterator last,
							 | 
						||
| 
								 | 
							
								                                       ::boost::compute::less<
							 | 
						||
| 
								 | 
							
								                                           typename std::iterator_traits<
							 | 
						||
| 
								 | 
							
								                                               InputIterator
							 | 
						||
| 
								 | 
							
								                                           >::value_type
							 | 
						||
| 
								 | 
							
								                                       >
							 | 
						||
| 
								 | 
							
								                                       compare,
							 | 
						||
| 
								 | 
							
								                                       const bool find_minimum,
							 | 
						||
| 
								 | 
							
								                                       command_queue &queue)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
							 | 
						||
| 
								 | 
							
								    typedef typename std::iterator_traits<InputIterator>::value_type input_type;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const context &context = queue.get_context();
							 | 
						||
| 
								 | 
							
								    const device &device = queue.get_device();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // Getting information about used queue and device
							 | 
						||
| 
								 | 
							
								    const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
							 | 
						||
| 
								 | 
							
								    const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const size_t count = detail::iterator_range_size(first, last);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    std::string cache_key = std::string("__boost_find_extrema_with_reduce_")
							 | 
						||
| 
								 | 
							
								        + type_name<input_type>();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // load parameters
							 | 
						||
| 
								 | 
							
								    boost::shared_ptr<parameter_cache> parameters =
							 | 
						||
| 
								 | 
							
								        detail::parameter_cache::get_global_cache(device);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // get preferred work group size and preferred number
							 | 
						||
| 
								 | 
							
								    // of work groups per compute unit
							 | 
						||
| 
								 | 
							
								    size_t work_group_size = parameters->get(cache_key, "wgsize", 256);
							 | 
						||
| 
								 | 
							
								    size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // calculate work group size and number of work groups
							 | 
						||
| 
								 | 
							
								    work_group_size = (std::min)(max_work_group_size, work_group_size);
							 | 
						||
| 
								 | 
							
								    size_t work_groups_no = compute_units_no * work_groups_per_cu;
							 | 
						||
| 
								 | 
							
								    work_groups_no = (std::min)(
							 | 
						||
| 
								 | 
							
								        work_groups_no,
							 | 
						||
| 
								 | 
							
								        static_cast<size_t>(std::ceil(float(count) / work_group_size))
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // phase I: finding candidates for extremum
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // device buffors for extremum candidates and their indices
							 | 
						||
| 
								 | 
							
								    // each work-group computes its candidate
							 | 
						||
| 
								 | 
							
								    // zero-copy buffers are used to eliminate copying data back to host
							 | 
						||
| 
								 | 
							
								    vector<input_type, ::boost::compute::pinned_allocator<input_type> >
							 | 
						||
| 
								 | 
							
								        candidates(work_groups_no, context);
							 | 
						||
| 
								 | 
							
								    vector<uint_, ::boost::compute::pinned_allocator <uint_> >
							 | 
						||
| 
								 | 
							
								        candidates_idx(work_groups_no, context);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // finding candidates for first extremum and their indices
							 | 
						||
| 
								 | 
							
								    find_extrema_with_reduce(
							 | 
						||
| 
								 | 
							
								        first, count, candidates.begin(), candidates_idx.begin(),
							 | 
						||
| 
								 | 
							
								        work_groups_no, work_group_size, compare, find_minimum, queue
							 | 
						||
| 
								 | 
							
								    );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // phase II: finding extremum from among the candidates
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // mapping candidates and their indices to host
							 | 
						||
| 
								 | 
							
								    input_type* candidates_host_ptr =
							 | 
						||
| 
								 | 
							
								        static_cast<input_type*>(
							 | 
						||
| 
								 | 
							
								            queue.enqueue_map_buffer(
							 | 
						||
| 
								 | 
							
								                candidates.get_buffer(), command_queue::map_read,
							 | 
						||
| 
								 | 
							
								                0, work_groups_no * sizeof(input_type)
							 | 
						||
| 
								 | 
							
								            )
							 | 
						||
| 
								 | 
							
								        );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    uint_* candidates_idx_host_ptr =
							 | 
						||
| 
								 | 
							
								        static_cast<uint_*>(
							 | 
						||
| 
								 | 
							
								            queue.enqueue_map_buffer(
							 | 
						||
| 
								 | 
							
								                candidates_idx.get_buffer(), command_queue::map_read,
							 | 
						||
| 
								 | 
							
								                0, work_groups_no * sizeof(uint_)
							 | 
						||
| 
								 | 
							
								            )
							 | 
						||
| 
								 | 
							
								        );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    input_type* i = candidates_host_ptr;
							 | 
						||
| 
								 | 
							
								    uint_* idx = candidates_idx_host_ptr;
							 | 
						||
| 
								 | 
							
								    uint_* extremum_idx = idx;
							 | 
						||
| 
								 | 
							
								    input_type extremum = *candidates_host_ptr;
							 | 
						||
| 
								 | 
							
								    i++; idx++;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // find extremum (serial) from among the candidates on host
							 | 
						||
| 
								 | 
							
								    if(!find_minimum) {
							 | 
						||
| 
								 | 
							
								        while(idx != (candidates_idx_host_ptr + work_groups_no)) {
							 | 
						||
| 
								 | 
							
								            input_type next = *i;
							 | 
						||
| 
								 | 
							
								            bool compare_result =  next > extremum;
							 | 
						||
| 
								 | 
							
								            bool equal = next == extremum;
							 | 
						||
| 
								 | 
							
								            extremum = compare_result ? next : extremum;
							 | 
						||
| 
								 | 
							
								            extremum_idx = compare_result ? idx : extremum_idx;
							 | 
						||
| 
								 | 
							
								            extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
							 | 
						||
| 
								 | 
							
								            idx++, i++;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        while(idx != (candidates_idx_host_ptr + work_groups_no)) {
							 | 
						||
| 
								 | 
							
								            input_type next = *i;
							 | 
						||
| 
								 | 
							
								            bool compare_result = next < extremum;
							 | 
						||
| 
								 | 
							
								            bool equal = next == extremum;
							 | 
						||
| 
								 | 
							
								            extremum = compare_result ? next : extremum;
							 | 
						||
| 
								 | 
							
								            extremum_idx = compare_result ? idx : extremum_idx;
							 | 
						||
| 
								 | 
							
								            extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx;
							 | 
						||
| 
								 | 
							
								            idx++, i++;
							 | 
						||
| 
								 | 
							
								        }
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    return first + static_cast<difference_type>(*extremum_idx);
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								} // end detail namespace
							 | 
						||
| 
								 | 
							
								} // end compute namespace
							 | 
						||
| 
								 | 
							
								} // end boost namespace
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP
							 |