208 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			208 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| 
								 | 
							
								//---------------------------------------------------------------------------//
							 | 
						||
| 
								 | 
							
								// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
							 | 
						||
| 
								 | 
							
								//
							 | 
						||
| 
								 | 
							
								// Distributed under the Boost Software License, Version 1.0
							 | 
						||
| 
								 | 
							
								// See accompanying file LICENSE_1_0.txt or copy at
							 | 
						||
| 
								 | 
							
								// http://www.boost.org/LICENSE_1_0.txt
							 | 
						||
| 
								 | 
							
								//
							 | 
						||
| 
								 | 
							
								// See http://boostorg.github.com/compute for more information.
							 | 
						||
| 
								 | 
							
								//---------------------------------------------------------------------------//
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
							 | 
						||
| 
								 | 
							
								#define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include <iterator>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#include <boost/compute/device.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/kernel.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/command_queue.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/algorithm/detail/serial_scan.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/meta_kernel.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/iterator_range_size.hpp>
							 | 
						||
| 
								 | 
							
								#include <boost/compute/detail/parameter_cache.hpp>
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								namespace boost {
							 | 
						||
| 
								 | 
							
								namespace compute {
							 | 
						||
| 
								 | 
							
								namespace detail {
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								template<class InputIterator, class OutputIterator, class T, class BinaryOperator>
							 | 
						||
| 
								 | 
							
								inline OutputIterator scan_on_cpu(InputIterator first,
							 | 
						||
| 
								 | 
							
								                                  InputIterator last,
							 | 
						||
| 
								 | 
							
								                                  OutputIterator result,
							 | 
						||
| 
								 | 
							
								                                  bool exclusive,
							 | 
						||
| 
								 | 
							
								                                  T init,
							 | 
						||
| 
								 | 
							
								                                  BinaryOperator op,
							 | 
						||
| 
								 | 
							
								                                  command_queue &queue)
							 | 
						||
| 
								 | 
							
								{
							 | 
						||
| 
								 | 
							
								    typedef typename
							 | 
						||
| 
								 | 
							
								        std::iterator_traits<InputIterator>::value_type input_type;
							 | 
						||
| 
								 | 
							
								    typedef typename
							 | 
						||
| 
								 | 
							
								        std::iterator_traits<OutputIterator>::value_type output_type;
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    const context &context = queue.get_context();
							 | 
						||
| 
								 | 
							
								    const device &device = queue.get_device();
							 | 
						||
| 
								 | 
							
								    const size_t compute_units = queue.get_device().compute_units();
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    boost::shared_ptr<parameter_cache> parameters =
							 | 
						||
| 
								 | 
							
								        detail::parameter_cache::get_global_cache(device);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    std::string cache_key =
							 | 
						||
| 
								 | 
							
								        "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T));
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // for inputs smaller than serial_scan_threshold
							 | 
						||
| 
								 | 
							
								    // serial_scan algorithm is used
							 | 
						||
| 
								 | 
							
								    uint_ serial_scan_threshold =
							 | 
						||
| 
								 | 
							
								        parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T));
							 | 
						||
| 
								 | 
							
								    serial_scan_threshold =
							 | 
						||
| 
								 | 
							
								        (std::max)(serial_scan_threshold, uint_(compute_units));
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    size_t count = detail::iterator_range_size(first, last);
							 | 
						||
| 
								 | 
							
								    if(count == 0){
							 | 
						||
| 
								 | 
							
								        return result;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else if(count < serial_scan_threshold) {
							 | 
						||
| 
								 | 
							
								        return serial_scan(first, last, result, exclusive, init, op, queue);
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    buffer block_partial_sums(context, sizeof(output_type) * compute_units );
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // create scan kernel
							 | 
						||
| 
								 | 
							
								    meta_kernel k("scan_on_cpu_block_scan");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // Arguments
							 | 
						||
| 
								 | 
							
								    size_t count_arg = k.add_arg<uint_>("count");
							 | 
						||
| 
								 | 
							
								    size_t init_arg = k.add_arg<output_type>("initial_value");
							 | 
						||
| 
								 | 
							
								    size_t block_partial_sums_arg =
							 | 
						||
| 
								 | 
							
								        k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    k <<
							 | 
						||
| 
								 | 
							
								        "uint block = " <<
							 | 
						||
| 
								 | 
							
								            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
							 | 
						||
| 
								 | 
							
								        "uint index = get_global_id(0) * block;\n" <<
							 | 
						||
| 
								 | 
							
								        "uint end = min(count, index + block);\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if(!exclusive){
							 | 
						||
| 
								 | 
							
								        k <<
							 | 
						||
| 
								 | 
							
								            k.decl<output_type>("sum") << " = " <<
							 | 
						||
| 
								 | 
							
								                first[k.var<uint_>("index")] << ";\n" <<
							 | 
						||
| 
								 | 
							
								            result[k.var<uint_>("index")] << " = sum;\n" <<
							 | 
						||
| 
								 | 
							
								            "index++;\n";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        k <<
							 | 
						||
| 
								 | 
							
								            k.decl<output_type>("sum") << ";\n" <<
							 | 
						||
| 
								 | 
							
								            "if(index == 0){\n" <<
							 | 
						||
| 
								 | 
							
								                "sum = initial_value;\n" <<
							 | 
						||
| 
								 | 
							
								            "}\n" <<
							 | 
						||
| 
								 | 
							
								            "else {\n" <<
							 | 
						||
| 
								 | 
							
								                "sum = " << first[k.var<uint_>("index")] << ";\n" <<
							 | 
						||
| 
								 | 
							
								                "index++;\n" <<
							 | 
						||
| 
								 | 
							
								            "}\n";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    k <<
							 | 
						||
| 
								 | 
							
								        "while(index < end){\n" <<
							 | 
						||
| 
								 | 
							
								            // load next value
							 | 
						||
| 
								 | 
							
								            k.decl<const input_type>("value") << " = "
							 | 
						||
| 
								 | 
							
								                << first[k.var<uint_>("index")] << ";\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if(exclusive){
							 | 
						||
| 
								 | 
							
								        k <<
							 | 
						||
| 
								 | 
							
								            "if(get_global_id(0) == 0){\n" <<
							 | 
						||
| 
								 | 
							
								                result[k.var<uint_>("index")] << " = sum;\n" <<
							 | 
						||
| 
								 | 
							
								            "}\n";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    k <<
							 | 
						||
| 
								 | 
							
								            "sum = " << op(k.var<output_type>("sum"),
							 | 
						||
| 
								 | 
							
								                           k.var<output_type>("value")) << ";\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    if(!exclusive){
							 | 
						||
| 
								 | 
							
								        k <<
							 | 
						||
| 
								 | 
							
								            "if(get_global_id(0) == 0){\n" <<
							 | 
						||
| 
								 | 
							
								                result[k.var<uint_>("index")] << " = sum;\n" <<
							 | 
						||
| 
								 | 
							
								            "}\n";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    k <<
							 | 
						||
| 
								 | 
							
								            "index++;\n" <<
							 | 
						||
| 
								 | 
							
								        "}\n" << // end while
							 | 
						||
| 
								 | 
							
								        "block_partial_sums[get_global_id(0)] = sum;\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // compile scan kernel
							 | 
						||
| 
								 | 
							
								    kernel block_scan_kernel = k.compile(context);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // setup kernel arguments
							 | 
						||
| 
								 | 
							
								    block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
							 | 
						||
| 
								 | 
							
								    block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init));
							 | 
						||
| 
								 | 
							
								    block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // execute the kernel
							 | 
						||
| 
								 | 
							
								    size_t global_work_size = compute_units;
							 | 
						||
| 
								 | 
							
								    queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // scan is done
							 | 
						||
| 
								 | 
							
								    if(compute_units < 2) {
							 | 
						||
| 
								 | 
							
								        return result + count;
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // final scan kernel
							 | 
						||
| 
								 | 
							
								    meta_kernel l("scan_on_cpu_final_scan");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // Arguments
							 | 
						||
| 
								 | 
							
								    count_arg = l.add_arg<uint_>("count");
							 | 
						||
| 
								 | 
							
								    block_partial_sums_arg =
							 | 
						||
| 
								 | 
							
								        l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums");
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    l <<
							 | 
						||
| 
								 | 
							
								        "uint block = " <<
							 | 
						||
| 
								 | 
							
								            "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" <<
							 | 
						||
| 
								 | 
							
								        "uint index = block + get_global_id(0) * block;\n" <<
							 | 
						||
| 
								 | 
							
								        "uint end = min(count, index + block);\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        k.decl<output_type>("sum") << " = block_partial_sums[0];\n" <<
							 | 
						||
| 
								 | 
							
								        "for(uint i = 0; i < get_global_id(0); i++) {\n" <<
							 | 
						||
| 
								 | 
							
								            "sum = " << op(k.var<output_type>("sum"),
							 | 
						||
| 
								 | 
							
								                           k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" <<
							 | 
						||
| 
								 | 
							
								        "}\n" <<
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								        "while(index < end){\n";
							 | 
						||
| 
								 | 
							
								    if(exclusive){
							 | 
						||
| 
								 | 
							
								        l <<
							 | 
						||
| 
								 | 
							
								            l.decl<output_type>("value") << " = "
							 | 
						||
| 
								 | 
							
								                << first[k.var<uint_>("index")] << ";\n" <<
							 | 
						||
| 
								 | 
							
								            result[k.var<uint_>("index")] << " = sum;\n" <<
							 | 
						||
| 
								 | 
							
								            "sum = " << op(k.var<output_type>("sum"),
							 | 
						||
| 
								 | 
							
								                           k.var<output_type>("value")) << ";\n";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    else {
							 | 
						||
| 
								 | 
							
								        l <<
							 | 
						||
| 
								 | 
							
								            "sum = " << op(k.var<output_type>("sum"),
							 | 
						||
| 
								 | 
							
								                           first[k.var<uint_>("index")]) << ";\n" <<
							 | 
						||
| 
								 | 
							
								            result[k.var<uint_>("index")] << " = sum;\n";
							 | 
						||
| 
								 | 
							
								    }
							 | 
						||
| 
								 | 
							
								    l <<
							 | 
						||
| 
								 | 
							
								            "index++;\n" <<
							 | 
						||
| 
								 | 
							
								        "}\n";
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // compile scan kernel
							 | 
						||
| 
								 | 
							
								    kernel final_scan_kernel = l.compile(context);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // setup kernel arguments
							 | 
						||
| 
								 | 
							
								    final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count));
							 | 
						||
| 
								 | 
							
								    final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // execute the kernel
							 | 
						||
| 
								 | 
							
								    global_work_size = compute_units;
							 | 
						||
| 
								 | 
							
								    queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0);
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								    // return iterator pointing to the end of the result range
							 | 
						||
| 
								 | 
							
								    return result + count;
							 | 
						||
| 
								 | 
							
								}
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								} // end detail namespace
							 | 
						||
| 
								 | 
							
								} // end compute namespace
							 | 
						||
| 
								 | 
							
								} // end boost namespace
							 | 
						||
| 
								 | 
							
								
							 | 
						||
| 
								 | 
							
								#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP
							 |