208 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			208 lines
		
	
	
		
			7.1 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | //---------------------------------------------------------------------------// | ||
|  | // Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com> | ||
|  | // | ||
|  | // Distributed under the Boost Software License, Version 1.0 | ||
|  | // See accompanying file LICENSE_1_0.txt or copy at | ||
|  | // http://www.boost.org/LICENSE_1_0.txt | ||
|  | // | ||
|  | // See http://boostorg.github.com/compute for more information. | ||
|  | //---------------------------------------------------------------------------// | ||
|  | 
 | ||
|  | #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP | ||
|  | #define BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP | ||
|  | 
 | ||
|  | #include <iterator> | ||
|  | 
 | ||
|  | #include <boost/compute/device.hpp> | ||
|  | #include <boost/compute/kernel.hpp> | ||
|  | #include <boost/compute/command_queue.hpp> | ||
|  | #include <boost/compute/algorithm/detail/serial_scan.hpp> | ||
|  | #include <boost/compute/detail/meta_kernel.hpp> | ||
|  | #include <boost/compute/detail/iterator_range_size.hpp> | ||
|  | #include <boost/compute/detail/parameter_cache.hpp> | ||
|  | 
 | ||
|  | namespace boost { | ||
|  | namespace compute { | ||
|  | namespace detail { | ||
|  | 
 | ||
|  | template<class InputIterator, class OutputIterator, class T, class BinaryOperator> | ||
|  | inline OutputIterator scan_on_cpu(InputIterator first, | ||
|  |                                   InputIterator last, | ||
|  |                                   OutputIterator result, | ||
|  |                                   bool exclusive, | ||
|  |                                   T init, | ||
|  |                                   BinaryOperator op, | ||
|  |                                   command_queue &queue) | ||
|  | { | ||
|  |     typedef typename | ||
|  |         std::iterator_traits<InputIterator>::value_type input_type; | ||
|  |     typedef typename | ||
|  |         std::iterator_traits<OutputIterator>::value_type output_type; | ||
|  | 
 | ||
|  |     const context &context = queue.get_context(); | ||
|  |     const device &device = queue.get_device(); | ||
|  |     const size_t compute_units = queue.get_device().compute_units(); | ||
|  | 
 | ||
|  |     boost::shared_ptr<parameter_cache> parameters = | ||
|  |         detail::parameter_cache::get_global_cache(device); | ||
|  | 
 | ||
|  |     std::string cache_key = | ||
|  |         "__boost_scan_cpu_" + boost::lexical_cast<std::string>(sizeof(T)); | ||
|  | 
 | ||
|  |     // for inputs smaller than serial_scan_threshold | ||
|  |     // serial_scan algorithm is used | ||
|  |     uint_ serial_scan_threshold = | ||
|  |         parameters->get(cache_key, "serial_scan_threshold", 16384 * sizeof(T)); | ||
|  |     serial_scan_threshold = | ||
|  |         (std::max)(serial_scan_threshold, uint_(compute_units)); | ||
|  | 
 | ||
|  |     size_t count = detail::iterator_range_size(first, last); | ||
|  |     if(count == 0){ | ||
|  |         return result; | ||
|  |     } | ||
|  |     else if(count < serial_scan_threshold) { | ||
|  |         return serial_scan(first, last, result, exclusive, init, op, queue); | ||
|  |     } | ||
|  | 
 | ||
|  |     buffer block_partial_sums(context, sizeof(output_type) * compute_units ); | ||
|  | 
 | ||
|  |     // create scan kernel | ||
|  |     meta_kernel k("scan_on_cpu_block_scan"); | ||
|  | 
 | ||
|  |     // Arguments | ||
|  |     size_t count_arg = k.add_arg<uint_>("count"); | ||
|  |     size_t init_arg = k.add_arg<output_type>("initial_value"); | ||
|  |     size_t block_partial_sums_arg = | ||
|  |         k.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); | ||
|  | 
 | ||
|  |     k << | ||
|  |         "uint block = " << | ||
|  |             "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << | ||
|  |         "uint index = get_global_id(0) * block;\n" << | ||
|  |         "uint end = min(count, index + block);\n"; | ||
|  | 
 | ||
|  |     if(!exclusive){ | ||
|  |         k << | ||
|  |             k.decl<output_type>("sum") << " = " << | ||
|  |                 first[k.var<uint_>("index")] << ";\n" << | ||
|  |             result[k.var<uint_>("index")] << " = sum;\n" << | ||
|  |             "index++;\n"; | ||
|  |     } | ||
|  |     else { | ||
|  |         k << | ||
|  |             k.decl<output_type>("sum") << ";\n" << | ||
|  |             "if(index == 0){\n" << | ||
|  |                 "sum = initial_value;\n" << | ||
|  |             "}\n" << | ||
|  |             "else {\n" << | ||
|  |                 "sum = " << first[k.var<uint_>("index")] << ";\n" << | ||
|  |                 "index++;\n" << | ||
|  |             "}\n"; | ||
|  |     } | ||
|  | 
 | ||
|  |     k << | ||
|  |         "while(index < end){\n" << | ||
|  |             // load next value | ||
|  |             k.decl<const input_type>("value") << " = " | ||
|  |                 << first[k.var<uint_>("index")] << ";\n"; | ||
|  | 
 | ||
|  |     if(exclusive){ | ||
|  |         k << | ||
|  |             "if(get_global_id(0) == 0){\n" << | ||
|  |                 result[k.var<uint_>("index")] << " = sum;\n" << | ||
|  |             "}\n"; | ||
|  |     } | ||
|  |     k << | ||
|  |             "sum = " << op(k.var<output_type>("sum"), | ||
|  |                            k.var<output_type>("value")) << ";\n"; | ||
|  | 
 | ||
|  |     if(!exclusive){ | ||
|  |         k << | ||
|  |             "if(get_global_id(0) == 0){\n" << | ||
|  |                 result[k.var<uint_>("index")] << " = sum;\n" << | ||
|  |             "}\n"; | ||
|  |     } | ||
|  | 
 | ||
|  |     k << | ||
|  |             "index++;\n" << | ||
|  |         "}\n" << // end while | ||
|  |         "block_partial_sums[get_global_id(0)] = sum;\n"; | ||
|  | 
 | ||
|  |     // compile scan kernel | ||
|  |     kernel block_scan_kernel = k.compile(context); | ||
|  | 
 | ||
|  |     // setup kernel arguments | ||
|  |     block_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); | ||
|  |     block_scan_kernel.set_arg(init_arg, static_cast<output_type>(init)); | ||
|  |     block_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); | ||
|  | 
 | ||
|  |     // execute the kernel | ||
|  |     size_t global_work_size = compute_units; | ||
|  |     queue.enqueue_1d_range_kernel(block_scan_kernel, 0, global_work_size, 0); | ||
|  | 
 | ||
|  |     // scan is done | ||
|  |     if(compute_units < 2) { | ||
|  |         return result + count; | ||
|  |     } | ||
|  | 
 | ||
|  |     // final scan kernel | ||
|  |     meta_kernel l("scan_on_cpu_final_scan"); | ||
|  | 
 | ||
|  |     // Arguments | ||
|  |     count_arg = l.add_arg<uint_>("count"); | ||
|  |     block_partial_sums_arg = | ||
|  |         l.add_arg<output_type *>(memory_object::global_memory, "block_partial_sums"); | ||
|  | 
 | ||
|  |     l << | ||
|  |         "uint block = " << | ||
|  |             "(uint)ceil(((float)count)/(get_global_size(0) + 1));\n" << | ||
|  |         "uint index = block + get_global_id(0) * block;\n" << | ||
|  |         "uint end = min(count, index + block);\n" << | ||
|  | 
 | ||
|  |         k.decl<output_type>("sum") << " = block_partial_sums[0];\n" << | ||
|  |         "for(uint i = 0; i < get_global_id(0); i++) {\n" << | ||
|  |             "sum = " << op(k.var<output_type>("sum"), | ||
|  |                            k.var<output_type>("block_partial_sums[i + 1]")) << ";\n" << | ||
|  |         "}\n" << | ||
|  | 
 | ||
|  |         "while(index < end){\n"; | ||
|  |     if(exclusive){ | ||
|  |         l << | ||
|  |             l.decl<output_type>("value") << " = " | ||
|  |                 << first[k.var<uint_>("index")] << ";\n" << | ||
|  |             result[k.var<uint_>("index")] << " = sum;\n" << | ||
|  |             "sum = " << op(k.var<output_type>("sum"), | ||
|  |                            k.var<output_type>("value")) << ";\n"; | ||
|  |     } | ||
|  |     else { | ||
|  |         l << | ||
|  |             "sum = " << op(k.var<output_type>("sum"), | ||
|  |                            first[k.var<uint_>("index")]) << ";\n" << | ||
|  |             result[k.var<uint_>("index")] << " = sum;\n"; | ||
|  |     } | ||
|  |     l << | ||
|  |             "index++;\n" << | ||
|  |         "}\n"; | ||
|  | 
 | ||
|  | 
 | ||
|  |     // compile scan kernel | ||
|  |     kernel final_scan_kernel = l.compile(context); | ||
|  | 
 | ||
|  |     // setup kernel arguments | ||
|  |     final_scan_kernel.set_arg(count_arg, static_cast<uint_>(count)); | ||
|  |     final_scan_kernel.set_arg(block_partial_sums_arg, block_partial_sums); | ||
|  | 
 | ||
|  |     // execute the kernel | ||
|  |     global_work_size = compute_units; | ||
|  |     queue.enqueue_1d_range_kernel(final_scan_kernel, 0, global_work_size, 0); | ||
|  | 
 | ||
|  |     // return iterator pointing to the end of the result range | ||
|  |     return result + count; | ||
|  | } | ||
|  | 
 | ||
|  | } // end detail namespace | ||
|  | } // end compute namespace | ||
|  | } // end boost namespace | ||
|  | 
 | ||
|  | #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_SCAN_ON_CPU_HPP |