213 lines
		
	
	
		
			8.5 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			213 lines
		
	
	
		
			8.5 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | //---------------------------------------------------------------------------// | ||
|  | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | ||
|  | // | ||
|  | // Distributed under the Boost Software License, Version 1.0 | ||
|  | // See accompanying file LICENSE_1_0.txt or copy at | ||
|  | // http://www.boost.org/LICENSE_1_0.txt | ||
|  | // | ||
|  | // See http://boostorg.github.com/compute for more information. | ||
|  | //---------------------------------------------------------------------------// | ||
|  | 
 | ||
|  | #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP | ||
|  | #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP | ||
|  | 
 | ||
|  | #include <iterator> | ||
|  | 
 | ||
|  | #include <boost/compute/types.hpp> | ||
|  | #include <boost/compute/functional.hpp> | ||
|  | #include <boost/compute/command_queue.hpp> | ||
|  | #include <boost/compute/container/detail/scalar.hpp> | ||
|  | #include <boost/compute/iterator/buffer_iterator.hpp> | ||
|  | #include <boost/compute/type_traits/type_name.hpp> | ||
|  | #include <boost/compute/detail/meta_kernel.hpp> | ||
|  | #include <boost/compute/detail/iterator_range_size.hpp> | ||
|  | #include <boost/compute/detail/parameter_cache.hpp> | ||
|  | 
 | ||
|  | namespace boost { | ||
|  | namespace compute { | ||
|  | namespace detail { | ||
|  | 
 | ||
|  | template<class InputIterator, class UnaryPredicate> | ||
|  | inline InputIterator find_if_with_atomics_one_vpt(InputIterator first, | ||
|  |                                                   InputIterator last, | ||
|  |                                                   UnaryPredicate predicate, | ||
|  |                                                   const size_t count, | ||
|  |                                                   command_queue &queue) | ||
|  | { | ||
|  |     typedef typename std::iterator_traits<InputIterator>::value_type value_type; | ||
|  |     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | ||
|  | 
 | ||
|  |     const context &context = queue.get_context(); | ||
|  | 
 | ||
|  |     detail::meta_kernel k("find_if"); | ||
|  |     size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index"); | ||
|  |     atomic_min<uint_> atomic_min_uint; | ||
|  | 
 | ||
|  |     k << k.decl<const uint_>("i") << " = get_global_id(0);\n" | ||
|  |       << k.decl<const value_type>("value") << "=" | ||
|  |       <<     first[k.var<const uint_>("i")] << ";\n" | ||
|  |       << "if(" << predicate(k.var<const value_type>("value")) << "){\n" | ||
|  |       << "    " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n" | ||
|  |       << "}\n"; | ||
|  | 
 | ||
|  |     kernel kernel = k.compile(context); | ||
|  | 
 | ||
|  |     scalar<uint_> index(context); | ||
|  |     kernel.set_arg(index_arg, index.get_buffer()); | ||
|  | 
 | ||
|  |     // initialize index to the last iterator's index | ||
|  |     index.write(static_cast<uint_>(count), queue); | ||
|  |     queue.enqueue_1d_range_kernel(kernel, 0, count, 0); | ||
|  | 
 | ||
|  |     // read index and return iterator | ||
|  |     return first + static_cast<difference_type>(index.read(queue)); | ||
|  | } | ||
|  | 
 | ||
|  | template<class InputIterator, class UnaryPredicate> | ||
|  | inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first, | ||
|  |                                                        InputIterator last, | ||
|  |                                                        UnaryPredicate predicate, | ||
|  |                                                        const size_t count, | ||
|  |                                                        const size_t vpt, | ||
|  |                                                        command_queue &queue) | ||
|  | { | ||
|  |     typedef typename std::iterator_traits<InputIterator>::value_type value_type; | ||
|  |     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | ||
|  | 
 | ||
|  |     const context &context = queue.get_context(); | ||
|  |     const device &device = queue.get_device(); | ||
|  | 
 | ||
|  |     detail::meta_kernel k("find_if"); | ||
|  |     size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index"); | ||
|  |     size_t count_arg = k.add_arg<const uint_>("count"); | ||
|  |     size_t vpt_arg = k.add_arg<const uint_>("vpt"); | ||
|  |     atomic_min<uint_> atomic_min_uint; | ||
|  | 
 | ||
|  |     // for GPUs reads from global memory are coalesced | ||
|  |     if(device.type() & device::gpu) { | ||
|  |         k << | ||
|  |             k.decl<const uint_>("lsize") << " = get_local_size(0);\n" << | ||
|  |             k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" << | ||
|  |             k.decl<const uint_>("end") << " = min(" << | ||
|  |                     "id + (lsize *" << k.var<uint_>("vpt") << ")," << | ||
|  |                     "count" << | ||
|  |             ");\n" << | ||
|  | 
 | ||
|  |             // checking if the index is already found | ||
|  |             "__local uint local_index;\n" << | ||
|  |             "if(get_local_id(0) == 0){\n" << | ||
|  |             "    local_index = *index;\n " << | ||
|  |             "};\n" << | ||
|  |             "barrier(CLK_LOCAL_MEM_FENCE);\n" << | ||
|  |             "if(local_index < id){\n" << | ||
|  |             "    return;\n" << | ||
|  |             "}\n" << | ||
|  | 
 | ||
|  |             "while(id < end){\n" << | ||
|  |             "    " << k.decl<const value_type>("value") << " = " << | ||
|  |                       first[k.var<const uint_>("id")] << ";\n" | ||
|  |             "    if(" << predicate(k.var<const value_type>("value")) << "){\n" << | ||
|  |             "        " << atomic_min_uint(k.var<uint_ *>("index"), | ||
|  |                                           k.var<uint_>("id")) << ";\n" << | ||
|  |             "        return;\n" | ||
|  |             "    }\n" << | ||
|  |             "    id+=lsize;\n" << | ||
|  |             "}\n"; | ||
|  |     // for CPUs (and other devices) reads are ordered so the big cache is | ||
|  |     // efficiently used. | ||
|  |     } else { | ||
|  |         k << | ||
|  |             k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" << | ||
|  |             k.decl<const uint_>("end") << " = min(" << | ||
|  |                     "id + " << k.var<uint_>("vpt") << "," << | ||
|  |                     "count" << | ||
|  |             ");\n" << | ||
|  |             "while(id < end && (*index) > id){\n" << | ||
|  |             "    " << k.decl<const value_type>("value") << " = " << | ||
|  |                       first[k.var<const uint_>("id")] << ";\n" | ||
|  |             "    if(" << predicate(k.var<const value_type>("value")) << "){\n" << | ||
|  |             "        " << atomic_min_uint(k.var<uint_ *>("index"), | ||
|  |                                           k.var<uint_>("id")) << ";\n" << | ||
|  |             "        return;\n" << | ||
|  |             "    }\n" << | ||
|  |             "    id++;\n" << | ||
|  |             "}\n"; | ||
|  |     } | ||
|  | 
 | ||
|  |     kernel kernel = k.compile(context); | ||
|  | 
 | ||
|  |     scalar<uint_> index(context); | ||
|  |     kernel.set_arg(index_arg, index.get_buffer()); | ||
|  |     kernel.set_arg(count_arg, static_cast<uint_>(count)); | ||
|  |     kernel.set_arg(vpt_arg, static_cast<uint_>(vpt)); | ||
|  | 
 | ||
|  |     // initialize index to the last iterator's index | ||
|  |     index.write(static_cast<uint_>(count), queue); | ||
|  | 
 | ||
|  |     const size_t global_wg_size = static_cast<size_t>( | ||
|  |         std::ceil(float(count) / vpt) | ||
|  |     ); | ||
|  |     queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0); | ||
|  | 
 | ||
|  |     // read index and return iterator | ||
|  |     return first + static_cast<difference_type>(index.read(queue)); | ||
|  | } | ||
|  | 
 | ||
|  | template<class InputIterator, class UnaryPredicate> | ||
|  | inline InputIterator find_if_with_atomics(InputIterator first, | ||
|  |                                           InputIterator last, | ||
|  |                                           UnaryPredicate predicate, | ||
|  |                                           command_queue &queue) | ||
|  | { | ||
|  |     typedef typename std::iterator_traits<InputIterator>::value_type value_type; | ||
|  | 
 | ||
|  |     size_t count = detail::iterator_range_size(first, last); | ||
|  |     if(count == 0){ | ||
|  |         return last; | ||
|  |     } | ||
|  | 
 | ||
|  |     const device &device = queue.get_device(); | ||
|  | 
 | ||
|  |     // load cached parameters | ||
|  |     std::string cache_key = std::string("__boost_find_if_with_atomics_") | ||
|  |         + type_name<value_type>(); | ||
|  |     boost::shared_ptr<parameter_cache> parameters = | ||
|  |         detail::parameter_cache::get_global_cache(device); | ||
|  | 
 | ||
|  |     // for relatively small inputs on GPUs kernel checking one value per thread | ||
|  |     // (work-item) is more efficient than its multiple values per thread version | ||
|  |     if(device.type() & device::gpu){ | ||
|  |         const size_t one_vpt_threshold = | ||
|  |             parameters->get(cache_key, "one_vpt_threshold", 1048576); | ||
|  |         if(count <= one_vpt_threshold){ | ||
|  |             return find_if_with_atomics_one_vpt( | ||
|  |                 first, last, predicate, count, queue | ||
|  |             ); | ||
|  |         } | ||
|  |     } | ||
|  | 
 | ||
|  |     // values per thread | ||
|  |     size_t vpt; | ||
|  |     if(device.type() & device::gpu){ | ||
|  |         // get vpt parameter | ||
|  |         vpt = parameters->get(cache_key, "vpt", 32); | ||
|  |     } else { | ||
|  |         // for CPUs work is split equally between compute units | ||
|  |         const size_t max_compute_units = | ||
|  |             device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); | ||
|  |         vpt = static_cast<size_t>( | ||
|  |             std::ceil(float(count) / max_compute_units) | ||
|  |         ); | ||
|  |     } | ||
|  | 
 | ||
|  |     return find_if_with_atomics_multiple_vpt( | ||
|  |         first, last, predicate, count, vpt, queue | ||
|  |     ); | ||
|  | } | ||
|  | 
 | ||
|  | } // end detail namespace | ||
|  | } // end compute namespace | ||
|  | } // end boost namespace | ||
|  | 
 | ||
|  | #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP |