857 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
		
		
			
		
	
	
			857 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
|   | //---------------------------------------------------------------------------// | ||
|  | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | ||
|  | // | ||
|  | // Distributed under the Boost Software License, Version 1.0 | ||
|  | // See accompanying file LICENSE_1_0.txt or copy at | ||
|  | // http://www.boost.org/LICENSE_1_0.txt | ||
|  | // | ||
|  | // See http://boostorg.github.com/compute for more information. | ||
|  | //---------------------------------------------------------------------------// | ||
|  | 
 | ||
|  | #ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP | ||
|  | #define BOOST_COMPUTE_ALGORITHM_COPY_HPP | ||
|  | 
 | ||
|  | #include <algorithm> | ||
|  | #include <iterator> | ||
|  | 
 | ||
|  | #include <boost/utility/enable_if.hpp> | ||
|  | 
 | ||
|  | #include <boost/mpl/and.hpp> | ||
|  | #include <boost/mpl/not.hpp> | ||
|  | #include <boost/mpl/or.hpp> | ||
|  | 
 | ||
|  | #include <boost/compute/buffer.hpp> | ||
|  | #include <boost/compute/system.hpp> | ||
|  | #include <boost/compute/command_queue.hpp> | ||
|  | #include <boost/compute/algorithm/detail/copy_on_device.hpp> | ||
|  | #include <boost/compute/algorithm/detail/copy_to_device.hpp> | ||
|  | #include <boost/compute/algorithm/detail/copy_to_host.hpp> | ||
|  | #include <boost/compute/async/future.hpp> | ||
|  | #include <boost/compute/container/mapped_view.hpp> | ||
|  | #include <boost/compute/detail/device_ptr.hpp> | ||
|  | #include <boost/compute/detail/is_contiguous_iterator.hpp> | ||
|  | #include <boost/compute/detail/iterator_range_size.hpp> | ||
|  | #include <boost/compute/detail/parameter_cache.hpp> | ||
|  | #include <boost/compute/iterator/buffer_iterator.hpp> | ||
|  | #include <boost/compute/type_traits/type_name.hpp> | ||
|  | #include <boost/compute/type_traits/is_device_iterator.hpp> | ||
|  | 
 | ||
|  | namespace boost { | ||
|  | namespace compute { | ||
|  | namespace detail { | ||
|  | 
 | ||
|  | namespace mpl = boost::mpl; | ||
|  | 
 | ||
|  | // meta-function returning true if copy() between InputIterator and | ||
|  | // OutputIterator can be implemented with clEnqueueCopyBuffer(). | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | struct can_copy_with_copy_buffer : | ||
|  |     mpl::and_< | ||
|  |         mpl::or_< | ||
|  |             boost::is_same< | ||
|  |                 InputIterator, | ||
|  |                 buffer_iterator<typename InputIterator::value_type> | ||
|  |             >, | ||
|  |             boost::is_same< | ||
|  |                 InputIterator, | ||
|  |                 detail::device_ptr<typename InputIterator::value_type> | ||
|  |             > | ||
|  |         >, | ||
|  |         mpl::or_< | ||
|  |             boost::is_same< | ||
|  |                 OutputIterator, | ||
|  |                 buffer_iterator<typename OutputIterator::value_type> | ||
|  |             >, | ||
|  |             boost::is_same< | ||
|  |                 OutputIterator, | ||
|  |                 detail::device_ptr<typename OutputIterator::value_type> | ||
|  |             > | ||
|  |         >, | ||
|  |         boost::is_same< | ||
|  |             typename InputIterator::value_type, | ||
|  |             typename OutputIterator::value_type | ||
|  |         > | ||
|  |     >::type {}; | ||
|  | 
 | ||
|  | // meta-function returning true if value_types of HostIterator and | ||
|  | // DeviceIterator are same | ||
|  | template<class HostIterator, class DeviceIterator> | ||
|  | struct is_same_value_type : | ||
|  |     boost::is_same< | ||
|  |         typename boost::remove_cv< | ||
|  |             typename std::iterator_traits<HostIterator>::value_type | ||
|  |         >::type, | ||
|  |         typename boost::remove_cv< | ||
|  |             typename DeviceIterator::value_type | ||
|  |         >::type | ||
|  |     >::type {}; | ||
|  | 
 | ||
|  | // meta-function returning true if value_type of HostIterator is bool | ||
|  | template<class HostIterator> | ||
|  | struct is_bool_value_type : | ||
|  |     boost::is_same< | ||
|  |         typename boost::remove_cv< | ||
|  |             typename std::iterator_traits<HostIterator>::value_type | ||
|  |         >::type, | ||
|  |         bool | ||
|  |     >::type {}; | ||
|  | 
 | ||
|  | // host -> device (async) | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline future<OutputIterator> | ||
|  | dispatch_copy_async(InputIterator first, | ||
|  |                     InputIterator last, | ||
|  |                     OutputIterator result, | ||
|  |                     command_queue &queue, | ||
|  |                     typename boost::enable_if< | ||
|  |                         mpl::and_< | ||
|  |                             mpl::not_< | ||
|  |                                 is_device_iterator<InputIterator> | ||
|  |                             >, | ||
|  |                             is_device_iterator<OutputIterator>, | ||
|  |                             is_same_value_type<InputIterator, OutputIterator> | ||
|  |                         > | ||
|  |                     >::type* = 0) | ||
|  | { | ||
|  |     BOOST_STATIC_ASSERT_MSG( | ||
|  |         is_contiguous_iterator<InputIterator>::value, | ||
|  |         "copy_async() is only supported for contiguous host iterators" | ||
|  |     ); | ||
|  | 
 | ||
|  |     return copy_to_device_async(first, last, result, queue); | ||
|  | } | ||
|  | 
 | ||
|  | // host -> device (async) | ||
|  | // Type mismatch between InputIterator and OutputIterator value_types | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline future<OutputIterator> | ||
|  | dispatch_copy_async(InputIterator first, | ||
|  |                     InputIterator last, | ||
|  |                     OutputIterator result, | ||
|  |                     command_queue &queue, | ||
|  |                     typename boost::enable_if< | ||
|  |                         mpl::and_< | ||
|  |                             mpl::not_< | ||
|  |                                 is_device_iterator<InputIterator> | ||
|  |                             >, | ||
|  |                             is_device_iterator<OutputIterator>, | ||
|  |                             mpl::not_< | ||
|  |                                 is_same_value_type<InputIterator, OutputIterator> | ||
|  |                             > | ||
|  |                         > | ||
|  |                     >::type* = 0) | ||
|  | { | ||
|  |     BOOST_STATIC_ASSERT_MSG( | ||
|  |         is_contiguous_iterator<InputIterator>::value, | ||
|  |         "copy_async() is only supported for contiguous host iterators" | ||
|  |     ); | ||
|  | 
 | ||
|  |     typedef typename std::iterator_traits<InputIterator>::value_type input_type; | ||
|  | 
 | ||
|  |     const context &context = queue.get_context(); | ||
|  |     size_t count = iterator_range_size(first, last); | ||
|  | 
 | ||
|  |     if(count < size_t(1)) { | ||
|  |         return future<OutputIterator>(); | ||
|  |     } | ||
|  | 
 | ||
|  |     // map [first; last) to device and run copy kernel | ||
|  |     // on device for copying & casting | ||
|  |     ::boost::compute::mapped_view<input_type> mapped_host( | ||
|  |         // make sure it's a pointer to constant data | ||
|  |         // to force read only mapping | ||
|  |         const_cast<const input_type*>( | ||
|  |             ::boost::addressof(*first) | ||
|  |         ), | ||
|  |         count, | ||
|  |         context | ||
|  |     ); | ||
|  |     return copy_on_device_async( | ||
|  |         mapped_host.begin(), mapped_host.end(), result, queue | ||
|  |     ); | ||
|  | } | ||
|  | 
 | ||
|  | // host -> device | ||
|  | // InputIterator is a contiguous iterator | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator | ||
|  | dispatch_copy(InputIterator first, | ||
|  |               InputIterator last, | ||
|  |               OutputIterator result, | ||
|  |               command_queue &queue, | ||
|  |               typename boost::enable_if< | ||
|  |                   mpl::and_< | ||
|  |                       mpl::not_< | ||
|  |                           is_device_iterator<InputIterator> | ||
|  |                       >, | ||
|  |                       is_device_iterator<OutputIterator>, | ||
|  |                       is_same_value_type<InputIterator, OutputIterator>, | ||
|  |                       is_contiguous_iterator<InputIterator> | ||
|  |                   > | ||
|  |               >::type* = 0) | ||
|  | { | ||
|  |     return copy_to_device(first, last, result, queue); | ||
|  | } | ||
|  | 
 | ||
|  | // host -> device | ||
|  | // Type mismatch between InputIterator and OutputIterator value_types | ||
|  | // InputIterator is a contiguous iterator | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator | ||
|  | dispatch_copy(InputIterator first, | ||
|  |               InputIterator last, | ||
|  |               OutputIterator result, | ||
|  |               command_queue &queue, | ||
|  |               typename boost::enable_if< | ||
|  |                   mpl::and_< | ||
|  |                       mpl::not_< | ||
|  |                           is_device_iterator<InputIterator> | ||
|  |                       >, | ||
|  |                       is_device_iterator<OutputIterator>, | ||
|  |                       mpl::not_< | ||
|  |                           is_same_value_type<InputIterator, OutputIterator> | ||
|  |                       >, | ||
|  |                       is_contiguous_iterator<InputIterator> | ||
|  |                   > | ||
|  |               >::type* = 0) | ||
|  | { | ||
|  |     typedef typename OutputIterator::value_type output_type; | ||
|  |     typedef typename std::iterator_traits<InputIterator>::value_type input_type; | ||
|  | 
 | ||
|  |     const device &device = queue.get_device(); | ||
|  | 
 | ||
|  |     // loading parameters | ||
|  |     std::string cache_key = | ||
|  |         std::string("__boost_compute_copy_to_device_") | ||
|  |             + type_name<input_type>() + "_" + type_name<output_type>(); | ||
|  |     boost::shared_ptr<parameter_cache> parameters = | ||
|  |         detail::parameter_cache::get_global_cache(device); | ||
|  | 
 | ||
|  |     size_t map_copy_threshold; | ||
|  |     size_t direct_copy_threshold; | ||
|  | 
 | ||
|  |     // calculate default values of thresholds | ||
|  |     if (device.type() & device::gpu) { | ||
|  |         // GPUs | ||
|  |         map_copy_threshold = 524288;  // 0.5 MB | ||
|  |         direct_copy_threshold = 52428800; // 50 MB | ||
|  |     } | ||
|  |     else { | ||
|  |         // CPUs and other devices | ||
|  |         map_copy_threshold = 134217728; // 128 MB | ||
|  |         direct_copy_threshold = 0; // it's never efficient for CPUs | ||
|  |     } | ||
|  | 
 | ||
|  |     // load thresholds | ||
|  |     map_copy_threshold = | ||
|  |         parameters->get( | ||
|  |             cache_key, "map_copy_threshold", map_copy_threshold | ||
|  |         ); | ||
|  |     direct_copy_threshold = | ||
|  |         parameters->get( | ||
|  |             cache_key, "direct_copy_threshold", direct_copy_threshold | ||
|  |         ); | ||
|  | 
 | ||
|  |     // select copy method based on thresholds & input_size_bytes | ||
|  |     size_t count = iterator_range_size(first, last); | ||
|  |     size_t input_size_bytes = count * sizeof(input_type); | ||
|  | 
 | ||
|  |     // [0; map_copy_threshold) -> copy_to_device_map() | ||
|  |     if(input_size_bytes < map_copy_threshold) { | ||
|  |         return copy_to_device_map(first, last, result, queue); | ||
|  |     } | ||
|  |     // [map_copy_threshold; direct_copy_threshold) -> convert [first; last) | ||
|  |     //     on host and then perform copy_to_device() | ||
|  |     else if(input_size_bytes < direct_copy_threshold) { | ||
|  |         std::vector<output_type> vector(first, last); | ||
|  |         return copy_to_device(vector.begin(), vector.end(), result, queue); | ||
|  |     } | ||
|  | 
 | ||
|  |     // [direct_copy_threshold; inf) -> map [first; last) to device and | ||
|  |     //     run copy kernel on device for copying & casting | ||
|  |     // At this point we are sure that count > 1 (first != last). | ||
|  | 
 | ||
|  |     // Perform async copy to device, wait for it to be finished and | ||
|  |     // return the result. | ||
|  |     // At this point we are sure that count > 1 (first != last), so event | ||
|  |     // returned by dispatch_copy_async() must be valid. | ||
|  |     return dispatch_copy_async(first, last, result, queue).get(); | ||
|  | } | ||
|  | 
 | ||
|  | // host -> device | ||
|  | // InputIterator is NOT a contiguous iterator | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator | ||
|  | dispatch_copy(InputIterator first, | ||
|  |               InputIterator last, | ||
|  |               OutputIterator result, | ||
|  |               command_queue &queue, | ||
|  |               typename boost::enable_if< | ||
|  |                   mpl::and_< | ||
|  |                       mpl::not_< | ||
|  |                           is_device_iterator<InputIterator> | ||
|  |                       >, | ||
|  |                       is_device_iterator<OutputIterator>, | ||
|  |                       mpl::not_< | ||
|  |                           is_contiguous_iterator<InputIterator> | ||
|  |                       > | ||
|  |                   > | ||
|  |               >::type* = 0) | ||
|  | { | ||
|  |     typedef typename OutputIterator::value_type output_type; | ||
|  |     typedef typename std::iterator_traits<InputIterator>::value_type input_type; | ||
|  | 
 | ||
|  |     const device &device = queue.get_device(); | ||
|  | 
 | ||
|  |     // loading parameters | ||
|  |     std::string cache_key = | ||
|  |         std::string("__boost_compute_copy_to_device_") | ||
|  |             + type_name<input_type>() + "_" + type_name<output_type>(); | ||
|  |     boost::shared_ptr<parameter_cache> parameters = | ||
|  |         detail::parameter_cache::get_global_cache(device); | ||
|  | 
 | ||
|  |     size_t map_copy_threshold; | ||
|  |     size_t direct_copy_threshold; | ||
|  | 
 | ||
|  |     // calculate default values of thresholds | ||
|  |     if (device.type() & device::gpu) { | ||
|  |         // GPUs | ||
|  |         map_copy_threshold = 524288;  // 0.5 MB | ||
|  |         direct_copy_threshold = 52428800; // 50 MB | ||
|  |     } | ||
|  |     else { | ||
|  |         // CPUs and other devices | ||
|  |         map_copy_threshold = 134217728; // 128 MB | ||
|  |         direct_copy_threshold = 0; // it's never efficient for CPUs | ||
|  |     } | ||
|  | 
 | ||
|  |     // load thresholds | ||
|  |     map_copy_threshold = | ||
|  |         parameters->get( | ||
|  |             cache_key, "map_copy_threshold", map_copy_threshold | ||
|  |         ); | ||
|  |     direct_copy_threshold = | ||
|  |         parameters->get( | ||
|  |             cache_key, "direct_copy_threshold", direct_copy_threshold | ||
|  |         ); | ||
|  | 
 | ||
|  |     // select copy method based on thresholds & input_size_bytes | ||
|  |     size_t input_size = iterator_range_size(first, last); | ||
|  |     size_t input_size_bytes = input_size * sizeof(input_type); | ||
|  | 
 | ||
|  |     // [0; map_copy_threshold) -> copy_to_device_map() | ||
|  |     // | ||
|  |     // if direct_copy_threshold is less than map_copy_threshold | ||
|  |     // copy_to_device_map() is used for every input | ||
|  |     if(input_size_bytes < map_copy_threshold | ||
|  |         || direct_copy_threshold <= map_copy_threshold) { | ||
|  |         return copy_to_device_map(first, last, result, queue); | ||
|  |     } | ||
|  |     // [map_copy_threshold; inf) -> convert [first; last) | ||
|  |     //     on host and then perform copy_to_device() | ||
|  |     std::vector<output_type> vector(first, last); | ||
|  |     return copy_to_device(vector.begin(), vector.end(), result, queue); | ||
|  | } | ||
|  | 
 | ||
|  | // device -> host (async) | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline future<OutputIterator> | ||
|  | dispatch_copy_async(InputIterator first, | ||
|  |                     InputIterator last, | ||
|  |                     OutputIterator result, | ||
|  |                     command_queue &queue, | ||
|  |                     typename boost::enable_if< | ||
|  |                         mpl::and_< | ||
|  |                             is_device_iterator<InputIterator>, | ||
|  |                             mpl::not_< | ||
|  |                                 is_device_iterator<OutputIterator> | ||
|  |                             >, | ||
|  |                             is_same_value_type<OutputIterator, InputIterator> | ||
|  |                         > | ||
|  |                     >::type* = 0) | ||
|  | { | ||
|  |     BOOST_STATIC_ASSERT_MSG( | ||
|  |         is_contiguous_iterator<OutputIterator>::value, | ||
|  |         "copy_async() is only supported for contiguous host iterators" | ||
|  |     ); | ||
|  | 
 | ||
|  |     return copy_to_host_async(first, last, result, queue); | ||
|  | } | ||
|  | 
 | ||
|  | // device -> host (async) | ||
|  | // Type mismatch between InputIterator and OutputIterator value_types | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline future<OutputIterator> | ||
|  | dispatch_copy_async(InputIterator first, | ||
|  |                     InputIterator last, | ||
|  |                     OutputIterator result, | ||
|  |                     command_queue &queue, | ||
|  |                     typename boost::enable_if< | ||
|  |                         mpl::and_< | ||
|  |                             is_device_iterator<InputIterator>, | ||
|  |                             mpl::not_< | ||
|  |                                 is_device_iterator<OutputIterator> | ||
|  |                             >, | ||
|  |                             mpl::not_< | ||
|  |                                 is_same_value_type<OutputIterator, InputIterator> | ||
|  |                             > | ||
|  |                         > | ||
|  |                     >::type* = 0) | ||
|  | { | ||
|  |     BOOST_STATIC_ASSERT_MSG( | ||
|  |         is_contiguous_iterator<OutputIterator>::value, | ||
|  |         "copy_async() is only supported for contiguous host iterators" | ||
|  |     ); | ||
|  | 
 | ||
|  |     typedef typename std::iterator_traits<OutputIterator>::value_type output_type; | ||
|  |     const context &context = queue.get_context(); | ||
|  |     size_t count = iterator_range_size(first, last); | ||
|  | 
 | ||
|  |     if(count < size_t(1)) { | ||
|  |         return future<OutputIterator>(); | ||
|  |     } | ||
|  | 
 | ||
|  |     // map host memory to device | ||
|  |     buffer mapped_host( | ||
|  |         context, | ||
|  |         count * sizeof(output_type), | ||
|  |         buffer::write_only | buffer::use_host_ptr, | ||
|  |         static_cast<void*>( | ||
|  |             ::boost::addressof(*result) | ||
|  |         ) | ||
|  |     ); | ||
|  |     // copy async on device | ||
|  |     ::boost::compute::future<buffer_iterator<output_type> > future = | ||
|  |         copy_on_device_async( | ||
|  |             first, | ||
|  |             last, | ||
|  |             make_buffer_iterator<output_type>(mapped_host), | ||
|  |             queue | ||
|  |         ); | ||
|  |     // update host memory asynchronously by maping and unmaping memory | ||
|  |     event map_event; | ||
|  |     void* ptr = queue.enqueue_map_buffer_async( | ||
|  |         mapped_host, | ||
|  |         CL_MAP_READ, | ||
|  |         0, | ||
|  |         count * sizeof(output_type), | ||
|  |         map_event, | ||
|  |         future.get_event() | ||
|  |     ); | ||
|  |     event unmap_event = | ||
|  |         queue.enqueue_unmap_buffer(mapped_host, ptr, map_event); | ||
|  |     return make_future(result + count, unmap_event); | ||
|  | } | ||
|  | 
 | ||
|  | // device -> host | ||
|  | // OutputIterator is a contiguous iterator | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator | ||
|  | dispatch_copy(InputIterator first, | ||
|  |               InputIterator last, | ||
|  |               OutputIterator result, | ||
|  |               command_queue &queue, | ||
|  |               typename boost::enable_if< | ||
|  |                   mpl::and_< | ||
|  |                       is_device_iterator<InputIterator>, | ||
|  |                       mpl::not_< | ||
|  |                           is_device_iterator<OutputIterator> | ||
|  |                       >, | ||
|  |                       is_same_value_type<OutputIterator, InputIterator>, | ||
|  |                       is_contiguous_iterator<OutputIterator>, | ||
|  |                       mpl::not_< | ||
|  |                           is_bool_value_type<OutputIterator> | ||
|  |                       > | ||
|  |                   > | ||
|  |               >::type* = 0) | ||
|  | { | ||
|  |     return copy_to_host(first, last, result, queue); | ||
|  | } | ||
|  | 
 | ||
|  | // device -> host | ||
|  | // Type mismatch between InputIterator and OutputIterator value_types | ||
|  | // OutputIterator is NOT a contiguous iterator or value_type of OutputIterator | ||
|  | // is a boolean type. | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator | ||
|  | dispatch_copy(InputIterator first, | ||
|  |               InputIterator last, | ||
|  |               OutputIterator result, | ||
|  |               command_queue &queue, | ||
|  |               typename boost::enable_if< | ||
|  |                   mpl::and_< | ||
|  |                       is_device_iterator<InputIterator>, | ||
|  |                       mpl::not_< | ||
|  |                           is_device_iterator<OutputIterator> | ||
|  |                       >, | ||
|  |                       mpl::or_< | ||
|  |                           mpl::not_< | ||
|  |                               is_contiguous_iterator<OutputIterator> | ||
|  |                           >, | ||
|  |                           is_bool_value_type<OutputIterator> | ||
|  |                       > | ||
|  |                   > | ||
|  |               >::type* = 0) | ||
|  | { | ||
|  |     typedef typename std::iterator_traits<OutputIterator>::value_type output_type; | ||
|  |     typedef typename InputIterator::value_type input_type; | ||
|  | 
 | ||
|  |     const device &device = queue.get_device(); | ||
|  | 
 | ||
|  |     // loading parameters | ||
|  |     std::string cache_key = | ||
|  |         std::string("__boost_compute_copy_to_host_") | ||
|  |             + type_name<input_type>() + "_" + type_name<output_type>(); | ||
|  |     boost::shared_ptr<parameter_cache> parameters = | ||
|  |         detail::parameter_cache::get_global_cache(device); | ||
|  | 
 | ||
|  |     size_t map_copy_threshold; | ||
|  |     size_t direct_copy_threshold; | ||
|  | 
 | ||
|  |     // calculate default values of thresholds | ||
|  |     if (device.type() & device::gpu) { | ||
|  |         // GPUs | ||
|  |         map_copy_threshold = 33554432;  // 30 MB | ||
|  |         direct_copy_threshold = 0; // it's never efficient for GPUs | ||
|  |     } | ||
|  |     else { | ||
|  |         // CPUs and other devices | ||
|  |         map_copy_threshold = 134217728; // 128 MB | ||
|  |         direct_copy_threshold = 0; // it's never efficient for CPUs | ||
|  |     } | ||
|  | 
 | ||
|  |     // load thresholds | ||
|  |     map_copy_threshold = | ||
|  |         parameters->get( | ||
|  |             cache_key, "map_copy_threshold", map_copy_threshold | ||
|  |         ); | ||
|  |     direct_copy_threshold = | ||
|  |         parameters->get( | ||
|  |             cache_key, "direct_copy_threshold", direct_copy_threshold | ||
|  |         ); | ||
|  | 
 | ||
|  |     // select copy method based on thresholds & input_size_bytes | ||
|  |     size_t count = iterator_range_size(first, last); | ||
|  |     size_t input_size_bytes = count * sizeof(input_type); | ||
|  | 
 | ||
|  |     // [0; map_copy_threshold) -> copy_to_host_map() | ||
|  |     // | ||
|  |     // if direct_copy_threshold is less than map_copy_threshold | ||
|  |     // copy_to_host_map() is used for every input | ||
|  |     if(input_size_bytes < map_copy_threshold | ||
|  |         || direct_copy_threshold <= map_copy_threshold) { | ||
|  |         return copy_to_host_map(first, last, result, queue); | ||
|  |     } | ||
|  |     // [map_copy_threshold; inf) -> copy [first;last) to temporary vector | ||
|  |     //     then copy (and convert) to result using std::copy() | ||
|  |     std::vector<input_type> vector(count); | ||
|  |     copy_to_host(first, last, vector.begin(), queue); | ||
|  |     return std::copy(vector.begin(), vector.end(), result); | ||
|  | } | ||
|  | 
 | ||
|  | // device -> host | ||
|  | // Type mismatch between InputIterator and OutputIterator value_types | ||
|  | // OutputIterator is a contiguous iterator | ||
|  | // value_type of OutputIterator is NOT a boolean type | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator | ||
|  | dispatch_copy(InputIterator first, | ||
|  |               InputIterator last, | ||
|  |               OutputIterator result, | ||
|  |               command_queue &queue, | ||
|  |               typename boost::enable_if< | ||
|  |                   mpl::and_< | ||
|  |                       is_device_iterator<InputIterator>, | ||
|  |                       mpl::not_< | ||
|  |                           is_device_iterator<OutputIterator> | ||
|  |                       >, | ||
|  |                       mpl::not_< | ||
|  |                           is_same_value_type<OutputIterator, InputIterator> | ||
|  |                       >, | ||
|  |                       is_contiguous_iterator<OutputIterator>, | ||
|  |                       mpl::not_< | ||
|  |                           is_bool_value_type<OutputIterator> | ||
|  |                       > | ||
|  |                   > | ||
|  |               >::type* = 0) | ||
|  | { | ||
|  |     typedef typename std::iterator_traits<OutputIterator>::value_type output_type; | ||
|  |     typedef typename InputIterator::value_type input_type; | ||
|  | 
 | ||
|  |     const device &device = queue.get_device(); | ||
|  | 
 | ||
|  |     // loading parameters | ||
|  |     std::string cache_key = | ||
|  |         std::string("__boost_compute_copy_to_host_") | ||
|  |             + type_name<input_type>() + "_" + type_name<output_type>(); | ||
|  |     boost::shared_ptr<parameter_cache> parameters = | ||
|  |         detail::parameter_cache::get_global_cache(device); | ||
|  | 
 | ||
|  |     size_t map_copy_threshold; | ||
|  |     size_t direct_copy_threshold; | ||
|  | 
 | ||
|  |     // calculate default values of thresholds | ||
|  |     if (device.type() & device::gpu) { | ||
|  |         // GPUs | ||
|  |         map_copy_threshold = 524288;  // 0.5 MB | ||
|  |         direct_copy_threshold = 52428800; // 50 MB | ||
|  |     } | ||
|  |     else { | ||
|  |         // CPUs and other devices | ||
|  |         map_copy_threshold = 134217728; // 128 MB | ||
|  |         direct_copy_threshold = 0; // it's never efficient for CPUs | ||
|  |     } | ||
|  | 
 | ||
|  |     // load thresholds | ||
|  |     map_copy_threshold = | ||
|  |         parameters->get( | ||
|  |             cache_key, "map_copy_threshold", map_copy_threshold | ||
|  |         ); | ||
|  |     direct_copy_threshold = | ||
|  |         parameters->get( | ||
|  |             cache_key, "direct_copy_threshold", direct_copy_threshold | ||
|  |         ); | ||
|  | 
 | ||
|  |     // select copy method based on thresholds & input_size_bytes | ||
|  |     size_t count = iterator_range_size(first, last); | ||
|  |     size_t input_size_bytes = count * sizeof(input_type); | ||
|  | 
 | ||
|  |     // [0; map_copy_threshold) -> copy_to_host_map() | ||
|  |     if(input_size_bytes < map_copy_threshold) { | ||
|  |         return copy_to_host_map(first, last, result, queue); | ||
|  |     } | ||
|  |     // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to | ||
|  |     //     temporary vector then copy (and convert) to result using std::copy() | ||
|  |     else if(input_size_bytes < direct_copy_threshold) { | ||
|  |         std::vector<input_type> vector(count); | ||
|  |         copy_to_host(first, last, vector.begin(), queue); | ||
|  |         return std::copy(vector.begin(), vector.end(), result); | ||
|  |     } | ||
|  | 
 | ||
|  |     // [direct_copy_threshold; inf) -> map [result; result + input_size) to | ||
|  |     //     device and run copy kernel on device for copying & casting | ||
|  |     // map host memory to device. | ||
|  | 
 | ||
|  |     // Perform async copy to host, wait for it to be finished and | ||
|  |     // return the result. | ||
|  |     // At this point we are sure that count > 1 (first != last), so event | ||
|  |     // returned by dispatch_copy_async() must be valid. | ||
|  |     return dispatch_copy_async(first, last, result, queue).get(); | ||
|  | } | ||
|  | 
 | ||
|  | // device -> device | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator | ||
|  | dispatch_copy(InputIterator first, | ||
|  |               InputIterator last, | ||
|  |               OutputIterator result, | ||
|  |               command_queue &queue, | ||
|  |               typename boost::enable_if< | ||
|  |                   mpl::and_< | ||
|  |                       is_device_iterator<InputIterator>, | ||
|  |                       is_device_iterator<OutputIterator>, | ||
|  |                       mpl::not_< | ||
|  |                           can_copy_with_copy_buffer< | ||
|  |                               InputIterator, OutputIterator | ||
|  |                           > | ||
|  |                       > | ||
|  |                   > | ||
|  |               >::type* = 0) | ||
|  | { | ||
|  |     return copy_on_device(first, last, result, queue); | ||
|  | } | ||
|  | 
 | ||
|  | // device -> device (specialization for buffer iterators) | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator | ||
|  | dispatch_copy(InputIterator first, | ||
|  |               InputIterator last, | ||
|  |               OutputIterator result, | ||
|  |               command_queue &queue, | ||
|  |               typename boost::enable_if< | ||
|  |                   mpl::and_< | ||
|  |                       is_device_iterator<InputIterator>, | ||
|  |                       is_device_iterator<OutputIterator>, | ||
|  |                       can_copy_with_copy_buffer< | ||
|  |                           InputIterator, OutputIterator | ||
|  |                       > | ||
|  |                   > | ||
|  |               >::type* = 0) | ||
|  | { | ||
|  |     typedef typename std::iterator_traits<InputIterator>::value_type value_type; | ||
|  |     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | ||
|  | 
 | ||
|  |     difference_type n = std::distance(first, last); | ||
|  |     if(n < 1){ | ||
|  |         // nothing to copy | ||
|  |         return result; | ||
|  |     } | ||
|  | 
 | ||
|  |     queue.enqueue_copy_buffer(first.get_buffer(), | ||
|  |                               result.get_buffer(), | ||
|  |                               first.get_index() * sizeof(value_type), | ||
|  |                               result.get_index() * sizeof(value_type), | ||
|  |                               static_cast<size_t>(n) * sizeof(value_type)); | ||
|  |     return result + n; | ||
|  | } | ||
|  | 
 | ||
|  | // device -> device (async) | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline future<OutputIterator> | ||
|  | dispatch_copy_async(InputIterator first, | ||
|  |                     InputIterator last, | ||
|  |                     OutputIterator result, | ||
|  |                     command_queue &queue, | ||
|  |                     typename boost::enable_if< | ||
|  |                         mpl::and_< | ||
|  |                             is_device_iterator<InputIterator>, | ||
|  |                             is_device_iterator<OutputIterator>, | ||
|  |                             mpl::not_< | ||
|  |                                 can_copy_with_copy_buffer< | ||
|  |                                     InputIterator, OutputIterator | ||
|  |                                 > | ||
|  |                             > | ||
|  |                         > | ||
|  |                     >::type* = 0) | ||
|  | { | ||
|  |     return copy_on_device_async(first, last, result, queue); | ||
|  | } | ||
|  | 
 | ||
|  | // device -> device (async, specialization for buffer iterators) | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline future<OutputIterator> | ||
|  | dispatch_copy_async(InputIterator first, | ||
|  |                     InputIterator last, | ||
|  |                     OutputIterator result, | ||
|  |                     command_queue &queue, | ||
|  |                     typename boost::enable_if< | ||
|  |                         mpl::and_< | ||
|  |                             is_device_iterator<InputIterator>, | ||
|  |                             is_device_iterator<OutputIterator>, | ||
|  |                             can_copy_with_copy_buffer< | ||
|  |                                 InputIterator, OutputIterator | ||
|  |                             > | ||
|  |                         > | ||
|  |                     >::type* = 0) | ||
|  | { | ||
|  |     typedef typename std::iterator_traits<InputIterator>::value_type value_type; | ||
|  |     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | ||
|  | 
 | ||
|  |     difference_type n = std::distance(first, last); | ||
|  |     if(n < 1){ | ||
|  |         // nothing to copy | ||
|  |         return make_future(result, event()); | ||
|  |     } | ||
|  | 
 | ||
|  |     event event_ = | ||
|  |         queue.enqueue_copy_buffer( | ||
|  |             first.get_buffer(), | ||
|  |             result.get_buffer(), | ||
|  |             first.get_index() * sizeof(value_type), | ||
|  |             result.get_index() * sizeof(value_type), | ||
|  |             static_cast<size_t>(n) * sizeof(value_type) | ||
|  |         ); | ||
|  | 
 | ||
|  |     return make_future(result + n, event_); | ||
|  | } | ||
|  | 
 | ||
|  | // host -> host | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator | ||
|  | dispatch_copy(InputIterator first, | ||
|  |               InputIterator last, | ||
|  |               OutputIterator result, | ||
|  |               command_queue &queue, | ||
|  |               typename boost::enable_if_c< | ||
|  |                   !is_device_iterator<InputIterator>::value && | ||
|  |                   !is_device_iterator<OutputIterator>::value | ||
|  |               >::type* = 0) | ||
|  | { | ||
|  |     (void) queue; | ||
|  | 
 | ||
|  |     return std::copy(first, last, result); | ||
|  | } | ||
|  | 
 | ||
|  | } // end detail namespace | ||
|  | 
 | ||
|  | /// Copies the values in the range [\p first, \p last) to the range | ||
|  | /// beginning at \p result. | ||
|  | /// | ||
|  | /// The generic copy() function can be used for a variety of data | ||
|  | /// transfer tasks and provides a standard interface to the following | ||
|  | /// OpenCL functions: | ||
|  | /// | ||
|  | /// \li \c clEnqueueReadBuffer() | ||
|  | /// \li \c clEnqueueWriteBuffer() | ||
|  | /// \li \c clEnqueueCopyBuffer() | ||
|  | /// | ||
|  | /// Unlike the aforementioned OpenCL functions, copy() will also work | ||
|  | /// with non-contiguous data-structures (e.g. \c std::list<T>) as | ||
|  | /// well as with "fancy" iterators (e.g. transform_iterator). | ||
|  | /// | ||
|  | /// \param first first element in the range to copy | ||
|  | /// \param last last element in the range to copy | ||
|  | /// \param result first element in the result range | ||
|  | /// \param queue command queue to perform the operation | ||
|  | /// | ||
|  | /// \return \c OutputIterator to the end of the result range | ||
|  | /// | ||
|  | /// For example, to copy an array of \c int values on the host to a vector on | ||
|  | /// the device: | ||
|  | /// \code | ||
|  | /// // array on the host | ||
|  | /// int data[] = { 1, 2, 3, 4 }; | ||
|  | /// | ||
|  | /// // vector on the device | ||
|  | /// boost::compute::vector<int> vec(4, context); | ||
|  | /// | ||
|  | /// // copy values to the device vector | ||
|  | /// boost::compute::copy(data, data + 4, vec.begin(), queue); | ||
|  | /// \endcode | ||
|  | /// | ||
|  | /// The copy algorithm can also be used with standard containers such as | ||
|  | /// \c std::vector<T>: | ||
|  | /// \code | ||
|  | /// std::vector<int> host_vector = ... | ||
|  | /// boost::compute::vector<int> device_vector = ... | ||
|  | /// | ||
|  | /// // copy from the host to the device | ||
|  | /// boost::compute::copy( | ||
|  | ///     host_vector.begin(), host_vector.end(), device_vector.begin(), queue | ||
|  | /// ); | ||
|  | /// | ||
|  | /// // copy from the device to the host | ||
|  | /// boost::compute::copy( | ||
|  | ///     device_vector.begin(), device_vector.end(), host_vector.begin(), queue | ||
|  | /// ); | ||
|  | /// \endcode | ||
|  | /// | ||
|  | /// \see copy_n(), copy_if(), copy_async() | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline OutputIterator copy(InputIterator first, | ||
|  |                            InputIterator last, | ||
|  |                            OutputIterator result, | ||
|  |                            command_queue &queue = system::default_queue()) | ||
|  | { | ||
|  |     return detail::dispatch_copy(first, last, result, queue); | ||
|  | } | ||
|  | 
 | ||
|  | /// Copies the values in the range [\p first, \p last) to the range | ||
|  | /// beginning at \p result. The copy is performed asynchronously. | ||
|  | /// | ||
|  | /// \see copy() | ||
|  | template<class InputIterator, class OutputIterator> | ||
|  | inline future<OutputIterator> | ||
|  | copy_async(InputIterator first, | ||
|  |            InputIterator last, | ||
|  |            OutputIterator result, | ||
|  |            command_queue &queue = system::default_queue()) | ||
|  | { | ||
|  |     return detail::dispatch_copy_async(first, last, result, queue); | ||
|  | } | ||
|  | 
 | ||
|  | } // end compute namespace | ||
|  | } // end boost namespace | ||
|  | 
 | ||
|  | #endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP |