857 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			857 lines
		
	
	
		
			31 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| //---------------------------------------------------------------------------//
 | |
| // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
 | |
| //
 | |
| // Distributed under the Boost Software License, Version 1.0
 | |
| // See accompanying file LICENSE_1_0.txt or copy at
 | |
| // http://www.boost.org/LICENSE_1_0.txt
 | |
| //
 | |
| // See http://boostorg.github.com/compute for more information.
 | |
| //---------------------------------------------------------------------------//
 | |
| 
 | |
| #ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP
 | |
| #define BOOST_COMPUTE_ALGORITHM_COPY_HPP
 | |
| 
 | |
| #include <algorithm>
 | |
| #include <iterator>
 | |
| 
 | |
| #include <boost/utility/enable_if.hpp>
 | |
| 
 | |
| #include <boost/mpl/and.hpp>
 | |
| #include <boost/mpl/not.hpp>
 | |
| #include <boost/mpl/or.hpp>
 | |
| 
 | |
| #include <boost/compute/buffer.hpp>
 | |
| #include <boost/compute/system.hpp>
 | |
| #include <boost/compute/command_queue.hpp>
 | |
| #include <boost/compute/algorithm/detail/copy_on_device.hpp>
 | |
| #include <boost/compute/algorithm/detail/copy_to_device.hpp>
 | |
| #include <boost/compute/algorithm/detail/copy_to_host.hpp>
 | |
| #include <boost/compute/async/future.hpp>
 | |
| #include <boost/compute/container/mapped_view.hpp>
 | |
| #include <boost/compute/detail/device_ptr.hpp>
 | |
| #include <boost/compute/detail/is_contiguous_iterator.hpp>
 | |
| #include <boost/compute/detail/iterator_range_size.hpp>
 | |
| #include <boost/compute/detail/parameter_cache.hpp>
 | |
| #include <boost/compute/iterator/buffer_iterator.hpp>
 | |
| #include <boost/compute/type_traits/type_name.hpp>
 | |
| #include <boost/compute/type_traits/is_device_iterator.hpp>
 | |
| 
 | |
| namespace boost {
 | |
| namespace compute {
 | |
| namespace detail {
 | |
| 
 | |
| namespace mpl = boost::mpl;
 | |
| 
 | |
| // meta-function returning true if copy() between InputIterator and
 | |
| // OutputIterator can be implemented with clEnqueueCopyBuffer().
 | |
| template<class InputIterator, class OutputIterator>
 | |
| struct can_copy_with_copy_buffer :
 | |
|     mpl::and_<
 | |
|         mpl::or_<
 | |
|             boost::is_same<
 | |
|                 InputIterator,
 | |
|                 buffer_iterator<typename InputIterator::value_type>
 | |
|             >,
 | |
|             boost::is_same<
 | |
|                 InputIterator,
 | |
|                 detail::device_ptr<typename InputIterator::value_type>
 | |
|             >
 | |
|         >,
 | |
|         mpl::or_<
 | |
|             boost::is_same<
 | |
|                 OutputIterator,
 | |
|                 buffer_iterator<typename OutputIterator::value_type>
 | |
|             >,
 | |
|             boost::is_same<
 | |
|                 OutputIterator,
 | |
|                 detail::device_ptr<typename OutputIterator::value_type>
 | |
|             >
 | |
|         >,
 | |
|         boost::is_same<
 | |
|             typename InputIterator::value_type,
 | |
|             typename OutputIterator::value_type
 | |
|         >
 | |
|     >::type {};
 | |
| 
 | |
| // meta-function returning true if value_types of HostIterator and
 | |
| // DeviceIterator are same
 | |
| template<class HostIterator, class DeviceIterator>
 | |
| struct is_same_value_type :
 | |
|     boost::is_same<
 | |
|         typename boost::remove_cv<
 | |
|             typename std::iterator_traits<HostIterator>::value_type
 | |
|         >::type,
 | |
|         typename boost::remove_cv<
 | |
|             typename DeviceIterator::value_type
 | |
|         >::type
 | |
|     >::type {};
 | |
| 
 | |
| // meta-function returning true if value_type of HostIterator is bool
 | |
| template<class HostIterator>
 | |
| struct is_bool_value_type :
 | |
|     boost::is_same<
 | |
|         typename boost::remove_cv<
 | |
|             typename std::iterator_traits<HostIterator>::value_type
 | |
|         >::type,
 | |
|         bool
 | |
|     >::type {};
 | |
| 
 | |
| // host -> device (async)
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline future<OutputIterator>
 | |
| dispatch_copy_async(InputIterator first,
 | |
|                     InputIterator last,
 | |
|                     OutputIterator result,
 | |
|                     command_queue &queue,
 | |
|                     typename boost::enable_if<
 | |
|                         mpl::and_<
 | |
|                             mpl::not_<
 | |
|                                 is_device_iterator<InputIterator>
 | |
|                             >,
 | |
|                             is_device_iterator<OutputIterator>,
 | |
|                             is_same_value_type<InputIterator, OutputIterator>
 | |
|                         >
 | |
|                     >::type* = 0)
 | |
| {
 | |
|     BOOST_STATIC_ASSERT_MSG(
 | |
|         is_contiguous_iterator<InputIterator>::value,
 | |
|         "copy_async() is only supported for contiguous host iterators"
 | |
|     );
 | |
| 
 | |
|     return copy_to_device_async(first, last, result, queue);
 | |
| }
 | |
| 
 | |
| // host -> device (async)
 | |
| // Type mismatch between InputIterator and OutputIterator value_types
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline future<OutputIterator>
 | |
| dispatch_copy_async(InputIterator first,
 | |
|                     InputIterator last,
 | |
|                     OutputIterator result,
 | |
|                     command_queue &queue,
 | |
|                     typename boost::enable_if<
 | |
|                         mpl::and_<
 | |
|                             mpl::not_<
 | |
|                                 is_device_iterator<InputIterator>
 | |
|                             >,
 | |
|                             is_device_iterator<OutputIterator>,
 | |
|                             mpl::not_<
 | |
|                                 is_same_value_type<InputIterator, OutputIterator>
 | |
|                             >
 | |
|                         >
 | |
|                     >::type* = 0)
 | |
| {
 | |
|     BOOST_STATIC_ASSERT_MSG(
 | |
|         is_contiguous_iterator<InputIterator>::value,
 | |
|         "copy_async() is only supported for contiguous host iterators"
 | |
|     );
 | |
| 
 | |
|     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
 | |
| 
 | |
|     const context &context = queue.get_context();
 | |
|     size_t count = iterator_range_size(first, last);
 | |
| 
 | |
|     if(count < size_t(1)) {
 | |
|         return future<OutputIterator>();
 | |
|     }
 | |
| 
 | |
|     // map [first; last) to device and run copy kernel
 | |
|     // on device for copying & casting
 | |
|     ::boost::compute::mapped_view<input_type> mapped_host(
 | |
|         // make sure it's a pointer to constant data
 | |
|         // to force read only mapping
 | |
|         const_cast<const input_type*>(
 | |
|             ::boost::addressof(*first)
 | |
|         ),
 | |
|         count,
 | |
|         context
 | |
|     );
 | |
|     return copy_on_device_async(
 | |
|         mapped_host.begin(), mapped_host.end(), result, queue
 | |
|     );
 | |
| }
 | |
| 
 | |
| // host -> device
 | |
| // InputIterator is a contiguous iterator
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator
 | |
| dispatch_copy(InputIterator first,
 | |
|               InputIterator last,
 | |
|               OutputIterator result,
 | |
|               command_queue &queue,
 | |
|               typename boost::enable_if<
 | |
|                   mpl::and_<
 | |
|                       mpl::not_<
 | |
|                           is_device_iterator<InputIterator>
 | |
|                       >,
 | |
|                       is_device_iterator<OutputIterator>,
 | |
|                       is_same_value_type<InputIterator, OutputIterator>,
 | |
|                       is_contiguous_iterator<InputIterator>
 | |
|                   >
 | |
|               >::type* = 0)
 | |
| {
 | |
|     return copy_to_device(first, last, result, queue);
 | |
| }
 | |
| 
 | |
| // host -> device
 | |
| // Type mismatch between InputIterator and OutputIterator value_types
 | |
| // InputIterator is a contiguous iterator
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator
 | |
| dispatch_copy(InputIterator first,
 | |
|               InputIterator last,
 | |
|               OutputIterator result,
 | |
|               command_queue &queue,
 | |
|               typename boost::enable_if<
 | |
|                   mpl::and_<
 | |
|                       mpl::not_<
 | |
|                           is_device_iterator<InputIterator>
 | |
|                       >,
 | |
|                       is_device_iterator<OutputIterator>,
 | |
|                       mpl::not_<
 | |
|                           is_same_value_type<InputIterator, OutputIterator>
 | |
|                       >,
 | |
|                       is_contiguous_iterator<InputIterator>
 | |
|                   >
 | |
|               >::type* = 0)
 | |
| {
 | |
|     typedef typename OutputIterator::value_type output_type;
 | |
|     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
 | |
| 
 | |
|     const device &device = queue.get_device();
 | |
| 
 | |
|     // loading parameters
 | |
|     std::string cache_key =
 | |
|         std::string("__boost_compute_copy_to_device_")
 | |
|             + type_name<input_type>() + "_" + type_name<output_type>();
 | |
|     boost::shared_ptr<parameter_cache> parameters =
 | |
|         detail::parameter_cache::get_global_cache(device);
 | |
| 
 | |
|     size_t map_copy_threshold;
 | |
|     size_t direct_copy_threshold;
 | |
| 
 | |
|     // calculate default values of thresholds
 | |
|     if (device.type() & device::gpu) {
 | |
|         // GPUs
 | |
|         map_copy_threshold = 524288;  // 0.5 MB
 | |
|         direct_copy_threshold = 52428800; // 50 MB
 | |
|     }
 | |
|     else {
 | |
|         // CPUs and other devices
 | |
|         map_copy_threshold = 134217728; // 128 MB
 | |
|         direct_copy_threshold = 0; // it's never efficient for CPUs
 | |
|     }
 | |
| 
 | |
|     // load thresholds
 | |
|     map_copy_threshold =
 | |
|         parameters->get(
 | |
|             cache_key, "map_copy_threshold", map_copy_threshold
 | |
|         );
 | |
|     direct_copy_threshold =
 | |
|         parameters->get(
 | |
|             cache_key, "direct_copy_threshold", direct_copy_threshold
 | |
|         );
 | |
| 
 | |
|     // select copy method based on thresholds & input_size_bytes
 | |
|     size_t count = iterator_range_size(first, last);
 | |
|     size_t input_size_bytes = count * sizeof(input_type);
 | |
| 
 | |
|     // [0; map_copy_threshold) -> copy_to_device_map()
 | |
|     if(input_size_bytes < map_copy_threshold) {
 | |
|         return copy_to_device_map(first, last, result, queue);
 | |
|     }
 | |
|     // [map_copy_threshold; direct_copy_threshold) -> convert [first; last)
 | |
|     //     on host and then perform copy_to_device()
 | |
|     else if(input_size_bytes < direct_copy_threshold) {
 | |
|         std::vector<output_type> vector(first, last);
 | |
|         return copy_to_device(vector.begin(), vector.end(), result, queue);
 | |
|     }
 | |
| 
 | |
|     // [direct_copy_threshold; inf) -> map [first; last) to device and
 | |
|     //     run copy kernel on device for copying & casting
 | |
|     // At this point we are sure that count > 1 (first != last).
 | |
| 
 | |
|     // Perform async copy to device, wait for it to be finished and
 | |
|     // return the result.
 | |
|     // At this point we are sure that count > 1 (first != last), so event
 | |
|     // returned by dispatch_copy_async() must be valid.
 | |
|     return dispatch_copy_async(first, last, result, queue).get();
 | |
| }
 | |
| 
 | |
| // host -> device
 | |
| // InputIterator is NOT a contiguous iterator
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator
 | |
| dispatch_copy(InputIterator first,
 | |
|               InputIterator last,
 | |
|               OutputIterator result,
 | |
|               command_queue &queue,
 | |
|               typename boost::enable_if<
 | |
|                   mpl::and_<
 | |
|                       mpl::not_<
 | |
|                           is_device_iterator<InputIterator>
 | |
|                       >,
 | |
|                       is_device_iterator<OutputIterator>,
 | |
|                       mpl::not_<
 | |
|                           is_contiguous_iterator<InputIterator>
 | |
|                       >
 | |
|                   >
 | |
|               >::type* = 0)
 | |
| {
 | |
|     typedef typename OutputIterator::value_type output_type;
 | |
|     typedef typename std::iterator_traits<InputIterator>::value_type input_type;
 | |
| 
 | |
|     const device &device = queue.get_device();
 | |
| 
 | |
|     // loading parameters
 | |
|     std::string cache_key =
 | |
|         std::string("__boost_compute_copy_to_device_")
 | |
|             + type_name<input_type>() + "_" + type_name<output_type>();
 | |
|     boost::shared_ptr<parameter_cache> parameters =
 | |
|         detail::parameter_cache::get_global_cache(device);
 | |
| 
 | |
|     size_t map_copy_threshold;
 | |
|     size_t direct_copy_threshold;
 | |
| 
 | |
|     // calculate default values of thresholds
 | |
|     if (device.type() & device::gpu) {
 | |
|         // GPUs
 | |
|         map_copy_threshold = 524288;  // 0.5 MB
 | |
|         direct_copy_threshold = 52428800; // 50 MB
 | |
|     }
 | |
|     else {
 | |
|         // CPUs and other devices
 | |
|         map_copy_threshold = 134217728; // 128 MB
 | |
|         direct_copy_threshold = 0; // it's never efficient for CPUs
 | |
|     }
 | |
| 
 | |
|     // load thresholds
 | |
|     map_copy_threshold =
 | |
|         parameters->get(
 | |
|             cache_key, "map_copy_threshold", map_copy_threshold
 | |
|         );
 | |
|     direct_copy_threshold =
 | |
|         parameters->get(
 | |
|             cache_key, "direct_copy_threshold", direct_copy_threshold
 | |
|         );
 | |
| 
 | |
|     // select copy method based on thresholds & input_size_bytes
 | |
|     size_t input_size = iterator_range_size(first, last);
 | |
|     size_t input_size_bytes = input_size * sizeof(input_type);
 | |
| 
 | |
|     // [0; map_copy_threshold) -> copy_to_device_map()
 | |
|     //
 | |
|     // if direct_copy_threshold is less than map_copy_threshold
 | |
|     // copy_to_device_map() is used for every input
 | |
|     if(input_size_bytes < map_copy_threshold
 | |
|         || direct_copy_threshold <= map_copy_threshold) {
 | |
|         return copy_to_device_map(first, last, result, queue);
 | |
|     }
 | |
|     // [map_copy_threshold; inf) -> convert [first; last)
 | |
|     //     on host and then perform copy_to_device()
 | |
|     std::vector<output_type> vector(first, last);
 | |
|     return copy_to_device(vector.begin(), vector.end(), result, queue);
 | |
| }
 | |
| 
 | |
| // device -> host (async)
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline future<OutputIterator>
 | |
| dispatch_copy_async(InputIterator first,
 | |
|                     InputIterator last,
 | |
|                     OutputIterator result,
 | |
|                     command_queue &queue,
 | |
|                     typename boost::enable_if<
 | |
|                         mpl::and_<
 | |
|                             is_device_iterator<InputIterator>,
 | |
|                             mpl::not_<
 | |
|                                 is_device_iterator<OutputIterator>
 | |
|                             >,
 | |
|                             is_same_value_type<OutputIterator, InputIterator>
 | |
|                         >
 | |
|                     >::type* = 0)
 | |
| {
 | |
|     BOOST_STATIC_ASSERT_MSG(
 | |
|         is_contiguous_iterator<OutputIterator>::value,
 | |
|         "copy_async() is only supported for contiguous host iterators"
 | |
|     );
 | |
| 
 | |
|     return copy_to_host_async(first, last, result, queue);
 | |
| }
 | |
| 
 | |
| // device -> host (async)
 | |
| // Type mismatch between InputIterator and OutputIterator value_types
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline future<OutputIterator>
 | |
| dispatch_copy_async(InputIterator first,
 | |
|                     InputIterator last,
 | |
|                     OutputIterator result,
 | |
|                     command_queue &queue,
 | |
|                     typename boost::enable_if<
 | |
|                         mpl::and_<
 | |
|                             is_device_iterator<InputIterator>,
 | |
|                             mpl::not_<
 | |
|                                 is_device_iterator<OutputIterator>
 | |
|                             >,
 | |
|                             mpl::not_<
 | |
|                                 is_same_value_type<OutputIterator, InputIterator>
 | |
|                             >
 | |
|                         >
 | |
|                     >::type* = 0)
 | |
| {
 | |
|     BOOST_STATIC_ASSERT_MSG(
 | |
|         is_contiguous_iterator<OutputIterator>::value,
 | |
|         "copy_async() is only supported for contiguous host iterators"
 | |
|     );
 | |
| 
 | |
|     typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
 | |
|     const context &context = queue.get_context();
 | |
|     size_t count = iterator_range_size(first, last);
 | |
| 
 | |
|     if(count < size_t(1)) {
 | |
|         return future<OutputIterator>();
 | |
|     }
 | |
| 
 | |
|     // map host memory to device
 | |
|     buffer mapped_host(
 | |
|         context,
 | |
|         count * sizeof(output_type),
 | |
|         buffer::write_only | buffer::use_host_ptr,
 | |
|         static_cast<void*>(
 | |
|             ::boost::addressof(*result)
 | |
|         )
 | |
|     );
 | |
|     // copy async on device
 | |
|     ::boost::compute::future<buffer_iterator<output_type> > future =
 | |
|         copy_on_device_async(
 | |
|             first,
 | |
|             last,
 | |
|             make_buffer_iterator<output_type>(mapped_host),
 | |
|             queue
 | |
|         );
 | |
|     // update host memory asynchronously by maping and unmaping memory
 | |
|     event map_event;
 | |
|     void* ptr = queue.enqueue_map_buffer_async(
 | |
|         mapped_host,
 | |
|         CL_MAP_READ,
 | |
|         0,
 | |
|         count * sizeof(output_type),
 | |
|         map_event,
 | |
|         future.get_event()
 | |
|     );
 | |
|     event unmap_event =
 | |
|         queue.enqueue_unmap_buffer(mapped_host, ptr, map_event);
 | |
|     return make_future(result + count, unmap_event);
 | |
| }
 | |
| 
 | |
| // device -> host
 | |
| // OutputIterator is a contiguous iterator
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator
 | |
| dispatch_copy(InputIterator first,
 | |
|               InputIterator last,
 | |
|               OutputIterator result,
 | |
|               command_queue &queue,
 | |
|               typename boost::enable_if<
 | |
|                   mpl::and_<
 | |
|                       is_device_iterator<InputIterator>,
 | |
|                       mpl::not_<
 | |
|                           is_device_iterator<OutputIterator>
 | |
|                       >,
 | |
|                       is_same_value_type<OutputIterator, InputIterator>,
 | |
|                       is_contiguous_iterator<OutputIterator>,
 | |
|                       mpl::not_<
 | |
|                           is_bool_value_type<OutputIterator>
 | |
|                       >
 | |
|                   >
 | |
|               >::type* = 0)
 | |
| {
 | |
|     return copy_to_host(first, last, result, queue);
 | |
| }
 | |
| 
 | |
| // device -> host
 | |
| // Type mismatch between InputIterator and OutputIterator value_types
 | |
| // OutputIterator is NOT a contiguous iterator or value_type of OutputIterator
 | |
| // is a boolean type.
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator
 | |
| dispatch_copy(InputIterator first,
 | |
|               InputIterator last,
 | |
|               OutputIterator result,
 | |
|               command_queue &queue,
 | |
|               typename boost::enable_if<
 | |
|                   mpl::and_<
 | |
|                       is_device_iterator<InputIterator>,
 | |
|                       mpl::not_<
 | |
|                           is_device_iterator<OutputIterator>
 | |
|                       >,
 | |
|                       mpl::or_<
 | |
|                           mpl::not_<
 | |
|                               is_contiguous_iterator<OutputIterator>
 | |
|                           >,
 | |
|                           is_bool_value_type<OutputIterator>
 | |
|                       >
 | |
|                   >
 | |
|               >::type* = 0)
 | |
| {
 | |
|     typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
 | |
|     typedef typename InputIterator::value_type input_type;
 | |
| 
 | |
|     const device &device = queue.get_device();
 | |
| 
 | |
|     // loading parameters
 | |
|     std::string cache_key =
 | |
|         std::string("__boost_compute_copy_to_host_")
 | |
|             + type_name<input_type>() + "_" + type_name<output_type>();
 | |
|     boost::shared_ptr<parameter_cache> parameters =
 | |
|         detail::parameter_cache::get_global_cache(device);
 | |
| 
 | |
|     size_t map_copy_threshold;
 | |
|     size_t direct_copy_threshold;
 | |
| 
 | |
|     // calculate default values of thresholds
 | |
|     if (device.type() & device::gpu) {
 | |
|         // GPUs
 | |
|         map_copy_threshold = 33554432;  // 30 MB
 | |
|         direct_copy_threshold = 0; // it's never efficient for GPUs
 | |
|     }
 | |
|     else {
 | |
|         // CPUs and other devices
 | |
|         map_copy_threshold = 134217728; // 128 MB
 | |
|         direct_copy_threshold = 0; // it's never efficient for CPUs
 | |
|     }
 | |
| 
 | |
|     // load thresholds
 | |
|     map_copy_threshold =
 | |
|         parameters->get(
 | |
|             cache_key, "map_copy_threshold", map_copy_threshold
 | |
|         );
 | |
|     direct_copy_threshold =
 | |
|         parameters->get(
 | |
|             cache_key, "direct_copy_threshold", direct_copy_threshold
 | |
|         );
 | |
| 
 | |
|     // select copy method based on thresholds & input_size_bytes
 | |
|     size_t count = iterator_range_size(first, last);
 | |
|     size_t input_size_bytes = count * sizeof(input_type);
 | |
| 
 | |
|     // [0; map_copy_threshold) -> copy_to_host_map()
 | |
|     //
 | |
|     // if direct_copy_threshold is less than map_copy_threshold
 | |
|     // copy_to_host_map() is used for every input
 | |
|     if(input_size_bytes < map_copy_threshold
 | |
|         || direct_copy_threshold <= map_copy_threshold) {
 | |
|         return copy_to_host_map(first, last, result, queue);
 | |
|     }
 | |
|     // [map_copy_threshold; inf) -> copy [first;last) to temporary vector
 | |
|     //     then copy (and convert) to result using std::copy()
 | |
|     std::vector<input_type> vector(count);
 | |
|     copy_to_host(first, last, vector.begin(), queue);
 | |
|     return std::copy(vector.begin(), vector.end(), result);
 | |
| }
 | |
| 
 | |
| // device -> host
 | |
| // Type mismatch between InputIterator and OutputIterator value_types
 | |
| // OutputIterator is a contiguous iterator
 | |
| // value_type of OutputIterator is NOT a boolean type
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator
 | |
| dispatch_copy(InputIterator first,
 | |
|               InputIterator last,
 | |
|               OutputIterator result,
 | |
|               command_queue &queue,
 | |
|               typename boost::enable_if<
 | |
|                   mpl::and_<
 | |
|                       is_device_iterator<InputIterator>,
 | |
|                       mpl::not_<
 | |
|                           is_device_iterator<OutputIterator>
 | |
|                       >,
 | |
|                       mpl::not_<
 | |
|                           is_same_value_type<OutputIterator, InputIterator>
 | |
|                       >,
 | |
|                       is_contiguous_iterator<OutputIterator>,
 | |
|                       mpl::not_<
 | |
|                           is_bool_value_type<OutputIterator>
 | |
|                       >
 | |
|                   >
 | |
|               >::type* = 0)
 | |
| {
 | |
|     typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
 | |
|     typedef typename InputIterator::value_type input_type;
 | |
| 
 | |
|     const device &device = queue.get_device();
 | |
| 
 | |
|     // loading parameters
 | |
|     std::string cache_key =
 | |
|         std::string("__boost_compute_copy_to_host_")
 | |
|             + type_name<input_type>() + "_" + type_name<output_type>();
 | |
|     boost::shared_ptr<parameter_cache> parameters =
 | |
|         detail::parameter_cache::get_global_cache(device);
 | |
| 
 | |
|     size_t map_copy_threshold;
 | |
|     size_t direct_copy_threshold;
 | |
| 
 | |
|     // calculate default values of thresholds
 | |
|     if (device.type() & device::gpu) {
 | |
|         // GPUs
 | |
|         map_copy_threshold = 524288;  // 0.5 MB
 | |
|         direct_copy_threshold = 52428800; // 50 MB
 | |
|     }
 | |
|     else {
 | |
|         // CPUs and other devices
 | |
|         map_copy_threshold = 134217728; // 128 MB
 | |
|         direct_copy_threshold = 0; // it's never efficient for CPUs
 | |
|     }
 | |
| 
 | |
|     // load thresholds
 | |
|     map_copy_threshold =
 | |
|         parameters->get(
 | |
|             cache_key, "map_copy_threshold", map_copy_threshold
 | |
|         );
 | |
|     direct_copy_threshold =
 | |
|         parameters->get(
 | |
|             cache_key, "direct_copy_threshold", direct_copy_threshold
 | |
|         );
 | |
| 
 | |
|     // select copy method based on thresholds & input_size_bytes
 | |
|     size_t count = iterator_range_size(first, last);
 | |
|     size_t input_size_bytes = count * sizeof(input_type);
 | |
| 
 | |
|     // [0; map_copy_threshold) -> copy_to_host_map()
 | |
|     if(input_size_bytes < map_copy_threshold) {
 | |
|         return copy_to_host_map(first, last, result, queue);
 | |
|     }
 | |
|     // [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to
 | |
|     //     temporary vector then copy (and convert) to result using std::copy()
 | |
|     else if(input_size_bytes < direct_copy_threshold) {
 | |
|         std::vector<input_type> vector(count);
 | |
|         copy_to_host(first, last, vector.begin(), queue);
 | |
|         return std::copy(vector.begin(), vector.end(), result);
 | |
|     }
 | |
| 
 | |
|     // [direct_copy_threshold; inf) -> map [result; result + input_size) to
 | |
|     //     device and run copy kernel on device for copying & casting
 | |
|     // map host memory to device.
 | |
| 
 | |
|     // Perform async copy to host, wait for it to be finished and
 | |
|     // return the result.
 | |
|     // At this point we are sure that count > 1 (first != last), so event
 | |
|     // returned by dispatch_copy_async() must be valid.
 | |
|     return dispatch_copy_async(first, last, result, queue).get();
 | |
| }
 | |
| 
 | |
| // device -> device
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator
 | |
| dispatch_copy(InputIterator first,
 | |
|               InputIterator last,
 | |
|               OutputIterator result,
 | |
|               command_queue &queue,
 | |
|               typename boost::enable_if<
 | |
|                   mpl::and_<
 | |
|                       is_device_iterator<InputIterator>,
 | |
|                       is_device_iterator<OutputIterator>,
 | |
|                       mpl::not_<
 | |
|                           can_copy_with_copy_buffer<
 | |
|                               InputIterator, OutputIterator
 | |
|                           >
 | |
|                       >
 | |
|                   >
 | |
|               >::type* = 0)
 | |
| {
 | |
|     return copy_on_device(first, last, result, queue);
 | |
| }
 | |
| 
 | |
| // device -> device (specialization for buffer iterators)
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator
 | |
| dispatch_copy(InputIterator first,
 | |
|               InputIterator last,
 | |
|               OutputIterator result,
 | |
|               command_queue &queue,
 | |
|               typename boost::enable_if<
 | |
|                   mpl::and_<
 | |
|                       is_device_iterator<InputIterator>,
 | |
|                       is_device_iterator<OutputIterator>,
 | |
|                       can_copy_with_copy_buffer<
 | |
|                           InputIterator, OutputIterator
 | |
|                       >
 | |
|                   >
 | |
|               >::type* = 0)
 | |
| {
 | |
|     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
 | |
|     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
 | |
| 
 | |
|     difference_type n = std::distance(first, last);
 | |
|     if(n < 1){
 | |
|         // nothing to copy
 | |
|         return result;
 | |
|     }
 | |
| 
 | |
|     queue.enqueue_copy_buffer(first.get_buffer(),
 | |
|                               result.get_buffer(),
 | |
|                               first.get_index() * sizeof(value_type),
 | |
|                               result.get_index() * sizeof(value_type),
 | |
|                               static_cast<size_t>(n) * sizeof(value_type));
 | |
|     return result + n;
 | |
| }
 | |
| 
 | |
| // device -> device (async)
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline future<OutputIterator>
 | |
| dispatch_copy_async(InputIterator first,
 | |
|                     InputIterator last,
 | |
|                     OutputIterator result,
 | |
|                     command_queue &queue,
 | |
|                     typename boost::enable_if<
 | |
|                         mpl::and_<
 | |
|                             is_device_iterator<InputIterator>,
 | |
|                             is_device_iterator<OutputIterator>,
 | |
|                             mpl::not_<
 | |
|                                 can_copy_with_copy_buffer<
 | |
|                                     InputIterator, OutputIterator
 | |
|                                 >
 | |
|                             >
 | |
|                         >
 | |
|                     >::type* = 0)
 | |
| {
 | |
|     return copy_on_device_async(first, last, result, queue);
 | |
| }
 | |
| 
 | |
| // device -> device (async, specialization for buffer iterators)
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline future<OutputIterator>
 | |
| dispatch_copy_async(InputIterator first,
 | |
|                     InputIterator last,
 | |
|                     OutputIterator result,
 | |
|                     command_queue &queue,
 | |
|                     typename boost::enable_if<
 | |
|                         mpl::and_<
 | |
|                             is_device_iterator<InputIterator>,
 | |
|                             is_device_iterator<OutputIterator>,
 | |
|                             can_copy_with_copy_buffer<
 | |
|                                 InputIterator, OutputIterator
 | |
|                             >
 | |
|                         >
 | |
|                     >::type* = 0)
 | |
| {
 | |
|     typedef typename std::iterator_traits<InputIterator>::value_type value_type;
 | |
|     typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
 | |
| 
 | |
|     difference_type n = std::distance(first, last);
 | |
|     if(n < 1){
 | |
|         // nothing to copy
 | |
|         return make_future(result, event());
 | |
|     }
 | |
| 
 | |
|     event event_ =
 | |
|         queue.enqueue_copy_buffer(
 | |
|             first.get_buffer(),
 | |
|             result.get_buffer(),
 | |
|             first.get_index() * sizeof(value_type),
 | |
|             result.get_index() * sizeof(value_type),
 | |
|             static_cast<size_t>(n) * sizeof(value_type)
 | |
|         );
 | |
| 
 | |
|     return make_future(result + n, event_);
 | |
| }
 | |
| 
 | |
| // host -> host
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator
 | |
| dispatch_copy(InputIterator first,
 | |
|               InputIterator last,
 | |
|               OutputIterator result,
 | |
|               command_queue &queue,
 | |
|               typename boost::enable_if_c<
 | |
|                   !is_device_iterator<InputIterator>::value &&
 | |
|                   !is_device_iterator<OutputIterator>::value
 | |
|               >::type* = 0)
 | |
| {
 | |
|     (void) queue;
 | |
| 
 | |
|     return std::copy(first, last, result);
 | |
| }
 | |
| 
 | |
| } // end detail namespace
 | |
| 
 | |
| /// Copies the values in the range [\p first, \p last) to the range
 | |
| /// beginning at \p result.
 | |
| ///
 | |
| /// The generic copy() function can be used for a variety of data
 | |
| /// transfer tasks and provides a standard interface to the following
 | |
| /// OpenCL functions:
 | |
| ///
 | |
| /// \li \c clEnqueueReadBuffer()
 | |
| /// \li \c clEnqueueWriteBuffer()
 | |
| /// \li \c clEnqueueCopyBuffer()
 | |
| ///
 | |
| /// Unlike the aforementioned OpenCL functions, copy() will also work
 | |
| /// with non-contiguous data-structures (e.g. \c std::list<T>) as
 | |
| /// well as with "fancy" iterators (e.g. transform_iterator).
 | |
| ///
 | |
| /// \param first first element in the range to copy
 | |
| /// \param last last element in the range to copy
 | |
| /// \param result first element in the result range
 | |
| /// \param queue command queue to perform the operation
 | |
| ///
 | |
| /// \return \c OutputIterator to the end of the result range
 | |
| ///
 | |
| /// For example, to copy an array of \c int values on the host to a vector on
 | |
| /// the device:
 | |
| /// \code
 | |
| /// // array on the host
 | |
| /// int data[] = { 1, 2, 3, 4 };
 | |
| ///
 | |
| /// // vector on the device
 | |
| /// boost::compute::vector<int> vec(4, context);
 | |
| ///
 | |
| /// // copy values to the device vector
 | |
| /// boost::compute::copy(data, data + 4, vec.begin(), queue);
 | |
| /// \endcode
 | |
| ///
 | |
| /// The copy algorithm can also be used with standard containers such as
 | |
| /// \c std::vector<T>:
 | |
| /// \code
 | |
| /// std::vector<int> host_vector = ...
 | |
| /// boost::compute::vector<int> device_vector = ...
 | |
| ///
 | |
| /// // copy from the host to the device
 | |
| /// boost::compute::copy(
 | |
| ///     host_vector.begin(), host_vector.end(), device_vector.begin(), queue
 | |
| /// );
 | |
| ///
 | |
| /// // copy from the device to the host
 | |
| /// boost::compute::copy(
 | |
| ///     device_vector.begin(), device_vector.end(), host_vector.begin(), queue
 | |
| /// );
 | |
| /// \endcode
 | |
| ///
 | |
| /// \see copy_n(), copy_if(), copy_async()
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline OutputIterator copy(InputIterator first,
 | |
|                            InputIterator last,
 | |
|                            OutputIterator result,
 | |
|                            command_queue &queue = system::default_queue())
 | |
| {
 | |
|     return detail::dispatch_copy(first, last, result, queue);
 | |
| }
 | |
| 
 | |
| /// Copies the values in the range [\p first, \p last) to the range
 | |
| /// beginning at \p result. The copy is performed asynchronously.
 | |
| ///
 | |
| /// \see copy()
 | |
| template<class InputIterator, class OutputIterator>
 | |
| inline future<OutputIterator>
 | |
| copy_async(InputIterator first,
 | |
|            InputIterator last,
 | |
|            OutputIterator result,
 | |
|            command_queue &queue = system::default_queue())
 | |
| {
 | |
|     return detail::dispatch_copy_async(first, last, result, queue);
 | |
| }
 | |
| 
 | |
| } // end compute namespace
 | |
| } // end boost namespace
 | |
| 
 | |
| #endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP
 | 
