857 lines
31 KiB
Plaintext
857 lines
31 KiB
Plaintext
|
//---------------------------------------------------------------------------//
|
||
|
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
|
||
|
//
|
||
|
// Distributed under the Boost Software License, Version 1.0
|
||
|
// See accompanying file LICENSE_1_0.txt or copy at
|
||
|
// http://www.boost.org/LICENSE_1_0.txt
|
||
|
//
|
||
|
// See http://boostorg.github.com/compute for more information.
|
||
|
//---------------------------------------------------------------------------//
|
||
|
|
||
|
#ifndef BOOST_COMPUTE_ALGORITHM_COPY_HPP
|
||
|
#define BOOST_COMPUTE_ALGORITHM_COPY_HPP
|
||
|
|
||
|
#include <algorithm>
|
||
|
#include <iterator>
|
||
|
|
||
|
#include <boost/utility/enable_if.hpp>
|
||
|
|
||
|
#include <boost/mpl/and.hpp>
|
||
|
#include <boost/mpl/not.hpp>
|
||
|
#include <boost/mpl/or.hpp>
|
||
|
|
||
|
#include <boost/compute/buffer.hpp>
|
||
|
#include <boost/compute/system.hpp>
|
||
|
#include <boost/compute/command_queue.hpp>
|
||
|
#include <boost/compute/algorithm/detail/copy_on_device.hpp>
|
||
|
#include <boost/compute/algorithm/detail/copy_to_device.hpp>
|
||
|
#include <boost/compute/algorithm/detail/copy_to_host.hpp>
|
||
|
#include <boost/compute/async/future.hpp>
|
||
|
#include <boost/compute/container/mapped_view.hpp>
|
||
|
#include <boost/compute/detail/device_ptr.hpp>
|
||
|
#include <boost/compute/detail/is_contiguous_iterator.hpp>
|
||
|
#include <boost/compute/detail/iterator_range_size.hpp>
|
||
|
#include <boost/compute/detail/parameter_cache.hpp>
|
||
|
#include <boost/compute/iterator/buffer_iterator.hpp>
|
||
|
#include <boost/compute/type_traits/type_name.hpp>
|
||
|
#include <boost/compute/type_traits/is_device_iterator.hpp>
|
||
|
|
||
|
namespace boost {
|
||
|
namespace compute {
|
||
|
namespace detail {
|
||
|
|
||
|
namespace mpl = boost::mpl;
|
||
|
|
||
|
// meta-function returning true if copy() between InputIterator and
|
||
|
// OutputIterator can be implemented with clEnqueueCopyBuffer().
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
struct can_copy_with_copy_buffer :
|
||
|
mpl::and_<
|
||
|
mpl::or_<
|
||
|
boost::is_same<
|
||
|
InputIterator,
|
||
|
buffer_iterator<typename InputIterator::value_type>
|
||
|
>,
|
||
|
boost::is_same<
|
||
|
InputIterator,
|
||
|
detail::device_ptr<typename InputIterator::value_type>
|
||
|
>
|
||
|
>,
|
||
|
mpl::or_<
|
||
|
boost::is_same<
|
||
|
OutputIterator,
|
||
|
buffer_iterator<typename OutputIterator::value_type>
|
||
|
>,
|
||
|
boost::is_same<
|
||
|
OutputIterator,
|
||
|
detail::device_ptr<typename OutputIterator::value_type>
|
||
|
>
|
||
|
>,
|
||
|
boost::is_same<
|
||
|
typename InputIterator::value_type,
|
||
|
typename OutputIterator::value_type
|
||
|
>
|
||
|
>::type {};
|
||
|
|
||
|
// meta-function returning true if value_types of HostIterator and
|
||
|
// DeviceIterator are same
|
||
|
template<class HostIterator, class DeviceIterator>
|
||
|
struct is_same_value_type :
|
||
|
boost::is_same<
|
||
|
typename boost::remove_cv<
|
||
|
typename std::iterator_traits<HostIterator>::value_type
|
||
|
>::type,
|
||
|
typename boost::remove_cv<
|
||
|
typename DeviceIterator::value_type
|
||
|
>::type
|
||
|
>::type {};
|
||
|
|
||
|
// meta-function returning true if value_type of HostIterator is bool
|
||
|
template<class HostIterator>
|
||
|
struct is_bool_value_type :
|
||
|
boost::is_same<
|
||
|
typename boost::remove_cv<
|
||
|
typename std::iterator_traits<HostIterator>::value_type
|
||
|
>::type,
|
||
|
bool
|
||
|
>::type {};
|
||
|
|
||
|
// host -> device (async)
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline future<OutputIterator>
|
||
|
dispatch_copy_async(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
mpl::not_<
|
||
|
is_device_iterator<InputIterator>
|
||
|
>,
|
||
|
is_device_iterator<OutputIterator>,
|
||
|
is_same_value_type<InputIterator, OutputIterator>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
BOOST_STATIC_ASSERT_MSG(
|
||
|
is_contiguous_iterator<InputIterator>::value,
|
||
|
"copy_async() is only supported for contiguous host iterators"
|
||
|
);
|
||
|
|
||
|
return copy_to_device_async(first, last, result, queue);
|
||
|
}
|
||
|
|
||
|
// host -> device (async)
|
||
|
// Type mismatch between InputIterator and OutputIterator value_types
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline future<OutputIterator>
|
||
|
dispatch_copy_async(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
mpl::not_<
|
||
|
is_device_iterator<InputIterator>
|
||
|
>,
|
||
|
is_device_iterator<OutputIterator>,
|
||
|
mpl::not_<
|
||
|
is_same_value_type<InputIterator, OutputIterator>
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
BOOST_STATIC_ASSERT_MSG(
|
||
|
is_contiguous_iterator<InputIterator>::value,
|
||
|
"copy_async() is only supported for contiguous host iterators"
|
||
|
);
|
||
|
|
||
|
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
|
||
|
|
||
|
const context &context = queue.get_context();
|
||
|
size_t count = iterator_range_size(first, last);
|
||
|
|
||
|
if(count < size_t(1)) {
|
||
|
return future<OutputIterator>();
|
||
|
}
|
||
|
|
||
|
// map [first; last) to device and run copy kernel
|
||
|
// on device for copying & casting
|
||
|
::boost::compute::mapped_view<input_type> mapped_host(
|
||
|
// make sure it's a pointer to constant data
|
||
|
// to force read only mapping
|
||
|
const_cast<const input_type*>(
|
||
|
::boost::addressof(*first)
|
||
|
),
|
||
|
count,
|
||
|
context
|
||
|
);
|
||
|
return copy_on_device_async(
|
||
|
mapped_host.begin(), mapped_host.end(), result, queue
|
||
|
);
|
||
|
}
|
||
|
|
||
|
// host -> device
|
||
|
// InputIterator is a contiguous iterator
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator
|
||
|
dispatch_copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
mpl::not_<
|
||
|
is_device_iterator<InputIterator>
|
||
|
>,
|
||
|
is_device_iterator<OutputIterator>,
|
||
|
is_same_value_type<InputIterator, OutputIterator>,
|
||
|
is_contiguous_iterator<InputIterator>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
return copy_to_device(first, last, result, queue);
|
||
|
}
|
||
|
|
||
|
// host -> device
|
||
|
// Type mismatch between InputIterator and OutputIterator value_types
|
||
|
// InputIterator is a contiguous iterator
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator
|
||
|
dispatch_copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
mpl::not_<
|
||
|
is_device_iterator<InputIterator>
|
||
|
>,
|
||
|
is_device_iterator<OutputIterator>,
|
||
|
mpl::not_<
|
||
|
is_same_value_type<InputIterator, OutputIterator>
|
||
|
>,
|
||
|
is_contiguous_iterator<InputIterator>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
typedef typename OutputIterator::value_type output_type;
|
||
|
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
|
||
|
|
||
|
const device &device = queue.get_device();
|
||
|
|
||
|
// loading parameters
|
||
|
std::string cache_key =
|
||
|
std::string("__boost_compute_copy_to_device_")
|
||
|
+ type_name<input_type>() + "_" + type_name<output_type>();
|
||
|
boost::shared_ptr<parameter_cache> parameters =
|
||
|
detail::parameter_cache::get_global_cache(device);
|
||
|
|
||
|
size_t map_copy_threshold;
|
||
|
size_t direct_copy_threshold;
|
||
|
|
||
|
// calculate default values of thresholds
|
||
|
if (device.type() & device::gpu) {
|
||
|
// GPUs
|
||
|
map_copy_threshold = 524288; // 0.5 MB
|
||
|
direct_copy_threshold = 52428800; // 50 MB
|
||
|
}
|
||
|
else {
|
||
|
// CPUs and other devices
|
||
|
map_copy_threshold = 134217728; // 128 MB
|
||
|
direct_copy_threshold = 0; // it's never efficient for CPUs
|
||
|
}
|
||
|
|
||
|
// load thresholds
|
||
|
map_copy_threshold =
|
||
|
parameters->get(
|
||
|
cache_key, "map_copy_threshold", map_copy_threshold
|
||
|
);
|
||
|
direct_copy_threshold =
|
||
|
parameters->get(
|
||
|
cache_key, "direct_copy_threshold", direct_copy_threshold
|
||
|
);
|
||
|
|
||
|
// select copy method based on thresholds & input_size_bytes
|
||
|
size_t count = iterator_range_size(first, last);
|
||
|
size_t input_size_bytes = count * sizeof(input_type);
|
||
|
|
||
|
// [0; map_copy_threshold) -> copy_to_device_map()
|
||
|
if(input_size_bytes < map_copy_threshold) {
|
||
|
return copy_to_device_map(first, last, result, queue);
|
||
|
}
|
||
|
// [map_copy_threshold; direct_copy_threshold) -> convert [first; last)
|
||
|
// on host and then perform copy_to_device()
|
||
|
else if(input_size_bytes < direct_copy_threshold) {
|
||
|
std::vector<output_type> vector(first, last);
|
||
|
return copy_to_device(vector.begin(), vector.end(), result, queue);
|
||
|
}
|
||
|
|
||
|
// [direct_copy_threshold; inf) -> map [first; last) to device and
|
||
|
// run copy kernel on device for copying & casting
|
||
|
// At this point we are sure that count > 1 (first != last).
|
||
|
|
||
|
// Perform async copy to device, wait for it to be finished and
|
||
|
// return the result.
|
||
|
// At this point we are sure that count > 1 (first != last), so event
|
||
|
// returned by dispatch_copy_async() must be valid.
|
||
|
return dispatch_copy_async(first, last, result, queue).get();
|
||
|
}
|
||
|
|
||
|
// host -> device
|
||
|
// InputIterator is NOT a contiguous iterator
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator
|
||
|
dispatch_copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
mpl::not_<
|
||
|
is_device_iterator<InputIterator>
|
||
|
>,
|
||
|
is_device_iterator<OutputIterator>,
|
||
|
mpl::not_<
|
||
|
is_contiguous_iterator<InputIterator>
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
typedef typename OutputIterator::value_type output_type;
|
||
|
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
|
||
|
|
||
|
const device &device = queue.get_device();
|
||
|
|
||
|
// loading parameters
|
||
|
std::string cache_key =
|
||
|
std::string("__boost_compute_copy_to_device_")
|
||
|
+ type_name<input_type>() + "_" + type_name<output_type>();
|
||
|
boost::shared_ptr<parameter_cache> parameters =
|
||
|
detail::parameter_cache::get_global_cache(device);
|
||
|
|
||
|
size_t map_copy_threshold;
|
||
|
size_t direct_copy_threshold;
|
||
|
|
||
|
// calculate default values of thresholds
|
||
|
if (device.type() & device::gpu) {
|
||
|
// GPUs
|
||
|
map_copy_threshold = 524288; // 0.5 MB
|
||
|
direct_copy_threshold = 52428800; // 50 MB
|
||
|
}
|
||
|
else {
|
||
|
// CPUs and other devices
|
||
|
map_copy_threshold = 134217728; // 128 MB
|
||
|
direct_copy_threshold = 0; // it's never efficient for CPUs
|
||
|
}
|
||
|
|
||
|
// load thresholds
|
||
|
map_copy_threshold =
|
||
|
parameters->get(
|
||
|
cache_key, "map_copy_threshold", map_copy_threshold
|
||
|
);
|
||
|
direct_copy_threshold =
|
||
|
parameters->get(
|
||
|
cache_key, "direct_copy_threshold", direct_copy_threshold
|
||
|
);
|
||
|
|
||
|
// select copy method based on thresholds & input_size_bytes
|
||
|
size_t input_size = iterator_range_size(first, last);
|
||
|
size_t input_size_bytes = input_size * sizeof(input_type);
|
||
|
|
||
|
// [0; map_copy_threshold) -> copy_to_device_map()
|
||
|
//
|
||
|
// if direct_copy_threshold is less than map_copy_threshold
|
||
|
// copy_to_device_map() is used for every input
|
||
|
if(input_size_bytes < map_copy_threshold
|
||
|
|| direct_copy_threshold <= map_copy_threshold) {
|
||
|
return copy_to_device_map(first, last, result, queue);
|
||
|
}
|
||
|
// [map_copy_threshold; inf) -> convert [first; last)
|
||
|
// on host and then perform copy_to_device()
|
||
|
std::vector<output_type> vector(first, last);
|
||
|
return copy_to_device(vector.begin(), vector.end(), result, queue);
|
||
|
}
|
||
|
|
||
|
// device -> host (async)
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline future<OutputIterator>
|
||
|
dispatch_copy_async(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
is_device_iterator<InputIterator>,
|
||
|
mpl::not_<
|
||
|
is_device_iterator<OutputIterator>
|
||
|
>,
|
||
|
is_same_value_type<OutputIterator, InputIterator>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
BOOST_STATIC_ASSERT_MSG(
|
||
|
is_contiguous_iterator<OutputIterator>::value,
|
||
|
"copy_async() is only supported for contiguous host iterators"
|
||
|
);
|
||
|
|
||
|
return copy_to_host_async(first, last, result, queue);
|
||
|
}
|
||
|
|
||
|
// device -> host (async)
|
||
|
// Type mismatch between InputIterator and OutputIterator value_types
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline future<OutputIterator>
|
||
|
dispatch_copy_async(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
is_device_iterator<InputIterator>,
|
||
|
mpl::not_<
|
||
|
is_device_iterator<OutputIterator>
|
||
|
>,
|
||
|
mpl::not_<
|
||
|
is_same_value_type<OutputIterator, InputIterator>
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
BOOST_STATIC_ASSERT_MSG(
|
||
|
is_contiguous_iterator<OutputIterator>::value,
|
||
|
"copy_async() is only supported for contiguous host iterators"
|
||
|
);
|
||
|
|
||
|
typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
|
||
|
const context &context = queue.get_context();
|
||
|
size_t count = iterator_range_size(first, last);
|
||
|
|
||
|
if(count < size_t(1)) {
|
||
|
return future<OutputIterator>();
|
||
|
}
|
||
|
|
||
|
// map host memory to device
|
||
|
buffer mapped_host(
|
||
|
context,
|
||
|
count * sizeof(output_type),
|
||
|
buffer::write_only | buffer::use_host_ptr,
|
||
|
static_cast<void*>(
|
||
|
::boost::addressof(*result)
|
||
|
)
|
||
|
);
|
||
|
// copy async on device
|
||
|
::boost::compute::future<buffer_iterator<output_type> > future =
|
||
|
copy_on_device_async(
|
||
|
first,
|
||
|
last,
|
||
|
make_buffer_iterator<output_type>(mapped_host),
|
||
|
queue
|
||
|
);
|
||
|
// update host memory asynchronously by maping and unmaping memory
|
||
|
event map_event;
|
||
|
void* ptr = queue.enqueue_map_buffer_async(
|
||
|
mapped_host,
|
||
|
CL_MAP_READ,
|
||
|
0,
|
||
|
count * sizeof(output_type),
|
||
|
map_event,
|
||
|
future.get_event()
|
||
|
);
|
||
|
event unmap_event =
|
||
|
queue.enqueue_unmap_buffer(mapped_host, ptr, map_event);
|
||
|
return make_future(result + count, unmap_event);
|
||
|
}
|
||
|
|
||
|
// device -> host
|
||
|
// OutputIterator is a contiguous iterator
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator
|
||
|
dispatch_copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
is_device_iterator<InputIterator>,
|
||
|
mpl::not_<
|
||
|
is_device_iterator<OutputIterator>
|
||
|
>,
|
||
|
is_same_value_type<OutputIterator, InputIterator>,
|
||
|
is_contiguous_iterator<OutputIterator>,
|
||
|
mpl::not_<
|
||
|
is_bool_value_type<OutputIterator>
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
return copy_to_host(first, last, result, queue);
|
||
|
}
|
||
|
|
||
|
// device -> host
|
||
|
// Type mismatch between InputIterator and OutputIterator value_types
|
||
|
// OutputIterator is NOT a contiguous iterator or value_type of OutputIterator
|
||
|
// is a boolean type.
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator
|
||
|
dispatch_copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
is_device_iterator<InputIterator>,
|
||
|
mpl::not_<
|
||
|
is_device_iterator<OutputIterator>
|
||
|
>,
|
||
|
mpl::or_<
|
||
|
mpl::not_<
|
||
|
is_contiguous_iterator<OutputIterator>
|
||
|
>,
|
||
|
is_bool_value_type<OutputIterator>
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
|
||
|
typedef typename InputIterator::value_type input_type;
|
||
|
|
||
|
const device &device = queue.get_device();
|
||
|
|
||
|
// loading parameters
|
||
|
std::string cache_key =
|
||
|
std::string("__boost_compute_copy_to_host_")
|
||
|
+ type_name<input_type>() + "_" + type_name<output_type>();
|
||
|
boost::shared_ptr<parameter_cache> parameters =
|
||
|
detail::parameter_cache::get_global_cache(device);
|
||
|
|
||
|
size_t map_copy_threshold;
|
||
|
size_t direct_copy_threshold;
|
||
|
|
||
|
// calculate default values of thresholds
|
||
|
if (device.type() & device::gpu) {
|
||
|
// GPUs
|
||
|
map_copy_threshold = 33554432; // 30 MB
|
||
|
direct_copy_threshold = 0; // it's never efficient for GPUs
|
||
|
}
|
||
|
else {
|
||
|
// CPUs and other devices
|
||
|
map_copy_threshold = 134217728; // 128 MB
|
||
|
direct_copy_threshold = 0; // it's never efficient for CPUs
|
||
|
}
|
||
|
|
||
|
// load thresholds
|
||
|
map_copy_threshold =
|
||
|
parameters->get(
|
||
|
cache_key, "map_copy_threshold", map_copy_threshold
|
||
|
);
|
||
|
direct_copy_threshold =
|
||
|
parameters->get(
|
||
|
cache_key, "direct_copy_threshold", direct_copy_threshold
|
||
|
);
|
||
|
|
||
|
// select copy method based on thresholds & input_size_bytes
|
||
|
size_t count = iterator_range_size(first, last);
|
||
|
size_t input_size_bytes = count * sizeof(input_type);
|
||
|
|
||
|
// [0; map_copy_threshold) -> copy_to_host_map()
|
||
|
//
|
||
|
// if direct_copy_threshold is less than map_copy_threshold
|
||
|
// copy_to_host_map() is used for every input
|
||
|
if(input_size_bytes < map_copy_threshold
|
||
|
|| direct_copy_threshold <= map_copy_threshold) {
|
||
|
return copy_to_host_map(first, last, result, queue);
|
||
|
}
|
||
|
// [map_copy_threshold; inf) -> copy [first;last) to temporary vector
|
||
|
// then copy (and convert) to result using std::copy()
|
||
|
std::vector<input_type> vector(count);
|
||
|
copy_to_host(first, last, vector.begin(), queue);
|
||
|
return std::copy(vector.begin(), vector.end(), result);
|
||
|
}
|
||
|
|
||
|
// device -> host
|
||
|
// Type mismatch between InputIterator and OutputIterator value_types
|
||
|
// OutputIterator is a contiguous iterator
|
||
|
// value_type of OutputIterator is NOT a boolean type
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator
|
||
|
dispatch_copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
is_device_iterator<InputIterator>,
|
||
|
mpl::not_<
|
||
|
is_device_iterator<OutputIterator>
|
||
|
>,
|
||
|
mpl::not_<
|
||
|
is_same_value_type<OutputIterator, InputIterator>
|
||
|
>,
|
||
|
is_contiguous_iterator<OutputIterator>,
|
||
|
mpl::not_<
|
||
|
is_bool_value_type<OutputIterator>
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
typedef typename std::iterator_traits<OutputIterator>::value_type output_type;
|
||
|
typedef typename InputIterator::value_type input_type;
|
||
|
|
||
|
const device &device = queue.get_device();
|
||
|
|
||
|
// loading parameters
|
||
|
std::string cache_key =
|
||
|
std::string("__boost_compute_copy_to_host_")
|
||
|
+ type_name<input_type>() + "_" + type_name<output_type>();
|
||
|
boost::shared_ptr<parameter_cache> parameters =
|
||
|
detail::parameter_cache::get_global_cache(device);
|
||
|
|
||
|
size_t map_copy_threshold;
|
||
|
size_t direct_copy_threshold;
|
||
|
|
||
|
// calculate default values of thresholds
|
||
|
if (device.type() & device::gpu) {
|
||
|
// GPUs
|
||
|
map_copy_threshold = 524288; // 0.5 MB
|
||
|
direct_copy_threshold = 52428800; // 50 MB
|
||
|
}
|
||
|
else {
|
||
|
// CPUs and other devices
|
||
|
map_copy_threshold = 134217728; // 128 MB
|
||
|
direct_copy_threshold = 0; // it's never efficient for CPUs
|
||
|
}
|
||
|
|
||
|
// load thresholds
|
||
|
map_copy_threshold =
|
||
|
parameters->get(
|
||
|
cache_key, "map_copy_threshold", map_copy_threshold
|
||
|
);
|
||
|
direct_copy_threshold =
|
||
|
parameters->get(
|
||
|
cache_key, "direct_copy_threshold", direct_copy_threshold
|
||
|
);
|
||
|
|
||
|
// select copy method based on thresholds & input_size_bytes
|
||
|
size_t count = iterator_range_size(first, last);
|
||
|
size_t input_size_bytes = count * sizeof(input_type);
|
||
|
|
||
|
// [0; map_copy_threshold) -> copy_to_host_map()
|
||
|
if(input_size_bytes < map_copy_threshold) {
|
||
|
return copy_to_host_map(first, last, result, queue);
|
||
|
}
|
||
|
// [map_copy_threshold; direct_copy_threshold) -> copy [first;last) to
|
||
|
// temporary vector then copy (and convert) to result using std::copy()
|
||
|
else if(input_size_bytes < direct_copy_threshold) {
|
||
|
std::vector<input_type> vector(count);
|
||
|
copy_to_host(first, last, vector.begin(), queue);
|
||
|
return std::copy(vector.begin(), vector.end(), result);
|
||
|
}
|
||
|
|
||
|
// [direct_copy_threshold; inf) -> map [result; result + input_size) to
|
||
|
// device and run copy kernel on device for copying & casting
|
||
|
// map host memory to device.
|
||
|
|
||
|
// Perform async copy to host, wait for it to be finished and
|
||
|
// return the result.
|
||
|
// At this point we are sure that count > 1 (first != last), so event
|
||
|
// returned by dispatch_copy_async() must be valid.
|
||
|
return dispatch_copy_async(first, last, result, queue).get();
|
||
|
}
|
||
|
|
||
|
// device -> device
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator
|
||
|
dispatch_copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
is_device_iterator<InputIterator>,
|
||
|
is_device_iterator<OutputIterator>,
|
||
|
mpl::not_<
|
||
|
can_copy_with_copy_buffer<
|
||
|
InputIterator, OutputIterator
|
||
|
>
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
return copy_on_device(first, last, result, queue);
|
||
|
}
|
||
|
|
||
|
// device -> device (specialization for buffer iterators)
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator
|
||
|
dispatch_copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
is_device_iterator<InputIterator>,
|
||
|
is_device_iterator<OutputIterator>,
|
||
|
can_copy_with_copy_buffer<
|
||
|
InputIterator, OutputIterator
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
|
||
|
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
|
||
|
|
||
|
difference_type n = std::distance(first, last);
|
||
|
if(n < 1){
|
||
|
// nothing to copy
|
||
|
return result;
|
||
|
}
|
||
|
|
||
|
queue.enqueue_copy_buffer(first.get_buffer(),
|
||
|
result.get_buffer(),
|
||
|
first.get_index() * sizeof(value_type),
|
||
|
result.get_index() * sizeof(value_type),
|
||
|
static_cast<size_t>(n) * sizeof(value_type));
|
||
|
return result + n;
|
||
|
}
|
||
|
|
||
|
// device -> device (async)
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline future<OutputIterator>
|
||
|
dispatch_copy_async(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
is_device_iterator<InputIterator>,
|
||
|
is_device_iterator<OutputIterator>,
|
||
|
mpl::not_<
|
||
|
can_copy_with_copy_buffer<
|
||
|
InputIterator, OutputIterator
|
||
|
>
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
return copy_on_device_async(first, last, result, queue);
|
||
|
}
|
||
|
|
||
|
// device -> device (async, specialization for buffer iterators)
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline future<OutputIterator>
|
||
|
dispatch_copy_async(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if<
|
||
|
mpl::and_<
|
||
|
is_device_iterator<InputIterator>,
|
||
|
is_device_iterator<OutputIterator>,
|
||
|
can_copy_with_copy_buffer<
|
||
|
InputIterator, OutputIterator
|
||
|
>
|
||
|
>
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
typedef typename std::iterator_traits<InputIterator>::value_type value_type;
|
||
|
typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;
|
||
|
|
||
|
difference_type n = std::distance(first, last);
|
||
|
if(n < 1){
|
||
|
// nothing to copy
|
||
|
return make_future(result, event());
|
||
|
}
|
||
|
|
||
|
event event_ =
|
||
|
queue.enqueue_copy_buffer(
|
||
|
first.get_buffer(),
|
||
|
result.get_buffer(),
|
||
|
first.get_index() * sizeof(value_type),
|
||
|
result.get_index() * sizeof(value_type),
|
||
|
static_cast<size_t>(n) * sizeof(value_type)
|
||
|
);
|
||
|
|
||
|
return make_future(result + n, event_);
|
||
|
}
|
||
|
|
||
|
// host -> host
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator
|
||
|
dispatch_copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue,
|
||
|
typename boost::enable_if_c<
|
||
|
!is_device_iterator<InputIterator>::value &&
|
||
|
!is_device_iterator<OutputIterator>::value
|
||
|
>::type* = 0)
|
||
|
{
|
||
|
(void) queue;
|
||
|
|
||
|
return std::copy(first, last, result);
|
||
|
}
|
||
|
|
||
|
} // end detail namespace
|
||
|
|
||
|
/// Copies the values in the range [\p first, \p last) to the range
|
||
|
/// beginning at \p result.
|
||
|
///
|
||
|
/// The generic copy() function can be used for a variety of data
|
||
|
/// transfer tasks and provides a standard interface to the following
|
||
|
/// OpenCL functions:
|
||
|
///
|
||
|
/// \li \c clEnqueueReadBuffer()
|
||
|
/// \li \c clEnqueueWriteBuffer()
|
||
|
/// \li \c clEnqueueCopyBuffer()
|
||
|
///
|
||
|
/// Unlike the aforementioned OpenCL functions, copy() will also work
|
||
|
/// with non-contiguous data-structures (e.g. \c std::list<T>) as
|
||
|
/// well as with "fancy" iterators (e.g. transform_iterator).
|
||
|
///
|
||
|
/// \param first first element in the range to copy
|
||
|
/// \param last last element in the range to copy
|
||
|
/// \param result first element in the result range
|
||
|
/// \param queue command queue to perform the operation
|
||
|
///
|
||
|
/// \return \c OutputIterator to the end of the result range
|
||
|
///
|
||
|
/// For example, to copy an array of \c int values on the host to a vector on
|
||
|
/// the device:
|
||
|
/// \code
|
||
|
/// // array on the host
|
||
|
/// int data[] = { 1, 2, 3, 4 };
|
||
|
///
|
||
|
/// // vector on the device
|
||
|
/// boost::compute::vector<int> vec(4, context);
|
||
|
///
|
||
|
/// // copy values to the device vector
|
||
|
/// boost::compute::copy(data, data + 4, vec.begin(), queue);
|
||
|
/// \endcode
|
||
|
///
|
||
|
/// The copy algorithm can also be used with standard containers such as
|
||
|
/// \c std::vector<T>:
|
||
|
/// \code
|
||
|
/// std::vector<int> host_vector = ...
|
||
|
/// boost::compute::vector<int> device_vector = ...
|
||
|
///
|
||
|
/// // copy from the host to the device
|
||
|
/// boost::compute::copy(
|
||
|
/// host_vector.begin(), host_vector.end(), device_vector.begin(), queue
|
||
|
/// );
|
||
|
///
|
||
|
/// // copy from the device to the host
|
||
|
/// boost::compute::copy(
|
||
|
/// device_vector.begin(), device_vector.end(), host_vector.begin(), queue
|
||
|
/// );
|
||
|
/// \endcode
|
||
|
///
|
||
|
/// \see copy_n(), copy_if(), copy_async()
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline OutputIterator copy(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue = system::default_queue())
|
||
|
{
|
||
|
return detail::dispatch_copy(first, last, result, queue);
|
||
|
}
|
||
|
|
||
|
/// Copies the values in the range [\p first, \p last) to the range
|
||
|
/// beginning at \p result. The copy is performed asynchronously.
|
||
|
///
|
||
|
/// \see copy()
|
||
|
template<class InputIterator, class OutputIterator>
|
||
|
inline future<OutputIterator>
|
||
|
copy_async(InputIterator first,
|
||
|
InputIterator last,
|
||
|
OutputIterator result,
|
||
|
command_queue &queue = system::default_queue())
|
||
|
{
|
||
|
return detail::dispatch_copy_async(first, last, result, queue);
|
||
|
}
|
||
|
|
||
|
} // end compute namespace
|
||
|
} // end boost namespace
|
||
|
|
||
|
#endif // BOOST_COMPUTE_ALGORITHM_COPY_HPP
|