Initial Commit
This commit is contained in:
@@ -0,0 +1,190 @@
|
||||
//---------------------------------------------------------------------------//
|
||||
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
|
||||
//
|
||||
// Distributed under the Boost Software License, Version 1.0
|
||||
// See accompanying file LICENSE_1_0.txt or copy at
|
||||
// http://www.boost.org/LICENSE_1_0.txt
|
||||
//
|
||||
// See http://boostorg.github.com/compute for more information.
|
||||
//---------------------------------------------------------------------------//
|
||||
|
||||
#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
|
||||
#define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
|
||||
|
||||
#include <iterator>
|
||||
|
||||
#include <boost/compute/command_queue.hpp>
|
||||
#include <boost/compute/async/future.hpp>
|
||||
#include <boost/compute/iterator/buffer_iterator.hpp>
|
||||
#include <boost/compute/iterator/discard_iterator.hpp>
|
||||
#include <boost/compute/memory/svm_ptr.hpp>
|
||||
#include <boost/compute/detail/iterator_range_size.hpp>
|
||||
#include <boost/compute/detail/meta_kernel.hpp>
|
||||
#include <boost/compute/detail/parameter_cache.hpp>
|
||||
#include <boost/compute/detail/work_size.hpp>
|
||||
#include <boost/compute/detail/vendor.hpp>
|
||||
|
||||
namespace boost {
|
||||
namespace compute {
|
||||
namespace detail {
|
||||
|
||||
template<class InputIterator, class OutputIterator>
|
||||
inline event copy_on_device_cpu(InputIterator first,
|
||||
OutputIterator result,
|
||||
size_t count,
|
||||
command_queue &queue)
|
||||
{
|
||||
meta_kernel k("copy");
|
||||
const device& device = queue.get_device();
|
||||
|
||||
k <<
|
||||
"uint block = " <<
|
||||
"(uint)ceil(((float)count)/get_global_size(0));\n" <<
|
||||
"uint index = get_global_id(0) * block;\n" <<
|
||||
"uint end = min(count, index + block);\n" <<
|
||||
"while(index < end){\n" <<
|
||||
result[k.var<uint_>("index")] << '=' <<
|
||||
first[k.var<uint_>("index")] << ";\n" <<
|
||||
"index++;\n" <<
|
||||
"}\n";
|
||||
|
||||
k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
|
||||
|
||||
size_t global_work_size = device.compute_units();
|
||||
if(count <= 1024) global_work_size = 1;
|
||||
return k.exec_1d(queue, 0, global_work_size);
|
||||
}
|
||||
|
||||
template<class InputIterator, class OutputIterator>
|
||||
inline event copy_on_device_gpu(InputIterator first,
|
||||
OutputIterator result,
|
||||
size_t count,
|
||||
command_queue &queue)
|
||||
{
|
||||
typedef typename std::iterator_traits<InputIterator>::value_type input_type;
|
||||
|
||||
const device& device = queue.get_device();
|
||||
boost::shared_ptr<parameter_cache> parameters =
|
||||
detail::parameter_cache::get_global_cache(device);
|
||||
std::string cache_key =
|
||||
"__boost_copy_kernel_" + boost::lexical_cast<std::string>(sizeof(input_type));
|
||||
|
||||
uint_ vpt = parameters->get(cache_key, "vpt", 4);
|
||||
uint_ tpb = parameters->get(cache_key, "tpb", 128);
|
||||
|
||||
meta_kernel k("copy");
|
||||
k <<
|
||||
"uint index = get_local_id(0) + " <<
|
||||
"(" << vpt * tpb << " * get_group_id(0));\n" <<
|
||||
"for(uint i = 0; i < " << vpt << "; i++){\n" <<
|
||||
" if(index < count){\n" <<
|
||||
result[k.var<uint_>("index")] << '=' <<
|
||||
first[k.var<uint_>("index")] << ";\n" <<
|
||||
" index += " << tpb << ";\n"
|
||||
" }\n"
|
||||
"}\n";
|
||||
|
||||
k.add_set_arg<const uint_>("count", static_cast<uint_>(count));
|
||||
size_t global_work_size = calculate_work_size(count, vpt, tpb);
|
||||
return k.exec_1d(queue, 0, global_work_size, tpb);
|
||||
}
|
||||
|
||||
template<class InputIterator, class OutputIterator>
|
||||
inline event dispatch_copy_on_device(InputIterator first,
|
||||
InputIterator last,
|
||||
OutputIterator result,
|
||||
command_queue &queue)
|
||||
{
|
||||
const size_t count = detail::iterator_range_size(first, last);
|
||||
|
||||
if(count == 0){
|
||||
// nothing to do
|
||||
return event();
|
||||
}
|
||||
|
||||
const device& device = queue.get_device();
|
||||
// copy_on_device_cpu() does not work for CPU on Apple platform
|
||||
// due to bug in its compiler.
|
||||
// See https://github.com/boostorg/compute/pull/626
|
||||
if((device.type() & device::cpu) && !is_apple_platform_device(device))
|
||||
{
|
||||
return copy_on_device_cpu(first, result, count, queue);
|
||||
}
|
||||
return copy_on_device_gpu(first, result, count, queue);
|
||||
}
|
||||
|
||||
template<class InputIterator, class OutputIterator>
|
||||
inline OutputIterator copy_on_device(InputIterator first,
|
||||
InputIterator last,
|
||||
OutputIterator result,
|
||||
command_queue &queue)
|
||||
{
|
||||
dispatch_copy_on_device(first, last, result, queue);
|
||||
return result + std::distance(first, last);
|
||||
}
|
||||
|
||||
template<class InputIterator>
|
||||
inline discard_iterator copy_on_device(InputIterator first,
|
||||
InputIterator last,
|
||||
discard_iterator result,
|
||||
command_queue &queue)
|
||||
{
|
||||
(void) queue;
|
||||
|
||||
return result + std::distance(first, last);
|
||||
}
|
||||
|
||||
template<class InputIterator, class OutputIterator>
|
||||
inline future<OutputIterator> copy_on_device_async(InputIterator first,
|
||||
InputIterator last,
|
||||
OutputIterator result,
|
||||
command_queue &queue)
|
||||
{
|
||||
event event_ = dispatch_copy_on_device(first, last, result, queue);
|
||||
return make_future(result + std::distance(first, last), event_);
|
||||
}
|
||||
|
||||
#ifdef CL_VERSION_2_0
|
||||
// copy_on_device() specialization for svm_ptr
|
||||
template<class T>
|
||||
inline svm_ptr<T> copy_on_device(svm_ptr<T> first,
|
||||
svm_ptr<T> last,
|
||||
svm_ptr<T> result,
|
||||
command_queue &queue)
|
||||
{
|
||||
size_t count = iterator_range_size(first, last);
|
||||
if(count == 0){
|
||||
return result;
|
||||
}
|
||||
|
||||
queue.enqueue_svm_memcpy(
|
||||
result.get(), first.get(), count * sizeof(T)
|
||||
);
|
||||
|
||||
return result + count;
|
||||
}
|
||||
|
||||
template<class T>
|
||||
inline future<svm_ptr<T> > copy_on_device_async(svm_ptr<T> first,
|
||||
svm_ptr<T> last,
|
||||
svm_ptr<T> result,
|
||||
command_queue &queue)
|
||||
{
|
||||
size_t count = iterator_range_size(first, last);
|
||||
if(count == 0){
|
||||
return future<svm_ptr<T> >();
|
||||
}
|
||||
|
||||
event event_ = queue.enqueue_svm_memcpy_async(
|
||||
result.get(), first.get(), count * sizeof(T)
|
||||
);
|
||||
|
||||
return make_future(result + count, event_);
|
||||
}
|
||||
#endif // CL_VERSION_2_0
|
||||
|
||||
} // end detail namespace
|
||||
} // end compute namespace
|
||||
} // end boost namespace
|
||||
|
||||
#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP
|
||||
Reference in New Issue
Block a user