//---------------------------------------------------------------------------// // Copyright (c) 2013 Kyle Lutz // // Distributed under the Boost Software License, Version 1.0 // See accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt // // See http://boostorg.github.com/compute for more information. //---------------------------------------------------------------------------// #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP #define BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP #include #include #include #include #include #include #include #include #include #include #include namespace boost { namespace compute { namespace detail { template inline event copy_on_device_cpu(InputIterator first, OutputIterator result, size_t count, command_queue &queue) { meta_kernel k("copy"); const device& device = queue.get_device(); k << "uint block = " << "(uint)ceil(((float)count)/get_global_size(0));\n" << "uint index = get_global_id(0) * block;\n" << "uint end = min(count, index + block);\n" << "while(index < end){\n" << result[k.var("index")] << '=' << first[k.var("index")] << ";\n" << "index++;\n" << "}\n"; k.add_set_arg("count", static_cast(count)); size_t global_work_size = device.compute_units(); if(count <= 1024) global_work_size = 1; return k.exec_1d(queue, 0, global_work_size); } template inline event copy_on_device_gpu(InputIterator first, OutputIterator result, size_t count, command_queue &queue) { typedef typename std::iterator_traits::value_type input_type; const device& device = queue.get_device(); boost::shared_ptr parameters = detail::parameter_cache::get_global_cache(device); std::string cache_key = "__boost_copy_kernel_" + boost::lexical_cast(sizeof(input_type)); uint_ vpt = parameters->get(cache_key, "vpt", 4); uint_ tpb = parameters->get(cache_key, "tpb", 128); meta_kernel k("copy"); k << "uint index = get_local_id(0) + " << "(" << vpt * tpb << " * get_group_id(0));\n" << "for(uint i = 0; i < " << vpt << "; i++){\n" << " if(index < count){\n" << result[k.var("index")] << '=' << first[k.var("index")] << ";\n" << " index += " << tpb << ";\n" " }\n" "}\n"; k.add_set_arg("count", static_cast(count)); size_t global_work_size = calculate_work_size(count, vpt, tpb); return k.exec_1d(queue, 0, global_work_size, tpb); } template inline event dispatch_copy_on_device(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue) { const size_t count = detail::iterator_range_size(first, last); if(count == 0){ // nothing to do return event(); } const device& device = queue.get_device(); // copy_on_device_cpu() does not work for CPU on Apple platform // due to bug in its compiler. // See https://github.com/boostorg/compute/pull/626 if((device.type() & device::cpu) && !is_apple_platform_device(device)) { return copy_on_device_cpu(first, result, count, queue); } return copy_on_device_gpu(first, result, count, queue); } template inline OutputIterator copy_on_device(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue) { dispatch_copy_on_device(first, last, result, queue); return result + std::distance(first, last); } template inline discard_iterator copy_on_device(InputIterator first, InputIterator last, discard_iterator result, command_queue &queue) { (void) queue; return result + std::distance(first, last); } template inline future copy_on_device_async(InputIterator first, InputIterator last, OutputIterator result, command_queue &queue) { event event_ = dispatch_copy_on_device(first, last, result, queue); return make_future(result + std::distance(first, last), event_); } #ifdef CL_VERSION_2_0 // copy_on_device() specialization for svm_ptr template inline svm_ptr copy_on_device(svm_ptr first, svm_ptr last, svm_ptr result, command_queue &queue) { size_t count = iterator_range_size(first, last); if(count == 0){ return result; } queue.enqueue_svm_memcpy( result.get(), first.get(), count * sizeof(T) ); return result + count; } template inline future > copy_on_device_async(svm_ptr first, svm_ptr last, svm_ptr result, command_queue &queue) { size_t count = iterator_range_size(first, last); if(count == 0){ return future >(); } event event_ = queue.enqueue_svm_memcpy_async( result.get(), first.get(), count * sizeof(T) ); return make_future(result + count, event_); } #endif // CL_VERSION_2_0 } // end detail namespace } // end compute namespace } // end boost namespace #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_COPY_ON_DEVICE_HPP