js8call/.svn/pristine/74/74de4325cc17f784ddbb39c2013df8649fa17bb0.svn-base

//---------------------------------------------------------------------------//
// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//

#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP

#include <iterator>

#include <boost/compute/types.hpp>
#include <boost/compute/functional.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/detail/scalar.hpp>
#include <boost/compute/iterator/buffer_iterator.hpp>
#include <boost/compute/type_traits/type_name.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>
#include <boost/compute/detail/parameter_cache.hpp>

namespace boost {
namespace compute {
namespace detail {

template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,
                                                  InputIterator last,
                                                  UnaryPredicate predicate,
                                                  const size_t count,
                                                  command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;

    const context &context = queue.get_context();

    detail::meta_kernel k("find_if");
    size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");
    atomic_min<uint_> atomic_min_uint;

    k << k.decl<const uint_>("i") << " = get_global_id(0);\n"
      << k.decl<const value_type>("value") << "="
      <<     first[k.var<const uint_>("i")] << ";\n"
      << "if(" << predicate(k.var<const value_type>("value")) << "){\n"
      << "    " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"
      << "}\n";

    kernel kernel = k.compile(context);

    scalar<uint_> index(context);
    kernel.set_arg(index_arg, index.get_buffer());

    // initialize index to the last iterator's index
    index.write(static_cast<uint_>(count), queue);
    queue.enqueue_1d_range_kernel(kernel, 0, count, 0);

    // read index and return iterator
    return first + static_cast<difference_type>(index.read(queue));
}

template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,
                                                       InputIterator last,
                                                       UnaryPredicate predicate,
                                                       const size_t count,
                                                       const size_t vpt,
                                                       command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;
    typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;

    const context &context = queue.get_context();
    const device &device = queue.get_device();

    detail::meta_kernel k("find_if");
    size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");
    size_t count_arg = k.add_arg<const uint_>("count");
    size_t vpt_arg = k.add_arg<const uint_>("vpt");
    atomic_min<uint_> atomic_min_uint;

    // for GPUs reads from global memory are coalesced
    if(device.type() & device::gpu) {
        k <<
            k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<
            k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<
            k.decl<const uint_>("end") << " = min(" <<
                    "id + (lsize *" << k.var<uint_>("vpt") << ")," <<
                    "count" <<
            ");\n" <<

            // checking if the index is already found
            "__local uint local_index;\n" <<
            "if(get_local_id(0) == 0){\n" <<
            "    local_index = *index;\n " <<
            "};\n" <<
            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "if(local_index < id){\n" <<
            "    return;\n" <<
            "}\n" <<

            "while(id < end){\n" <<
            "    " << k.decl<const value_type>("value") << " = " <<
                      first[k.var<const uint_>("id")] << ";\n"
            "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
            "        " << atomic_min_uint(k.var<uint_ *>("index"),
                                          k.var<uint_>("id")) << ";\n" <<
            "        return;\n"
            "    }\n" <<
            "    id+=lsize;\n" <<
            "}\n";
    // for CPUs (and other devices) reads are ordered so the big cache is
    // efficiently used.
    } else {
        k <<
            k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<
            k.decl<const uint_>("end") << " = min(" <<
                    "id + " << k.var<uint_>("vpt") << "," <<
                    "count" <<
            ");\n" <<
            "while(id < end && (*index) > id){\n" <<
            "    " << k.decl<const value_type>("value") << " = " <<
                      first[k.var<const uint_>("id")] << ";\n"
            "    if(" << predicate(k.var<const value_type>("value")) << "){\n" <<
            "        " << atomic_min_uint(k.var<uint_ *>("index"),
                                          k.var<uint_>("id")) << ";\n" <<
            "        return;\n" <<
            "    }\n" <<
            "    id++;\n" <<
            "}\n";
    }

    kernel kernel = k.compile(context);

    scalar<uint_> index(context);
    kernel.set_arg(index_arg, index.get_buffer());
    kernel.set_arg(count_arg, static_cast<uint_>(count));
    kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));

    // initialize index to the last iterator's index
    index.write(static_cast<uint_>(count), queue);

    const size_t global_wg_size = static_cast<size_t>(
        std::ceil(float(count) / vpt)
    );
    queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);

    // read index and return iterator
    return first + static_cast<difference_type>(index.read(queue));
}

template<class InputIterator, class UnaryPredicate>
inline InputIterator find_if_with_atomics(InputIterator first,
                                          InputIterator last,
                                          UnaryPredicate predicate,
                                          command_queue &queue)
{
    typedef typename std::iterator_traits<InputIterator>::value_type value_type;

    size_t count = detail::iterator_range_size(first, last);
    if(count == 0){
        return last;
    }

    const device &device = queue.get_device();

    // load cached parameters
    std::string cache_key = std::string("__boost_find_if_with_atomics_")
        + type_name<value_type>();
    boost::shared_ptr<parameter_cache> parameters =
        detail::parameter_cache::get_global_cache(device);

    // for relatively small inputs on GPUs kernel checking one value per thread
    // (work-item) is more efficient than its multiple values per thread version
    if(device.type() & device::gpu){
        const size_t one_vpt_threshold =
            parameters->get(cache_key, "one_vpt_threshold", 1048576);
        if(count <= one_vpt_threshold){
            return find_if_with_atomics_one_vpt(
                first, last, predicate, count, queue
            );
        }
    }

    // values per thread
    size_t vpt;
    if(device.type() & device::gpu){
        // get vpt parameter
        vpt = parameters->get(cache_key, "vpt", 32);
    } else {
        // for CPUs work is split equally between compute units
        const size_t max_compute_units =
            device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();
        vpt = static_cast<size_t>(
            std::ceil(float(count) / max_compute_units)
        );
    }

    return find_if_with_atomics_multiple_vpt(
        first, last, predicate, count, vpt, queue
    );
}

} // end detail namespace
} // end compute namespace
} // end boost namespace

#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP
Initial Commit 2018-02-08 21:28:33 -05:00			`//---------------------------------------------------------------------------//`
			`// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>`
			`//`
			`// Distributed under the Boost Software License, Version 1.0`
			`// See accompanying file LICENSE_1_0.txt or copy at`
			`// http://www.boost.org/LICENSE_1_0.txt`
			`//`
			`// See http://boostorg.github.com/compute for more information.`
			`//---------------------------------------------------------------------------//`

			`#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP`
			`#define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP`

			`#include <iterator>`

			`#include <boost/compute/types.hpp>`
			`#include <boost/compute/functional.hpp>`
			`#include <boost/compute/command_queue.hpp>`
			`#include <boost/compute/container/detail/scalar.hpp>`
			`#include <boost/compute/iterator/buffer_iterator.hpp>`
			`#include <boost/compute/type_traits/type_name.hpp>`
			`#include <boost/compute/detail/meta_kernel.hpp>`
			`#include <boost/compute/detail/iterator_range_size.hpp>`
			`#include <boost/compute/detail/parameter_cache.hpp>`

			`namespace boost {`
			`namespace compute {`
			`namespace detail {`

			`template<class InputIterator, class UnaryPredicate>`
			`inline InputIterator find_if_with_atomics_one_vpt(InputIterator first,`
			`InputIterator last,`
			`UnaryPredicate predicate,`
			`const size_t count,`
			`command_queue &queue)`
			`{`
			`typedef typename std::iterator_traits<InputIterator>::value_type value_type;`
			`typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;`

			`const context &context = queue.get_context();`

			`detail::meta_kernel k("find_if");`
			`size_t index_arg = k.add_arg<int *>(memory_object::global_memory, "index");`
			`atomic_min<uint_> atomic_min_uint;`

			`k << k.decl<const uint_>("i") << " = get_global_id(0);\n"`
			`<< k.decl<const value_type>("value") << "="`
			`<< first[k.var<const uint_>("i")] << ";\n"`
			`<< "if(" << predicate(k.var<const value_type>("value")) << "){\n"`
			`<< " " << atomic_min_uint(k.var<uint_ *>("index"), k.var<uint_>("i")) << ";\n"`
			`<< "}\n";`

			`kernel kernel = k.compile(context);`

			`scalar<uint_> index(context);`
			`kernel.set_arg(index_arg, index.get_buffer());`

			`// initialize index to the last iterator's index`
			`index.write(static_cast<uint_>(count), queue);`
			`queue.enqueue_1d_range_kernel(kernel, 0, count, 0);`

			`// read index and return iterator`
			`return first + static_cast<difference_type>(index.read(queue));`
			`}`

			`template<class InputIterator, class UnaryPredicate>`
			`inline InputIterator find_if_with_atomics_multiple_vpt(InputIterator first,`
			`InputIterator last,`
			`UnaryPredicate predicate,`
			`const size_t count,`
			`const size_t vpt,`
			`command_queue &queue)`
			`{`
			`typedef typename std::iterator_traits<InputIterator>::value_type value_type;`
			`typedef typename std::iterator_traits<InputIterator>::difference_type difference_type;`

			`const context &context = queue.get_context();`
			`const device &device = queue.get_device();`

			`detail::meta_kernel k("find_if");`
			`size_t index_arg = k.add_arg<uint_ *>(memory_object::global_memory, "index");`
			`size_t count_arg = k.add_arg<const uint_>("count");`
			`size_t vpt_arg = k.add_arg<const uint_>("vpt");`
			`atomic_min<uint_> atomic_min_uint;`

			`// for GPUs reads from global memory are coalesced`
			`if(device.type() & device::gpu) {`
			`k <<`
			`k.decl<const uint_>("lsize") << " = get_local_size(0);\n" <<`
			`k.decl<uint_>("id") << " = get_local_id(0) + get_group_id(0) * lsize * vpt;\n" <<`
			`k.decl<const uint_>("end") << " = min(" <<`
			`"id + (lsize *" << k.var<uint_>("vpt") << ")," <<`
			`"count" <<`
			`");\n" <<`

			`// checking if the index is already found`
			`"__local uint local_index;\n" <<`
			`"if(get_local_id(0) == 0){\n" <<`
			`" local_index = *index;\n " <<`
			`"};\n" <<`
			`"barrier(CLK_LOCAL_MEM_FENCE);\n" <<`
			`"if(local_index < id){\n" <<`
			`" return;\n" <<`
			`"}\n" <<`

			`"while(id < end){\n" <<`
			`" " << k.decl<const value_type>("value") << " = " <<`
			`first[k.var<const uint_>("id")] << ";\n"`
			`" if(" << predicate(k.var<const value_type>("value")) << "){\n" <<`
			`" " << atomic_min_uint(k.var<uint_ *>("index"),`
			`k.var<uint_>("id")) << ";\n" <<`
			`" return;\n"`
			`" }\n" <<`
			`" id+=lsize;\n" <<`
			`"}\n";`
			`// for CPUs (and other devices) reads are ordered so the big cache is`
			`// efficiently used.`
			`} else {`
			`k <<`
			`k.decl<uint_>("id") << " = get_global_id(0) * " << k.var<uint_>("vpt") << ";\n" <<`
			`k.decl<const uint_>("end") << " = min(" <<`
			`"id + " << k.var<uint_>("vpt") << "," <<`
			`"count" <<`
			`");\n" <<`
			`"while(id < end && (*index) > id){\n" <<`
			`" " << k.decl<const value_type>("value") << " = " <<`
			`first[k.var<const uint_>("id")] << ";\n"`
			`" if(" << predicate(k.var<const value_type>("value")) << "){\n" <<`
			`" " << atomic_min_uint(k.var<uint_ *>("index"),`
			`k.var<uint_>("id")) << ";\n" <<`
			`" return;\n" <<`
			`" }\n" <<`
			`" id++;\n" <<`
			`"}\n";`
			`}`

			`kernel kernel = k.compile(context);`

			`scalar<uint_> index(context);`
			`kernel.set_arg(index_arg, index.get_buffer());`
			`kernel.set_arg(count_arg, static_cast<uint_>(count));`
			`kernel.set_arg(vpt_arg, static_cast<uint_>(vpt));`

			`// initialize index to the last iterator's index`
			`index.write(static_cast<uint_>(count), queue);`

			`const size_t global_wg_size = static_cast<size_t>(`
			`std::ceil(float(count) / vpt)`
			`);`
			`queue.enqueue_1d_range_kernel(kernel, 0, global_wg_size, 0);`

			`// read index and return iterator`
			`return first + static_cast<difference_type>(index.read(queue));`
			`}`

			`template<class InputIterator, class UnaryPredicate>`
			`inline InputIterator find_if_with_atomics(InputIterator first,`
			`InputIterator last,`
			`UnaryPredicate predicate,`
			`command_queue &queue)`
			`{`
			`typedef typename std::iterator_traits<InputIterator>::value_type value_type;`

			`size_t count = detail::iterator_range_size(first, last);`
			`if(count == 0){`
			`return last;`
			`}`

			`const device &device = queue.get_device();`

			`// load cached parameters`
			`std::string cache_key = std::string("__boost_find_if_with_atomics_")`
			`+ type_name<value_type>();`
			`boost::shared_ptr<parameter_cache> parameters =`
			`detail::parameter_cache::get_global_cache(device);`

			`// for relatively small inputs on GPUs kernel checking one value per thread`
			`// (work-item) is more efficient than its multiple values per thread version`
			`if(device.type() & device::gpu){`
			`const size_t one_vpt_threshold =`
			`parameters->get(cache_key, "one_vpt_threshold", 1048576);`
			`if(count <= one_vpt_threshold){`
			`return find_if_with_atomics_one_vpt(`
			`first, last, predicate, count, queue`
			`);`
			`}`
			`}`

			`// values per thread`
			`size_t vpt;`
			`if(device.type() & device::gpu){`
			`// get vpt parameter`
			`vpt = parameters->get(cache_key, "vpt", 32);`
			`} else {`
			`// for CPUs work is split equally between compute units`
			`const size_t max_compute_units =`
			`device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>();`
			`vpt = static_cast<size_t>(`
			`std::ceil(float(count) / max_compute_units)`
			`);`
			`}`

			`return find_if_with_atomics_multiple_vpt(`
			`first, last, predicate, count, vpt, queue`
			`);`
			`}`

			`} // end detail namespace`
			`} // end compute namespace`
			`} // end boost namespace`

			`#endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_IF_WITH_ATOMICS_HPP`