js8call/.svn/pristine/d3/d3723d763600d7534842e8971b9b49dccff3f135.svn-base

//---------------------------------------------------------------------------//
// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>
//
// Distributed under the Boost Software License, Version 1.0
// See accompanying file LICENSE_1_0.txt or copy at
// http://www.boost.org/LICENSE_1_0.txt
//
// See http://boostorg.github.com/compute for more information.
//---------------------------------------------------------------------------//

#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_
#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_

#include <algorithm>

#include <boost/compute/kernel.hpp>
#include <boost/compute/program.hpp>
#include <boost/compute/command_queue.hpp>
#include <boost/compute/container/vector.hpp>
#include <boost/compute/memory/local_buffer.hpp>
#include <boost/compute/detail/meta_kernel.hpp>
#include <boost/compute/detail/iterator_range_size.hpp>

namespace boost {
namespace compute {
namespace detail {

template<class KeyType, class ValueType>
inline size_t pick_bitonic_block_sort_block_size(size_t proposed_wg,
                                                 size_t lmem_size,
                                                 bool sort_by_key)
{
    size_t n = proposed_wg;

    size_t lmem_required = n * sizeof(KeyType);
    if(sort_by_key) {
        lmem_required += n * sizeof(ValueType);
    }

    // try to force at least 4 work-groups of >64 elements
    // for better occupancy
    while(lmem_size < (lmem_required * 4) && (n > 64)) {
        n /= 2;
        lmem_required = n * sizeof(KeyType);
    }
    while(lmem_size < lmem_required && (n != 1)) {
        n /= 2;
        if(n < 1) n = 1;
        lmem_required = n * sizeof(KeyType);
    }

    if(n < 2)   { return 1; }
    else if(n < 4)   { return 2; }
    else if(n < 8)   { return 4; }
    else if(n < 16)  { return 8; }
    else if(n < 32)  { return 16; }
    else if(n < 64)  { return 32; }
    else if(n < 128) { return 64; }
    else if(n < 256) { return 128; }
    else             { return 256; }
}


/// Performs bitonic block sort according to \p compare.
///
/// Since bitonic sort can be only performed when input size is equal to 2^n,
/// in this case input size is block size (\p work_group_size), we would have
/// to require \p count be a exact multiple of block size. That would not be
/// great.
/// Instead, bitonic sort kernel is merged with odd-even merge sort so if the
/// last block is not equal to 2^n (where n is some natural number) the odd-even
/// sort is performed for that block. That way bitonic_block_sort() works for
/// input of any size. Block size (\p work_group_size) still have to be equal
/// to 2^n.
///
/// This is NOT stable.
///
/// \param keys_first first key element in the range to sort
/// \param values_first first value element in the range to sort
/// \param compare comparison function for keys
/// \param count number of elements in the range; count > 0
/// \param work_group_size size of the work group, also the block size; must be
///        equal to n^2 where n is natural number
/// \param queue command queue to perform the operation
template<class KeyIterator, class ValueIterator, class Compare>
inline size_t bitonic_block_sort(KeyIterator keys_first,
                                 ValueIterator values_first,
                                 Compare compare,
                                 const size_t count,
                                 const bool sort_by_key,
                                 command_queue &queue)
{
    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;

    meta_kernel k("bitonic_block_sort");
    size_t count_arg = k.add_arg<const uint_>("count");

    size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys");
    size_t local_vals_arg = 0;
    if(sort_by_key) {
        local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx");
    }

    k <<
        // Work item global and local ids
        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
        k.decl<const uint_>("lid") << " = get_local_id(0);\n";

    // declare my_key and my_value
    k <<
        k.decl<key_type>("my_key") << ";\n";
    // Instead of copying values (my_value) in local memory with keys
    // we save local index (uchar) and copy my_value at the end at
    // final index. This saves local memory.
    if(sort_by_key)
    {
        k <<
            k.decl<uchar_>("my_index") << " = (uchar)(lid);\n";
    }

    // load key
    k <<
        "if(gid < count) {\n" <<
            k.var<key_type>("my_key") <<  " = " <<
                keys_first[k.var<const uint_>("gid")] << ";\n" <<
        "}\n";

    // load key and index to local memory
    k <<
        "lkeys[lid] = my_key;\n";
    if(sort_by_key)
    {
        k <<
            "lidx[lid] = my_index;\n";
    }
    k <<
        k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" <<
        k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n";

    // When work group size is a power of 2 bitonic sorter can be used;
    // otherwise, slower odd-even sort is used.

    k <<
        // check if n is power of 2
        "if(((n != 0) && ((n & (~n + 1)) == n))) {\n";

    // bitonic sort, not stable
    k <<
        // wait for keys and vals to be stored in local memory
        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<

        "#pragma unroll\n" <<
        "for(" <<
            k.decl<uint_>("length") << " = 1; " <<
            "length < n; " <<
            "length <<= 1" <<
        ") {\n" <<
            // direction of sort: false -> asc, true -> desc
            k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" <<
            "for(" <<
                k.decl<uint_>("k") << " = length; " <<
                "k > 0; " <<
                "k >>= 1" <<
            ") {\n" <<

            // sibling to compare with my key
            k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" <<
            k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
            k.decl<bool>("compare") << " = " <<
                compare(k.var<key_type>("sibling_key"),
                        k.var<key_type>("my_key")) << ";\n" <<
            k.decl<bool>("swap") <<
                " = compare ^ (sibling_idx < lid) ^ direction;\n" <<
            "my_key = swap ? sibling_key : my_key;\n";
    if(sort_by_key)
    {
        k <<
            "my_index = swap ? lidx[sibling_idx] : my_index;\n";
    }
    k <<
            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "lkeys[lid] = my_key;\n";
    if(sort_by_key)
    {
        k <<
            "lidx[lid] = my_index;\n";
    }
    k <<
            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "}\n" <<
         "}\n";

    // end of bitonic sort

    // odd-even sort, not stable
    k <<
        "}\n" <<
        "else { \n";

    k <<
        k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" <<
        k.decl<uint_>("oddsibling_idx") << " = " <<
            "(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" <<
        k.decl<uint_>("evensibling_idx") << " = " <<
            "(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" <<

        // wait for keys and vals to be stored in local memory
        "barrier(CLK_LOCAL_MEM_FENCE);\n" <<

        "#pragma unroll\n" <<
        "for(" <<
            k.decl<uint_>("i") << " = 0; " <<
            "i < n; " <<
            "i++" <<
        ") {\n" <<
            k.decl<uint_>("sibling_idx") <<
                " = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" <<
            k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<
            k.decl<bool>("compare") << " = " <<
                compare(k.var<key_type>("sibling_key"),
                        k.var<key_type>("my_key")) << ";\n" <<
            k.decl<bool>("swap") <<
                " = compare ^ (sibling_idx < lid);\n" <<
            "my_key = swap ? sibling_key : my_key;\n";
    if(sort_by_key)
    {
        k <<
            "my_index = swap ? lidx[sibling_idx] : my_index;\n";
    }
    k <<
            "barrier(CLK_LOCAL_MEM_FENCE);\n" <<
            "lkeys[lid] = my_key;\n";
    if(sort_by_key)
    {
        k <<
            "lidx[lid] = my_index;\n";
    }
    k <<
            "barrier(CLK_LOCAL_MEM_FENCE);\n"
        "}\n" <<  // for

    "}\n"; // else
    // end of odd-even sort

    // save key and value
    k <<
        "if(gid < count) {\n" <<
        keys_first[k.var<const uint_>("gid")] << " = " <<
            k.var<key_type>("my_key") << ";\n";
    if(sort_by_key)
    {
        k << values_first[k.var<const uint_>("gid")] << " = " <<
                values_first[k.var<const uint_>("offset + my_index")] << ";\n";
    }
    k <<
        // end if
        "}\n";

    const context &context = queue.get_context();
    const device &device = queue.get_device();
    ::boost::compute::kernel kernel = k.compile(context);

    const size_t work_group_size =
        pick_bitonic_block_sort_block_size<key_type, uchar_>(
            kernel.get_work_group_info<size_t>(
                device, CL_KERNEL_WORK_GROUP_SIZE
            ),
            device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE),
            sort_by_key
        );

    const size_t global_size =
        work_group_size * static_cast<size_t>(
            std::ceil(float(count) / work_group_size)
        );

    kernel.set_arg(count_arg, static_cast<uint_>(count));
    kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size));
    if(sort_by_key) {
        kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size));
    }

    queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
    // return size of the block
    return work_group_size;
}

template<class KeyIterator, class ValueIterator, class Compare>
inline size_t block_sort(KeyIterator keys_first,
                         ValueIterator values_first,
                         Compare compare,
                         const size_t count,
                         const bool sort_by_key,
                         const bool stable,
                         command_queue &queue)
{
    if(stable) {
        // TODO: Implement stable block sort (stable odd-even merge sort)
        return size_t(1);
    }
    return bitonic_block_sort(
        keys_first, values_first,
        compare, count,
        sort_by_key, queue
    );
}

/// space: O(n + m); n - number of keys, m - number of values
template<class KeyIterator, class ValueIterator, class Compare>
inline void merge_blocks_on_gpu(KeyIterator keys_first,
                                ValueIterator values_first,
                                KeyIterator out_keys_first,
                                ValueIterator out_values_first,
                                Compare compare,
                                const size_t count,
                                const size_t block_size,
                                const bool sort_by_key,
                                command_queue &queue)
{
    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;

    meta_kernel k("merge_blocks");
    size_t count_arg = k.add_arg<const uint_>("count");
    size_t block_size_arg = k.add_arg<const uint_>("block_size");

    k <<
        // get global id
        k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<
        "if(gid >= count) {\n" <<
            "return;\n" <<
        "}\n" <<

        k.decl<const key_type>("my_key") << " = " <<
            keys_first[k.var<const uint_>("gid")] << ";\n";

    if(sort_by_key) {
        k <<
            k.decl<const value_type>("my_value") << " = " <<
                values_first[k.var<const uint_>("gid")] << ";\n";
    }

    k <<
        // get my block idx
        k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" <<
        k.decl<const bool>("my_block_idx_is_odd") << " = " <<
            "my_block_idx & 0x1;\n" <<

        k.decl<const uint_>("other_block_idx") << " = " <<
            // if(my_block_idx is odd) {} else {}
            "my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" <<

        // get ranges of my block and the other block
        // [my_block_start; my_block_end)
        // [other_block_start; other_block_end)
        k.decl<const uint_>("my_block_start") << " = " <<
            "min(my_block_idx * block_size, count);\n" << // including
        k.decl<const uint_>("my_block_end") << " = " <<
            "min((my_block_idx + 1) * block_size, count);\n" << // excluding

        k.decl<const uint_>("other_block_start") << " = " <<
            "min(other_block_idx * block_size, count);\n" << // including
        k.decl<const uint_>("other_block_end") << " = " <<
            "min((other_block_idx + 1) * block_size, count);\n" << // excluding

        // other block is empty, nothing to merge here
        "if(other_block_start == count){\n" <<
            out_keys_first[k.var<uint_>("gid")] << " = my_key;\n";
        if(sort_by_key) {
            k <<
                out_values_first[k.var<uint_>("gid")] << " = my_value;\n";
        }

        k <<
        "return;\n" <<
        "}\n" <<

        // lower bound
        // left_idx - lower bound
        k.decl<uint_>("left_idx") << " = other_block_start;\n" <<
        k.decl<uint_>("right_idx") << " = other_block_end;\n" <<
        "while(left_idx < right_idx) {\n" <<
            k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
            k.decl<key_type>("mid_key") << " = " <<
                    keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
            k.decl<bool>("smaller") << " = " <<
                compare(k.var<key_type>("mid_key"),
                        k.var<key_type>("my_key")) << ";\n" <<
            "left_idx = smaller ? mid_idx + 1 : left_idx;\n" <<
            "right_idx = smaller ? right_idx :  mid_idx;\n" <<
        "}\n" <<
        // left_idx is found position in other block

        // if my_block is odd we need to get the upper bound
        "right_idx = other_block_end;\n" <<
        "if(my_block_idx_is_odd && left_idx != right_idx) {\n" <<
            k.decl<key_type>("upper_key") << " = " <<
                keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
            "while(" <<
                "!(" << compare(k.var<key_type>("upper_key"),
                                k.var<key_type>("my_key")) <<
                ") && " <<
                "!(" << compare(k.var<key_type>("my_key"),
                                k.var<key_type>("upper_key")) <<
                ") && " <<
                     "left_idx < right_idx" <<
                ")" <<
            "{\n" <<
                k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<
                k.decl<key_type>("mid_key") << " = " <<
                    keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<
                k.decl<bool>("equal") << " = " <<
                    "!(" << compare(k.var<key_type>("mid_key"),
                                    k.var<key_type>("my_key")) <<
                    ") && " <<
                    "!(" << compare(k.var<key_type>("my_key"),
                                    k.var<key_type>("mid_key")) <<
                    ");\n" <<
                "left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" <<
                "right_idx = equal ? right_idx : mid_idx;\n" <<
                "upper_key = equal ? upper_key : " <<
                    keys_first[k.var<const uint_>("left_idx")] << ";\n" <<
            "}\n" <<
        "}\n" <<

        k.decl<uint_>("offset") << " = 0;\n" <<
        "offset += gid - my_block_start;\n" <<
        "offset += left_idx - other_block_start;\n" <<
        "offset += min(my_block_start, other_block_start);\n" <<
        out_keys_first[k.var<uint_>("offset")] << " = my_key;\n";
    if(sort_by_key) {
        k <<
            out_values_first[k.var<uint_>("offset")] << " = my_value;\n";
    }

    const context &context = queue.get_context();
    ::boost::compute::kernel kernel = k.compile(context);

    const size_t work_group_size = (std::min)(
        size_t(256),
        kernel.get_work_group_info<size_t>(
            queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE
        )
    );
    const size_t global_size =
        work_group_size * static_cast<size_t>(
            std::ceil(float(count) / work_group_size)
        );

    kernel.set_arg(count_arg, static_cast<uint_>(count));
    kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));
    queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);
}

template<class KeyIterator, class ValueIterator, class Compare>
inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
                                     KeyIterator keys_last,
                                     ValueIterator values_first,
                                     Compare compare,
                                     bool stable,
                                     command_queue &queue)
{
    typedef typename std::iterator_traits<KeyIterator>::value_type key_type;
    typedef typename std::iterator_traits<ValueIterator>::value_type value_type;

    size_t count = iterator_range_size(keys_first, keys_last);
    if(count < 2){
        return;
    }

    size_t block_size =
        block_sort(
            keys_first, values_first,
            compare, count,
            true /* sort_by_key */, stable /* stable */,
            queue
        );

    // for small input size only block sort is performed
    if(count <= block_size) {
        return;
    }

    const context &context = queue.get_context();

    bool result_in_temporary_buffer = false;
    ::boost::compute::vector<key_type> temp_keys(count, context);
    ::boost::compute::vector<value_type> temp_values(count, context);

    for(; block_size < count; block_size *= 2) {
        result_in_temporary_buffer = !result_in_temporary_buffer;
        if(result_in_temporary_buffer) {
            merge_blocks_on_gpu(keys_first, values_first,
                                temp_keys.begin(), temp_values.begin(),
                                compare, count, block_size,
                                true /* sort_by_key */, queue);
        } else {
            merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(),
                                keys_first, values_first,
                                compare, count, block_size,
                                true /* sort_by_key */, queue);
        }
    }

    if(result_in_temporary_buffer) {
        copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue);
        copy_async(temp_values.begin(), temp_values.end(), values_first, queue);
    }
}

template<class Iterator, class Compare>
inline void merge_sort_on_gpu(Iterator first,
                              Iterator last,
                              Compare compare,
                              bool stable,
                              command_queue &queue)
{
    typedef typename std::iterator_traits<Iterator>::value_type key_type;

    size_t count = iterator_range_size(first, last);
    if(count < 2){
        return;
    }

    Iterator dummy;
    size_t block_size =
        block_sort(
            first, dummy,
            compare, count,
            false /* sort_by_key */, stable /* stable */,
            queue
        );

    // for small input size only block sort is performed
    if(count <= block_size) {
        return;
    }

    const context &context = queue.get_context();

    bool result_in_temporary_buffer = false;
    ::boost::compute::vector<key_type> temp_keys(count, context);

    for(; block_size < count; block_size *= 2) {
        result_in_temporary_buffer = !result_in_temporary_buffer;
        if(result_in_temporary_buffer) {
            merge_blocks_on_gpu(first, dummy, temp_keys.begin(), dummy,
                                compare, count, block_size,
                                false /* sort_by_key */, queue);
        } else {
            merge_blocks_on_gpu(temp_keys.begin(), dummy, first, dummy,
                                compare, count, block_size,
                                false /* sort_by_key */, queue);
        }
    }

    if(result_in_temporary_buffer) {
        copy_async(temp_keys.begin(), temp_keys.end(), first, queue);
    }
}

template<class KeyIterator, class ValueIterator, class Compare>
inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,
                                     KeyIterator keys_last,
                                     ValueIterator values_first,
                                     Compare compare,
                                     command_queue &queue)
{
    merge_sort_by_key_on_gpu(
        keys_first, keys_last, values_first,
        compare, false /* not stable */, queue
    );
}

template<class Iterator, class Compare>
inline void merge_sort_on_gpu(Iterator first,
                              Iterator last,
                              Compare compare,
                              command_queue &queue)
{
    merge_sort_on_gpu(
        first, last, compare, false /* not stable */, queue
    );
}

} // end detail namespace
} // end compute namespace
} // end boost namespace

#endif /* BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ */
Initial Commit 2018-02-08 21:28:33 -05:00			`//---------------------------------------------------------------------------//`
			`// Copyright (c) 2016 Jakub Szuppe <j.szuppe@gmail.com>`
			`//`
			`// Distributed under the Boost Software License, Version 1.0`
			`// See accompanying file LICENSE_1_0.txt or copy at`
			`// http://www.boost.org/LICENSE_1_0.txt`
			`//`
			`// See http://boostorg.github.com/compute for more information.`
			`//---------------------------------------------------------------------------//`

			`#ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_`
			`#define BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_`

			`#include <algorithm>`

			`#include <boost/compute/kernel.hpp>`
			`#include <boost/compute/program.hpp>`
			`#include <boost/compute/command_queue.hpp>`
			`#include <boost/compute/container/vector.hpp>`
			`#include <boost/compute/memory/local_buffer.hpp>`
			`#include <boost/compute/detail/meta_kernel.hpp>`
			`#include <boost/compute/detail/iterator_range_size.hpp>`

			`namespace boost {`
			`namespace compute {`
			`namespace detail {`

			`template<class KeyType, class ValueType>`
			`inline size_t pick_bitonic_block_sort_block_size(size_t proposed_wg,`
			`size_t lmem_size,`
			`bool sort_by_key)`
			`{`
			`size_t n = proposed_wg;`

			`size_t lmem_required = n * sizeof(KeyType);`
			`if(sort_by_key) {`
			`lmem_required += n * sizeof(ValueType);`
			`}`

			`// try to force at least 4 work-groups of >64 elements`
			`// for better occupancy`
			`while(lmem_size < (lmem_required * 4) && (n > 64)) {`
			`n /= 2;`
			`lmem_required = n * sizeof(KeyType);`
			`}`
			`while(lmem_size < lmem_required && (n != 1)) {`
			`n /= 2;`
			`if(n < 1) n = 1;`
			`lmem_required = n * sizeof(KeyType);`
			`}`

			`if(n < 2) { return 1; }`
			`else if(n < 4) { return 2; }`
			`else if(n < 8) { return 4; }`
			`else if(n < 16) { return 8; }`
			`else if(n < 32) { return 16; }`
			`else if(n < 64) { return 32; }`
			`else if(n < 128) { return 64; }`
			`else if(n < 256) { return 128; }`
			`else { return 256; }`
			`}`


			`/// Performs bitonic block sort according to \p compare.`
			`///`
			`/// Since bitonic sort can be only performed when input size is equal to 2^n,`
			`/// in this case input size is block size (\p work_group_size), we would have`
			`/// to require \p count be a exact multiple of block size. That would not be`
			`/// great.`
			`/// Instead, bitonic sort kernel is merged with odd-even merge sort so if the`
			`/// last block is not equal to 2^n (where n is some natural number) the odd-even`
			`/// sort is performed for that block. That way bitonic_block_sort() works for`
			`/// input of any size. Block size (\p work_group_size) still have to be equal`
			`/// to 2^n.`
			`///`
			`/// This is NOT stable.`
			`///`
			`/// \param keys_first first key element in the range to sort`
			`/// \param values_first first value element in the range to sort`
			`/// \param compare comparison function for keys`
			`/// \param count number of elements in the range; count > 0`
			`/// \param work_group_size size of the work group, also the block size; must be`
			`/// equal to n^2 where n is natural number`
			`/// \param queue command queue to perform the operation`
			`template<class KeyIterator, class ValueIterator, class Compare>`
			`inline size_t bitonic_block_sort(KeyIterator keys_first,`
			`ValueIterator values_first,`
			`Compare compare,`
			`const size_t count,`
			`const bool sort_by_key,`
			`command_queue &queue)`
			`{`
			`typedef typename std::iterator_traits<KeyIterator>::value_type key_type;`

			`meta_kernel k("bitonic_block_sort");`
			`size_t count_arg = k.add_arg<const uint_>("count");`

			`size_t local_keys_arg = k.add_arg<key_type *>(memory_object::local_memory, "lkeys");`
			`size_t local_vals_arg = 0;`
			`if(sort_by_key) {`
			`local_vals_arg = k.add_arg<uchar_ *>(memory_object::local_memory, "lidx");`
			`}`

			`k <<`
			`// Work item global and local ids`
			`k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<`
			`k.decl<const uint_>("lid") << " = get_local_id(0);\n";`

			`// declare my_key and my_value`
			`k <<`
			`k.decl<key_type>("my_key") << ";\n";`
			`// Instead of copying values (my_value) in local memory with keys`
			`// we save local index (uchar) and copy my_value at the end at`
			`// final index. This saves local memory.`
			`if(sort_by_key)`
			`{`
			`k <<`
			`k.decl<uchar_>("my_index") << " = (uchar)(lid);\n";`
			`}`

			`// load key`
			`k <<`
			`"if(gid < count) {\n" <<`
			`k.var<key_type>("my_key") << " = " <<`
			`keys_first[k.var<const uint_>("gid")] << ";\n" <<`
			`"}\n";`

			`// load key and index to local memory`
			`k <<`
			`"lkeys[lid] = my_key;\n";`
			`if(sort_by_key)`
			`{`
			`k <<`
			`"lidx[lid] = my_index;\n";`
			`}`
			`k <<`
			`k.decl<const uint_>("offset") << " = get_group_id(0) * get_local_size(0);\n" <<`
			`k.decl<const uint_>("n") << " = min((uint)(get_local_size(0)),(count - offset));\n";`

			`// When work group size is a power of 2 bitonic sorter can be used;`
			`// otherwise, slower odd-even sort is used.`

			`k <<`
			`// check if n is power of 2`
			`"if(((n != 0) && ((n & (~n + 1)) == n))) {\n";`

			`// bitonic sort, not stable`
			`k <<`
			`// wait for keys and vals to be stored in local memory`
			`"barrier(CLK_LOCAL_MEM_FENCE);\n" <<`

			`"#pragma unroll\n" <<`
			`"for(" <<`
			`k.decl<uint_>("length") << " = 1; " <<`
			`"length < n; " <<`
			`"length <<= 1" <<`
			`") {\n" <<`
			`// direction of sort: false -> asc, true -> desc`
			`k.decl<bool>("direction") << "= ((lid & (length<<1)) != 0);\n" <<`
			`"for(" <<`
			`k.decl<uint_>("k") << " = length; " <<`
			`"k > 0; " <<`
			`"k >>= 1" <<`
			`") {\n" <<`

			`// sibling to compare with my key`
			`k.decl<uint_>("sibling_idx") << " = lid ^ k;\n" <<`
			`k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<`
			`k.decl<bool>("compare") << " = " <<`
			`compare(k.var<key_type>("sibling_key"),`
			`k.var<key_type>("my_key")) << ";\n" <<`
			`k.decl<bool>("swap") <<`
			`" = compare ^ (sibling_idx < lid) ^ direction;\n" <<`
			`"my_key = swap ? sibling_key : my_key;\n";`
			`if(sort_by_key)`
			`{`
			`k <<`
			`"my_index = swap ? lidx[sibling_idx] : my_index;\n";`
			`}`
			`k <<`
			`"barrier(CLK_LOCAL_MEM_FENCE);\n" <<`
			`"lkeys[lid] = my_key;\n";`
			`if(sort_by_key)`
			`{`
			`k <<`
			`"lidx[lid] = my_index;\n";`
			`}`
			`k <<`
			`"barrier(CLK_LOCAL_MEM_FENCE);\n" <<`
			`"}\n" <<`
			`"}\n";`

			`// end of bitonic sort`

			`// odd-even sort, not stable`
			`k <<`
			`"}\n" <<`
			`"else { \n";`

			`k <<`
			`k.decl<bool>("lid_is_even") << " = (lid%2) == 0;\n" <<`
			`k.decl<uint_>("oddsibling_idx") << " = " <<`
			`"(lid_is_even) ? max(lid,(uint)(1)) - 1 : min(lid+1,n-1);\n" <<`
			`k.decl<uint_>("evensibling_idx") << " = " <<`
			`"(lid_is_even) ? min(lid+1,n-1) : max(lid,(uint)(1)) - 1;\n" <<`

			`// wait for keys and vals to be stored in local memory`
			`"barrier(CLK_LOCAL_MEM_FENCE);\n" <<`

			`"#pragma unroll\n" <<`
			`"for(" <<`
			`k.decl<uint_>("i") << " = 0; " <<`
			`"i < n; " <<`
			`"i++" <<`
			`") {\n" <<`
			`k.decl<uint_>("sibling_idx") <<`
			`" = i%2 == 0 ? evensibling_idx : oddsibling_idx;\n" <<`
			`k.decl<key_type>("sibling_key") << " = lkeys[sibling_idx];\n" <<`
			`k.decl<bool>("compare") << " = " <<`
			`compare(k.var<key_type>("sibling_key"),`
			`k.var<key_type>("my_key")) << ";\n" <<`
			`k.decl<bool>("swap") <<`
			`" = compare ^ (sibling_idx < lid);\n" <<`
			`"my_key = swap ? sibling_key : my_key;\n";`
			`if(sort_by_key)`
			`{`
			`k <<`
			`"my_index = swap ? lidx[sibling_idx] : my_index;\n";`
			`}`
			`k <<`
			`"barrier(CLK_LOCAL_MEM_FENCE);\n" <<`
			`"lkeys[lid] = my_key;\n";`
			`if(sort_by_key)`
			`{`
			`k <<`
			`"lidx[lid] = my_index;\n";`
			`}`
			`k <<`
			`"barrier(CLK_LOCAL_MEM_FENCE);\n"`
			`"}\n" << // for`

			`"}\n"; // else`
			`// end of odd-even sort`

			`// save key and value`
			`k <<`
			`"if(gid < count) {\n" <<`
			`keys_first[k.var<const uint_>("gid")] << " = " <<`
			`k.var<key_type>("my_key") << ";\n";`
			`if(sort_by_key)`
			`{`
			`k << values_first[k.var<const uint_>("gid")] << " = " <<`
			`values_first[k.var<const uint_>("offset + my_index")] << ";\n";`
			`}`
			`k <<`
			`// end if`
			`"}\n";`

			`const context &context = queue.get_context();`
			`const device &device = queue.get_device();`
			`::boost::compute::kernel kernel = k.compile(context);`

			`const size_t work_group_size =`
			`pick_bitonic_block_sort_block_size<key_type, uchar_>(`
			`kernel.get_work_group_info<size_t>(`
			`device, CL_KERNEL_WORK_GROUP_SIZE`
			`),`
			`device.get_info<size_t>(CL_DEVICE_LOCAL_MEM_SIZE),`
			`sort_by_key`
			`);`

			`const size_t global_size =`
			`work_group_size * static_cast<size_t>(`
			`std::ceil(float(count) / work_group_size)`
			`);`

			`kernel.set_arg(count_arg, static_cast<uint_>(count));`
			`kernel.set_arg(local_keys_arg, local_buffer<key_type>(work_group_size));`
			`if(sort_by_key) {`
			`kernel.set_arg(local_vals_arg, local_buffer<uchar_>(work_group_size));`
			`}`

			`queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);`
			`// return size of the block`
			`return work_group_size;`
			`}`

			`template<class KeyIterator, class ValueIterator, class Compare>`
			`inline size_t block_sort(KeyIterator keys_first,`
			`ValueIterator values_first,`
			`Compare compare,`
			`const size_t count,`
			`const bool sort_by_key,`
			`const bool stable,`
			`command_queue &queue)`
			`{`
			`if(stable) {`
			`// TODO: Implement stable block sort (stable odd-even merge sort)`
			`return size_t(1);`
			`}`
			`return bitonic_block_sort(`
			`keys_first, values_first,`
			`compare, count,`
			`sort_by_key, queue`
			`);`
			`}`

			`/// space: O(n + m); n - number of keys, m - number of values`
			`template<class KeyIterator, class ValueIterator, class Compare>`
			`inline void merge_blocks_on_gpu(KeyIterator keys_first,`
			`ValueIterator values_first,`
			`KeyIterator out_keys_first,`
			`ValueIterator out_values_first,`
			`Compare compare,`
			`const size_t count,`
			`const size_t block_size,`
			`const bool sort_by_key,`
			`command_queue &queue)`
			`{`
			`typedef typename std::iterator_traits<KeyIterator>::value_type key_type;`
			`typedef typename std::iterator_traits<ValueIterator>::value_type value_type;`

			`meta_kernel k("merge_blocks");`
			`size_t count_arg = k.add_arg<const uint_>("count");`
			`size_t block_size_arg = k.add_arg<const uint_>("block_size");`

			`k <<`
			`// get global id`
			`k.decl<const uint_>("gid") << " = get_global_id(0);\n" <<`
			`"if(gid >= count) {\n" <<`
			`"return;\n" <<`
			`"}\n" <<`

			`k.decl<const key_type>("my_key") << " = " <<`
			`keys_first[k.var<const uint_>("gid")] << ";\n";`

			`if(sort_by_key) {`
			`k <<`
			`k.decl<const value_type>("my_value") << " = " <<`
			`values_first[k.var<const uint_>("gid")] << ";\n";`
			`}`

			`k <<`
			`// get my block idx`
			`k.decl<const uint_>("my_block_idx") << " = gid / block_size;\n" <<`
			`k.decl<const bool>("my_block_idx_is_odd") << " = " <<`
			`"my_block_idx & 0x1;\n" <<`

			`k.decl<const uint_>("other_block_idx") << " = " <<`
			`// if(my_block_idx is odd) {} else {}`
			`"my_block_idx_is_odd ? my_block_idx - 1 : my_block_idx + 1;\n" <<`

			`// get ranges of my block and the other block`
			`// [my_block_start; my_block_end)`
			`// [other_block_start; other_block_end)`
			`k.decl<const uint_>("my_block_start") << " = " <<`
			`"min(my_block_idx * block_size, count);\n" << // including`
			`k.decl<const uint_>("my_block_end") << " = " <<`
			`"min((my_block_idx + 1) * block_size, count);\n" << // excluding`

			`k.decl<const uint_>("other_block_start") << " = " <<`
			`"min(other_block_idx * block_size, count);\n" << // including`
			`k.decl<const uint_>("other_block_end") << " = " <<`
			`"min((other_block_idx + 1) * block_size, count);\n" << // excluding`

			`// other block is empty, nothing to merge here`
			`"if(other_block_start == count){\n" <<`
			`out_keys_first[k.var<uint_>("gid")] << " = my_key;\n";`
			`if(sort_by_key) {`
			`k <<`
			`out_values_first[k.var<uint_>("gid")] << " = my_value;\n";`
			`}`

			`k <<`
			`"return;\n" <<`
			`"}\n" <<`

			`// lower bound`
			`// left_idx - lower bound`
			`k.decl<uint_>("left_idx") << " = other_block_start;\n" <<`
			`k.decl<uint_>("right_idx") << " = other_block_end;\n" <<`
			`"while(left_idx < right_idx) {\n" <<`
			`k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<`
			`k.decl<key_type>("mid_key") << " = " <<`
			`keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<`
			`k.decl<bool>("smaller") << " = " <<`
			`compare(k.var<key_type>("mid_key"),`
			`k.var<key_type>("my_key")) << ";\n" <<`
			`"left_idx = smaller ? mid_idx + 1 : left_idx;\n" <<`
			`"right_idx = smaller ? right_idx : mid_idx;\n" <<`
			`"}\n" <<`
			`// left_idx is found position in other block`

			`// if my_block is odd we need to get the upper bound`
			`"right_idx = other_block_end;\n" <<`
			`"if(my_block_idx_is_odd && left_idx != right_idx) {\n" <<`
			`k.decl<key_type>("upper_key") << " = " <<`
			`keys_first[k.var<const uint_>("left_idx")] << ";\n" <<`
			`"while(" <<`
			`"!(" << compare(k.var<key_type>("upper_key"),`
			`k.var<key_type>("my_key")) <<`
			`") && " <<`
			`"!(" << compare(k.var<key_type>("my_key"),`
			`k.var<key_type>("upper_key")) <<`
			`") && " <<`
			`"left_idx < right_idx" <<`
			`")" <<`
			`"{\n" <<`
			`k.decl<uint_>("mid_idx") << " = (left_idx + right_idx) / 2;\n" <<`
			`k.decl<key_type>("mid_key") << " = " <<`
			`keys_first[k.var<const uint_>("mid_idx")] << ";\n" <<`
			`k.decl<bool>("equal") << " = " <<`
			`"!(" << compare(k.var<key_type>("mid_key"),`
			`k.var<key_type>("my_key")) <<`
			`") && " <<`
			`"!(" << compare(k.var<key_type>("my_key"),`
			`k.var<key_type>("mid_key")) <<`
			`");\n" <<`
			`"left_idx = equal ? mid_idx + 1 : left_idx + 1;\n" <<`
			`"right_idx = equal ? right_idx : mid_idx;\n" <<`
			`"upper_key = equal ? upper_key : " <<`
			`keys_first[k.var<const uint_>("left_idx")] << ";\n" <<`
			`"}\n" <<`
			`"}\n" <<`

			`k.decl<uint_>("offset") << " = 0;\n" <<`
			`"offset += gid - my_block_start;\n" <<`
			`"offset += left_idx - other_block_start;\n" <<`
			`"offset += min(my_block_start, other_block_start);\n" <<`
			`out_keys_first[k.var<uint_>("offset")] << " = my_key;\n";`
			`if(sort_by_key) {`
			`k <<`
			`out_values_first[k.var<uint_>("offset")] << " = my_value;\n";`
			`}`

			`const context &context = queue.get_context();`
			`::boost::compute::kernel kernel = k.compile(context);`

			`const size_t work_group_size = (std::min)(`
			`size_t(256),`
			`kernel.get_work_group_info<size_t>(`
			`queue.get_device(), CL_KERNEL_WORK_GROUP_SIZE`
			`)`
			`);`
			`const size_t global_size =`
			`work_group_size * static_cast<size_t>(`
			`std::ceil(float(count) / work_group_size)`
			`);`

			`kernel.set_arg(count_arg, static_cast<uint_>(count));`
			`kernel.set_arg(block_size_arg, static_cast<uint_>(block_size));`
			`queue.enqueue_1d_range_kernel(kernel, 0, global_size, work_group_size);`
			`}`

			`template<class KeyIterator, class ValueIterator, class Compare>`
			`inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,`
			`KeyIterator keys_last,`
			`ValueIterator values_first,`
			`Compare compare,`
			`bool stable,`
			`command_queue &queue)`
			`{`
			`typedef typename std::iterator_traits<KeyIterator>::value_type key_type;`
			`typedef typename std::iterator_traits<ValueIterator>::value_type value_type;`

			`size_t count = iterator_range_size(keys_first, keys_last);`
			`if(count < 2){`
			`return;`
			`}`

			`size_t block_size =`
			`block_sort(`
			`keys_first, values_first,`
			`compare, count,`
			`true /* sort_by_key /, stable / stable */,`
			`queue`
			`);`

			`// for small input size only block sort is performed`
			`if(count <= block_size) {`
			`return;`
			`}`

			`const context &context = queue.get_context();`

			`bool result_in_temporary_buffer = false;`
			`::boost::compute::vector<key_type> temp_keys(count, context);`
			`::boost::compute::vector<value_type> temp_values(count, context);`

			`for(; block_size < count; block_size *= 2) {`
			`result_in_temporary_buffer = !result_in_temporary_buffer;`
			`if(result_in_temporary_buffer) {`
			`merge_blocks_on_gpu(keys_first, values_first,`
			`temp_keys.begin(), temp_values.begin(),`
			`compare, count, block_size,`
			`true /* sort_by_key */, queue);`
			`} else {`
			`merge_blocks_on_gpu(temp_keys.begin(), temp_values.begin(),`
			`keys_first, values_first,`
			`compare, count, block_size,`
			`true /* sort_by_key */, queue);`
			`}`
			`}`

			`if(result_in_temporary_buffer) {`
			`copy_async(temp_keys.begin(), temp_keys.end(), keys_first, queue);`
			`copy_async(temp_values.begin(), temp_values.end(), values_first, queue);`
			`}`
			`}`

			`template<class Iterator, class Compare>`
			`inline void merge_sort_on_gpu(Iterator first,`
			`Iterator last,`
			`Compare compare,`
			`bool stable,`
			`command_queue &queue)`
			`{`
			`typedef typename std::iterator_traits<Iterator>::value_type key_type;`

			`size_t count = iterator_range_size(first, last);`
			`if(count < 2){`
			`return;`
			`}`

			`Iterator dummy;`
			`size_t block_size =`
			`block_sort(`
			`first, dummy,`
			`compare, count,`
			`false /* sort_by_key /, stable / stable */,`
			`queue`
			`);`

			`// for small input size only block sort is performed`
			`if(count <= block_size) {`
			`return;`
			`}`

			`const context &context = queue.get_context();`

			`bool result_in_temporary_buffer = false;`
			`::boost::compute::vector<key_type> temp_keys(count, context);`

			`for(; block_size < count; block_size *= 2) {`
			`result_in_temporary_buffer = !result_in_temporary_buffer;`
			`if(result_in_temporary_buffer) {`
			`merge_blocks_on_gpu(first, dummy, temp_keys.begin(), dummy,`
			`compare, count, block_size,`
			`false /* sort_by_key */, queue);`
			`} else {`
			`merge_blocks_on_gpu(temp_keys.begin(), dummy, first, dummy,`
			`compare, count, block_size,`
			`false /* sort_by_key */, queue);`
			`}`
			`}`

			`if(result_in_temporary_buffer) {`
			`copy_async(temp_keys.begin(), temp_keys.end(), first, queue);`
			`}`
			`}`

			`template<class KeyIterator, class ValueIterator, class Compare>`
			`inline void merge_sort_by_key_on_gpu(KeyIterator keys_first,`
			`KeyIterator keys_last,`
			`ValueIterator values_first,`
			`Compare compare,`
			`command_queue &queue)`
			`{`
			`merge_sort_by_key_on_gpu(`
			`keys_first, keys_last, values_first,`
			`compare, false /* not stable */, queue`
			`);`
			`}`

			`template<class Iterator, class Compare>`
			`inline void merge_sort_on_gpu(Iterator first,`
			`Iterator last,`
			`Compare compare,`
			`command_queue &queue)`
			`{`
			`merge_sort_on_gpu(`
			`first, last, compare, false /* not stable */, queue`
			`);`
			`}`

			`} // end detail namespace`
			`} // end compute namespace`
			`} // end boost namespace`

			`#endif /* BOOST_COMPUTE_ALGORITHM_DETAIL_MERGE_SORT_ON_GPU_HPP_ */`