我有@ms给出代码:
#include <thrust/device_vector.h>
#include <thrust/scan.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <iostream>
struct omit_negative : public thrust::unary_function<int, int>
{
__host__ __device__
int operator()(int value)
{
if (value<0)
{
value = 0;
}
return value;
}
};
int main()
{
int array[] = {2,1,-1,3,-1,2};
const int array_size = sizeof(array)/sizeof(array[0]);
thrust::device_vector<int> d_array(array, array + array_size);
thrust::device_vector<int> d_result(array_size);
std::cout << "input data" << std::endl;
thrust::copy(d_array.begin(), d_array.end(), std::ostream_iterator<int>(std::cout, " "));
thrust::inclusive_scan(thrust::make_transform_iterator(d_array.begin(), omit_negative()),
thrust::make_transform_iterator(d_array.end(), omit_negative()),
d_result.begin());
std::cout << std::endl << "after inclusive_scan" << std::endl;
thrust::copy(d_result.begin(), d_result.end(), std::ostream_iterator<int>(std::cout, " "));
using namespace thrust::placeholders;
thrust::scatter_if(d_array.begin(),
d_array.end(),
thrust::make_counting_iterator(0),
d_array.begin(),
d_result.begin(),
_1<0
);
std::cout << std::endl << "after scatter_if" << std::endl;
thrust::copy(d_result.begin(), d_result.end(), std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
}
它是指前一个问题 。
我不知道推力,但现在我想我会放弃编写自己的代码的想法。 我宁愿使用推力。 我修改我的算法:而不是-1有0(所以make_transform是没有必要的)。 此外,您的示例创建主机阵列。 但实际上,我已经准备了存储在设备阵列,而且我喜欢用它(而不是矢量),以避免产生多余的内存,并避免复制内存(它的成本时间 - 最小的时间成本是我的目标)。 我不知道如何使用,而不是矢量阵列。 这是我写的:
int* dev_l_set = 0;
cudaMalloc((void**)&dev_l_set, actualVerticesRowCount * sizeof(int));
...prepare array in kernel...
thrust::device_vector<int> d_result(actualVerticesRowCount);
thrust::inclusive_scan(dev_l_set, dev_l_set + actualVerticesRowCount, dev_l_set);
using namespace thrust::placeholders;
thrust::scatter_if(dev_l_set, dev_l_set + actualVerticesRowCount, thrust::make_counting_iterator(0), dev_l_set, d_result.begin(), _1 <= 0);
cudaFree(dev_l_set);
dev_l_set = thrust::raw_pointer_cast(d_result.data());
我无法从device_vector强制转换为int *,但我想在存储初始扫描的结果dev_l_set
阵列。 此外,它会是巨大的,做到这一点的地方,是有必要使用d_result
在scatter_if?
实际输入(存储在INT * - 装置侧):(实施例)
dev_l_set[0] = 0
dev_l_set[1] = 2
dev_l_set[2] = 0
dev_l_set[3] = 3
dev_l_set[4] = 0
dev_l_set[5] = 1
期望的输出到上述输入:
dev_l_set[0] = 0
dev_l_set[1] = 2
dev_l_set[2] = 0
dev_l_set[3] = 5
dev_l_set[4] = 0
dev_l_set[5] = 6
dev_l_set
应存储输入,然后在适当位置做扫描和到底应该存储输出。
这可能是这样的。
int* dev_l_set = 0;
cudaMalloc((void**)&dev_l_set, actualVerticesRowCount * sizeof(int));
...prepare array in kernel... (see input data)
thrust::inclusive_scan(dev_l_set, dev_l_set + actualVerticesRowCount, dev_l_set);
using namespace thrust::placeholders;
thrust::scatter_if(dev_l_set, dev_l_set + actualVerticesRowCount, thrust::make_counting_iterator(0), dev_l_set, dev_l_set, _1 <= 0);
我Cuda的版本(最小的是应用应该工作)为5.5(特斯拉M2070),不幸的是我不能使用C ++ 11。
你可以做的包容性扫描,以及在地方分散的步骤,无需额外的结果向量。
下面的示例直接使用从原始设备指针的数据,而无需thrust::device_vector
。 包容扫描之后,先前0
元件被恢复。
作为@JaredHoberock指出,一个不应该依赖于居住在代码thrust::detail
。 因此,我编辑我的答案,复制从代码的一部分thrust::detail::head_flags
直接进入这个例子。
#include <thrust/scan.h>
#include <thrust/scatter.h>
#include <thrust/device_ptr.h>
#include <thrust/iterator/constant_iterator.h>
#include <iostream>
// the following code is copied from <thrust/detail/range/head_flags.h>
#include <thrust/detail/config.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/iterator/zip_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/tuple.h>
#include <thrust/functional.h>
template<typename RandomAccessIterator,
typename BinaryPredicate = thrust::equal_to<typename thrust::iterator_value<RandomAccessIterator>::type>,
typename ValueType = bool,
typename IndexType = typename thrust::iterator_difference<RandomAccessIterator>::type>
class head_flags
{
public:
struct head_flag_functor
{
BinaryPredicate binary_pred; // this must be the first member for performance reasons
IndexType n;
typedef ValueType result_type;
__host__ __device__
head_flag_functor(IndexType n)
: binary_pred(), n(n)
{}
__host__ __device__
head_flag_functor(IndexType n, BinaryPredicate binary_pred)
: binary_pred(binary_pred), n(n)
{}
template<typename Tuple>
__host__ __device__ __thrust_forceinline__
result_type operator()(const Tuple &t)
{
const IndexType i = thrust::get<0>(t);
// note that we do not dereference the tuple's 2nd element when i <= 0
// and therefore do not dereference a bad location at the boundary
return (i == 0 || !binary_pred(thrust::get<1>(t), thrust::get<2>(t)));
}
};
typedef thrust::counting_iterator<IndexType> counting_iterator;
public:
typedef thrust::transform_iterator<
head_flag_functor,
thrust::zip_iterator<thrust::tuple<counting_iterator,RandomAccessIterator,RandomAccessIterator> >
> iterator;
__host__ __device__
head_flags(RandomAccessIterator first, RandomAccessIterator last)
: m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
head_flag_functor(last - first))),
m_end(m_begin + (last - first))
{}
__host__ __device__
head_flags(RandomAccessIterator first, RandomAccessIterator last, BinaryPredicate binary_pred)
: m_begin(thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(thrust::counting_iterator<IndexType>(0), first, first - 1)),
head_flag_functor(last - first, binary_pred))),
m_end(m_begin + (last - first))
{}
__host__ __device__
iterator begin() const
{
return m_begin;
}
__host__ __device__
iterator end() const
{
return m_end;
}
template<typename OtherIndex>
__host__ __device__
typename iterator::reference operator[](OtherIndex i)
{
return *(begin() + i);
}
private:
iterator m_begin, m_end;
};
template<typename RandomAccessIterator>
__host__ __device__
head_flags<RandomAccessIterator>
make_head_flags(RandomAccessIterator first, RandomAccessIterator last)
{
return head_flags<RandomAccessIterator>(first, last);
}
int main()
{
// copy data to device, this will be produced by your kernel
int array[] = {0,2,0,3,0,1};
const int array_size = sizeof(array)/sizeof(array[0]);
int* dev_l_set;
cudaMalloc((void**)&dev_l_set, array_size * sizeof(int));
cudaMemcpy(dev_l_set, array, array_size * sizeof(int), cudaMemcpyHostToDevice);
// wrap raw pointer in a thrust::device_ptr so thrust knows that this memory is located on the GPU
thrust::device_ptr<int> dev_ptr = thrust::device_pointer_cast(dev_l_set);
thrust::inclusive_scan(dev_ptr,
dev_ptr+array_size,
dev_ptr);
// copy result back to host for printing
cudaMemcpy(array, dev_l_set, array_size * sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "after inclusive_scan" << std::endl;
thrust::copy(array, array+array_size, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
using namespace thrust::placeholders;
thrust::scatter_if(thrust::make_constant_iterator(0),
thrust::make_constant_iterator(0)+array_size,
thrust::make_counting_iterator(0),
make_head_flags(dev_ptr, dev_ptr+array_size).begin(),
dev_ptr,
!_1
);
// copy result back to host for printing
cudaMemcpy(array, dev_l_set, array_size * sizeof(int), cudaMemcpyDeviceToHost);
std::cout << "after scatter_if" << std::endl;
thrust::copy(array, array+array_size, std::ostream_iterator<int>(std::cout, " "));
std::cout << std::endl;
}
产量
after inclusive_scan
0 2 2 5 5 6
after scatter_if
0 2 0 5 0 6