Simple Thrust code performs about half as fast as

2019-09-17 08:46发布

问题:

I'm pretty new to Cuda and Thrust, but my impression was that Thrust, when used well, is supposed to offer better performance than naively written Cuda kernels. Am I using Thrust in a sub-optimal way? Below is a complete, minimal example that takes an array u of length N+2, and for each i between 1 and N computes the average 0.5*(u[i-1] + u[i+1]) and puts the result in uNew[i]. (uNew[0] is set to u[0] and u[N+1] is set to u[N+1] so that the boundary terms don't change). The code performs this averaging a large number of times to get reasonable times for timing tests. On my hardware, the Thrust computation takes roughly twice as long as the naive code. Is there a way to improve my Thrust code?

#include <iostream>
#include <thrust/device_vector.h>
#include <boost/timer.hpp>
#include <thrust/device_malloc.h>

typedef double numtype;

template <typename T> class NeighborAverageFunctor{
	int N;
public:
	NeighborAverageFunctor(int _N){
		N = _N;
	}
	template <typename Tuple>
	__host__ __device__ void operator()(Tuple t){
		T uL = thrust::get<0>(t);
		T uR = thrust::get<1>(t);

		thrust::get<2>(t) = 0.5*(uL + uR);
	}

	int getN(){
		return N;
	}
};

template <typename T> void thrust_sweep(thrust::device_ptr<T> u, thrust::device_ptr<T> uNew, NeighborAverageFunctor<T>& op){
	int N = op.getN();
	thrust::for_each(thrust::make_zip_iterator(thrust::make_tuple(u, u + 2, uNew + 1)), thrust::make_zip_iterator(thrust::make_tuple(u + N, u + N+2, uNew + N+1)), op);
	// Propagate boundary values without changing them
	uNew[0] = u[0];
	uNew[N+1] = u[N+1];
}


template <typename T> __global__ void initialization_kernel(int n, T* u){
	const int i = blockIdx.x * blockDim.x + threadIdx.x;
	if(i < n+2){
		if(i == 0){
			u[i] = 1.0;
		}
		else{
			u[i] = 0.0;
		}
	}
}

template <typename T> __global__ void sweep_kernel(int n, T, T* u, T* uNew){
	const int i = blockDim.x * blockIdx.x + threadIdx.x;
	if (i >= 1 && i < n-1){
		uNew[i] = 0.5*(u[i+1] + u[i-1]);
	}
	else if(i == 0 || i == n+1){
		uNew[i] = u[i];
	}
}

int main(void){
	int sweeps = 2000;
	int N = 4096*2048;
	numtype h = 1.0/N;
	numtype hSquared = pow(h, 2);

	NeighborAverageFunctor<numtype> op(N);

	thrust::device_ptr<numtype> u_d = thrust::device_malloc<numtype>(N+2);
	thrust::device_ptr<numtype> uNew_d = thrust::device_malloc<numtype>(N+2);
	thrust::device_ptr<numtype> uTemp_d;

	thrust::fill(u_d, u_d + (N+2), 0.0);
	u_d[0] = 1.0;

	boost::timer::timer timer1;

	for(int k = 0; k < sweeps; k++){
		thrust_sweep<numtype>(u_d, uNew_d, op);
		uTemp_d = u_d;
		u_d = uNew_d;
		uNew_d = uTemp_d;
	}

	double thrust_time = timer1.elapsed();

	thrust::host_vector<numtype> u_h(N+2);
	thrust::copy(u_d, u_d + N+2, u_h.begin());
	for(int i = 0; i < 10; i++){
		std::cout << u_h[i] << " ";
	}
	std::cout << std::endl;

	thrust::device_free(u_d);
	thrust::device_free(uNew_d);

	numtype * u_raw_d, * uNew_raw_d, * uTemp_raw_d;
	cudaMalloc(&u_raw_d, (N+2)*sizeof(numtype));
	cudaMalloc(&uNew_raw_d, (N+2)*sizeof(numtype));

	numtype * u_raw_h = (numtype*)malloc((N+2)*sizeof(numtype));

	int block_size = 256;
	int grid_size = ((N+2) + block_size - 1) / block_size;

	initialization_kernel<numtype><<<grid_size, block_size>>>(N, u_raw_d);

	boost::timer::timer timer2;

	for(int k = 0; k < sweeps; k++){
		sweep_kernel<numtype><<<grid_size, block_size>>>(N+2, hSquared, u_raw_d, uNew_raw_d);
		uTemp_raw_d = u_raw_d;
		u_raw_d = uNew_raw_d;
		uNew_raw_d = uTemp_raw_d;
	}

	double raw_time = timer2.elapsed();

	cudaMemcpy(u_raw_h, u_raw_d, (N+2)*sizeof(numtype), cudaMemcpyDeviceToHost);

	for(int i = 0; i < 10; i++){
		std::cout << u_raw_h[i] << " ";
	}
	std::cout << std::endl;

	std::cout << "Thrust: " << thrust_time << " s" << std::endl;
	std::cout << "Raw: " << raw_time << " s" << std::endl;

	free(u_raw_h);

	cudaFree(u_raw_d);
	cudaFree(uNew_raw_d);

	return 0;
}

回答1:

According to my testing, these lines:

uNew[0] = u[0];
uNew[N+1] = u[N+1];

are killing your thrust performance relative to the kernel method. When I eliminate them, the results don't seem to be any different. Compared to how your kernel is handling the boundary cases, the thrust code is using a very expensive method (cudaMemcpy operations, under the hood) to perform the boundary handling.

Since your thrust functor never actually writes to the boundary positions, it should be sufficient to write these values only once, rather than in a loop.

You can speed up your thrust performance significantly by doing a better job of handling the boundary cases.