Getting CUDA Thrust to use a CUDA stream of your choice -



Getting CUDA Thrust to use a CUDA stream of your choice -

looking @ kernel launches within code of cuda thrust, seems utilize default stream. can create thrust utilize stream of choice? missing in api?

i want update reply provided talonmies next release of thrust 1.8 introduces possibility of indicating cuda execution stream as

thrust::cuda::par.on(stream)

see also

thrust release 1.8.0.

in following, i'm recasting illustration in

false dependency issue fermi architecture

in terms of cuda thrust apis.

#include <iostream> #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include <thrust\device_vector.h> #include <thrust\execution_policy.h> #include "utilities.cuh" using namespace std; #define num_threads 32 #define num_blocks 16 #define num_streams 3 struct binaryop{ __host__ __device__ int operator()(const int& o1,const int& o2) { homecoming o1 * o2; } }; int main() { const int n = 6000000; // --- host side input info allocation , initialization. registering host memory page-locked (required asynch cudamemcpyasync). int *h_in = new int[n]; for(int = 0; < n; i++) h_in[i] = 5; gpuerrchk(cudahostregister(h_in, n * sizeof(int), cudahostregisterportable)); // --- host side input info allocation , initialization. registering host memory page-locked (required asynch cudamemcpyasync). int *h_out = new int[n]; for(int = 0; < n; i++) h_out[i] = 0; gpuerrchk(cudahostregister(h_out, n * sizeof(int), cudahostregisterportable)); // --- host side check results vector allocation , initialization int *h_checkresults = new int[n]; for(int = 0; < n; i++) h_checkresults[i] = h_in[i] * h_in[i]; // --- device side input info allocation. int *d_in = 0; gpuerrchk(cudamalloc((void **)&d_in, n * sizeof(int))); // --- device side output info allocation. int *d_out = 0; gpuerrchk( cudamalloc((void **)&d_out, n * sizeof(int))); int streamsize = n / num_streams; size_t streammemsize = n * sizeof(int) / num_streams; // --- set kernel launch configuration dim3 nthreads = dim3(num_threads,1,1); dim3 nblocks = dim3(num_blocks, 1,1); dim3 subkernelblock = dim3((int)ceil((float)nblocks.x / 2)); // --- create cuda streams cudastream_t streams[num_streams]; for(int = 0; < num_streams; i++) gpuerrchk(cudastreamcreate(&streams[i])); /**************************/ /* breadth-first approach */ /**************************/ for(int = 0; < num_streams; i++) { int offset = * streamsize; cudamemcpyasync(&d_in[offset], &h_in[offset], streammemsize, cudamemcpyhosttodevice, streams[i]); } for(int = 0; < num_streams; i++) { int offset = * streamsize; thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_in[offset]) + streamsize/2, thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_out[offset]), binaryop()); thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset + streamsize/2]), thrust::device_pointer_cast(&d_in[offset + streamsize/2]) + streamsize/2, thrust::device_pointer_cast(&d_in[offset + streamsize/2]), thrust::device_pointer_cast(&d_out[offset + streamsize/2]), binaryop()); } for(int = 0; < num_streams; i++) { int offset = * streamsize; cudamemcpyasync(&h_out[offset], &d_out[offset], streammemsize, cudamemcpydevicetohost, streams[i]); } for(int = 0; < num_streams; i++) gpuerrchk(cudastreamsynchronize(streams[i])); gpuerrchk(cudadevicesynchronize()); // --- release resources gpuerrchk(cudahostunregister(h_in)); gpuerrchk(cudahostunregister(h_out)); gpuerrchk(cudafree(d_in)); gpuerrchk(cudafree(d_out)); for(int = 0; < num_streams; i++) gpuerrchk(cudastreamdestroy(streams[i])); cudadevicereset(); // --- gpu output check int sum = 0; for(int = 0; < n; i++) { //printf("%i %i\n", h_out[i], h_checkresults[i]); sum += h_checkresults[i] - h_out[i]; } cout << "error between cpu , gpu: " << sum << endl; delete[] h_in; delete[] h_out; delete[] h_checkresults; homecoming 0; }

the utilities.cu , utilities.cuh files needed run such illustration maintained @ github page.

the visual profiler timeline shows concurrency of cuda thrust operations , memory transfers

cuda thrust

Comments

Popular posts from this blog

model view controller - MVC Rails Planning -

ruby on rails - Devise Logout Error in RoR -

html - Submenu setup with jquery and effect 'fold' -