Getting CUDA Thrust to use a CUDA stream of your choice -

looking @ kernel launches within code of cuda thrust, seems utilize default stream. can create thrust utilize stream of choice? missing in api?

i want update reply provided talonmies next release of thrust 1.8 introduces possibility of indicating cuda execution stream as

thrust::cuda::par.on(stream)

see also

thrust release 1.8.0.

in following, i'm recasting illustration in

false dependency issue fermi architecture

in terms of cuda thrust apis.

#include <iostream>  #include "cuda_runtime.h" #include "device_launch_parameters.h"  #include <stdio.h>  #include <thrust\device_vector.h> #include <thrust\execution_policy.h>  #include "utilities.cuh"  using namespace std;  #define num_threads 32 #define num_blocks 16 #define num_streams 3  struct binaryop{ __host__ __device__ int operator()(const int& o1,const int& o2) {  homecoming o1 * o2; } };  int main() {     const int n = 6000000;      // --- host side input   info allocation , initialization. registering host memory page-locked (required asynch cudamemcpyasync).     int *h_in = new int[n]; for(int = 0; < n; i++) h_in[i] = 5;     gpuerrchk(cudahostregister(h_in, n * sizeof(int), cudahostregisterportable));      // --- host side input   info allocation , initialization. registering host memory page-locked (required asynch cudamemcpyasync).     int *h_out = new int[n]; for(int = 0; < n; i++) h_out[i] = 0;     gpuerrchk(cudahostregister(h_out, n * sizeof(int), cudahostregisterportable));      // --- host side check results vector allocation , initialization     int *h_checkresults = new int[n]; for(int = 0; < n; i++) h_checkresults[i] = h_in[i] * h_in[i];      // --- device side input   info allocation.     int *d_in = 0;              gpuerrchk(cudamalloc((void **)&d_in, n * sizeof(int)));      // --- device side output   info allocation.      int *d_out = 0;             gpuerrchk( cudamalloc((void **)&d_out, n * sizeof(int)));      int streamsize = n / num_streams;     size_t streammemsize = n * sizeof(int) / num_streams;      // --- set kernel launch configuration     dim3 nthreads       = dim3(num_threads,1,1);     dim3 nblocks        = dim3(num_blocks, 1,1);     dim3 subkernelblock = dim3((int)ceil((float)nblocks.x / 2));      // --- create cuda streams     cudastream_t streams[num_streams];     for(int = 0; < num_streams; i++)         gpuerrchk(cudastreamcreate(&streams[i]));      /**************************/     /* breadth-first approach */     /**************************/      for(int = 0; < num_streams; i++) {         int offset = * streamsize;         cudamemcpyasync(&d_in[offset], &h_in[offset], streammemsize, cudamemcpyhosttodevice,     streams[i]);     }      for(int = 0; < num_streams; i++)     {         int offset = * streamsize;          thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_in[offset]) + streamsize/2,                                                              thrust::device_pointer_cast(&d_in[offset]), thrust::device_pointer_cast(&d_out[offset]), binaryop());         thrust::transform(thrust::cuda::par.on(streams[i]), thrust::device_pointer_cast(&d_in[offset + streamsize/2]), thrust::device_pointer_cast(&d_in[offset + streamsize/2]) + streamsize/2,                                                              thrust::device_pointer_cast(&d_in[offset + streamsize/2]), thrust::device_pointer_cast(&d_out[offset + streamsize/2]), binaryop());      }      for(int = 0; < num_streams; i++) {         int offset = * streamsize;         cudamemcpyasync(&h_out[offset], &d_out[offset], streammemsize, cudamemcpydevicetohost,   streams[i]);     }      for(int = 0; < num_streams; i++)         gpuerrchk(cudastreamsynchronize(streams[i]));      gpuerrchk(cudadevicesynchronize());      // --- release resources     gpuerrchk(cudahostunregister(h_in));     gpuerrchk(cudahostunregister(h_out));     gpuerrchk(cudafree(d_in));     gpuerrchk(cudafree(d_out));      for(int = 0; < num_streams; i++)         gpuerrchk(cudastreamdestroy(streams[i]));      cudadevicereset();        // --- gpu output check     int sum = 0;     for(int = 0; < n; i++) {              //printf("%i %i\n", h_out[i], h_checkresults[i]);         sum += h_checkresults[i] - h_out[i];     }      cout << "error between cpu , gpu: " << sum << endl;      delete[] h_in;     delete[] h_out;     delete[] h_checkresults;       homecoming 0; }

the utilities.cu , utilities.cuh files needed run such illustration maintained @ github page.

the visual profiler timeline shows concurrency of cuda thrust operations , memory transfers

cuda thrust

Search This Blog

Three

Getting CUDA Thrust to use a CUDA stream of your choice -

Comments

Post a Comment

Popular posts from this blog

model view controller - MVC Rails Planning -

html - Submenu setup with jquery and effect 'fold' -

ruby on rails - Devise Logout Error in RoR -