1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
|
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#define arraySize 1000
__global__ void addKernel( int *c, const int *a, const int *b )
{
int i = threadIdx.x;
if( i < arraySize )
c[i] = a[i] + b[i];
}
int main()
{
int a[arraySize];
int b[arraySize];
int c[arraySize];
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
// fill the arrays 'a' and 'b' on the CPU
for( int i = 0 ; i < arraySize ; i++ ) {
a[i] = i;
b[i] = i;
}
// Add vectors in parallel.
// Allocate GPU buffers for three vectors (two input, one output)
cudaMalloc((void**)&dev_c, arraySize * sizeof(int));
cudaMalloc((void**)&dev_a, arraySize * sizeof(int));
cudaMalloc((void**)&dev_b, arraySize * sizeof(int));
// copy the arrays 'a' and 'b' to the GPU
cudaMemcpy(dev_a, a, arraySize * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, arraySize * sizeof(int), cudaMemcpyHostToDevice);
addKernel<<<1, arraySize>>>(dev_c, dev_a, dev_b);
cudaDeviceSynchronize();
// copy the array 'c' back from the GPU to the CPU
cudaMemcpy(c, dev_c, arraySize * sizeof(int), cudaMemcpyDeviceToHost);
// display the results
for( int i = 0 ; i < arraySize ; i++ ) {
printf( "%d + %d = %d\n", a[i], b[i], c[i] );
}
// free the memory allocated on the GPU
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return 0;
}
| |