#include <stdio.h>
#include <cuda.h>
#include <arrayfire.h>
#define divup(x,y) (x%y) ? ((x+y-1)/y) : (x/y)
#define CUDA(call) do { \
cudaError_t _e = (call); \
if (_e == cudaSuccess) break; \
fprintf(stderr, __FILE__":%d: cuda error: %s (%d)\n", \
__LINE__, cudaGetErrorString(_e), _e); \
exit(-1); \
} while (0)
#define AF(call) do { \
afError _e = (call); \
if (_e == AF_SUCCESS) break; \
fprintf(stderr, __FILE__":%d: arrayfire error: %s (%d)\n", \
__LINE__, af_errstr(), _e); \
exit(-1); \
} while (0)
__global__
void test_inside(int n, float *d_inside, float *d_x, float *d_y)
{
unsigned bid = blockIdx.y * gridDim.x + blockIdx.x;
int idx = bid * blockDim.x + threadIdx.x;
if (idx >= n) return;
float x = d_x[idx];
float y = d_y[idx];
d_inside[idx] = (sqrt(x*x + y*y) <= 1);
}
int main(int argc, char *argv[])
{
int n = 2e6;
printf("using %d samples\n", n);
float *d_x = NULL, *d_y = NULL;
unsigned bytes = sizeof(float) * n;
CUDA(cudaMalloc(&d_x, bytes));
CUDA(cudaMalloc(&d_y, bytes));
AF(af_randu_S(d_x, n));
AF(af_randu_S(d_y, n));
float *d_inside;
CUDA(cudaMalloc((void **)&d_inside, n * sizeof(float)));
dim3 threads(256);
dim3 blocks(divup(n, threads.x));
if (blocks.x > 65535) {
blocks.y = divup(blocks.x, 65535);
blocks.x = divup(blocks.x, blocks.y);
}
test_inside<<<blocks, threads>>>(n, d_inside, d_x, d_y);
#ifdef DEBUG // For debugging purposes
CUDA(cudaDeviceSynchronize());
CUDA(cudaGetLastError());
#endif
float h_result;
AF(af_sum_vector_S(&h_result, n, d_inside));
float pi = 4.0f * h_result / n;
printf("pi %f\n", pi);
CUDA(cudaFree(d_x));
CUDA(cudaFree(d_y));
CUDA(cudaFree(d_inside));
#ifdef WIN32 // pause in Windows
if (!(argc == 2 && argv[1][0] == '-')) {
printf("hit [enter]...");
getchar();
}
#endif
return 0;
}