// Monte carlo estimation of pi

#include <stdio.h>
#include <err.h>
#include <cuda.h>
#include <jacket.h>

#define CUDA(call)                                      \
    do {                                                \
        cudaError_t _e = (call);                        \
        if (_e == cudaSuccess) break;                   \
        errx(-1, __FILE__":%d: cuda failure: %s (%d)",  \
             __LINE__, cudaGetErrorString(_e), _e);     \
    } while (0)

#define JKT(call)                                           \
    do {                                                    \
        jktError_t _e = (call);                             \
        if (_e == JKT_SUCCESS) break;                       \
        errx(-1, __FILE__":%d: jacket failure: %s (%d)",    \
             __LINE__, jkt_strerror(_e), _e);               \
    } while (0)





__global__
void test_inside(int n, float *d_inside, float *d_x, float *d_y)
{
    int i = blockDim.x * blockIdx.x + threadIdx.x;
    if (i >= n) return;

    float x = d_x[i];
    float y = d_y[i];
    d_inside[i] = (sqrt(x*x + y*y) <= 1);
}

int main(int argc, char *args[])
{
    if (argc != 2)
        errx(-1, "usage: %s <trials>", args[0]);

    int n = atoi(args[1]); // number of trials

    int bytes = -1;
    JKT(jkt_grand_f32(NULL, n, &bytes)); // determine required allocation

    // random scattering of (x,y) points
    float *d_x, *d_y;
    CUDA(cudaMalloc((void **)&d_x, bytes));
    CUDA(cudaMalloc((void **)&d_y, bytes));
    JKT(jkt_grand_f32(d_x,  n, &bytes));
    JKT(jkt_grand_f32(d_y,  n, &bytes));

    // test if inside unit circle
    float *d_inside;
    CUDA(cudaMalloc((void **)&d_inside, n * sizeof(float)));
    int threads = 256, blocks = (n + threads - 1)/threads;
    test_inside<<<blocks,threads>>>(n, d_inside, d_x, d_y);

    // count how many fell inside
    float h_result;
    JKT(jkt_sum_vector_f32f32(&h_result, d_inside, n));

    // approximate PI
    float pi = 4.0f * h_result / n;
    printf("pi %f\n", pi);

    CUDA(cudaFree(d_x));
    CUDA(cudaFree(d_y));
    CUDA(cudaFree(d_inside));

    return 0;
}
