|
| 1 | +#include <stdio.h> |
| 2 | +#include <stdlib.h> |
| 3 | + |
| 4 | +#include "kernel_float.h" |
| 5 | + |
| 6 | +#define CUDA_CHECK(call) \ |
| 7 | + do { \ |
| 8 | + cudaError_t __err = call; \ |
| 9 | + if (__err != cudaSuccess) { \ |
| 10 | + fprintf( \ |
| 11 | + stderr, \ |
| 12 | + "CUDA error at %s:%d code=%d(%s) \"%s\" \n", \ |
| 13 | + __FILE__, \ |
| 14 | + __LINE__, \ |
| 15 | + __err, \ |
| 16 | + cudaGetErrorString(__err), \ |
| 17 | + #call); \ |
| 18 | + exit(EXIT_FAILURE); \ |
| 19 | + } \ |
| 20 | + } while (0) |
| 21 | + |
| 22 | +// Alias `kernel_float` as `kf` |
| 23 | +namespace kf = kernel_float; |
| 24 | + |
| 25 | +// Define the float type and vector size |
| 26 | +using float_type = float; |
| 27 | +static constexpr int VECTOR_SIZE = 4; |
| 28 | + |
| 29 | +__global__ void calculate_pi_kernel(int nx, int ny, int* global_count) { |
| 30 | + // Calculate the global x and y indices for this thread within the grid |
| 31 | + int thread_x = blockIdx.x * blockDim.x + threadIdx.x; |
| 32 | + int thread_y = blockIdx.y * blockDim.y + threadIdx.y; |
| 33 | + |
| 34 | + // Calculate the x and y coordinates as integers. |
| 35 | + // The x coordinates are: [thread_x * VECTOR_SIZE, thread_x * VECTOR_SIZE + 1, ...] |
| 36 | + // The y coordinates are: [thread_y, thread_y, ...] |
| 37 | + kf::vec<int, VECTOR_SIZE> xi = thread_x * VECTOR_SIZE + kf::range<int, VECTOR_SIZE>(); |
| 38 | + kf::vec<int, VECTOR_SIZE> yi = thread_y; |
| 39 | + |
| 40 | + // Normalize the integers to values between 0 and 1. |
| 41 | + kf::vec<float_type, VECTOR_SIZE> xf = kf::cast<float_type>(xi) / float_type(nx); |
| 42 | + kf::vec<float_type, VECTOR_SIZE> yf = kf::cast<float_type>(yi) / float_type(ny); |
| 43 | + |
| 44 | + // Compute the squared distance to the origin and then take the |
| 45 | + // square root to get the distance to the origin. |
| 46 | + kf::vec<float_type, VECTOR_SIZE> dist_squared = xf * xf + yf * yf; |
| 47 | + kf::vec<float_type, VECTOR_SIZE> dist = kf::sqrt(dist_squared); |
| 48 | + |
| 49 | + // Count the number of points within the unit circle. |
| 50 | + // The expression `dist <= 1` returns a boolean vector |
| 51 | + // and `kf::count` counts how many elements are `true`. |
| 52 | + int n = kf::count(dist <= float_type(1)); |
| 53 | + |
| 54 | + // Atomically add 'n' to 'global_count' |
| 55 | + atomicAdd(global_count, n); |
| 56 | +} |
| 57 | + |
| 58 | +double calculate_pi(int nx, int ny) { |
| 59 | + // Allocate memory on the device (GPU) for 'global_count' to accumulate the count of points inside the circle |
| 60 | + int* d_global_count; |
| 61 | + CUDA_CHECK(cudaMalloc(&d_global_count, sizeof(int))); |
| 62 | + |
| 63 | + // Initialize the device memory to zero |
| 64 | + CUDA_CHECK(cudaMemset(d_global_count, 0, sizeof(int))); |
| 65 | + |
| 66 | + // Each thread processes 'VECTOR_SIZE' points in the x-direction |
| 67 | + int num_threads_x = (nx + VECTOR_SIZE - 1) / VECTOR_SIZE; |
| 68 | + |
| 69 | + // Define the dimensions of each thread block (number of threads per block) |
| 70 | + dim3 block_size(16, 16); // Each block contains 16 threads in x and y directions |
| 71 | + |
| 72 | + // Calculate the number of blocks needed in the grid to cover all threads |
| 73 | + dim3 grid_size( |
| 74 | + (num_threads_x + block_size.x - 1) / block_size.x, // Number of blocks in x-direction |
| 75 | + (ny + block_size.y - 1) / block_size.y // Number of blocks in y-direction |
| 76 | + ); |
| 77 | + |
| 78 | + // Launch the kernel on the GPU with the calculated grid and block dimensions |
| 79 | + calculate_pi_kernel<<<grid_size, block_size>>>(nx, ny, d_global_count); |
| 80 | + |
| 81 | + // Check for any errors during kernel launch (asynchronous) |
| 82 | + CUDA_CHECK(cudaGetLastError()); |
| 83 | + |
| 84 | + // Wait for the kernel to finish executing and check for errors (synchronization point) |
| 85 | + CUDA_CHECK(cudaDeviceSynchronize()); |
| 86 | + |
| 87 | + // Copy the result from device memory back to host memory |
| 88 | + int h_global_count = 0; // Host variable to store the count |
| 89 | + CUDA_CHECK(cudaMemcpy(&h_global_count, d_global_count, sizeof(int), cudaMemcpyDeviceToHost)); |
| 90 | + |
| 91 | + // Free the allocated device memory |
| 92 | + CUDA_CHECK(cudaFree(d_global_count)); |
| 93 | + |
| 94 | + // Calculate the estimated value of Pi using the ratio of points inside the circle to the total points |
| 95 | + int total_points = nx * ny; |
| 96 | + double pi_estimate = 4.0 * (double(h_global_count) / total_points); |
| 97 | + |
| 98 | + return pi_estimate; |
| 99 | +} |
| 100 | + |
| 101 | +int main() { |
| 102 | + CUDA_CHECK(cudaSetDevice(0)); |
| 103 | + |
| 104 | + for (int n = 1; n <= 16384; n *= 2) { |
| 105 | + double pi = calculate_pi(n, n); |
| 106 | + |
| 107 | + printf("nx=%d ny=%d pi=%f\n", n, n, pi); |
| 108 | + } |
| 109 | + |
| 110 | + return EXIT_SUCCESS; |
| 111 | +} |
0 commit comments