Add pi example

stijnh · stijnh · commit 86adf930710e · 2024-09-24T11:25:54.000+02:00
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -1,2 +1,3 @@
+add_subdirectory(pi)
 add_subdirectory(vector_add)
 add_subdirectory(vector_add_tiling)
diff --git a/examples/pi/CMakeLists.txt b/examples/pi/CMakeLists.txt
@@ -0,0 +1,12 @@
+cmake_minimum_required(VERSION 3.17)
+
+set (PROJECT_NAME kernel_float_pi)
+project(${PROJECT_NAME} LANGUAGES CXX CUDA)
+set (CMAKE_CXX_STANDARD 17)
+
+add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
+target_link_libraries(${PROJECT_NAME} kernel_float)
+set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
+
+find_package(CUDA REQUIRED)
+target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})
diff --git a/examples/pi/main.cu b/examples/pi/main.cu
@@ -0,0 +1,111 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "kernel_float.h"
+
+#define CUDA_CHECK(call)                                     \
+    do {                                                     \
+        cudaError_t __err = call;                            \
+        if (__err != cudaSuccess) {                          \
+            fprintf(                                         \
+                stderr,                                      \
+                "CUDA error at %s:%d code=%d(%s) \"%s\" \n", \
+                __FILE__,                                    \
+                __LINE__,                                    \
+                __err,                                       \
+                cudaGetErrorString(__err),                   \
+                #call);                                      \
+            exit(EXIT_FAILURE);                              \
+        }                                                    \
+    } while (0)
+
+// Alias `kernel_float` as `kf`
+namespace kf = kernel_float;
+
+// Define the float type and vector size
+using float_type = float;
+static constexpr int VECTOR_SIZE = 4;
+
+__global__ void calculate_pi_kernel(int nx, int ny, int* global_count) {
+    // Calculate the global x and y indices for this thread within the grid
+    int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
+    int thread_y = blockIdx.y * blockDim.y + threadIdx.y;
+
+    // Calculate the x and y coordinates as integers.
+    // The x coordinates are: [thread_x * VECTOR_SIZE, thread_x * VECTOR_SIZE + 1, ...]
+    // The y coordinates are: [thread_y,               thread_y,                   ...]
+    kf::vec<int, VECTOR_SIZE> xi = thread_x * VECTOR_SIZE + kf::range<int, VECTOR_SIZE>();
+    kf::vec<int, VECTOR_SIZE> yi = thread_y;
+
+    // Normalize the integers to values between 0 and 1.
+    kf::vec<float_type, VECTOR_SIZE> xf = kf::cast<float_type>(xi) / float_type(nx);
+    kf::vec<float_type, VECTOR_SIZE> yf = kf::cast<float_type>(yi) / float_type(ny);
+
+    // Compute the squared distance to the origin and then take the
+    // square root to get the distance to the origin.
+    kf::vec<float_type, VECTOR_SIZE> dist_squared = xf * xf + yf * yf;
+    kf::vec<float_type, VECTOR_SIZE> dist = kf::sqrt(dist_squared);
+
+    // Count the number of points within the unit circle.
+    // The expression `dist <= 1` returns a boolean vector
+    // and `kf::count` counts how many elements are `true`.
+    int n = kf::count(dist <= float_type(1));
+
+    // Atomically add 'n' to 'global_count'
+    atomicAdd(global_count, n);
+}
+
+double calculate_pi(int nx, int ny) {
+    // Allocate memory on the device (GPU) for 'global_count' to accumulate the count of points inside the circle
+    int* d_global_count;
+    CUDA_CHECK(cudaMalloc(&d_global_count, sizeof(int)));
+
+    // Initialize the device memory to zero
+    CUDA_CHECK(cudaMemset(d_global_count, 0, sizeof(int)));
+
+    // Each thread processes 'VECTOR_SIZE' points in the x-direction
+    int num_threads_x = (nx + VECTOR_SIZE - 1) / VECTOR_SIZE;
+
+    // Define the dimensions of each thread block (number of threads per block)
+    dim3 block_size(16, 16);  // Each block contains 16 threads in x and y directions
+
+    // Calculate the number of blocks needed in the grid to cover all threads
+    dim3 grid_size(
+        (num_threads_x + block_size.x - 1) / block_size.x,  // Number of blocks in x-direction
+        (ny + block_size.y - 1) / block_size.y  // Number of blocks in y-direction
+    );
+
+    // Launch the kernel on the GPU with the calculated grid and block dimensions
+    calculate_pi_kernel<<<grid_size, block_size>>>(nx, ny, d_global_count);
+
+    // Check for any errors during kernel launch (asynchronous)
+    CUDA_CHECK(cudaGetLastError());
+
+    // Wait for the kernel to finish executing and check for errors (synchronization point)
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // Copy the result from device memory back to host memory
+    int h_global_count = 0;  // Host variable to store the count
+    CUDA_CHECK(cudaMemcpy(&h_global_count, d_global_count, sizeof(int), cudaMemcpyDeviceToHost));
+
+    // Free the allocated device memory
+    CUDA_CHECK(cudaFree(d_global_count));
+
+    // Calculate the estimated value of Pi using the ratio of points inside the circle to the total points
+    int total_points = nx * ny;
+    double pi_estimate = 4.0 * (double(h_global_count) / total_points);
+
+    return pi_estimate;
+}
+
+int main() {
+    CUDA_CHECK(cudaSetDevice(0));
+
+    for (int n = 1; n <= 16384; n *= 2) {
+        double pi = calculate_pi(n, n);
+
+        printf("nx=%d ny=%d pi=%f\n", n, n, pi);
+    }
+
+    return EXIT_SUCCESS;
+}

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
	`1`	`+add_subdirectory(pi)`
`1`	`2`	`add_subdirectory(vector_add)`
`2`	`3`	`add_subdirectory(vector_add_tiling)`