Skip to content

Commit 86adf93

Browse files
committed
Add pi example
1 parent d1f056d commit 86adf93

File tree

3 files changed

+124
-0
lines changed

3 files changed

+124
-0
lines changed

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1+
add_subdirectory(pi)
12
add_subdirectory(vector_add)
23
add_subdirectory(vector_add_tiling)

examples/pi/CMakeLists.txt

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
cmake_minimum_required(VERSION 3.17)
2+
3+
set (PROJECT_NAME kernel_float_pi)
4+
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
5+
set (CMAKE_CXX_STANDARD 17)
6+
7+
add_executable(${PROJECT_NAME} "${PROJECT_SOURCE_DIR}/main.cu")
8+
target_link_libraries(${PROJECT_NAME} kernel_float)
9+
set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "80")
10+
11+
find_package(CUDA REQUIRED)
12+
target_include_directories(${PROJECT_NAME} PRIVATE ${CUDA_TOOLKIT_INCLUDE})

examples/pi/main.cu

Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
#include <stdio.h>
2+
#include <stdlib.h>
3+
4+
#include "kernel_float.h"
5+
6+
#define CUDA_CHECK(call) \
7+
do { \
8+
cudaError_t __err = call; \
9+
if (__err != cudaSuccess) { \
10+
fprintf( \
11+
stderr, \
12+
"CUDA error at %s:%d code=%d(%s) \"%s\" \n", \
13+
__FILE__, \
14+
__LINE__, \
15+
__err, \
16+
cudaGetErrorString(__err), \
17+
#call); \
18+
exit(EXIT_FAILURE); \
19+
} \
20+
} while (0)
21+
22+
// Alias `kernel_float` as `kf`
23+
namespace kf = kernel_float;
24+
25+
// Define the float type and vector size
26+
using float_type = float;
27+
static constexpr int VECTOR_SIZE = 4;
28+
29+
__global__ void calculate_pi_kernel(int nx, int ny, int* global_count) {
30+
// Calculate the global x and y indices for this thread within the grid
31+
int thread_x = blockIdx.x * blockDim.x + threadIdx.x;
32+
int thread_y = blockIdx.y * blockDim.y + threadIdx.y;
33+
34+
// Calculate the x and y coordinates as integers.
35+
// The x coordinates are: [thread_x * VECTOR_SIZE, thread_x * VECTOR_SIZE + 1, ...]
36+
// The y coordinates are: [thread_y, thread_y, ...]
37+
kf::vec<int, VECTOR_SIZE> xi = thread_x * VECTOR_SIZE + kf::range<int, VECTOR_SIZE>();
38+
kf::vec<int, VECTOR_SIZE> yi = thread_y;
39+
40+
// Normalize the integers to values between 0 and 1.
41+
kf::vec<float_type, VECTOR_SIZE> xf = kf::cast<float_type>(xi) / float_type(nx);
42+
kf::vec<float_type, VECTOR_SIZE> yf = kf::cast<float_type>(yi) / float_type(ny);
43+
44+
// Compute the squared distance to the origin and then take the
45+
// square root to get the distance to the origin.
46+
kf::vec<float_type, VECTOR_SIZE> dist_squared = xf * xf + yf * yf;
47+
kf::vec<float_type, VECTOR_SIZE> dist = kf::sqrt(dist_squared);
48+
49+
// Count the number of points within the unit circle.
50+
// The expression `dist <= 1` returns a boolean vector
51+
// and `kf::count` counts how many elements are `true`.
52+
int n = kf::count(dist <= float_type(1));
53+
54+
// Atomically add 'n' to 'global_count'
55+
atomicAdd(global_count, n);
56+
}
57+
58+
double calculate_pi(int nx, int ny) {
59+
// Allocate memory on the device (GPU) for 'global_count' to accumulate the count of points inside the circle
60+
int* d_global_count;
61+
CUDA_CHECK(cudaMalloc(&d_global_count, sizeof(int)));
62+
63+
// Initialize the device memory to zero
64+
CUDA_CHECK(cudaMemset(d_global_count, 0, sizeof(int)));
65+
66+
// Each thread processes 'VECTOR_SIZE' points in the x-direction
67+
int num_threads_x = (nx + VECTOR_SIZE - 1) / VECTOR_SIZE;
68+
69+
// Define the dimensions of each thread block (number of threads per block)
70+
dim3 block_size(16, 16); // Each block contains 16 threads in x and y directions
71+
72+
// Calculate the number of blocks needed in the grid to cover all threads
73+
dim3 grid_size(
74+
(num_threads_x + block_size.x - 1) / block_size.x, // Number of blocks in x-direction
75+
(ny + block_size.y - 1) / block_size.y // Number of blocks in y-direction
76+
);
77+
78+
// Launch the kernel on the GPU with the calculated grid and block dimensions
79+
calculate_pi_kernel<<<grid_size, block_size>>>(nx, ny, d_global_count);
80+
81+
// Check for any errors during kernel launch (asynchronous)
82+
CUDA_CHECK(cudaGetLastError());
83+
84+
// Wait for the kernel to finish executing and check for errors (synchronization point)
85+
CUDA_CHECK(cudaDeviceSynchronize());
86+
87+
// Copy the result from device memory back to host memory
88+
int h_global_count = 0; // Host variable to store the count
89+
CUDA_CHECK(cudaMemcpy(&h_global_count, d_global_count, sizeof(int), cudaMemcpyDeviceToHost));
90+
91+
// Free the allocated device memory
92+
CUDA_CHECK(cudaFree(d_global_count));
93+
94+
// Calculate the estimated value of Pi using the ratio of points inside the circle to the total points
95+
int total_points = nx * ny;
96+
double pi_estimate = 4.0 * (double(h_global_count) / total_points);
97+
98+
return pi_estimate;
99+
}
100+
101+
int main() {
102+
CUDA_CHECK(cudaSetDevice(0));
103+
104+
for (int n = 1; n <= 16384; n *= 2) {
105+
double pi = calculate_pi(n, n);
106+
107+
printf("nx=%d ny=%d pi=%f\n", n, n, pi);
108+
}
109+
110+
return EXIT_SUCCESS;
111+
}

0 commit comments

Comments
 (0)