|
| 1 | +# Profiling |
| 2 | + |
| 3 | +This section provides brief documentation on how to use the NVIDIA NSight tools |
| 4 | +to profile an application on Tursa. The process is provided as short example using |
| 5 | +a simple application. |
| 6 | + |
| 7 | +For full details, see the NVidia Nsight documentation: |
| 8 | + |
| 9 | + - [NVIDIA Nsight Systems](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) |
| 10 | + - [NVIDIA Nsigh Compute](https://docs.nvidia.com/nsight-compute/) |
| 11 | + |
| 12 | +!!! important |
| 13 | + The Night GUI is not available on Tursa and you cannot connect a local GUI to |
| 14 | + Tursa over SSH due to limitations in the SSH module in the Nsight GUI. If you want |
| 15 | + to visualise profiles, you must download them from Tursa to your local system |
| 16 | + where you have installed the GUI. |
| 17 | + |
| 18 | +!!! credit |
| 19 | + Thanks to Paul Graham of NVIDIA for agreeing to share this example. |
| 20 | + |
| 21 | +## Example code |
| 22 | + |
| 23 | +Here is the example CUDA code that will be used for this example. In the rest of the |
| 24 | +exercise, we assume you have saved this to a file called `vector-add.cu` on Tursa. |
| 25 | + |
| 26 | +``` |
| 27 | +#include <stdio.h> |
| 28 | +
|
| 29 | +/* |
| 30 | + * Host function to initialize vector elements. This function |
| 31 | + * simply initializes each element to equal its index in the |
| 32 | + * vector. |
| 33 | + */ |
| 34 | +
|
| 35 | +void initWith(float num, float *a, int N) |
| 36 | +{ |
| 37 | + for(int i = 0; i < N; ++i) |
| 38 | + { |
| 39 | + a[i] = num; |
| 40 | + } |
| 41 | +} |
| 42 | +
|
| 43 | +/* |
| 44 | + * Device kernel stores into `result` the sum of each |
| 45 | + * same-indexed value of `a` and `b`. |
| 46 | + */ |
| 47 | +
|
| 48 | +__global__ |
| 49 | +void addVectorsInto(float *result, float *a, float *b, int N) |
| 50 | +{ |
| 51 | + int index = threadIdx.x + blockIdx.x * blockDim.x; |
| 52 | + int stride = blockDim.x * gridDim.x; |
| 53 | +
|
| 54 | + for(int i = index; i < N; i += stride) |
| 55 | + { |
| 56 | + result[i] = a[i] + b[i]; |
| 57 | + } |
| 58 | +} |
| 59 | +
|
| 60 | +/* |
| 61 | + * Host function to confirm values in `vector`. This function |
| 62 | + * assumes all values are the same `target` value. |
| 63 | + */ |
| 64 | +
|
| 65 | +void checkElementsAre(float target, float *vector, int N) |
| 66 | +{ |
| 67 | + for(int i = 0; i < N; i++) |
| 68 | + { |
| 69 | + if(vector[i] != target) |
| 70 | + { |
| 71 | + printf("FAIL: vector[%d] - %0.0f does not equal %0.0f\n", i, vector[i], target); |
| 72 | + exit(1); |
| 73 | + } |
| 74 | + } |
| 75 | + printf("Success! All values calculated correctly.\n"); |
| 76 | +} |
| 77 | +
|
| 78 | +int main() |
| 79 | +{ |
| 80 | +
|
| 81 | + int deviceId; |
| 82 | + int numberOfSMs; |
| 83 | +
|
| 84 | + cudaGetDevice(&deviceId); |
| 85 | + cudaDeviceGetAttribute(&numberOfSMs, cudaDevAttrMultiProcessorCount, deviceId); |
| 86 | + printf("Device ID: %d\tNumber of SMs: %d\n", deviceId, numberOfSMs); |
| 87 | + |
| 88 | + const int N = 2<<24; |
| 89 | + size_t size = N * sizeof(float); |
| 90 | +
|
| 91 | + float *a; |
| 92 | + float *b; |
| 93 | + float *c; |
| 94 | +
|
| 95 | + cudaMallocManaged(&a, size); |
| 96 | + cudaMallocManaged(&b, size); |
| 97 | + cudaMallocManaged(&c, size); |
| 98 | +
|
| 99 | + initWith(3, a, N); |
| 100 | + initWith(4, b, N); |
| 101 | + initWith(0, c, N); |
| 102 | +
|
| 103 | + size_t threadsPerBlock; |
| 104 | + size_t numberOfBlocks; |
| 105 | +
|
| 106 | + /* |
| 107 | + * nsys should register performance changes when execution configuration |
| 108 | + * is updated. |
| 109 | + */ |
| 110 | +
|
| 111 | + threadsPerBlock = 256; |
| 112 | + numberOfBlocks = 32 * numberOfSMs; |
| 113 | +
|
| 114 | + cudaError_t addVectorsErr; |
| 115 | + cudaError_t asyncErr; |
| 116 | +
|
| 117 | + addVectorsInto<<<numberOfBlocks, threadsPerBlock>>>(c, a, b, N); |
| 118 | +
|
| 119 | + addVectorsErr = cudaGetLastError(); |
| 120 | + if(addVectorsErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(addVectorsErr)); |
| 121 | +
|
| 122 | + asyncErr = cudaDeviceSynchronize(); |
| 123 | + if(asyncErr != cudaSuccess) printf("Error: %s\n", cudaGetErrorString(asyncErr)); |
| 124 | +
|
| 125 | + checkElementsAre(7, c, N); |
| 126 | +
|
| 127 | + cudaFree(a); |
| 128 | + cudaFree(b); |
| 129 | + cudaFree(c); |
| 130 | +} |
| 131 | +``` |
| 132 | + |
| 133 | +## Compile the code |
| 134 | + |
| 135 | +Compile the example code: |
| 136 | + |
| 137 | +``` |
| 138 | +module load nvhpc/23.5-nompi |
| 139 | +module load gcc/12.2.0 |
| 140 | +
|
| 141 | +nvcc -o vector-add.exe vector-add.cu |
| 142 | +``` |
| 143 | + |
| 144 | +## Test the example |
| 145 | + |
| 146 | +Create a job submission script to run the example code: |
| 147 | + |
| 148 | +```slurm |
| 149 | +#!/bin/bash |
| 150 | +
|
| 151 | +#SBATCH --job-name=vector-add |
| 152 | +#SBATCH --time=0:5:0 |
| 153 | +#SBATCH --nodes=1 |
| 154 | +#SBATCH --ntasks-per-node=32 |
| 155 | +#SBATCH --cpus-per-task=1 |
| 156 | +#SBATCH --gres=gpu:4 |
| 157 | +#SBATCH --partition=gpu-a100-40 |
| 158 | +#SBATCH --qos=dev |
| 159 | +
|
| 160 | +#SBATCH --account=[add your budget code] |
| 161 | +
|
| 162 | +# Load the correct modules |
| 163 | +module load nvhpc/23.5-nompi |
| 164 | +module load gcc/12.2.0 |
| 165 | +
|
| 166 | +./vector-add.exe |
| 167 | +``` |
| 168 | + |
| 169 | +When you submit this, you should see the code produce output like: |
| 170 | + |
| 171 | +``` |
| 172 | +Device ID: 0 Number of SMs: 108 |
| 173 | +Success! All values calculated correctly. |
| 174 | +``` |
| 175 | + |
| 176 | +## Use Nsight System to generate a profile |
| 177 | + |
| 178 | +Create a job submission script to get a profile of the example application: |
| 179 | + |
| 180 | +```slurm |
| 181 | +#!/bin/bash |
| 182 | +
|
| 183 | +#SBATCH --job-name=vector-add |
| 184 | +#SBATCH --time=0:5:0 |
| 185 | +#SBATCH --nodes=1 |
| 186 | +#SBATCH --ntasks-per-node=32 |
| 187 | +#SBATCH --cpus-per-task=1 |
| 188 | +#SBATCH --gres=gpu:4 |
| 189 | +#SBATCH --partition=gpu-a100-40 |
| 190 | +#SBATCH --qos=dev |
| 191 | +
|
| 192 | +#SBATCH --account=[add your budget code] |
| 193 | +
|
| 194 | +# Load the correct modules |
| 195 | +module load nvhpc/23.5-nompi |
| 196 | +module load gcc/12.2.0 |
| 197 | +
|
| 198 | +nsys profile --stats=true vector-add.exe |
| 199 | +``` |
| 200 | + |
| 201 | +This should produce output something like: |
| 202 | + |
| 203 | +``` |
| 204 | +1/8] [========================100%] report2.nsys-rep |
| 205 | +[2/8] [========================100%] report2.sqlite |
| 206 | +SKIPPED: /mnt/lustre/tursafs1/home/z01/z01/dc-turn1/test/nsight/report2.sqlite does not contain NV Tools Extension (NVTX) data. |
| 207 | +[3/8] Executing 'nvtx_sum' stats report |
| 208 | +[4/8] Executing 'osrt_sum' stats report |
| 209 | +
|
| 210 | + Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name |
| 211 | + -------- --------------- --------- ---------- ---------- -------- ---------- ----------- --------------------- |
| 212 | + 84.5 3740154532 120 31167954.4 10103030.0 7054 1413120085 130155802.9 poll |
| 213 | + 8.5 376952738 1065 353946.2 20882.0 1397 29245539 1359316.7 ioctl |
| 214 | + 5.2 230556453 106 2175060.9 2094310.0 1815 20983399 2645399.3 sem_timedwait |
| 215 | + 0.9 38619004 7 5517000.6 8521.0 1467 20053805 9423347.1 fread |
| 216 | + 0.3 15445783 58 266306.6 5168.0 2794 11102745 1479367.5 fopen |
| 217 | + 0.3 13440698 26 516949.9 6425.5 2305 5145173 1446432.8 mmap |
| 218 | + 0.2 6658153 10 665815.3 1431.5 1396 6644466 2100683.7 dup |
| 219 | + 0.0 1609153 42 38313.2 8520.0 6635 937695 143076.8 mmap64 |
| 220 | + 0.0 621661 4 155415.3 152814.0 121175 194858 30933.7 pthread_create |
| 221 | + 0.0 536320 102 5258.0 1816.0 978 213296 25541.3 fcntl |
| 222 | + 0.0 533379 52 10257.3 2375.0 1885 260160 40144.9 fclose |
| 223 | + 0.0 485683 83 5851.6 5169.0 2794 22349 2604.6 open64 |
| 224 | + 0.0 106926 64 1670.7 1467.0 978 2864 389.8 pthread_mutex_trylock |
| 225 | + 0.0 94711 29 3265.9 1397.0 908 58668 10657.4 fgets |
| 226 | + 0.0 73122 14 5223.0 4260.0 1816 16552 3478.6 write |
| 227 | + 0.0 54408 11 4946.2 4679.0 2794 7124 1502.6 munmap |
| 228 | + 0.0 49799 7 7114.1 7054.0 2445 13829 3811.4 open |
| 229 | + 0.0 47074 17 2769.1 2794.0 1815 5168 868.5 read |
| 230 | + 0.0 23257 3 7752.3 8521.0 4260 10476 3178.5 pipe2 |
| 231 | + 0.0 18368 2 9184.0 9184.0 7054 11314 3012.3 socket |
| 232 | + 0.0 11873 1 11873.0 11873.0 11873 11873 0.0 connect |
| 233 | + 0.0 2864 1 2864.0 2864.0 2864 2864 0.0 bind |
| 234 | + 0.0 1886 1 1886.0 1886.0 1886 1886 0.0 listen |
| 235 | +
|
| 236 | +[5/8] Executing 'cuda_api_sum' stats report |
| 237 | +
|
| 238 | + Time (%) Total Time (ns) Num Calls Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name |
| 239 | + -------- --------------- --------- ----------- ---------- -------- --------- ----------- ---------------------- |
| 240 | + 66.9 326556539 3 108852179.7 69982.0 61601 326424956 188423551.5 cudaMallocManaged |
| 241 | + 17.4 84859556 1 84859556.0 84859556.0 84859556 84859556 0.0 cudaDeviceSynchronize |
| 242 | + 12.9 62931338 1 62931338.0 62931338.0 62931338 62931338 0.0 cudaLaunchKernel |
| 243 | + 2.8 13646243 3 4548747.7 4418958.0 4016391 5210894 607736.3 cudaFree |
| 244 | + 0.0 7054 1 7054.0 7054.0 7054 7054 0.0 cuModuleGetLoadingMode |
| 245 | +
|
| 246 | +[6/8] Executing 'cuda_gpu_kern_sum' stats report |
| 247 | +
|
| 248 | + Time (%) Total Time (ns) Instances Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Name |
| 249 | + -------- --------------- --------- ---------- ---------- -------- -------- ----------- ---------------------------------------------- |
| 250 | + 100.0 84862110 1 84862110.0 84862110.0 84862110 84862110 0.0 addVectorsInto(float *, float *, float *, int) |
| 251 | +
|
| 252 | +[7/8] Executing 'cuda_gpu_mem_time_sum' stats report |
| 253 | +
|
| 254 | + Time (%) Total Time (ns) Count Avg (ns) Med (ns) Min (ns) Max (ns) StdDev (ns) Operation |
| 255 | + -------- --------------- ----- -------- -------- -------- -------- ----------- --------------------------------- |
| 256 | + 81.9 48978013 10109 4845.0 3455.0 2656 51328 5656.4 [CUDA Unified Memory memcpy HtoD] |
| 257 | + 18.1 10801550 768 14064.5 4095.0 2463 79840 21695.6 [CUDA Unified Memory memcpy DtoH] |
| 258 | +
|
| 259 | +[8/8] Executing 'cuda_gpu_mem_size_sum' stats report |
| 260 | +
|
| 261 | + Total (MB) Count Avg (MB) Med (MB) Min (MB) Max (MB) StdDev (MB) Operation |
| 262 | + ---------- ----- -------- -------- -------- -------- ----------- --------------------------------- |
| 263 | + 402.653 10109 0.040 0.008 0.004 1.044 0.135 [CUDA Unified Memory memcpy HtoD] |
| 264 | + 134.218 768 0.175 0.033 0.004 1.044 0.301 [CUDA Unified Memory memcpy DtoH] |
| 265 | +
|
| 266 | +Generated: |
| 267 | + /mnt/lustre/tursafs1/home/z01/z01/dc-turn1/test/nsight/report1.nsys-rep |
| 268 | + /mnt/lustre/tursafs1/home/z01/z01/dc-turn1/test/nsight/report1.sqlite |
| 269 | +``` |
| 270 | + |
| 271 | +You can download the `report1.nsys-rep` file to your local system to load into the Nsight GUI for |
| 272 | +visualisation if you wish. |
| 273 | + |
| 274 | +## Use Nsight Compute to investiage hardware counters |
| 275 | + |
| 276 | +Create a job submission script to get a profile of the hardware counters for the example application: |
| 277 | + |
| 278 | +```slurm |
| 279 | +#!/bin/bash |
| 280 | +
|
| 281 | +#SBATCH --job-name=vector-add |
| 282 | +#SBATCH --time=0:5:0 |
| 283 | +#SBATCH --nodes=1 |
| 284 | +#SBATCH --ntasks-per-node=32 |
| 285 | +#SBATCH --cpus-per-task=1 |
| 286 | +#SBATCH --gres=gpu:4 |
| 287 | +#SBATCH --partition=gpu-a100-40 |
| 288 | +#SBATCH --qos=dev |
| 289 | +
|
| 290 | +#SBATCH --account=[add your budget code] |
| 291 | +
|
| 292 | +# Load the correct modules |
| 293 | +module load nvhpc/23.5-nompi |
| 294 | +module load gcc/12.2.0 |
| 295 | +
|
| 296 | +ncu ./vector-add.exe |
| 297 | +``` |
| 298 | + |
| 299 | +This should produce output something like: |
| 300 | + |
| 301 | +``` |
| 302 | +[165308] vector-add.exe@127.0.0.1 |
| 303 | + addVectorsInto(float *, float *, float *, int) (3456, 1, 1)x(256, 1, 1), Context 1, Stream 7, Device 0, CC 8.0 |
| 304 | + Section: GPU Speed Of Light Throughput |
| 305 | + ----------------------- ------------- ------------ |
| 306 | + Metric Name Metric Unit Metric Value |
| 307 | + ----------------------- ------------- ------------ |
| 308 | + DRAM Frequency cycle/nsecond 1.20 |
| 309 | + SM Frequency cycle/nsecond 1.08 |
| 310 | + Elapsed Cycles cycle 322427 |
| 311 | + Memory Throughput % 85.69 |
| 312 | + DRAM Throughput % 85.69 |
| 313 | + Duration usecond 299.23 |
| 314 | + L1/TEX Cache Throughput % 17.69 |
| 315 | + L2 Cache Throughput % 68.64 |
| 316 | + SM Active Cycles cycle 314986.82 |
| 317 | + Compute (SM) Throughput % 9.12 |
| 318 | + ----------------------- ------------- ------------ |
| 319 | +
|
| 320 | + INF The kernel is utilizing greater than 80.0% of the available compute or memory performance of the device. To |
| 321 | + further improve performance, work will likely need to be shifted from the most utilized to another unit. |
| 322 | + Start by analyzing DRAM in the Memory Workload Analysis section. |
| 323 | +
|
| 324 | + Section: Launch Statistics |
| 325 | + -------------------------------- --------------- --------------- |
| 326 | + Metric Name Metric Unit Metric Value |
| 327 | + -------------------------------- --------------- --------------- |
| 328 | + Block Size 256 |
| 329 | + Function Cache Configuration CachePreferNone |
| 330 | + Grid Size 3456 |
| 331 | + Registers Per Thread register/thread 26 |
| 332 | + Shared Memory Configuration Size Kbyte 32.77 |
| 333 | + Driver Shared Memory Per Block Kbyte/block 1.02 |
| 334 | + Dynamic Shared Memory Per Block byte/block 0 |
| 335 | + Static Shared Memory Per Block byte/block 0 |
| 336 | + Threads thread 884736 |
| 337 | + Waves Per SM 4 |
| 338 | + -------------------------------- --------------- --------------- |
| 339 | +
|
| 340 | + Section: Occupancy |
| 341 | + ------------------------------- ----------- ------------ |
| 342 | + Metric Name Metric Unit Metric Value |
| 343 | + ------------------------------- ----------- ------------ |
| 344 | + Block Limit SM block 32 |
| 345 | + Block Limit Registers block 8 |
| 346 | + Block Limit Shared Mem block 32 |
| 347 | + Block Limit Warps block 8 |
| 348 | + Theoretical Active Warps per SM warp 64 |
| 349 | + Theoretical Occupancy % 100 |
| 350 | + Achieved Occupancy % 96.24 |
| 351 | + Achieved Active Warps Per SM warp 61.59 |
| 352 | + ------------------------------- ----------- ------------ |
| 353 | +
|
| 354 | + INF This kernel's theoretical occupancy is not impacted by any block limit. |
| 355 | +``` |
| 356 | + |
| 357 | + |
| 358 | + |
| 359 | + |
| 360 | + |
0 commit comments