|
193 | 193 | "outputs": [],
|
194 | 194 | "cell_type": "code",
|
195 | 195 | "source": [
|
| 196 | + "# solution\n", |
196 | 197 | "max_threads = attribute(device(),CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)\n",
|
197 | 198 | "thread_count = []\n",
|
198 | 199 | "throughputs = []\n",
|
199 | 200 | "for pow = 0:Int(log2(max_threads/32))\n",
|
200 | 201 | " threads = (32, 2^pow)\n",
|
201 |
| - " blocks = (nx÷threads[1], ny÷threads[2])\n", |
202 |
| - " t_it = @belapsed begin @cuda blocks=$blocks threads=$threads update_temperature!($T2, $T, $Ci, $lam, $dt, $_dx, $_dy); synchronize() end\n", |
203 |
| - " T_eff = (2*1+1)*1/1e9*nx*ny*sizeof(Float64)/t_it\n", |
| 202 | + " blocks = #...\n", |
| 203 | + " t_it = @belapsed begin @cuda #...\n", |
| 204 | + " T_eff = #...\n", |
204 | 205 | " push!(thread_count, prod(threads))\n",
|
205 | 206 | " push!(throughputs, T_eff)\n",
|
206 | 207 | " println(\"(threads=$threads) T_eff = $(T_eff)\")\n",
|
|
295 | 296 | "outputs": [],
|
296 | 297 | "cell_type": "code",
|
297 | 298 | "source": [
|
298 |
| - "# solution\n", |
| 299 | + "# hint\n", |
299 | 300 | "function update_temperature!(T2, T, Ci, lam, dt, _dx, _dy)\n",
|
300 | 301 | " ix = (blockIdx().x-1) * blockDim().x + threadIdx().x\n",
|
301 | 302 | " iy = (blockIdx().y-1) * blockDim().y + threadIdx().y\n",
|
302 |
| - " tx = threadIdx().x\n", |
303 |
| - " ty = threadIdx().y\n", |
304 |
| - " T_l = @cuDynamicSharedMem(eltype(T), (blockDim().x, blockDim().y))\n", |
305 |
| - " @inbounds T_l[tx,ty] = T[ix,iy]\n", |
| 303 | + " tx = # local thread id, x dimension\n", |
| 304 | + " ty = # local thread id, y dimension\n", |
| 305 | + " T_l = # allocation of a block-local temperature array (in shared memory)\n", |
| 306 | + " @inbounds T_l[tx,ty] = # read the values of the temperature array `T` into shared memory\n", |
306 | 307 | " if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))\n",
|
307 |
| - " @inbounds T2[ix,iy] = T_l[tx,ty] + dt*Ci[ix,iy]\n", |
| 308 | + " @inbounds T2[ix,iy] = #=read temperature values from shared memory=# + dt*Ci[ix,iy]\n", |
308 | 309 | " end\n",
|
309 | 310 | " return\n",
|
310 | 311 | "end"
|
|
326 | 327 | "outputs": [],
|
327 | 328 | "cell_type": "code",
|
328 | 329 | "source": [
|
329 |
| - "# solution\n", |
330 |
| - "t_it = @belapsed begin @cuda blocks=$blocks threads=$threads shmem=prod($threads)*sizeof(Float64) update_temperature!($T2, $T, $Ci, $lam, $dt, $_dx, $_dy); synchronize() end\n", |
331 |
| - "T_eff = (2*1+1)*1/1e9*nx*ny*sizeof(Float64)/t_it" |
| 330 | + "# solution" |
332 | 331 | ],
|
333 | 332 | "metadata": {},
|
334 | 333 | "execution_count": null
|
|
356 | 355 | "outputs": [],
|
357 | 356 | "cell_type": "code",
|
358 | 357 | "source": [
|
359 |
| - "# solution\n", |
| 358 | + "# hint\n", |
360 | 359 | "function update_temperature!(T2, T, Ci, lam, dt, _dx, _dy)\n",
|
361 | 360 | " ix = (blockIdx().x-1) * blockDim().x + threadIdx().x\n",
|
362 | 361 | " iy = (blockIdx().y-1) * blockDim().y + threadIdx().y\n",
|
363 |
| - " tx = threadIdx().x+1\n", |
364 |
| - " ty = threadIdx().y+1\n", |
365 |
| - " T_l = @cuDynamicSharedMem(eltype(T), (blockDim().x+2, blockDim().y+2))\n", |
| 362 | + " tx = # adjust the local thread id in y dimension\n", |
| 363 | + " ty = # adjust the local thread id in y dimension\n", |
| 364 | + " T_l = # adjust the shared memory allocation\n", |
366 | 365 | " @inbounds T_l[tx,ty] = T[ix,iy]\n",
|
367 | 366 | " if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))\n",
|
368 | 367 | " @inbounds T2[ix,iy] = T_l[tx,ty] + dt*Ci[ix,iy]\n",
|
369 | 368 | " end\n",
|
370 | 369 | " return\n",
|
371 | 370 | "end\n",
|
372 | 371 | "\n",
|
373 |
| - "t_it = @belapsed begin @cuda blocks=$blocks threads=$threads shmem=prod($threads.+2)*sizeof(Float64) update_temperature!($T2, $T, $Ci, $lam, $dt, $_dx, $_dy); synchronize() end\n", |
| 372 | + "t_it = @belapsed begin @cuda blocks=$blocks threads=$threads shmem=#=adjust the shared memory=# update_temperature!($T2, $T, $Ci, $lam, $dt, $_dx, $_dy); synchronize() end\n", |
374 | 373 | "T_eff = (2*1+1)*1/1e9*nx*ny*sizeof(Float64)/t_it"
|
375 | 374 | ],
|
376 | 375 | "metadata": {},
|
|
397 | 396 | "outputs": [],
|
398 | 397 | "cell_type": "code",
|
399 | 398 | "source": [
|
400 |
| - "# solution\n", |
| 399 | + "# hint\n", |
401 | 400 | "function update_temperature!(T2, T, Ci, lam, dt, _dx, _dy)\n",
|
402 | 401 | " ix = (blockIdx().x-1) * blockDim().x + threadIdx().x\n",
|
403 | 402 | " iy = (blockIdx().y-1) * blockDim().y + threadIdx().y\n",
|
|
406 | 405 | " T_l = @cuDynamicSharedMem(eltype(T), (blockDim().x+2, blockDim().y+2))\n",
|
407 | 406 | " @inbounds T_l[tx,ty] = T[ix,iy]\n",
|
408 | 407 | " if (ix>1 && ix<size(T2,1) && iy>1 && iy<size(T2,2))\n",
|
409 |
| - " @inbounds if (threadIdx().x == 1) T_l[tx-1,ty] = T[ix-1,iy] end\n", |
410 |
| - " @inbounds if (threadIdx().x == blockDim().x) T_l[tx+1,ty] = T[ix+1,iy] end\n", |
411 |
| - " @inbounds if (threadIdx().y == 1) T_l[tx,ty-1] = T[ix,iy-1] end\n", |
412 |
| - " @inbounds if (threadIdx().y == blockDim().y) T_l[tx,ty+1] = T[ix,iy+1] end\n", |
| 408 | + " @inbounds if (threadIdx().x == 1) #=read the required values to the left halo of `T_l`=# end\n", |
| 409 | + " @inbounds if (threadIdx().x == blockDim().x) #=read the required values to the right halo of `T_l`=# end\n", |
| 410 | + " @inbounds if #=read the required values to the bottom halo of `T_l`=# end\n", |
| 411 | + " @inbounds if #=read the required values to the top halo of `T_l`=# end\n", |
413 | 412 | " @inbounds T2[ix,iy] = T_l[tx,ty] + dt*Ci[ix,iy]\n",
|
414 | 413 | " end\n",
|
415 | 414 | " return\n",
|
|
443 | 442 | "outputs": [],
|
444 | 443 | "cell_type": "code",
|
445 | 444 | "source": [
|
446 |
| - "# solution\n", |
| 445 | + "# hint\n", |
447 | 446 | "function update_temperature!(T2, T, Ci, lam, dt, _dx, _dy)\n",
|
448 | 447 | " ix = (blockIdx().x-1) * blockDim().x + threadIdx().x\n",
|
449 | 448 | " iy = (blockIdx().y-1) * blockDim().y + threadIdx().y\n",
|
|
458 | 457 | " @inbounds if (threadIdx().y == blockDim().y) T_l[tx,ty+1] = T[ix,iy+1] end\n",
|
459 | 458 | " sync_threads()\n",
|
460 | 459 | " @inbounds T2[ix,iy] = T_l[tx,ty] + dt*Ci[ix,iy]*(\n",
|
461 |
| - " - ((-lam*(T_l[tx+1,ty] - T_l[tx,ty])*_dx) - (-lam*(T_l[tx,ty] - T_l[tx-1,ty])*_dx))*_dx\n", |
462 |
| - " - ((-lam*(T_l[tx,ty+1] - T_l[tx,ty])*_dy) - (-lam*(T_l[tx,ty] - T_l[tx,ty-1])*_dy))*_dy\n", |
| 460 | + " # add the computation of the derivatives\n", |
| 461 | + " # ...\n", |
463 | 462 | " )\n",
|
464 | 463 | " end\n",
|
465 | 464 | " return\n",
|
466 | 465 | "end\n",
|
467 | 466 | "\n",
|
468 |
| - "function diffusion2D_step!(T2, T, Ci, lam, dt, _dx, _dy)\n", |
469 |
| - " threads = (32, 8)\n", |
470 |
| - " blocks = (size(T2,1)÷threads[1], size(T2,2)÷threads[2])\n", |
471 |
| - " @cuda blocks=blocks threads=threads shmem=prod(threads.+2)*sizeof(Float64) update_temperature!(T2, T, Ci, lam, dt, _dx, _dy); synchronize()\n", |
472 |
| - "end\n", |
| 467 | + "diffusion2D()\n", |
473 | 468 | "\n",
|
474 |
| - "diffusion2D()" |
475 |
| - ], |
476 |
| - "metadata": {}, |
477 |
| - "execution_count": null |
478 |
| - }, |
479 |
| - { |
480 |
| - "outputs": [], |
481 |
| - "cell_type": "code", |
482 |
| - "source": [ |
483 | 469 | "t_it = @belapsed begin @cuda blocks=$blocks threads=$threads shmem=prod($threads.+2)*sizeof(Float64) update_temperature!($T2, $T, $Ci, $lam, $dt, $_dx, $_dy); synchronize() end\n",
|
484 | 470 | "T_eff = (2*1+1)*1/1e9*nx*ny*sizeof(Float64)/t_it"
|
485 | 471 | ],
|
|
513 | 499 | "outputs": [],
|
514 | 500 | "cell_type": "code",
|
515 | 501 | "source": [
|
516 |
| - "# solution\n", |
517 |
| - "T_peak = 561 # Peak memory throughput of the Tesla P100 GPU\n", |
518 |
| - "T_eff/T_peak" |
| 502 | + "# solution" |
519 | 503 | ],
|
520 | 504 | "metadata": {},
|
521 | 505 | "execution_count": null
|
|
534 | 518 | "file_extension": ".jl",
|
535 | 519 | "mimetype": "application/julia",
|
536 | 520 | "name": "julia",
|
537 |
| - "version": "1.10.3" |
| 521 | + "version": "1.10.5" |
538 | 522 | },
|
539 | 523 | "kernelspec": {
|
540 | 524 | "name": "julia-1.10",
|
541 |
| - "display_name": "Julia 1.10.3", |
| 525 | + "display_name": "Julia 1.10.5", |
542 | 526 | "language": "julia"
|
543 | 527 | }
|
544 | 528 | },
|
|
0 commit comments