Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions examples/10-function-attributes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
use cudarc::{
driver::{CudaContext, DriverError},
nvrtc::Ptx,
};

fn main() -> Result<(), DriverError> {
let ctx = CudaContext::new(0)?;

println!("Device: {}", ctx.name()?);
println!();

// Load the module with the sin_kernel
let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
let sin_kernel = module.load_function("sin_kernel")?;

// Query function attributes
println!("=== Function Attributes for 'sin_kernel' ===");
println!();

println!("Resource Usage:");
println!(" Registers per thread: {}", sin_kernel.num_regs()?);
println!(
" Static shared memory: {} bytes",
sin_kernel.shared_size_bytes()?
);
println!(
" Constant memory: {} bytes",
sin_kernel.const_size_bytes()?
);
println!(
" Local memory per thread: {} bytes",
sin_kernel.local_size_bytes()?
);
println!();

println!("Limits:");
println!(
" Max threads per block: {}",
sin_kernel.max_threads_per_block()?
);
println!();

println!("Compilation Info:");
let ptx_ver = sin_kernel.ptx_version()?;
let bin_ver = sin_kernel.binary_version()?;
println!(
" PTX version: {}.{}",
ptx_ver / 10,
ptx_ver % 10
);
println!(
" Binary version: {}.{}",
bin_ver / 10,
bin_ver % 10
);
println!();

// Use occupancy API to get optimal launch configuration
extern "C" fn no_dynamic_smem(_block_size: std::ffi::c_int) -> usize {
0
}
let (min_grid_size, block_size) =
sin_kernel.occupancy_max_potential_block_size(no_dynamic_smem, 0, 0, None)?;

println!("=== Optimal Launch Configuration (sin_kernel) ===");
println!(" Suggested block size: {}", block_size);
println!(" Min grid size: {}", min_grid_size);
println!(
" Total threads per grid: {}",
min_grid_size * block_size
);

Ok(())
}
18 changes: 18 additions & 0 deletions src/driver/result.rs
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,24 @@ pub mod device {

pub mod function {
use super::sys::{self, CUfunc_cache_enum, CUfunction_attribute_enum};
use std::mem::MaybeUninit;

/// Gets a specific attribute of a CUDA function.
///
/// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b)
///
/// # Safety
/// Function must exist.
pub unsafe fn get_function_attribute(
f: sys::CUfunction,
attribute: CUfunction_attribute_enum,
) -> Result<i32, super::DriverError> {
let mut value = MaybeUninit::uninit();
unsafe {
sys::cuFuncGetAttribute(value.as_mut_ptr(), attribute, f).result()?;
Ok(value.assume_init())
}
}

/// Sets the specific attribute of a cuda function.
///
Expand Down
45 changes: 45 additions & 0 deletions src/driver/safe/core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1890,6 +1890,51 @@ impl CudaFunction {
Ok(cluster_size as u32)
}

/// Get the value of a specific attribute of this [CudaFunction].
///
/// See [CUDA docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b)
pub fn get_attribute(
&self,
attribute: CUfunction_attribute_enum,
) -> Result<i32, result::DriverError> {
unsafe { result::function::get_function_attribute(self.cu_function, attribute) }
}

/// Get the number of registers used per thread.
pub fn num_regs(&self) -> Result<i32, result::DriverError> {
self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_NUM_REGS)
}

/// Get the size of statically-allocated shared memory in bytes.
pub fn shared_size_bytes(&self) -> Result<i32, result::DriverError> {
self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
}

/// Get the size of constant memory in bytes used by this function.
pub fn const_size_bytes(&self) -> Result<i32, result::DriverError> {
self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
}

/// Get the size of local memory in bytes used per thread.
pub fn local_size_bytes(&self) -> Result<i32, result::DriverError> {
self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
}

/// Get the maximum number of threads per block for this function.
pub fn max_threads_per_block(&self) -> Result<i32, result::DriverError> {
self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
}

/// Get the PTX virtual architecture version for which the function was compiled.
pub fn ptx_version(&self) -> Result<i32, result::DriverError> {
self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_PTX_VERSION)
}

/// Get the binary architecture version for which the function was compiled.
pub fn binary_version(&self) -> Result<i32, result::DriverError> {
self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_BINARY_VERSION)
}

/// Set the value of a specific attribute of this [CudaFunction].
pub fn set_attribute(
&self,
Expand Down