coreylowman · coreylowman · Nov 1, 2025 · Nov 1, 2025 · Nov 1, 2025 · Nov 1, 2025
@@ -0,0 +1,74 @@
+use cudarc::{
+    driver::{CudaContext, DriverError},
+    nvrtc::Ptx,
+};
+
+fn main() -> Result<(), DriverError> {
+    let ctx = CudaContext::new(0)?;
+
+    println!("Device: {}", ctx.name()?);
+    println!();
+
+    // Load the module with the sin_kernel
+    let module = ctx.load_module(Ptx::from_file("./examples/sin.ptx"))?;
+    let sin_kernel = module.load_function("sin_kernel")?;
+
+    // Query function attributes
+    println!("=== Function Attributes for 'sin_kernel' ===");
+    println!();
+
+    println!("Resource Usage:");
+    println!("  Registers per thread:     {}", sin_kernel.num_regs()?);
+    println!(
+        "  Static shared memory:     {} bytes",
+        sin_kernel.shared_size_bytes()?
+    );
+    println!(
+        "  Constant memory:          {} bytes",
+        sin_kernel.const_size_bytes()?
+    );
+    println!(
+        "  Local memory per thread:  {} bytes",
+        sin_kernel.local_size_bytes()?
+    );
+    println!();
+
+    println!("Limits:");
+    println!(
+        "  Max threads per block:    {}",
+        sin_kernel.max_threads_per_block()?
+    );
+    println!();
+
+    println!("Compilation Info:");
+    let ptx_ver = sin_kernel.ptx_version()?;
+    let bin_ver = sin_kernel.binary_version()?;
+    println!(
+        "  PTX version:              {}.{}",
+        ptx_ver / 10,
+        ptx_ver % 10
+    );
+    println!(
+        "  Binary version:           {}.{}",
+        bin_ver / 10,
+        bin_ver % 10
+    );
+    println!();
+
+    // Use occupancy API to get optimal launch configuration
+    extern "C" fn no_dynamic_smem(_block_size: std::ffi::c_int) -> usize {
+        0
+    }
+    let (min_grid_size, block_size) =
+        sin_kernel.occupancy_max_potential_block_size(no_dynamic_smem, 0, 0, None)?;
+
+    println!("=== Optimal Launch Configuration (sin_kernel) ===");
+    println!("  Suggested block size:     {}", block_size);
+    println!("  Min grid size:            {}", min_grid_size);
+    println!(
+        "  Total threads per grid:   {}",
+        min_grid_size * block_size
+    );
+
+    Ok(())
+}
@@ -180,6 +180,24 @@ pub mod device {
 
 pub mod function {
     use super::sys::{self, CUfunc_cache_enum, CUfunction_attribute_enum};
+    use std::mem::MaybeUninit;
+
+    /// Gets a specific attribute of a CUDA function.
+    ///
+    /// See [cuda docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b)
+    ///
+    /// # Safety
+    /// Function must exist.
+    pub unsafe fn get_function_attribute(
+        f: sys::CUfunction,
+        attribute: CUfunction_attribute_enum,
+    ) -> Result<i32, super::DriverError> {
+        let mut value = MaybeUninit::uninit();
+        unsafe {
+            sys::cuFuncGetAttribute(value.as_mut_ptr(), attribute, f).result()?;
+            Ok(value.assume_init())
+        }
+    }
 
     /// Sets the specific attribute of a cuda function.
     ///

@@ -1890,6 +1890,51 @@ impl CudaFunction {
         Ok(cluster_size as u32)
     }
 
+    /// Get the value of a specific attribute of this [CudaFunction].
+    ///
+    /// See [CUDA docs](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b)
+    pub fn get_attribute(
+        &self,
+        attribute: CUfunction_attribute_enum,
+    ) -> Result<i32, result::DriverError> {
+        unsafe { result::function::get_function_attribute(self.cu_function, attribute) }
+    }
+
+    /// Get the number of registers used per thread.
+    pub fn num_regs(&self) -> Result<i32, result::DriverError> {
+        self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_NUM_REGS)
+    }
+
+    /// Get the size of statically-allocated shared memory in bytes.
+    pub fn shared_size_bytes(&self) -> Result<i32, result::DriverError> {
+        self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES)
+    }
+
+    /// Get the size of constant memory in bytes used by this function.
+    pub fn const_size_bytes(&self) -> Result<i32, result::DriverError> {
+        self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES)
+    }
+
+    /// Get the size of local memory in bytes used per thread.
+    pub fn local_size_bytes(&self) -> Result<i32, result::DriverError> {
+        self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES)
+    }
+
+    /// Get the maximum number of threads per block for this function.
+    pub fn max_threads_per_block(&self) -> Result<i32, result::DriverError> {
+        self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK)
+    }
+
+    /// Get the PTX virtual architecture version for which the function was compiled.
+    pub fn ptx_version(&self) -> Result<i32, result::DriverError> {
+        self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_PTX_VERSION)
+    }
+
+    /// Get the binary architecture version for which the function was compiled.
+    pub fn binary_version(&self) -> Result<i32, result::DriverError> {
+        self.get_attribute(CUfunction_attribute_enum::CU_FUNC_ATTRIBUTE_BINARY_VERSION)
+    }
+
     /// Set the value of a specific attribute of this [CudaFunction].
     pub fn set_attribute(
         &self,