rapidsai · rapids-bot · Oct 10, 2025 · Oct 8, 2025
diff --git a/docs/cudf/source/cudf_polars/usage.md b/docs/cudf/source/cudf_polars/usage.md
@@ -89,18 +89,47 @@ shape: (3, 3)
 cudf-polars can optionally trace execution of each node in the query plan.
 To enable tracing, set the environment variable ``CUDF_POLARS_LOG_TRACES`` to a
 true value ("1", "true", "y", "yes") before starting your process. This will
-capture and log information about each node before and after it executes, including information on
+capture and log information about each node before and after it executes, and includes
+information on timing, memory usage, and the input and output dataframes. The log message
+includes the following fields:
+
+| Field Name | Type  | Description |
+| ---------- | ----- | ----------- |
+| type       | string | The name of the IR node |
+| start      | int    | A nanosecond-precision counter indicating when this node started executing |
+| stop       | int    | A nanosecond-precision counter indicating when this node finished executing |
+| overhead   | int    | The overhead, in nanoseconds, added by tracing |
+| `count_frames_{phase}` | int | The number of dataframes for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_DATAFRAMES=0`. |
+| `frames_{phase}` | `list[dict]` | A list with dictionaries with "shape" and "size" fields, one per input dataframe, for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_DATAFRAMES=0`. |
+| `total_bytes_{phase}` | int | The sum of the size (in bytes) of the dataframes for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_MEMORY=0`. |
+| `rmm_current_bytes_{phase}` | int | The current number of bytes allocated by RMM Memory Resource used by cudf-polars for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_MEMORY=0`. |
+| `rmm_current_count_{phase}` | int | The current number of allocations made by RMM Memory Resource used by cudf-polars for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_MEMORY=0`. |
+| `rmm_peak_bytes_{phase}` | int | The peak number of bytes allocated by RMM Memory Resource used by cudf-polars for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_MEMORY=0`. |
+| `rmm_peak_count_{phase}` | int | The peak number of allocations made by RMM Memory Resource used by cudf-polars for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_MEMORY=0`. |
+| `rmm_total_bytes_{phase}` | int | The total number of bytes allocated by RMM Memory Resource used by cudf-polars for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_MEMORY=0`. |
+| `rmm_total_count_{phase}` | int | The total number of allocations made by RMM Memory Resource used by cudf-polars for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_MEMORY=0`. |
+| `nvml_current_bytes_{phase}` | int | The device memory usage of this process, as reported by NVML, for the input / output `phase`. This metric can be disabled by setting `CUDF_POLARS_LOG_TRACES_MEMORY=0`. |
+
+Setting `CUDF_POLARS_LOG_TRACES=1` enables all the metrics. Depending on the query, the overhead
+from collecting the memory or dataframe metrics can be measurable. You can disable some metrics
+through additional environment variables. For example, do disable the memory related metrics, set:
 
-- The type of the node being executed (e.g. `Scan`, `Select`, `Join`, `Groupby`, etc.)
-- The shape and size (in memory) of each input and output DataFrame
-- The GPU memory usage, as reported by [nvml], before and after executing the node
-- A start and stop clock, which can be used to measure the duration of the node's execution
+```
+CUDF_POLARS_LOG_TRACES=1 CUDF_POLARS_LOG_TRACES_MEMORY=0
+```
+
+And to disable the memory and dataframe metrics, which essentially leaves just
+the duration metrics, set
+```
+CUDF_POLARS_LOG_TRACES=1 CUDF_POLARS_LOG_TRACES_MEMORY=0 CUDF_POLARS_LOG_TRACES_DATAFRAMES=0
+```
+
+Note that tracing still needs to be enabled with `CUDF_POLARS_LOG_TRACES=1`.
 
 The implementation uses [structlog] to build log records. You can configure the
 output using structlog's [configuration][structlog-configure] and enrich the
 records with [context variables][structlog-context].
 
-
 ```
 >>> df = pl.DataFrame({"a": ["a", "a", "b"], "b": [1, 2, 3]}).lazy()
 >>> df.group_by("a").agg(pl.col("b").min().alias("min"), pl.col("b").max().alias("max")).collect(engine="gpu")
@@ -120,5 +149,5 @@ shape: (2, 3)
 [nvml]: https://developer.nvidia.com/management-library-nvml
 [rmm-stats]: https://docs.rapids.ai/api/rmm/stable/guide/#memory-statistics-and-profiling
 [structlog]: https://www.structlog.org/
-[structlog-configure](https://www.structlog.org/en/stable/configuration.html)
-[structlog-context](https://www.structlog.org/en/stable/contextvars.html)
+[structlog-configure]: https://www.structlog.org/en/stable/configuration.html
+[structlog-context]: https://www.structlog.org/en/stable/contextvars.html
@@ -30,6 +30,12 @@
 LOG_TRACES = _HAS_STRUCTLOG and _bool_converter(
     os.environ.get("CUDF_POLARS_LOG_TRACES", "0")
 )
+LOG_MEMORY = LOG_TRACES and _bool_converter(
+    os.environ.get("CUDF_POLARS_LOG_TRACES_MEMORY", "1")
+)
+LOG_DATAFRAMES = LOG_TRACES and _bool_converter(
+    os.environ.get("CUDF_POLARS_LOG_TRACES_DATAFRAMES", "1")
+)
 
 CUDF_POLARS_NVTX_DOMAIN = "cudf_polars"
 
@@ -84,40 +90,48 @@ def make_snapshot(
 
     d: dict[str, Any] = {
         "type": ir_name,
-        f"count_frames_{phase}": len(frames),
-        f"frames_{phase}": [
-            {
-                "shape": frame.table.shape(),
-                "size": sum(col.device_buffer_size() for col in frame.table.columns()),
-            }
-            for frame in frames
-        ],
     }
-    d[f"total_bytes_{phase}"] = sum(x["size"] for x in d[f"frames_{phase}"])
 
-    stats = rmm.statistics.get_statistics()
-    if stats:
+    if LOG_DATAFRAMES:
         d.update(
             {
-                f"rmm_current_bytes_{phase}": stats.current_bytes,
-                f"rmm_current_count_{phase}": stats.current_count,
-                f"rmm_peak_bytes_{phase}": stats.peak_bytes,
-                f"rmm_peak_count_{phase}": stats.peak_count,
-                f"rmm_total_bytes_{phase}": stats.total_bytes,
-                f"rmm_total_count_{phase}": stats.total_count,
+                f"count_frames_{phase}": len(frames),
+                f"frames_{phase}": [
+                    {
+                        "shape": frame.table.shape(),
+                        "size": sum(
+                            col.device_buffer_size() for col in frame.table.columns()
+                        ),
+                    }
+                    for frame in frames
+                ],
             }
         )
+        d[f"total_bytes_{phase}"] = sum(x["size"] for x in d[f"frames_{phase}"])
+
+    if LOG_MEMORY:
+        stats = rmm.statistics.get_statistics()
+        if stats:
+            d.update(
+                {
+                    f"rmm_current_bytes_{phase}": stats.current_bytes,
+                    f"rmm_current_count_{phase}": stats.current_count,
+                    f"rmm_peak_bytes_{phase}": stats.peak_bytes,
+                    f"rmm_peak_count_{phase}": stats.peak_count,
+                    f"rmm_total_bytes_{phase}": stats.total_bytes,
+                    f"rmm_total_count_{phase}": stats.total_count,
+                }
+            )
 
+        if device_handle is not None:
+            processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
+            for proc in processes:
+                if proc.pid == pid:
+                    d[f"nvml_current_bytes_{phase}"] = proc.usedGpuMemory
+                    break
     if extra:
         d.update(extra)
 
-    if device_handle is not None:
-        processes = pynvml.nvmlDeviceGetComputeRunningProcesses(device_handle)
-        for proc in processes:
-            if proc.pid == pid:
-                d[f"nvml_current_bytes_{phase}"] = proc.usedGpuMemory
-                break
-
     return d