Skip to content

Commit 96e34d4

Browse files
authored
More mypy and docs fixes (#20224)
In the process of fixing mypy issues, I uncovered a number of documentation issues as well which are now fixed: - Due to how Breathe outputs names, C++ enums don't have their namespaces encoded in the symbols - Similarly, all free functions should not have names in symbols - All classes and methods _should_ have names in symbols Additionally, now that we are properly exposing the `regex_flags` as an enum and not an enum class, we need to ensure that that type of object also winds up in the docs. Contributes to #17470 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Matthew Roeschke (https://github.com/mroeschke) URL: #20224
1 parent 6a1a827 commit 96e34d4

40 files changed

+286
-203
lines changed

cpp/include/cudf/binaryop.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -292,6 +292,7 @@ namespace binops {
292292
/**
293293
* @brief Returns true if the binary operator is supported for the given input types.
294294
*
295+
* @ingroup transformation_binaryops
295296
* @param out The output data type
296297
* @param lhs The left-hand cudf::data_type
297298
* @param rhs The right-hand cudf::data_type

docs/cudf/source/conf.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import tempfile
2727
import warnings
2828
import xml.etree.ElementTree as ET
29-
from enum import IntEnum
29+
from enum import IntEnum, IntFlag
3030
from typing import Any
3131

3232
import cudf
@@ -684,7 +684,7 @@ def can_document_member(
684684
) -> bool:
685685
try:
686686
return issubclass(
687-
member, IntEnum
687+
member, (IntEnum, IntFlag)
688688
) and member.__module__.startswith("pylibcudf")
689689
except TypeError:
690690
return False
@@ -703,7 +703,7 @@ def add_content(self, more_content) -> None:
703703

704704
if self.object.__name__ != "Kind":
705705
self.add_line(
706-
f"See also :cpp:enum:`cudf::{self.object.__name__}`.",
706+
f"See also :cpp:enum:`{self.object.__name__}`.",
707707
source_name,
708708
)
709709
self.add_line("", source_name)

docs/cudf/source/pylibcudf/api_docs/io/index.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ I/O Functions
1818
avro
1919
csv
2020
json
21+
orc
2122
parquet
2223
parquet_metadata
2324
text
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
===
2+
ORC
3+
===
4+
5+
.. automodule:: pylibcudf.io.orc
6+
:members:

python/cudf/cudf/core/column/column.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ def set_mask(self, value) -> Self:
399399
mask = as_buffer(dbuf)
400400

401401
if mask is not None:
402-
new_mask: plc.gpumemoryview | None = plc.gpumemoryview(mask)
402+
new_mask = plc.gpumemoryview(mask)
403403
new_null_count = plc.null_mask.null_count(
404404
new_mask,
405405
0,
@@ -1346,6 +1346,7 @@ def fillna(
13461346
input_col = self.nans_to_nulls()
13471347

13481348
with acquire_spill_lock():
1349+
plc_replace: plc.replace.ReplacePolicy | plc.Scalar
13491350
if method:
13501351
plc_replace = (
13511352
plc.replace.ReplacePolicy.PRECEDING
@@ -2045,7 +2046,7 @@ def _process_for_reduction(
20452046
return _get_nan_for_dtype(self.dtype)
20462047
return col
20472048

2048-
def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
2049+
def _reduction_result_dtype(self, reduction_op: str) -> DtypeObj:
20492050
"""
20502051
Determine the correct dtype to pass to libcudf based on
20512052
the input dtype, data dtype, and specific reduction op
@@ -2353,6 +2354,7 @@ def _cast_self_and_other_for_where(
23532354
f"Type-casting from {other_col.dtype} "
23542355
f"to {self.dtype}, there could be potential data loss"
23552356
)
2357+
other_out: plc.Scalar | ColumnBase
23562358
if other_is_scalar:
23572359
other_out = pa_scalar_to_plc_scalar(
23582360
pa.scalar(other, type=cudf_dtype_to_pa_type(self.dtype))

python/cudf/cudf/core/column/string.py

Lines changed: 54 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33
from __future__ import annotations
44

55
import itertools
6-
from functools import cached_property
6+
import re
7+
from collections.abc import Callable
8+
from functools import cached_property, lru_cache
79
from typing import TYPE_CHECKING, cast
810

911
import numpy as np
@@ -52,6 +54,29 @@
5254
from cudf.core.dtypes import DecimalDtype
5355

5456

57+
# For now all supported re flags have matching names in libcudf. If that ever changes
58+
# this construction will need to be updated with more explicit mapping.
59+
_FLAG_MAP = {
60+
getattr(re, flag): getattr(plc.strings.regex_flags.RegexFlags, flag)
61+
for flag in ("MULTILINE", "DOTALL")
62+
}
63+
64+
65+
@lru_cache
66+
def plc_flags_from_re_flags(
67+
flags: re.RegexFlag,
68+
) -> plc.strings.regex_flags.RegexFlags:
69+
# Convert Python re flags to pylibcudf RegexFlags
70+
plc_flags = plc.strings.regex_flags.RegexFlags(0)
71+
for re_flag, plc_flag in _FLAG_MAP.items():
72+
if flags & re_flag:
73+
plc_flags |= plc_flag
74+
flags &= ~re_flag
75+
if flags:
76+
raise ValueError(f"Unsupported re flags: {flags}")
77+
return plc_flags
78+
79+
5580
class StringColumn(ColumnBase):
5681
"""
5782
Implements operations for Columns of String type
@@ -323,7 +348,9 @@ def as_numerical_column(self, dtype: np.dtype) -> NumericalColumn:
323348
if not is_pandas_nullable_extension_dtype(dtype):
324349
result = result.fillna(False)
325350
return result._with_type_metadata(dtype) # type: ignore[return-value]
326-
elif dtype.kind in {"i", "u"}:
351+
352+
cast_func: Callable[[plc.Column, plc.DataType], plc.Column]
353+
if dtype.kind in {"i", "u"}:
327354
if not self.is_integer().all():
328355
raise ValueError(
329356
"Could not convert strings to integer "
@@ -362,7 +389,9 @@ def strptime(
362389
raise ValueError(
363390
"Cannot convert `None` value to datetime or timedelta."
364391
)
365-
elif dtype.kind == "M": # type: ignore[union-attr]
392+
393+
casting_func: Callable[[plc.Column, plc.DataType, str], plc.Column]
394+
if dtype.kind == "M": # type: ignore[union-attr]
366395
if format.endswith("%z"):
367396
raise NotImplementedError(
368397
"cuDF does not yet support timezone-aware datetimes"
@@ -587,10 +616,10 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
587616
}:
588617
if isinstance(other, pa.Scalar):
589618
other = pa_scalar_to_plc_scalar(other)
590-
lhs, rhs = (other, self) if reflect else (self, other)
619+
lhs_op, rhs_op = (other, self) if reflect else (self, other)
591620
return binaryop.binaryop(
592-
lhs=lhs,
593-
rhs=rhs,
621+
lhs=lhs_op,
622+
rhs=rhs_op,
594623
op=op,
595624
dtype=get_dtype_of_same_kind(
596625
self.dtype, np.dtype(np.bool_)
@@ -1062,7 +1091,7 @@ def _split(
10621091
self,
10631092
delimiter: plc.Scalar,
10641093
maxsplit: int,
1065-
method: Callable[[plc.Column, plc.Scalar, int], plc.Column],
1094+
method: Callable[[plc.Column, plc.Scalar, int], plc.Table],
10661095
) -> dict[int, Self]:
10671096
plc_table = method(
10681097
self.to_pylibcudf(mode="read"),
@@ -1086,7 +1115,7 @@ def rsplit(self, delimiter: plc.Scalar, maxsplit: int) -> dict[int, Self]:
10861115
def _partition(
10871116
self,
10881117
delimiter: plc.Scalar,
1089-
method: Callable[[plc.Column, plc.Scalar], plc.Column],
1118+
method: Callable[[plc.Column, plc.Scalar], plc.Table],
10901119
) -> dict[int, Self]:
10911120
plc_table = method(
10921121
self.to_pylibcudf(mode="read"),
@@ -1180,7 +1209,10 @@ def concatenate(
11801209
def extract(self, pattern: str, flags: int) -> dict[int, Self]:
11811210
plc_table = plc.strings.extract.extract(
11821211
self.to_pylibcudf(mode="read"),
1183-
plc.strings.regex_program.RegexProgram.create(pattern, flags),
1212+
plc.strings.regex_program.RegexProgram.create(
1213+
pattern,
1214+
plc_flags_from_re_flags(flags),
1215+
),
11841216
)
11851217
return dict(
11861218
enumerate(
@@ -1192,7 +1224,10 @@ def extract(self, pattern: str, flags: int) -> dict[int, Self]:
11921224
def contains_re(self, pattern: str, flags: int) -> Self:
11931225
plc_column = plc.strings.contains.contains_re(
11941226
self.to_pylibcudf(mode="read"),
1195-
plc.strings.regex_program.RegexProgram.create(pattern, flags),
1227+
plc.strings.regex_program.RegexProgram.create(
1228+
pattern,
1229+
plc_flags_from_re_flags(flags),
1230+
),
11961231
)
11971232
return type(self).from_pylibcudf(plc_column) # type: ignore[return-value]
11981233

@@ -1400,7 +1435,9 @@ def wrap(self, width: int) -> Self:
14001435
def count_re(self, pattern: str, flags: int) -> NumericalColumn:
14011436
plc_result = plc.strings.contains.count_re(
14021437
self.to_pylibcudf(mode="read"),
1403-
plc.strings.regex_program.RegexProgram.create(pattern, flags),
1438+
plc.strings.regex_program.RegexProgram.create(
1439+
pattern, plc_flags_from_re_flags(flags)
1440+
),
14041441
)
14051442
return type(self).from_pylibcudf(plc_result) # type: ignore[return-value]
14061443

@@ -1415,7 +1452,9 @@ def findall(
14151452
) -> Self:
14161453
plc_result = method(
14171454
self.to_pylibcudf(mode="read"),
1418-
plc.strings.regex_program.RegexProgram.create(pat, flags),
1455+
plc.strings.regex_program.RegexProgram.create(
1456+
pat, plc_flags_from_re_flags(flags)
1457+
),
14191458
)
14201459
return type(self).from_pylibcudf(plc_result) # type: ignore[return-value]
14211460

@@ -1464,7 +1503,9 @@ def find(
14641503
def matches_re(self, pattern: str, flags: int) -> Self:
14651504
plc_result = plc.strings.contains.matches_re(
14661505
self.to_pylibcudf(mode="read"),
1467-
plc.strings.regex_program.RegexProgram.create(pattern, flags),
1506+
plc.strings.regex_program.RegexProgram.create(
1507+
pattern, plc_flags_from_re_flags(flags)
1508+
),
14681509
)
14691510
return type(self).from_pylibcudf(plc_result) # type: ignore[return-value]
14701511

0 commit comments

Comments
 (0)