Skip to content

Correct full_domain=True For from_pandas #1239

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions tiledb/dataframe_.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,11 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non
dim_min = dtype_min

if np.issubdtype(dtype, np.integer):
tile_max = np.iinfo(np.uint64).max - tile
if np.uint64(dtype_max - dtype_min) > tile_max:
tile_max = np.iinfo(dtype).max - tile

if tile_max < 0:
dim_max -= 1
elif dtype_max - dtype_min > tile_max:
dim_max = dtype_max - tile
else:
dim_min, dim_max = None, None
Expand All @@ -261,8 +264,12 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non
dim_max = np.max(values)

if np.issubdtype(dtype, np.integer) or dtype.kind == "M":
# we can't make a tile larger than the dimension range or lower than 1
tile = max(1, min(tile, np.uint64(dim_max - dim_min)))
# when full_domain=True, the tile cannot exceed the max range of the
# datatype. when full_domain=False, the tile cannot exceed the max range
# of the dimensions. the tile extent must be at least 1.
dim_range = np.uint64(dim_max - dim_min)
tile_max = np.uint64(dtype_max) if full_domain else dim_range
tile = max(1, min(tile, tile_max))
elif np.issubdtype(dtype, np.floating):
# this difference can be inf
with np.errstate(over="ignore"):
Expand Down Expand Up @@ -477,11 +484,18 @@ def _from_pandas(uri, dataframe, tiledb_args):
elif mode != "ingest":
raise TileDBError(f"Invalid mode specified ('{mode}')")

# TODO: disentangle the full_domain logic
full_domain = tiledb_args.get("full_domain", False)
if sparse == False and (not index_dims or "index_col" not in kwargs):

if sparse == False and (not index_dims or "index_col" not in tiledb_args):
# for dense arrays, if there aren't any columns specified to use in
# creating the dimension (via `index_dims` or the Pandas `read_csv`
# argument `index_col`), then use the full domain
full_domain = True

if full_domain is None and tiledb_args.get("nrows"):
# Pandas `read_csv` argument `nrows` specifies to only read the first n
# rows of a CSV file resulting in a dimension that should have a domain
# length of n, not the full domain
full_domain = False

date_spec = tiledb_args.get("date_spec")
Expand Down
42 changes: 42 additions & 0 deletions tiledb/tests/test_pandas_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1476,6 +1476,48 @@ def assert_filters_eq(left, right):
with tiledb.open(uri) as A:
assert_filters_eq(getter(A), f)

@pytest.mark.parametrize(
"dtype",
[
np.uint8,
np.uint16,
np.uint32,
np.uint64,
np.int8,
np.int16,
np.int32,
np.int64,
],
)
def test_full_domain(self, dtype):
uri = self.path("test_full_domain")

df = pd.DataFrame(
{
"d": np.random.randint(0, 10, size=10, dtype=dtype),
"a": np.random.random(size=10),
}
)

tiledb.from_pandas(
uri,
df,
index_dims=["d"],
sparse=True,
full_domain=True,
)

with tiledb.open(uri, "r") as A:
dim = A.dim("d")
iinfo = np.iinfo(dtype)
assert dim.domain[0] == iinfo.min
if dtype in (np.uint8, np.int8):
assert A.dim("d").tile == iinfo.max
assert dim.domain[1] == iinfo.max - 1
else:
assert A.dim("d").tile == 10000
assert dim.domain[1] == iinfo.max - A.dim("d").tile


###############################################################################

Expand Down