diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index 034299200c..b51b9882b3 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -249,8 +249,11 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non dim_min = dtype_min if np.issubdtype(dtype, np.integer): - tile_max = np.iinfo(np.uint64).max - tile - if np.uint64(dtype_max - dtype_min) > tile_max: + tile_max = np.iinfo(dtype).max - tile + + if tile_max < 0: + dim_max -= 1 + elif dtype_max - dtype_min > tile_max: dim_max = dtype_max - tile else: dim_min, dim_max = None, None @@ -261,8 +264,12 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non dim_max = np.max(values) if np.issubdtype(dtype, np.integer) or dtype.kind == "M": - # we can't make a tile larger than the dimension range or lower than 1 - tile = max(1, min(tile, np.uint64(dim_max - dim_min))) + # when full_domain=True, the tile cannot exceed the max range of the + # datatype. when full_domain=False, the tile cannot exceed the max range + # of the dimensions. the tile extent must be at least 1. + dim_range = np.uint64(dim_max - dim_min) + tile_max = np.uint64(dtype_max) if full_domain else dim_range + tile = max(1, min(tile, tile_max)) elif np.issubdtype(dtype, np.floating): # this difference can be inf with np.errstate(over="ignore"): @@ -477,11 +484,18 @@ def _from_pandas(uri, dataframe, tiledb_args): elif mode != "ingest": raise TileDBError(f"Invalid mode specified ('{mode}')") - # TODO: disentangle the full_domain logic full_domain = tiledb_args.get("full_domain", False) - if sparse == False and (not index_dims or "index_col" not in kwargs): + + if sparse == False and (not index_dims or "index_col" not in tiledb_args): + # for dense arrays, if there aren't any columns specified to use in + # creating the dimension (via `index_dims` or the Pandas `read_csv` + # argument `index_col`), then use the full domain full_domain = True + if full_domain is None and tiledb_args.get("nrows"): + # Pandas `read_csv` argument `nrows` specifies to only read the first n + # rows of a CSV file resulting in a dimension that should have a domain + # length of n, not the full domain full_domain = False date_spec = tiledb_args.get("date_spec") diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py index d59950571f..022334380c 100644 --- a/tiledb/tests/test_pandas_dataframe.py +++ b/tiledb/tests/test_pandas_dataframe.py @@ -1476,6 +1476,48 @@ def assert_filters_eq(left, right): with tiledb.open(uri) as A: assert_filters_eq(getter(A), f) + @pytest.mark.parametrize( + "dtype", + [ + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.int8, + np.int16, + np.int32, + np.int64, + ], + ) + def test_full_domain(self, dtype): + uri = self.path("test_full_domain") + + df = pd.DataFrame( + { + "d": np.random.randint(0, 10, size=10, dtype=dtype), + "a": np.random.random(size=10), + } + ) + + tiledb.from_pandas( + uri, + df, + index_dims=["d"], + sparse=True, + full_domain=True, + ) + + with tiledb.open(uri, "r") as A: + dim = A.dim("d") + iinfo = np.iinfo(dtype) + assert dim.domain[0] == iinfo.min + if dtype in (np.uint8, np.int8): + assert A.dim("d").tile == iinfo.max + assert dim.domain[1] == iinfo.max - 1 + else: + assert A.dim("d").tile == 10000 + assert dim.domain[1] == iinfo.max - A.dim("d").tile + ###############################################################################