From 623eb326ce6d931a4cd1e922c2a64b2157ab1ece Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Thu, 21 Jul 2022 14:48:34 -0500 Subject: [PATCH 1/3] Correct `full_domain=True` For `from_pandas` * Previously, the `tile_max` was only correctly calculated for `np.int64`. This correctly set the `dim_max` to `dtype_max - tile` but only for `np.int64` * All other integer dtypes had `dim_max` set to `dtype_max` which resulted in a domain range that was too large * We also need to account for when the tile extent is larger than the range of the full domain. For an instance, when we have a dim dtype of `np.int8`, the default tile extent of 10000 will be much larger than the range of the full domain --- tiledb/dataframe_.py | 9 ++++-- tiledb/tests/test_pandas_dataframe.py | 43 +++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index 034299200c..90a4a7ca5c 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -249,8 +249,11 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non dim_min = dtype_min if np.issubdtype(dtype, np.integer): - tile_max = np.iinfo(np.uint64).max - tile - if np.uint64(dtype_max - dtype_min) > tile_max: + tile_max = np.iinfo(dtype).max - tile + + if tile_max < 0: + dim_max -= 1 + elif dtype_max - dtype_min > tile_max: dim_max = dtype_max - tile else: dim_min, dim_max = None, None @@ -479,7 +482,7 @@ def _from_pandas(uri, dataframe, tiledb_args): # TODO: disentangle the full_domain logic full_domain = tiledb_args.get("full_domain", False) - if sparse == False and (not index_dims or "index_col" not in kwargs): + if sparse == False and (not index_dims or "index_col" not in tiledb_args): full_domain = True if full_domain is None and tiledb_args.get("nrows"): full_domain = False diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py index d59950571f..47b6a1516a 100644 --- a/tiledb/tests/test_pandas_dataframe.py +++ b/tiledb/tests/test_pandas_dataframe.py @@ -1,3 +1,4 @@ +from curses import A_ATTRIBUTES import pytest pd = pytest.importorskip("pandas") @@ -1476,6 +1477,48 @@ def assert_filters_eq(left, right): with tiledb.open(uri) as A: assert_filters_eq(getter(A), f) + @pytest.mark.parametrize( + "dtype", + [ + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.int8, + np.int16, + np.int32, + np.int64, + ], + ) + def test_full_domain(self, dtype): + uri = self.path("test_full_domain") + + df = pd.DataFrame( + { + "d": np.random.randint(0, 10, size=10, dtype=dtype), + "a": np.random.random(size=10), + } + ) + + tiledb.from_pandas( + uri, + df, + index_dims=["d"], + sparse=True, + full_domain=True, + ) + + with tiledb.open(uri, "r") as A: + dim = A.dim("d") + iinfo = np.iinfo(dtype) + assert dim.domain[0] == iinfo.min + if dtype in (np.uint8, np.int8): + assert A.dim("d").tile == 254 + assert dim.domain[1] == iinfo.max - 1 + else: + assert A.dim("d").tile == 10000 + assert dim.domain[1] == iinfo.max - A.dim("d").tile + ############################################################################### From def094a19d1c75562413f659dd9ec0d6046516fa Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 24 Aug 2022 10:47:27 -0500 Subject: [PATCH 2/3] Add Documentation To Clarify The `full_domain` Logic --- tiledb/dataframe_.py | 9 ++++++++- tiledb/tests/test_pandas_dataframe.py | 1 - 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index 90a4a7ca5c..a1a3252bed 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -480,11 +480,18 @@ def _from_pandas(uri, dataframe, tiledb_args): elif mode != "ingest": raise TileDBError(f"Invalid mode specified ('{mode}')") - # TODO: disentangle the full_domain logic full_domain = tiledb_args.get("full_domain", False) + if sparse == False and (not index_dims or "index_col" not in tiledb_args): + # for dense arrays, if there aren't any columns specified to use in + # creating the dimension (via `index_dims` or the Pandas `read_csv` + # argument `index_col`), then use the full domain full_domain = True + if full_domain is None and tiledb_args.get("nrows"): + # Pandas `read_csv` argument `nrows` specifies to only read the first n + # rows of a CSV file resulting in a dimension that should have a domain + # length of n, not the full domain full_domain = False date_spec = tiledb_args.get("date_spec") diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py index 47b6a1516a..ea511a2a10 100644 --- a/tiledb/tests/test_pandas_dataframe.py +++ b/tiledb/tests/test_pandas_dataframe.py @@ -1,4 +1,3 @@ -from curses import A_ATTRIBUTES import pytest pd = pytest.importorskip("pandas") From 759b749e6f2c46d4fb13e1d9c0e9b13d6505a376 Mon Sep 17 00:00:00 2001 From: Vivian Nguyen Date: Wed, 24 Aug 2022 12:12:34 -0500 Subject: [PATCH 3/3] Correct Logic For Accounting For Signed Integers * I was mistakenly under the impression that `tiledb.Dim(domain=(-128, 126), tile=254, dtype=np.int8)` was a valid tile extent for int8. But since the max value of int8 is 128, not 256, we can't have an extent larger than that. --- tiledb/dataframe_.py | 8 ++++++-- tiledb/tests/test_pandas_dataframe.py | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py index a1a3252bed..b51b9882b3 100644 --- a/tiledb/dataframe_.py +++ b/tiledb/dataframe_.py @@ -264,8 +264,12 @@ def dim_for_column(name, values, dtype, tile, full_domain=False, dim_filters=Non dim_max = np.max(values) if np.issubdtype(dtype, np.integer) or dtype.kind == "M": - # we can't make a tile larger than the dimension range or lower than 1 - tile = max(1, min(tile, np.uint64(dim_max - dim_min))) + # when full_domain=True, the tile cannot exceed the max range of the + # datatype. when full_domain=False, the tile cannot exceed the max range + # of the dimensions. the tile extent must be at least 1. + dim_range = np.uint64(dim_max - dim_min) + tile_max = np.uint64(dtype_max) if full_domain else dim_range + tile = max(1, min(tile, tile_max)) elif np.issubdtype(dtype, np.floating): # this difference can be inf with np.errstate(over="ignore"): diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py index ea511a2a10..022334380c 100644 --- a/tiledb/tests/test_pandas_dataframe.py +++ b/tiledb/tests/test_pandas_dataframe.py @@ -1512,7 +1512,7 @@ def test_full_domain(self, dtype): iinfo = np.iinfo(dtype) assert dim.domain[0] == iinfo.min if dtype in (np.uint8, np.int8): - assert A.dim("d").tile == 254 + assert A.dim("d").tile == iinfo.max assert dim.domain[1] == iinfo.max - 1 else: assert A.dim("d").tile == 10000