Additional Support For Nullable Attributes (#1836)

nguyenv · web-flow · commit 656f54f67653 · 2023-09-28T08:41:47.000-04:00
**Background** As detailed in sc-34754, this fixes a bug found by a customer using the TileDB-SOMA Python API where the `SOMADataFrame` containing an enumerated nullable attribute was not being readback correctly. This highlights a larger deficit in the TileDB-Py codebase in which we have [little support](https://docs.tiledb.com/main/how-to/arrays/writing-arrays/nullable-attributes) for writing nullable attributes outside of utilizing `tiledb.from_pandas` with Pandas's `ExtensionDtype`. **Changes** - This PR supports writing Pyarrow arrays and Pandas dataframes that contain nullable values (`pd.NA`, `pa.na`, `None`, etc.). - Nullable attributes are now represented in Numpy as [masked arrays](https://numpy.org/doc/stable/reference/maskedarray.html). - `PyQuery` results now also return the validity buffer. - Note that in Pyarrow, the validity values represent 0 = invalid, 1 = valid, whereas in Numpy, this is inverted and mask values represent 0 = valid, 1 = invalid. **Future Proposals** - Support writing `numpy.ma` for nullable attributes. ``` with tiledb.open(uri, "w') as A: A[:] = np.ma.array(data, mask) ``` - Support writing with built-in sequences (eg. `list`, `tuple`). Internally, we check if the attribute `.isnullable()` and then cast using `np.ma.masked_invalid()`. ``` with tiledb.open(uri, "w') as A: A[:] = [1, 2, None, 3] ```
diff --git a/tiledb/core.cc b/tiledb/core.cc
@@ -1183,7 +1183,8 @@ class PyQuery {
     py::dict results;
     for (auto &buffer_name : buffers_order_) {
       auto bp = buffers_.at(buffer_name);
-      results[py::str(buffer_name)] = py::make_tuple(bp.data, bp.offsets);
+      results[py::str(buffer_name)] =
+          py::make_tuple(bp.data, bp.offsets, bp.validity);
     }
     return results;
   }
diff --git a/tiledb/libtiledb.pyx b/tiledb/libtiledb.pyx
@@ -1925,15 +1925,26 @@ cdef class DenseArrayImpl(Array):
         if self.view_attr:
             result = self.subarray(selection, attrs=(self.view_attr,))
             return result[self.view_attr]
-        else:
-            result = self.subarray(selection)
-            for i in range(self.schema.nattr):
-                attr = self.schema.attr(i)
-                enum_label = attr.enum_label
-                if enum_label is not None:
-                    values = self.enum(enum_label).values()
-                    result[attr.name] = np.array([values[idx] for idx in result[attr.name]])
-            return result
+
+        result = self.subarray(selection)
+        for i in range(self.schema.nattr):
+            attr = self.schema.attr(i)
+            enum_label = attr.enum_label
+            if enum_label is not None:
+                values = self.enum(enum_label).values()
+                if attr.isnullable:
+                    data = np.array([values[idx] for idx in result[attr.name].data])
+                    result[attr.name] = np.ma.array(
+                        data, mask=~result[attr.name].mask)
+                else:
+                    result[attr.name] = np.array(
+                        [values[idx] for idx in result[attr.name]])
+            else:
+                if attr.isnullable:
+                    result[attr.name] = np.ma.array(result[attr.name].data, 
+                        mask=~result[attr.name].mask)
+
+        return result
 
     def __repr__(self):
         if self.isopen:
@@ -2182,6 +2193,10 @@ cdef class DenseArrayImpl(Array):
                     arr.shape = np.prod(output_shape)
 
                 out[name] = arr
+            
+            if self.schema.has_attr(name) and self.attr(name).isnullable:
+                out[name] = np.ma.array(out[name], mask=results[name][2].astype(bool))
+                
         return out
 
     def __setitem__(self, object selection, object val):
@@ -2272,14 +2287,34 @@ cdef class DenseArrayImpl(Array):
             # Create list of attribute names and values
             for attr_idx in range(self.schema.nattr):
                 attr = self.schema.attr(attr_idx)
-                k = attr.name
-                v = val[k]
-                attr = self.schema.attr(k)
+                name = attr.name
+                attr_val = val[name]
+
                 attributes.append(attr._internal_name)
                 # object arrays are var-len and handled later
-                if type(v) is np.ndarray and v.dtype is not np.dtype('O'):
-                    v = np.ascontiguousarray(v, dtype=attr.dtype)
-                values.append(v)
+                if type(attr_val) is np.ndarray and attr_val.dtype is not np.dtype('O'):
+                    attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)
+                
+                try:
+                    if attr.isvar:
+                        # ensure that the value is array-convertible, for example: pandas.Series
+                        attr_val = np.asarray(attr_val)
+                        if attr.isnullable and name not in nullmaps:
+                            nullmaps[name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8)
+                    else:
+                        if (np.issubdtype(attr.dtype, np.string_) and not
+                            (np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))):
+                            raise ValueError("Cannot write a string value to non-string "
+                                            "typed attribute '{}'!".format(name))
+                        
+                        if attr.isnullable and name not in nullmaps:
+                            nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask
+                            attr_val = np.nan_to_num(attr_val)
+                        attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)
+                except Exception as exc:
+                    raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc
+                
+                values.append(attr_val)
 
         elif np.isscalar(val):
             for i in range(self.schema.nattr):
@@ -2290,10 +2325,29 @@ cdef class DenseArrayImpl(Array):
                 values.append(A)
         elif self.schema.nattr == 1:
             attr = self.schema.attr(0)
+            name = attr.name
             attributes.append(attr._internal_name)
             # object arrays are var-len and handled later
             if type(val) is np.ndarray and val.dtype is not np.dtype('O'):
                 val = np.ascontiguousarray(val, dtype=attr.dtype)
+            try:
+                if attr.isvar:
+                    # ensure that the value is array-convertible, for example: pandas.Series
+                    val = np.asarray(val)
+                    if attr.isnullable and name not in nullmaps:
+                        nullmaps[name] = np.array([int(v is not None) for v in val], dtype=np.uint8)
+                else:
+                    if (np.issubdtype(attr.dtype, np.string_) and not
+                        (np.issubdtype(val.dtype, np.string_) or val.dtype == np.dtype('O'))):
+                        raise ValueError("Cannot write a string value to non-string "
+                                        "typed attribute '{}'!".format(name))
+                    
+                    if attr.isnullable and name not in nullmaps:
+                        nullmaps[name] = ~np.ma.fix_invalid(val).mask
+                        val = np.nan_to_num(val)
+                    val = np.ascontiguousarray(val, dtype=attr.dtype)
+            except Exception as exc:
+                raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc
             values.append(val)
         elif self.view_attr is not None:
             # Support single-attribute assignment for multi-attr array
@@ -2329,9 +2383,6 @@ cdef class DenseArrayImpl(Array):
                 if not isinstance(val, np.ndarray):
                     raise TypeError(f"Expected NumPy array for attribute '{key}' "
                                     f"validity bitmap, got {type(val)}")
-                if val.dtype != np.uint8:
-                    raise TypeError(f"Expected NumPy uint8 array for attribute '{key}' "
-                                    f"validity bitmap, got {val.dtype}")
 
         _write_array(
             ctx_ptr,
@@ -2769,17 +2820,19 @@ def _setitem_impl_sparse(self: Array, selection, val, dict nullmaps):
             if attr.isvar:
                 # ensure that the value is array-convertible, for example: pandas.Series
                 attr_val = np.asarray(attr_val)
+                if attr.isnullable and name not in nullmaps:
+                    nullmaps[name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8)
             else:
                 if (np.issubdtype(attr.dtype, np.string_) and not
                     (np.issubdtype(attr_val.dtype, np.string_) or attr_val.dtype == np.dtype('O'))):
                     raise ValueError("Cannot write a string value to non-string "
                                      "typed attribute '{}'!".format(name))
-
+                
+                if attr.isnullable and name not in nullmaps:
+                    nullmaps[name] = ~np.ma.masked_invalid(attr_val).mask
+                    attr_val = np.nan_to_num(attr_val)
                 attr_val = np.ascontiguousarray(attr_val, dtype=attr.dtype)
 
-            if attr.isnullable and attr.name not in nullmaps:
-                nullmaps[attr.name] = np.array([int(v is not None) for v in attr_val], dtype=np.uint8)
-
         except Exception as exc:
             raise ValueError(f"NumPy array conversion check failed for attr '{name}'") from exc
 
@@ -2919,7 +2972,18 @@ cdef class SparseArrayImpl(Array):
             enum_label = attr.enum_label
             if enum_label is not None:
                 values = self.enum(enum_label).values()
-                result[attr.name] = np.array([values[idx] for idx in result[attr.name]])
+                if attr.isnullable:
+                    data = np.array([values[idx] for idx in result[attr.name].data])
+                    result[attr.name] = np.ma.array(
+                        data, mask=~result[attr.name].mask)
+                else:
+                    result[attr.name] = np.array(
+                        [values[idx] for idx in result[attr.name]])
+            else:
+                if attr.isnullable:
+                    result[attr.name] = np.ma.array(result[attr.name].data, 
+                        mask=~result[attr.name].mask)
+
         return result
 
     def query(self, attrs=None, cond=None, attr_cond=None, dims=None,
@@ -3207,6 +3271,9 @@ cdef class SparseArrayImpl(Array):
                 else:
                     arr.dtype = el_dtype
                     out[final_name] = arr
+            
+            if self.schema.has_attr(final_name) and self.attr(final_name).isnullable:
+                out[final_name] = np.ma.array(out[final_name], mask=results[name][2])
 
         return out
 
diff --git a/tiledb/tests/test_enumeration.py b/tiledb/tests/test_enumeration.py
@@ -4,7 +4,7 @@
 
 import tiledb
 
-from .common import DiskTestCase, has_pandas
+from .common import DiskTestCase, has_pandas, has_pyarrow
 
 
 class EnumerationTest(DiskTestCase):
@@ -82,47 +82,37 @@ def test_array_schema_enumeration(self):
                 assert_array_equal(A.df[:]["attr1"], A[:]["attr1"])
                 assert_array_equal(A.df[:]["attr2"], A[:]["attr2"])
 
-    def test_array_schema_enumeration_nullable(self):
-        uri = self.path("test_array_schema_enumeration")
-        dom = tiledb.Domain(tiledb.Dim(domain=(1, 8), tile=1))
-        enum1 = tiledb.Enumeration("enmr1", False, np.arange(3) * 10)
-        enum2 = tiledb.Enumeration("enmr2", False, ["a", "bb", "ccc"])
-        attr1 = tiledb.Attr("attr1", dtype=np.int32, enum_label="enmr1")
-        attr2 = tiledb.Attr("attr2", dtype=np.int32, enum_label="enmr2")
-        attr3 = tiledb.Attr("attr3", dtype=np.int32)
+    @pytest.mark.skipif(
+        not has_pyarrow() or not has_pandas(),
+        reason="pyarrow and/or pandas not installed",
+    )
+    @pytest.mark.parametrize("sparse", [True, False])
+    @pytest.mark.parametrize("pass_df", [True, False])
+    def test_array_schema_enumeration_nullable(self, sparse, pass_df):
+        import pyarrow as pa
+
+        uri = self.path("test_array_schema_enumeration_nullable")
+        enmr = tiledb.Enumeration("e", False, ["alpha", "beta", "gamma"])
+        dom = tiledb.Domain(tiledb.Dim("d", domain=(1, 5), dtype="int64"))
+        att = tiledb.Attr("a", dtype="int8", nullable=True, enum_label="e")
         schema = tiledb.ArraySchema(
-            domain=dom, attrs=(attr1, attr2, attr3), enums=(enum1, enum2)
+            domain=dom, attrs=[att], enums=[enmr], sparse=sparse
         )
         tiledb.Array.create(uri, schema)
 
-        data1 = np.random.randint(0, 3, 8)
-        data2 = np.random.randint(0, 3, 8)
-        data3 = np.random.randint(0, 3, 8)
-
         with tiledb.open(uri, "w") as A:
-            A[:] = {"attr1": data1, "attr2": data2, "attr3": data3}
-
-        with tiledb.open(uri, "r") as A:
-            assert A.enum("enmr1") == enum1
-            assert attr1.enum_label == "enmr1"
-            assert A.attr("attr1").enum_label == "enmr1"
+            dims = pa.array([1, 2, 3, 4, 5])
+            data = pa.array([1.0, 2.0, None, 0, 1.0])
+            if pass_df:
+                dims = dims.to_pandas()
+                data = data.to_pandas()
 
-            assert A.enum("enmr2") == enum2
-            assert attr2.enum_label == "enmr2"
-            assert A.attr("attr2").enum_label == "enmr2"
-
-            with self.assertRaises(tiledb.TileDBError) as excinfo:
-                assert A.enum("enmr3") == []
-            assert " No enumeration named 'enmr3'" in str(excinfo.value)
-            assert attr3.enum_label is None
-            assert A.attr("attr3").enum_label is None
+            if sparse:
+                A[dims] = data
+            else:
+                A[:] = data
 
-            if has_pandas():
-                assert_array_equal(A.df[:]["attr1"].cat.codes, data1)
-                assert_array_equal(A.df[:]["attr2"].cat.codes, data2)
-
-                assert_array_equal(A.df[:]["attr1"], A.multi_index[:]["attr1"])
-                assert_array_equal(A.df[:]["attr2"], A.multi_index[:]["attr2"])
-
-                assert_array_equal(A.df[:]["attr1"], A[:]["attr1"])
-                assert_array_equal(A.df[:]["attr2"], A[:]["attr2"])
+        with tiledb.open(uri, "r") as A:
+            expected_validity = [False, False, True, False, False]
+            assert_array_equal(A[:]["a"].mask, expected_validity)
+            assert_array_equal(A.df[:]["a"].isna(), expected_validity)
diff --git a/tiledb/tests/test_libtiledb.py b/tiledb/tests/test_libtiledb.py
@@ -27,6 +27,7 @@
     assert_unordered_equal,
     fx_sparse_cell_order,  # noqa: F401
     has_pandas,
+    has_pyarrow,
     rand_ascii,
     rand_ascii_bytes,
     rand_utf8,
@@ -381,6 +382,38 @@ def test_array_delete(self):
 
         assert tiledb.array_exists(uri) is False
 
+    @pytest.mark.skipif(
+        not has_pyarrow() or not has_pandas(),
+        reason="pyarrow and/or pandas not installed",
+    )
+    @pytest.mark.parametrize("sparse", [True, False])
+    @pytest.mark.parametrize("pass_df", [True, False])
+    def test_array_write_nullable(self, sparse, pass_df):
+        import pyarrow as pa
+
+        uri = self.path("test_array_write_nullable")
+        dom = tiledb.Domain(tiledb.Dim("d", domain=(1, 5), dtype="int64"))
+        att = tiledb.Attr("a", dtype="int8", nullable=True)
+        schema = tiledb.ArraySchema(domain=dom, attrs=[att], sparse=sparse)
+        tiledb.Array.create(uri, schema)
+
+        with tiledb.open(uri, "w") as A:
+            dims = pa.array([1, 2, 3, 4, 5])
+            data = pa.array([1.0, 2.0, None, 0, 1.0])
+            if pass_df:
+                dims = dims.to_pandas()
+                data = data.to_pandas()
+
+            if sparse:
+                A[dims] = data
+            else:
+                A[:] = data
+
+        with tiledb.open(uri, "r") as A:
+            expected_validity = [False, False, True, False, False]
+            assert_array_equal(A[:]["a"].mask, expected_validity)
+            assert_array_equal(A.df[:]["a"].isna(), expected_validity)
+
 
 class DenseArrayTest(DiskTestCase):
     def test_array_1d(self):

Original file line number	Diff line number	Diff line change
`@@ -1183,7 +1183,8 @@ class PyQuery {`
`1183`	`1183`	`py::dict results;`
`1184`	`1184`	`for (auto &buffer_name : buffers_order_) {`
`1185`	`1185`	`auto bp = buffers_.at(buffer_name);`
`1186`		`- results[py::str(buffer_name)] = py::make_tuple(bp.data, bp.offsets);`
	`1186`	`+ results[py::str(buffer_name)] =`
	`1187`	`+ py::make_tuple(bp.data, bp.offsets, bp.validity);`
`1187`	`1188`	`}`
`1188`	`1189`	`return results;`
`1189`	`1190`	`}`