diff --git a/examples/ndarray/lazyexpr_where_indexing.py b/examples/ndarray/lazyexpr_where_indexing.py new file mode 100644 index 00000000..a39b801c --- /dev/null +++ b/examples/ndarray/lazyexpr_where_indexing.py @@ -0,0 +1,42 @@ +# Imports + +import numpy as np + +import blosc2 + +N = 1000 +it = ((-x + 1, x - 2, 0.1 * x) for x in range(N)) +sa = blosc2.fromiter( + it, dtype=[("A", "i4"), ("B", "f4"), ("C", "f8")], shape=(N,), urlpath="sa-1M.b2nd", mode="w" +) +expr = sa["(A < B)"] +A = sa["A"][:] +B = sa["B"][:] +C = sa["C"][:] +temp = sa[:] +indices = A < B +idx = np.argmax(indices) + +# One might think that expr[:10] gives the first 10 elements of the evaluated expression, but this is not the case. +# It actually computes the expression on the first 10 elements of the operands; since for some elements the condition +# is False, the result will be shorter than 10 elements. +# Returns less than 10 elements in general +sliced = expr.compute(slice(0, 10)) +gotitem = expr[:10] +np.testing.assert_array_equal(sliced[:], gotitem) +np.testing.assert_array_equal(gotitem, temp[:10][indices[:10]]) # Equivalent syntax +# Actually this makes sense since one can understand this as a request to compute on a portion of operands. +# If one desires a portion of the result, one should compute the whole expression and then slice it. + +# Get first element for which condition is true +sliced = expr.compute(idx) +gotitem = expr[idx] +# Arrays of one element +np.testing.assert_array_equal(sliced[()], gotitem) +np.testing.assert_array_equal(gotitem, temp[idx]) + +# Should return void arrays here. +sliced = expr.compute(0) +gotitem = expr[0] +np.testing.assert_array_equal(sliced[()], gotitem) +np.testing.assert_array_equal(gotitem, temp[0]) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 6aaaa531..1b6c3704 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -278,8 +278,9 @@ def compute(self, item: slice | list[slice] | None = None, **kwargs: Any) -> blo Parameters ---------- item: slice, list of slices, optional - If not None, only the chunks that intersect with the slices - in items will be evaluated. + If provided, item is used to slice the operands *prior* to computation; not to retrieve specified slices of + the evaluated result. This difference between slicing operands and slicing the final expression + is important when reductions or a where clause are used in the expression. kwargs: Any, optional Keyword arguments that are supported by the :func:`empty` constructor. @@ -328,7 +329,9 @@ def __getitem__(self, item: int | slice | Sequence[slice]) -> blosc2.NDArray: Parameters ---------- item: int, slice or sequence of slices - The slice(s) to be retrieved. Note that step parameter is not yet honored. + If provided, item is used to slice the operands *prior* to computation; not to retrieve specified slices of + the evaluated result. This difference between slicing operands and slicing the final expression + is important when reductions or a where clause are used in the expression. Returns ------- @@ -1378,7 +1381,8 @@ def slices_eval( # noqa: C901 for i, (c, s) in enumerate(zip(coords, chunks, strict=True)) ) # Check whether current slice_ intersects with _slice - if _slice is not None and _slice != (): + checker = _slice.item() if hasattr(_slice, "item") else _slice # can't use != when _slice is np.int + if checker is not None and checker != (): # Ensure that _slice is of type slice key = ndindex.ndindex(_slice).expand(shape).raw _slice = tuple(k if isinstance(k, slice) else slice(k, k + 1, None) for k in key) @@ -1508,19 +1512,7 @@ def slices_eval( # noqa: C901 else: raise ValueError("The where condition must be a tuple with one or two elements") - if orig_slice is not None: - if isinstance(out, np.ndarray): - out = out[orig_slice] - if _order is not None: - indices_ = indices_[orig_slice] - elif isinstance(out, blosc2.NDArray): - # It *seems* better to choose an automatic chunks and blocks for the output array - # out = out.slice(orig_slice, chunks=out.chunks, blocks=out.blocks) - out = out.slice(orig_slice) - else: - raise ValueError("The output array is not a NumPy array or a NDArray") - - if where is not None and len(where) < 2: + if where is not None and len(where) < 2: # Don't need to take orig_slice since filled up from 0 index if _order is not None: # argsort the result following _order new_order = np.argsort(out[:lenout]) @@ -1532,6 +1524,19 @@ def slices_eval( # noqa: C901 else: out.resize((lenout,)) + else: # Need to take orig_slice since filled up array according to slice_ for each chunk + if orig_slice is not None: + if isinstance(out, np.ndarray): + out = out[orig_slice] + if _order is not None: + indices_ = indices_[orig_slice] + elif isinstance(out, blosc2.NDArray): + # It *seems* better to choose an automatic chunks and blocks for the output array + # out = out.slice(orig_slice, chunks=out.chunks, blocks=out.blocks) + out = out.slice(orig_slice) + else: + raise ValueError("The output array is not a NumPy array or a NDArray") + return out @@ -1827,7 +1832,8 @@ def chunked_eval( # noqa: C901 operands: dict A dictionary containing the operands for the expression. item: int, slice or sequence of slices, optional - The slice(s) to be retrieved. Note that step parameter is not honored yet. + The slice(s) of the operands to be used in computation. Note that step parameter is not honored yet. + Item is used to slice the operands PRIOR to computation. kwargs: Any, optional Additional keyword arguments supported by the :func:`empty` constructor. In addition, the following keyword arguments are supported: diff --git a/tests/ndarray/test_lazyexpr_fields.py b/tests/ndarray/test_lazyexpr_fields.py index 43c1ade5..b9a9b51a 100644 --- a/tests/ndarray/test_lazyexpr_fields.py +++ b/tests/ndarray/test_lazyexpr_fields.py @@ -279,17 +279,19 @@ def test_where_one_param(array_fixture): res = np.sort(res) nres = np.sort(nres) np.testing.assert_allclose(res[:], nres) + # Test with getitem sl = slice(100) res = expr.where(a1)[sl] + nres = na1[sl][ne_evaluate("na1**2 + na2**2 > 2 * na1 * na2 + 1")[sl]] if len(a1.shape) == 1 or a1.chunks == a1.shape: # TODO: fix this, as it seems that is not working well for numexpr? if blosc2.IS_WASM: return - np.testing.assert_allclose(res, nres[sl]) + np.testing.assert_allclose(res, nres) else: # In this case, we cannot compare results, only the length - assert len(res) == len(nres[sl]) + assert len(res) == len(nres) # Test where indirectly via a condition in getitem in a NDArray @@ -330,25 +332,26 @@ def test_where_getitem(array_fixture): # Test with partial slice sl = slice(100) res = sa1[a1**2 + a2**2 > 2 * a1 * a2 + 1][sl] + nres = nsa1[sl][ne_evaluate("na1**2 + na2**2 > 2 * na1 * na2 + 1")[sl]] if len(a1.shape) == 1 or a1.chunks == a1.shape: # TODO: fix this, as it seems that is not working well for numexpr? if blosc2.IS_WASM: return - np.testing.assert_allclose(res["a"], nres[sl]["a"]) - np.testing.assert_allclose(res["b"], nres[sl]["b"]) + np.testing.assert_allclose(res["a"], nres["a"]) + np.testing.assert_allclose(res["b"], nres["b"]) else: # In this case, we cannot compare results, only the length - assert len(res["a"]) == len(nres[sl]["a"]) - assert len(res["b"]) == len(nres[sl]["b"]) + assert len(res["a"]) == len(nres["a"]) + assert len(res["b"]) == len(nres["b"]) # string version res = sa1["a**2 + b**2 > 2 * a * b + 1"][sl] if len(a1.shape) == 1 or a1.chunks == a1.shape: - np.testing.assert_allclose(res["a"], nres[sl]["a"]) - np.testing.assert_allclose(res["b"], nres[sl]["b"]) + np.testing.assert_allclose(res["a"], nres["a"]) + np.testing.assert_allclose(res["b"], nres["b"]) else: # We cannot compare the results here, other than the length - assert len(res["a"]) == len(nres[sl]["a"]) - assert len(res["b"]) == len(nres[sl]["b"]) + assert len(res["a"]) == len(nres["a"]) + assert len(res["b"]) == len(nres["b"]) # Test where indirectly via a condition in getitem in a NDField @@ -631,3 +634,40 @@ def test_col_reduction(reduce_op): ns = nreduc(nC[nC > 0]) np.testing.assert_allclose(s, ns) np.testing.assert_allclose(s2, ns) + + +def test_fields_indexing(): + N = 1000 + it = ((-x + 1, x - 2, 0.1 * x) for x in range(N)) + sa = blosc2.fromiter( + it, dtype=[("A", "i4"), ("B", "f4"), ("C", "f8")], shape=(N,), urlpath="sa-1M.b2nd", mode="w" + ) + expr = sa["(A < B)"] + A = sa["A"][:] + B = sa["B"][:] + C = sa["C"][:] + temp = sa[:] + indices = A < B + idx = np.argmax(indices) + + # Returns less than 10 elements in general + sliced = expr.compute(slice(0, 10)) + gotitem = expr[:10] + np.testing.assert_array_equal(sliced[:], gotitem) + np.testing.assert_array_equal(gotitem, temp[:10][indices[:10]]) + # Actually this makes sense since one can understand this as a request to compute on a portion of operands. + # If one desires a portion of the result, one should compute the whole expression and then slice it. + # For a general slice it is quite difficult to simply stop when the desired slice has been obtained. Or + # to try to optimise chunk computation order. + + # Get first true element + sliced = expr.compute(idx) + gotitem = expr[idx] + np.testing.assert_array_equal(sliced[()], gotitem) + np.testing.assert_array_equal(gotitem, temp[idx]) + + # Should return void arrays here. + sliced = expr.compute(0) # typically gives array of zeros + gotitem = expr[0] # gives an error + np.testing.assert_array_equal(sliced[()], gotitem) + np.testing.assert_array_equal(gotitem, temp[0])