From 52241c7da7c6460817f5bb5e461c88020a74d7b8 Mon Sep 17 00:00:00 2001 From: Luke Shaw Date: Fri, 30 May 2025 10:38:36 +0200 Subject: [PATCH 1/7] Make behaviour of compute consistent for slicing --- src/blosc2/lazyexpr.py | 29 ++++++------- tests/ndarray/test_lazyexpr_fields.py | 60 ++++++++++++++++++++++----- 2 files changed, 65 insertions(+), 24 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 6aaaa531..dbb032f7 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1378,7 +1378,7 @@ def slices_eval( # noqa: C901 for i, (c, s) in enumerate(zip(coords, chunks, strict=True)) ) # Check whether current slice_ intersects with _slice - if _slice is not None and _slice != (): + if _slice is not None and _slice != (): # can't use != when _slice is np.int # Ensure that _slice is of type slice key = ndindex.ndindex(_slice).expand(shape).raw _slice = tuple(k if isinstance(k, slice) else slice(k, k + 1, None) for k in key) @@ -1508,19 +1508,7 @@ def slices_eval( # noqa: C901 else: raise ValueError("The where condition must be a tuple with one or two elements") - if orig_slice is not None: - if isinstance(out, np.ndarray): - out = out[orig_slice] - if _order is not None: - indices_ = indices_[orig_slice] - elif isinstance(out, blosc2.NDArray): - # It *seems* better to choose an automatic chunks and blocks for the output array - # out = out.slice(orig_slice, chunks=out.chunks, blocks=out.blocks) - out = out.slice(orig_slice) - else: - raise ValueError("The output array is not a NumPy array or a NDArray") - - if where is not None and len(where) < 2: + if where is not None and len(where) < 2: # Don't need to take orig_slice since filled up from 0 index if _order is not None: # argsort the result following _order new_order = np.argsort(out[:lenout]) @@ -1532,6 +1520,19 @@ def slices_eval( # noqa: C901 else: out.resize((lenout,)) + else: # Need to take orig_slice since filled up array according to slice_ for each chunk + if orig_slice is not None: + if isinstance(out, np.ndarray): + out = out[orig_slice] + if _order is not None: + indices_ = indices_[orig_slice] + elif isinstance(out, blosc2.NDArray): + # It *seems* better to choose an automatic chunks and blocks for the output array + # out = out.slice(orig_slice, chunks=out.chunks, blocks=out.blocks) + out = out.slice(orig_slice) + else: + raise ValueError("The output array is not a NumPy array or a NDArray") + return out diff --git a/tests/ndarray/test_lazyexpr_fields.py b/tests/ndarray/test_lazyexpr_fields.py index 43c1ade5..b9a9b51a 100644 --- a/tests/ndarray/test_lazyexpr_fields.py +++ b/tests/ndarray/test_lazyexpr_fields.py @@ -279,17 +279,19 @@ def test_where_one_param(array_fixture): res = np.sort(res) nres = np.sort(nres) np.testing.assert_allclose(res[:], nres) + # Test with getitem sl = slice(100) res = expr.where(a1)[sl] + nres = na1[sl][ne_evaluate("na1**2 + na2**2 > 2 * na1 * na2 + 1")[sl]] if len(a1.shape) == 1 or a1.chunks == a1.shape: # TODO: fix this, as it seems that is not working well for numexpr? if blosc2.IS_WASM: return - np.testing.assert_allclose(res, nres[sl]) + np.testing.assert_allclose(res, nres) else: # In this case, we cannot compare results, only the length - assert len(res) == len(nres[sl]) + assert len(res) == len(nres) # Test where indirectly via a condition in getitem in a NDArray @@ -330,25 +332,26 @@ def test_where_getitem(array_fixture): # Test with partial slice sl = slice(100) res = sa1[a1**2 + a2**2 > 2 * a1 * a2 + 1][sl] + nres = nsa1[sl][ne_evaluate("na1**2 + na2**2 > 2 * na1 * na2 + 1")[sl]] if len(a1.shape) == 1 or a1.chunks == a1.shape: # TODO: fix this, as it seems that is not working well for numexpr? if blosc2.IS_WASM: return - np.testing.assert_allclose(res["a"], nres[sl]["a"]) - np.testing.assert_allclose(res["b"], nres[sl]["b"]) + np.testing.assert_allclose(res["a"], nres["a"]) + np.testing.assert_allclose(res["b"], nres["b"]) else: # In this case, we cannot compare results, only the length - assert len(res["a"]) == len(nres[sl]["a"]) - assert len(res["b"]) == len(nres[sl]["b"]) + assert len(res["a"]) == len(nres["a"]) + assert len(res["b"]) == len(nres["b"]) # string version res = sa1["a**2 + b**2 > 2 * a * b + 1"][sl] if len(a1.shape) == 1 or a1.chunks == a1.shape: - np.testing.assert_allclose(res["a"], nres[sl]["a"]) - np.testing.assert_allclose(res["b"], nres[sl]["b"]) + np.testing.assert_allclose(res["a"], nres["a"]) + np.testing.assert_allclose(res["b"], nres["b"]) else: # We cannot compare the results here, other than the length - assert len(res["a"]) == len(nres[sl]["a"]) - assert len(res["b"]) == len(nres[sl]["b"]) + assert len(res["a"]) == len(nres["a"]) + assert len(res["b"]) == len(nres["b"]) # Test where indirectly via a condition in getitem in a NDField @@ -631,3 +634,40 @@ def test_col_reduction(reduce_op): ns = nreduc(nC[nC > 0]) np.testing.assert_allclose(s, ns) np.testing.assert_allclose(s2, ns) + + +def test_fields_indexing(): + N = 1000 + it = ((-x + 1, x - 2, 0.1 * x) for x in range(N)) + sa = blosc2.fromiter( + it, dtype=[("A", "i4"), ("B", "f4"), ("C", "f8")], shape=(N,), urlpath="sa-1M.b2nd", mode="w" + ) + expr = sa["(A < B)"] + A = sa["A"][:] + B = sa["B"][:] + C = sa["C"][:] + temp = sa[:] + indices = A < B + idx = np.argmax(indices) + + # Returns less than 10 elements in general + sliced = expr.compute(slice(0, 10)) + gotitem = expr[:10] + np.testing.assert_array_equal(sliced[:], gotitem) + np.testing.assert_array_equal(gotitem, temp[:10][indices[:10]]) + # Actually this makes sense since one can understand this as a request to compute on a portion of operands. + # If one desires a portion of the result, one should compute the whole expression and then slice it. + # For a general slice it is quite difficult to simply stop when the desired slice has been obtained. Or + # to try to optimise chunk computation order. + + # Get first true element + sliced = expr.compute(idx) + gotitem = expr[idx] + np.testing.assert_array_equal(sliced[()], gotitem) + np.testing.assert_array_equal(gotitem, temp[idx]) + + # Should return void arrays here. + sliced = expr.compute(0) # typically gives array of zeros + gotitem = expr[0] # gives an error + np.testing.assert_array_equal(sliced[()], gotitem) + np.testing.assert_array_equal(gotitem, temp[0]) From b0ba4d1ee1087f3b973980aaf508396ebec67ee6 Mon Sep 17 00:00:00 2001 From: Luke Shaw Date: Fri, 30 May 2025 10:50:38 +0200 Subject: [PATCH 2/7] Make behaviour of compute consistent for slicing --- src/blosc2/lazyexpr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index dbb032f7..1830f054 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1378,7 +1378,7 @@ def slices_eval( # noqa: C901 for i, (c, s) in enumerate(zip(coords, chunks, strict=True)) ) # Check whether current slice_ intersects with _slice - if _slice is not None and _slice != (): # can't use != when _slice is np.int + if _slice is not None and _slice is not (): # noqa: F632 # can't use != when _slice is np.int # Ensure that _slice is of type slice key = ndindex.ndindex(_slice).expand(shape).raw _slice = tuple(k if isinstance(k, slice) else slice(k, k + 1, None) for k in key) From 4bfad238fa14caca8548d473a539c2cb572114c4 Mon Sep 17 00:00:00 2001 From: Luke Shaw Date: Fri, 30 May 2025 11:00:09 +0200 Subject: [PATCH 3/7] Fix slice checking --- src/blosc2/lazyexpr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 1830f054..108c942c 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1378,7 +1378,7 @@ def slices_eval( # noqa: C901 for i, (c, s) in enumerate(zip(coords, chunks, strict=True)) ) # Check whether current slice_ intersects with _slice - if _slice is not None and _slice is not (): # noqa: F632 # can't use != when _slice is np.int + if _slice is not None and np.any(_slice != ()): # can't use != when _slice is np.int # Ensure that _slice is of type slice key = ndindex.ndindex(_slice).expand(shape).raw _slice = tuple(k if isinstance(k, slice) else slice(k, k + 1, None) for k in key) From 159fe8218a468dae0ed5a2974b76fe74bdb1729f Mon Sep 17 00:00:00 2001 From: Luke Shaw Date: Fri, 30 May 2025 11:25:26 +0200 Subject: [PATCH 4/7] Fix slice checking --- src/blosc2/lazyexpr.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 108c942c..2bb00af6 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1378,7 +1378,8 @@ def slices_eval( # noqa: C901 for i, (c, s) in enumerate(zip(coords, chunks, strict=True)) ) # Check whether current slice_ intersects with _slice - if _slice is not None and np.any(_slice != ()): # can't use != when _slice is np.int + checker = _slice.item() if hasattr(_slice, "item") else _slice # can't use != when _slice is np.int + if checker is not None and checker != (): # Ensure that _slice is of type slice key = ndindex.ndindex(_slice).expand(shape).raw _slice = tuple(k if isinstance(k, slice) else slice(k, k + 1, None) for k in key) From 1333dae368af0bf16aaf32b87cb8663c73466ac6 Mon Sep 17 00:00:00 2001 From: Luke Shaw Date: Fri, 30 May 2025 12:00:04 +0200 Subject: [PATCH 5/7] Add documentation of behaviour of get_item and compute for LazyExprs --- examples/ndarray/lazyexpr_where_indexing.py | 42 +++++++++++++++++++++ src/blosc2/lazyexpr.py | 29 +++++++++++++- 2 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 examples/ndarray/lazyexpr_where_indexing.py diff --git a/examples/ndarray/lazyexpr_where_indexing.py b/examples/ndarray/lazyexpr_where_indexing.py new file mode 100644 index 00000000..a39b801c --- /dev/null +++ b/examples/ndarray/lazyexpr_where_indexing.py @@ -0,0 +1,42 @@ +# Imports + +import numpy as np + +import blosc2 + +N = 1000 +it = ((-x + 1, x - 2, 0.1 * x) for x in range(N)) +sa = blosc2.fromiter( + it, dtype=[("A", "i4"), ("B", "f4"), ("C", "f8")], shape=(N,), urlpath="sa-1M.b2nd", mode="w" +) +expr = sa["(A < B)"] +A = sa["A"][:] +B = sa["B"][:] +C = sa["C"][:] +temp = sa[:] +indices = A < B +idx = np.argmax(indices) + +# One might think that expr[:10] gives the first 10 elements of the evaluated expression, but this is not the case. +# It actually computes the expression on the first 10 elements of the operands; since for some elements the condition +# is False, the result will be shorter than 10 elements. +# Returns less than 10 elements in general +sliced = expr.compute(slice(0, 10)) +gotitem = expr[:10] +np.testing.assert_array_equal(sliced[:], gotitem) +np.testing.assert_array_equal(gotitem, temp[:10][indices[:10]]) # Equivalent syntax +# Actually this makes sense since one can understand this as a request to compute on a portion of operands. +# If one desires a portion of the result, one should compute the whole expression and then slice it. + +# Get first element for which condition is true +sliced = expr.compute(idx) +gotitem = expr[idx] +# Arrays of one element +np.testing.assert_array_equal(sliced[()], gotitem) +np.testing.assert_array_equal(gotitem, temp[idx]) + +# Should return void arrays here. +sliced = expr.compute(0) +gotitem = expr[0] +np.testing.assert_array_equal(sliced[()], gotitem) +np.testing.assert_array_equal(gotitem, temp[0]) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 2bb00af6..9d2aa80b 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1829,7 +1829,8 @@ def chunked_eval( # noqa: C901 operands: dict A dictionary containing the operands for the expression. item: int, slice or sequence of slices, optional - The slice(s) to be retrieved. Note that step parameter is not honored yet. + The slice(s) of the operands to be used in computation. Note that step parameter is not honored yet. + Item is used to slice the operands PRIOR to computation. kwargs: Any, optional Additional keyword arguments supported by the :func:`empty` constructor. In addition, the following keyword arguments are supported: @@ -2688,6 +2689,21 @@ def sort(self, order: str | list[str] | None = None) -> blosc2.LazyArray: return lazy_expr def compute(self, item=None, **kwargs) -> blosc2.NDArray: + """ + Compute the expression with the given item and kwargs. + Parameters + ---------- + item: int, slice or sequence of slices, optional + The slice(s) of the operands to be used in computation. Note that step parameter is not honored yet. + Item is used to slice the operands PRIOR to computation. + kwargs + + Returns: + blosc2.NDArray or numpy.ndarray + ------- + + """ + # When NumPy ufuncs are called, the user may add an `out` parameter to kwargs if "out" in kwargs: kwargs["_output"] = kwargs.pop("out") @@ -2722,6 +2738,17 @@ def compute(self, item=None, **kwargs) -> blosc2.NDArray: return result def __getitem__(self, item): + """ + Apply LazyExpr on a slice of the oeprands. + Parameters + ---------- + item: int, slice or sequence of slices, optional + The slice(s) of the operands to be used in computation. Note that step parameter is not honored yet. + Item is used to slice the operands PRIOR to computation. + Returns: + numpy.ndarray + """ + kwargs = {"_getitem": True} return self.compute(item, **kwargs) From a6b245823c611974fc30575bf8178dca715a0d27 Mon Sep 17 00:00:00 2001 From: Luke Shaw Date: Fri, 30 May 2025 12:12:30 +0200 Subject: [PATCH 6/7] Add documentation of behaviour of get_item and compute for LazyExprs --- src/blosc2/lazyexpr.py | 33 +++++---------------------------- 1 file changed, 5 insertions(+), 28 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 9d2aa80b..416f8583 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -279,7 +279,8 @@ def compute(self, item: slice | list[slice] | None = None, **kwargs: Any) -> blo ---------- item: slice, list of slices, optional If not None, only the chunks that intersect with the slices - in items will be evaluated. + in item will be evaluated. If provided, items of the operands to be used in computation. + Important to note that item is used to slice the operands PRIOR to computation. kwargs: Any, optional Keyword arguments that are supported by the :func:`empty` constructor. @@ -328,7 +329,9 @@ def __getitem__(self, item: int | slice | Sequence[slice]) -> blosc2.NDArray: Parameters ---------- item: int, slice or sequence of slices - The slice(s) to be retrieved. Note that step parameter is not yet honored. + If provided, items of the operands to be used in computation. + Important to note that item is used to slice the operands PRIOR to computation, not to retrieve specified + slices of the evaluated result. Returns ------- @@ -2689,21 +2692,6 @@ def sort(self, order: str | list[str] | None = None) -> blosc2.LazyArray: return lazy_expr def compute(self, item=None, **kwargs) -> blosc2.NDArray: - """ - Compute the expression with the given item and kwargs. - Parameters - ---------- - item: int, slice or sequence of slices, optional - The slice(s) of the operands to be used in computation. Note that step parameter is not honored yet. - Item is used to slice the operands PRIOR to computation. - kwargs - - Returns: - blosc2.NDArray or numpy.ndarray - ------- - - """ - # When NumPy ufuncs are called, the user may add an `out` parameter to kwargs if "out" in kwargs: kwargs["_output"] = kwargs.pop("out") @@ -2738,17 +2726,6 @@ def compute(self, item=None, **kwargs) -> blosc2.NDArray: return result def __getitem__(self, item): - """ - Apply LazyExpr on a slice of the oeprands. - Parameters - ---------- - item: int, slice or sequence of slices, optional - The slice(s) of the operands to be used in computation. Note that step parameter is not honored yet. - Item is used to slice the operands PRIOR to computation. - Returns: - numpy.ndarray - """ - kwargs = {"_getitem": True} return self.compute(item, **kwargs) From 3db4ddb9a324e0a3cf79fa0bcb7c6a8e5c686b81 Mon Sep 17 00:00:00 2001 From: Luke Shaw Date: Fri, 30 May 2025 13:25:15 +0200 Subject: [PATCH 7/7] Improve clarity of coments. --- src/blosc2/lazyexpr.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 416f8583..1b6c3704 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -278,9 +278,9 @@ def compute(self, item: slice | list[slice] | None = None, **kwargs: Any) -> blo Parameters ---------- item: slice, list of slices, optional - If not None, only the chunks that intersect with the slices - in item will be evaluated. If provided, items of the operands to be used in computation. - Important to note that item is used to slice the operands PRIOR to computation. + If provided, item is used to slice the operands *prior* to computation; not to retrieve specified slices of + the evaluated result. This difference between slicing operands and slicing the final expression + is important when reductions or a where clause are used in the expression. kwargs: Any, optional Keyword arguments that are supported by the :func:`empty` constructor. @@ -329,9 +329,9 @@ def __getitem__(self, item: int | slice | Sequence[slice]) -> blosc2.NDArray: Parameters ---------- item: int, slice or sequence of slices - If provided, items of the operands to be used in computation. - Important to note that item is used to slice the operands PRIOR to computation, not to retrieve specified - slices of the evaluated result. + If provided, item is used to slice the operands *prior* to computation; not to retrieve specified slices of + the evaluated result. This difference between slicing operands and slicing the final expression + is important when reductions or a where clause are used in the expression. Returns -------