pymc-devs · ricardoV94 · Jun 11, 2025 · Jun 11, 2025 · Feb 19, 2025 · Jun 12, 2025
diff --git a/doc/tutorial/gradients.rst b/doc/tutorial/gradients.rst
@@ -101,18 +101,20 @@ PyTensor implements the :func:`pytensor.gradient.jacobian` macro that does all
 that is needed to compute the Jacobian. The following text explains how
 to do it manually.
 
+Using Scan
+----------
+
 In order to manually compute the Jacobian of some function ``y`` with
-respect to some parameter ``x`` we need to use `scan`. What we
-do is to loop over the entries in ``y`` and compute the gradient of
+respect to some parameter ``x`` we can use `scan`.
+In this case, we loop over the entries in ``y`` and compute the gradient of
 ``y[i]`` with respect to ``x``.
 
 .. note::
 
     `scan` is a generic op in PyTensor that allows writing in a symbolic
     manner all kinds of recurrent equations. While creating
     symbolic loops (and optimizing them for performance) is a hard task,
-    effort is being done for improving the performance of `scan`. We
-    shall return to :ref:`scan<tutloop>` later in this tutorial.
+    efforts are being made to improving the performance of `scan`.
 
 >>> import pytensor
 >>> import pytensor.tensor as pt
@@ -124,9 +126,9 @@ do is to loop over the entries in ``y`` and compute the gradient of
 array([[ 8.,  0.],
        [ 0.,  8.]])
 
-What we do in this code is to generate a sequence of integers from ``0`` to
-``y.shape[0]`` using `pt.arange`. Then we loop through this sequence, and
-at each step, we compute the gradient of element ``y[i]`` with respect to
+This code generates a sequence of integers from ``0`` to
+``y.shape[0]`` using `pt.arange`. Then it loops through this sequence, and
+at each step, computes the gradient of element ``y[i]`` with respect to
 ``x``. `scan` automatically concatenates all these rows, generating a
 matrix which corresponds to the Jacobian.
 
@@ -139,6 +141,31 @@ matrix which corresponds to the Jacobian.
     ``x`` anymore, while ``y[i]`` still is.
 
 
+Using automatic vectorization
+-----------------------------
+An alternative way to build the Jacobian is to vectorize the graph that computes a single row or colum of the jacobian
+We can use `Lop` or `Rop` (more about it below) to obtain the row or column of the jacobian and `vectorize_graph`
+to vectorize it to the full jacobian matrix.
+
+>>> import pytensor
+>>> import pytensor.tensor as pt
+>>> from pytensor.gradient import Lop
+>>> from pytensor.graph import vectorize_graph
+>>> x = pt.dvector('x')
+>>> y = x ** 2
+>>> row_cotangent = pt.dvector("row_cotangent")  # Helper variable, it will be replaced during vectorization
+>>> J_row = Lop(y, x, row_cotangent)
+>>> J = vectorize_graph(J_row, replace={row_cotangent: pt.eye(x.size)})
+>>> f = pytensor.function([x], J)
+>>> f([4, 4])
+array([[ 8.,  0.],
+       [ 0.,  8.]])
+
+This avoids the overhead of scan, at the cost of higher memory usage if the jacobian expression has large intermediate operations.
+Also, not all graphs are safely vectorizable (e.g., if different rows require intermediate operations of different sizes).
+For these reasons `jacobian` uses scan by default. The behavior can be changed by setting `vectorize=True`.
+
+
 Computing the Hessian
 =====================
 

diff --git a/pytensor/gradient.py b/pytensor/gradient.py
@@ -11,7 +11,7 @@
 import pytensor
 from pytensor.compile.ops import ViewOp
 from pytensor.configdefaults import config
-from pytensor.graph import utils
+from pytensor.graph import utils, vectorize_graph
 from pytensor.graph.basic import Apply, NominalVariable, Variable
 from pytensor.graph.null_type import NullType, null_type
 from pytensor.graph.op import get_test_values
@@ -703,15 +703,15 @@ def grad(
         grad_dict[var] = g_var
 
     def handle_disconnected(var):
-        message = (
-            "grad method was asked to compute the gradient "
-            "with respect to a variable that is not part of "
-            "the computational graph of the cost, or is used "
-            f"only by a non-differentiable operator: {var}"
-        )
         if disconnected_inputs == "ignore":
-            pass
+            return
         elif disconnected_inputs == "warn":
+            message = (
+                "grad method was asked to compute the gradient "
+                "with respect to a variable that is not part of "
+                "the computational graph of the cost, or is used "
+                f"only by a non-differentiable operator: {var}"
+            )
             warnings.warn(message, stacklevel=2)
         elif disconnected_inputs == "raise":
             message = utils.get_variable_trace_string(var)
@@ -2021,13 +2021,19 @@ def __str__(self):
 Exception args: {args_msg}"""
 
 
-def jacobian(expression, wrt, consider_constant=None, disconnected_inputs="raise"):
+def jacobian(
+    expression,
+    wrt,
+    consider_constant=None,
+    disconnected_inputs="raise",
+    vectorize=False,
+):
     """
     Compute the full Jacobian, row by row.
 
     Parameters
     ----------
-    expression : Vector (1-dimensional) :class:`~pytensor.graph.basic.Variable`
+    expression :class:`~pytensor.graph.basic.Variable`
         Values that we are differentiating (that we want the Jacobian of)
     wrt : :class:`~pytensor.graph.basic.Variable` or list of Variables
         Term[s] with respect to which we compute the Jacobian
@@ -2051,62 +2057,74 @@ def jacobian(expression, wrt, consider_constant=None, disconnected_inputs="raise
         output, then a zero variable is returned. The return value is
         of same type as `wrt`: a list/tuple or TensorVariable in all cases.
     """
+    from pytensor.tensor.basic import eye
+    from pytensor.tensor.extra_ops import broadcast_to
 
     if not isinstance(expression, Variable):
         raise TypeError("jacobian expects a Variable as `expression`")
 
-    if expression.ndim > 1:
-        raise ValueError(
-            "jacobian expects a 1 dimensional variable as `expression`."
-            " If not use flatten to make it a vector"
-        )
-
     using_list = isinstance(wrt, list)
     using_tuple = isinstance(wrt, tuple)
+    grad_kwargs = {
+        "consider_constant": consider_constant,
+        "disconnected_inputs": disconnected_inputs,
+    }
 
     if isinstance(wrt, list | tuple):
         wrt = list(wrt)
     else:
         wrt = [wrt]
 
     if all(expression.type.broadcastable):
-        # expression is just a scalar, use grad
-        return as_list_or_tuple(
-            using_list,
-            using_tuple,
-            grad(
-                expression.squeeze(),
-                wrt,
-                consider_constant=consider_constant,
-                disconnected_inputs=disconnected_inputs,
-            ),
+        jacobian_matrices = grad(expression.squeeze(), wrt, **grad_kwargs)
+
+    elif vectorize:
+        expression_flat = expression.ravel()
+        row_tangent = _float_ones_like(expression_flat).type("row_tangent")
+        jacobian_single_rows = Lop(expression.ravel(), wrt, row_tangent, **grad_kwargs)
+
+        n_rows = expression_flat.size
+        jacobian_matrices = vectorize_graph(
+            jacobian_single_rows,
+            replace={row_tangent: eye(n_rows, dtype=row_tangent.dtype)},
         )
+        if disconnected_inputs != "raise":
+            # If the input is disconnected from the cost, `vectorize_graph` has no effect on the respective jacobian
+            # We have to broadcast the zeros explicitly here
+            for i, (jacobian_single_row, jacobian_matrix) in enumerate(
+                zip(jacobian_single_rows, jacobian_matrices, strict=True)
+            ):
+                if jacobian_single_row.ndim == jacobian_matrix.ndim:
+                    jacobian_matrices[i] = broadcast_to(
+                        jacobian_matrix, shape=(n_rows, *jacobian_matrix.shape)
+                    )
 
-    def inner_function(*args):
-        idx = args[0]
-        expr = args[1]
-        rvals = []
-        for inp in args[2:]:
-            rval = grad(
-                expr[idx],
-                inp,
-                consider_constant=consider_constant,
-                disconnected_inputs=disconnected_inputs,
+    else:
+
+        def inner_function(*args):
+            idx, expr, *wrt = args
+            return grad(expr[idx], wrt, **grad_kwargs)
+
+        jacobian_matrices, updates = pytensor.scan(
+            inner_function,
+            sequences=pytensor.tensor.arange(expression.size),
+            non_sequences=[expression.ravel(), *wrt],
+            return_list=True,
+        )
+        if updates:
+            raise ValueError(
+                "The scan used to build the jacobian matrices returned a list of updates"
             )
-            rvals.append(rval)
-        return rvals
-
-    # Computing the gradients does not affect the random seeds on any random
-    # generator used n expression (because during computing gradients we are
-    # just backtracking over old values. (rp Jan 2012 - if anyone has a
-    # counter example please show me)
-    jacobs, updates = pytensor.scan(
-        inner_function,
-        sequences=pytensor.tensor.arange(expression.shape[0]),
-        non_sequences=[expression, *wrt],
-    )
-    assert not updates, "Scan has returned a list of updates; this should not happen."
-    return as_list_or_tuple(using_list, using_tuple, jacobs)
+
+    if jacobian_matrices[0].ndim < (expression.ndim + wrt[0].ndim):
+        # There was some raveling or squeezing done prior to getting the jacobians
+        # Reshape into original shapes
+        jacobian_matrices = [
+            jac_matrix.reshape((*expression.shape, *w.shape))
+            for jac_matrix, w in zip(jacobian_matrices, wrt, strict=True)
+        ]
+
+    return as_list_or_tuple(using_list, using_tuple, jacobian_matrices)
 
 
 def hessian(cost, wrt, consider_constant=None, disconnected_inputs="raise"):

diff --git a/pytensor/graph/replace.py b/pytensor/graph/replace.py
@@ -232,13 +232,13 @@ def vectorize_graph(
 def vectorize_graph(
     outputs: Sequence[Variable],
     replace: Mapping[Variable, Variable],
-) -> Sequence[Variable]: ...
+) -> list[Variable]: ...
 
 
 def vectorize_graph(
     outputs: Variable | Sequence[Variable],
     replace: Mapping[Variable, Variable],
-) -> Variable | Sequence[Variable]:
+) -> Variable | list[Variable]:
     """Vectorize outputs graph given mapping from old variables to expanded counterparts version.
 
     Expanded dimensions must be on the left. Behavior is similar to the functional `numpy.vectorize`.

diff --git a/pytensor/tensor/basic.py b/pytensor/tensor/basic.py
@@ -3081,6 +3081,10 @@ def flatten(x, ndim=1):
     else:
         dims = (-1,)
 
+    if len(dims) == _x.ndim:
+        # Nothing to ravel
+        return _x
+
     x_reshaped = _x.reshape(dims)
     shape_kept_dims = _x.type.shape[: ndim - 1]
     bcast_new_dim = builtins.all(s == 1 for s in _x.type.shape[ndim - 1 :])

diff --git a/pytensor/tensor/math.py b/pytensor/tensor/math.py
@@ -3916,23 +3916,7 @@ def logsumexp(x, axis=None, keepdims=False):
     return log(sum(exp(x), axis=axis, keepdims=keepdims))
 
 
-# Predefine all batched variations of Dot
-_inner_prod = Blockwise(
-    _dot,
-    signature="(n),(n)->()",
-)
-
-_matrix_vec_prod = Blockwise(
-    _dot,
-    signature="(m,k),(k)->(m)",
-)
-
-_vec_matrix_prod = Blockwise(
-    _dot,
-    signature="(k),(k,n)->(n)",
-)
-
-_matrix_matrix_matmul = Blockwise(
+_matmul = Blockwise(
     _dot,
     signature="(m,k),(k,n)->(m,n)",
     gufunc_spec=("numpy.matmul", 2, 1),
@@ -3988,11 +3972,11 @@ def matmul(x1: "ArrayLike", x2: "ArrayLike", dtype: Optional["DTypeLike"] = None
     if x1.type.ndim == 1 and x2.type.ndim == 1:
         out = _dot(x1, x2)
     elif x1.type.ndim == 1:
-        out = _matrix_matrix_matmul(x1[None], x2).squeeze(-2)
+        out = vecmat(x1, x2)
     elif x2.type.ndim == 1:
-        out = _matrix_matrix_matmul(x1, x2[:, None]).squeeze(-1)
+        out = matvec(x1, x2)
     else:
-        out = _matrix_matrix_matmul(x1, x2)
+        out = _matmul(x1, x2)
 
     if dtype is not None:
         out = out.astype(dtype)
@@ -4042,7 +4026,7 @@ def vecdot(
     >>> z_batch = pt.vecdot(x_batch, y_batch)  # shape (3,)
     >>> # Equivalent to numpy.vecdot(x_batch, y_batch)
     """
-    out = _inner_prod(x1, x2)
+    out = matmul(x1[..., None, :], x2[..., :, None]).squeeze((-2, -1))
 
     if dtype is not None:
         out = out.astype(dtype)
@@ -4091,7 +4075,7 @@ def matvec(
     >>> result = pt.matvec(batched_A, batched_v)  # shape (2, 3)
     >>> # Equivalent to numpy.matvec(batched_A, batched_v)
     """
-    out = _matrix_vec_prod(x1, x2)
+    out = matmul(x1, x2[..., None]).squeeze(-1)
 
     if dtype is not None:
         out = out.astype(dtype)
@@ -4129,18 +4113,18 @@ def vecmat(
     --------
     >>> import pytensor.tensor as pt
     >>> # Vector-matrix product
-    >>> v = pt.vector("v", shape=(3,))  # shape (3,)
-    >>> A = pt.matrix("A", shape=(3, 4))  # shape (3, 4)
+    >>> v = pt.vector("v", shape=(3,))
+    >>> A = pt.matrix("A", shape=(3, 4))
     >>> result = pt.vecmat(v, A)  # shape (4,)
     >>> # Equivalent to numpy.vecmat(v, A)
     >>>
     >>> # Batched vector-matrix product
-    >>> batched_v = pt.matrix("v", shape=(2, 3))  # shape (2, 3)
-    >>> batched_A = pt.tensor3("A", shape=(2, 3, 4))  # shape (2, 3, 4)
+    >>> batched_v = pt.matrix("v", shape=(2, 3))
+    >>> batched_A = pt.tensor3("A", shape=(2, 3, 4))
     >>> result = pt.vecmat(batched_v, batched_A)  # shape (2, 4)
     >>> # Equivalent to numpy.vecmat(batched_v, batched_A)
     """
-    out = _vec_matrix_prod(x1, x2)
+    out = matmul(x2.mT, x1[..., None]).squeeze(-1)
 
     if dtype is not None:
         out = out.astype(dtype)
@@ -4155,18 +4139,18 @@ def vectorize_node_dot(op, node, batched_x, batched_y):
     old_y_ndim = old_y.type.ndim
     match (old_x_ndim, old_y_ndim):
         case (1, 1):
-            batch_op = _inner_prod
+            batch_fn = vecdot
         case (2, 1):
-            batch_op = _matrix_vec_prod
+            batch_fn = matvec
         case (1, 2):
-            batch_op = _vec_matrix_prod
+            batch_fn = vecmat
         case (2, 2):
-            batch_op = _matrix_matrix_matmul
+            batch_fn = matmul
         case _:
             raise ValueError(
                 f"Core dot Op should have 1D or 2D inputs, got {old_x_ndim}D and {old_y_ndim}D."
             )
-    return batch_op(batched_x, batched_y).owner
+    return batch_fn(batched_x, batched_y).owner
 
 
 def nan_to_num(x, nan=0.0, posinf=None, neginf=None):

diff --git a/pytensor/tensor/rewriting/basic.py b/pytensor/tensor/rewriting/basic.py
@@ -31,6 +31,7 @@
 from pytensor.compile.ops import ViewOp
 from pytensor.graph import FunctionGraph
 from pytensor.graph.basic import Constant
+from pytensor.graph.op import _NoPythonOp
 from pytensor.graph.rewriting.basic import (
     NodeProcessingGraphRewriter,
     NodeRewriter,
@@ -1108,7 +1109,12 @@ def unconditional_constant_folding(fgraph, node):
         storage_map[o] = [None]
         compute_map[o] = [False]
 
-    thunk = node.op.make_thunk(node, storage_map, compute_map, no_recycling=[])
+    if isinstance(node.op, _NoPythonOp):
+        thunk = node.op.make_thunk(node, storage_map, compute_map, no_recycling=[])
+    else:
+        thunk = node.op.make_thunk(
+            node, storage_map, compute_map, no_recycling=[], impl="py"
+        )
     required = thunk()
 
     # A node whose inputs are all provided should always return successfully