|
28 | 28 | as_tensor_variable,
|
29 | 29 | cast,
|
30 | 30 | constant,
|
| 31 | + expand_dims, |
31 | 32 | get_underlying_scalar_constant_value,
|
32 | 33 | moveaxis,
|
33 | 34 | ones_like,
|
34 | 35 | register_infer_shape,
|
35 | 36 | switch,
|
36 | 37 | zeros_like,
|
37 | 38 | )
|
38 |
| -from pytensor.tensor.blockwise import Blockwise |
39 | 39 | from pytensor.tensor.elemwise import CAReduce, DimShuffle, Elemwise
|
40 | 40 | from pytensor.tensor.exceptions import NotScalarConstantError
|
41 | 41 | from pytensor.tensor.extra_ops import broadcast_arrays
|
|
45 | 45 | Sum,
|
46 | 46 | _conj,
|
47 | 47 | _dot,
|
48 |
| - _inner_prod, |
49 |
| - _matrix_matrix_matmul, |
50 |
| - _matrix_vec_prod, |
51 |
| - _vec_matrix_prod, |
| 48 | + _matmul, |
52 | 49 | add,
|
53 | 50 | digamma,
|
54 | 51 | dot,
|
@@ -196,60 +193,134 @@ def local_lift_transpose_through_dot(fgraph, node):
|
196 | 193 | return ret
|
197 | 194 |
|
198 | 195 |
|
199 |
| -@register_stabilize |
| 196 | +@register_canonicalize |
200 | 197 | @register_specialize
|
201 |
| -@node_rewriter(tracks=[Blockwise]) |
| 198 | +@node_rewriter(tracks=[_matmul]) |
202 | 199 | def local_batched_matmul_to_core_matmul(fgraph, node):
|
203 |
| - """Rewrite matmul where only one of the inputs has batch dimensions to a reshaped core matmul. |
| 200 | + """Move batch dimensions of matmul operands to core matmul |
204 | 201 |
|
205 |
| - Example, if x has batch dimensions, but y not: |
| 202 | + Example, if x has batch dimensions that don't overlap with batch dimensions of y |
206 | 203 | x @ y -> (x.reshape(-1, x.shape[-1]) @ y).reshape(*x.shape[:-1], y.shape[-1])
|
207 | 204 |
|
208 |
| - It also works when y has batch dimensions, but x not. |
| 205 | + It also works for batch dimensions of y that don't overlap with batch dimensions of x |
209 | 206 | """
|
210 | 207 |
|
211 |
| - # Check whether we have a matmul operation in this node |
212 |
| - if not ( |
213 |
| - isinstance(node.op.core_op, Dot) |
214 |
| - and len(node.op.inputs_sig[0]) == 2 |
215 |
| - and len(node.op.inputs_sig[1]) == 2 |
216 |
| - ): |
217 |
| - return None |
218 |
| - |
219 | 208 | x, y = node.inputs
|
220 | 209 | batch_ndim = node.op.batch_ndim(node)
|
221 | 210 |
|
222 |
| - # Check if x has batch dimensions, but y not (or only broadcastable dimensions) |
223 |
| - if any(not b_dim for b_dim in x.type.broadcastable[:-2]) and all( |
224 |
| - y.type.broadcastable[:-2] |
225 |
| - ): |
226 |
| - x_stacked = x.reshape((-1, x.shape[-1])) |
227 |
| - out_stacked = x_stacked @ y.squeeze(tuple(range(batch_ndim))) |
228 |
| - out = out_stacked.reshape((*x.shape[:-1], y.shape[-1])) |
229 |
| - return [out] |
230 |
| - |
231 |
| - # Otherwise, check if y has batch dimension, but x not |
232 |
| - elif any(not b_dim for b_dim in y.type.broadcastable[:-2]) and all( |
233 |
| - x.type.broadcastable[:-2] |
234 |
| - ): |
235 |
| - # For the y batch case we need to first move the batch axes and then reshape |
236 |
| - # y.shape == (*b, k, n) |
237 |
| - y_tr = moveaxis(y, -2, 0) # (k, *b, n) |
238 |
| - y_stacked = y_tr.reshape((y.shape[-2], -1)) # (k, *b * n) |
239 |
| - out_stacked = x.squeeze(tuple(range(batch_ndim))) @ y_stacked # (m, *b * n) |
240 |
| - out_stacked_tr = out_stacked.reshape( |
241 |
| - (x.shape[-2], *y.shape[:-2], y.shape[-1]) |
242 |
| - ) # (m, *b, n) |
243 |
| - out = moveaxis(out_stacked_tr, 0, -2) # (*b, m, n) |
244 |
| - return [out] |
245 |
| - |
246 |
| - # Both x and y have batch dimensions, nothing to do here |
247 |
| - return None |
| 211 | + x_axis_to_merge = [ |
| 212 | + i |
| 213 | + for i, (bcast_x, bcast_y) in enumerate( |
| 214 | + zip(x.type.broadcastable[:-2], y.type.broadcastable[:-2]) |
| 215 | + ) |
| 216 | + if bcast_y and not bcast_x |
| 217 | + ] |
| 218 | + |
| 219 | + y_axis_to_merge = [ |
| 220 | + i |
| 221 | + for i, (bcast_x, bcast_y) in enumerate( |
| 222 | + zip(x.type.broadcastable[:-2], y.type.broadcastable[:-2]) |
| 223 | + ) |
| 224 | + if bcast_x and not bcast_y |
| 225 | + ] |
| 226 | + |
| 227 | + if not (x_axis_to_merge or y_axis_to_merge): |
| 228 | + return None |
| 229 | + |
| 230 | + x_shape = tuple(x.shape) |
| 231 | + y_shape = tuple(y.shape) |
| 232 | + x_is_row = x.type.broadcastable[-2] |
| 233 | + y_is_col = y.type.broadcastable[-1] |
| 234 | + n_x_axis_to_merge = len(x_axis_to_merge) |
| 235 | + n_y_axis_to_merge = len(y_axis_to_merge) |
| 236 | + n_axis_to_merge = n_x_axis_to_merge + n_y_axis_to_merge |
| 237 | + |
| 238 | + x_stacked, y_stacked = x, y |
| 239 | + dims_were_merged = False |
| 240 | + |
| 241 | + if n_x_axis_to_merge: |
| 242 | + # ravel batch dimensions of x on the core (m) axis |
| 243 | + x_axis_destination = tuple(range(-n_x_axis_to_merge - 2, -2)) |
| 244 | + x_stacked = moveaxis(x, x_axis_to_merge, x_axis_destination) |
| 245 | + if x_is_row: |
| 246 | + # x was a row matrix, squeeze it to clean up the graph |
| 247 | + x_stacked = x_stacked.squeeze(-2) |
| 248 | + if n_x_axis_to_merge > 1 or not x_is_row: |
| 249 | + # Ravel moved batch dims together with (m) if needed |
| 250 | + x_stacked_shape = tuple(x_stacked.shape) |
| 251 | + x_stacked = x_stacked.reshape( |
| 252 | + (*x_stacked_shape[: batch_ndim - n_x_axis_to_merge], -1, x_shape[-1]) |
| 253 | + ) |
| 254 | + dims_were_merged = True |
| 255 | + |
| 256 | + if n_y_axis_to_merge: |
| 257 | + # ravel batch dimensions of y on the core (n) axis |
| 258 | + y_axis_destination = tuple(range(-n_y_axis_to_merge - 1, -1)) |
| 259 | + y_stacked = moveaxis(y, y_axis_to_merge, y_axis_destination) |
| 260 | + if y_is_col: |
| 261 | + # y was a column matrix, squeeze it to clean up the graph |
| 262 | + y_stacked = y_stacked.squeeze(-1) |
| 263 | + if n_y_axis_to_merge > 1 or not y_is_col: |
| 264 | + # Ravel moved batch dims together with (n) if needed |
| 265 | + y_stacked_shape = tuple(y_stacked.shape) |
| 266 | + y_stacked = y_stacked.reshape( |
| 267 | + (*y_stacked_shape[: batch_ndim - n_y_axis_to_merge], y_shape[-2], -1) |
| 268 | + ) |
| 269 | + dims_were_merged = True |
| 270 | + |
| 271 | + # Squeeze x_dims corresponding to merged dimensions of y |
| 272 | + x_axis_to_squeeze = np.array(y_axis_to_merge) |
| 273 | + for i in reversed(x_axis_to_merge): |
| 274 | + # The corresponding dimensions of y may have shifted when we merged dimensions of x |
| 275 | + x_axis_to_squeeze[x_axis_to_squeeze > i] -= 1 |
| 276 | + x_stacked = x_stacked.squeeze(tuple(x_axis_to_squeeze)) |
| 277 | + |
| 278 | + # Same for y |
| 279 | + y_axis_to_squeeze = np.array(x_axis_to_merge) |
| 280 | + for i in reversed(y_axis_to_merge): |
| 281 | + y_axis_to_squeeze[y_axis_to_squeeze > i] -= 1 |
| 282 | + y_stacked = y_stacked.squeeze(tuple(y_axis_to_squeeze)) |
| 283 | + |
| 284 | + out_stacked = x_stacked @ y_stacked |
| 285 | + |
| 286 | + # Split back any merged dimensions |
| 287 | + if dims_were_merged: |
| 288 | + x_merged_shapes = [x_shape[i] for i in x_axis_to_merge] |
| 289 | + if not x_is_row: |
| 290 | + # Otherwise we handle that later with expand_dims, which is cleaner |
| 291 | + x_merged_shapes.append(x_shape[-2]) |
| 292 | + y_merged_shapes = [y_shape[i] for i in y_axis_to_merge] |
| 293 | + if not y_is_col: |
| 294 | + # Otherwise we handle that later with expand_dims, which is cleaner |
| 295 | + y_merged_shapes.append(y_shape[-1]) |
| 296 | + out_stacked_shape = tuple(out_stacked.shape) |
| 297 | + out_unstacked = out_stacked.reshape( |
| 298 | + ( |
| 299 | + *out_stacked_shape[: batch_ndim - n_axis_to_merge], |
| 300 | + *x_merged_shapes, |
| 301 | + *y_merged_shapes, |
| 302 | + ) |
| 303 | + ) |
| 304 | + else: |
| 305 | + out_unstacked = out_stacked |
| 306 | + |
| 307 | + # Add back dummy row, col axis |
| 308 | + # We do this separately to avoid the reshape as much as we can |
| 309 | + if y_is_col and (n_y_axis_to_merge or dims_were_merged): |
| 310 | + out_unstacked = expand_dims(out_unstacked, -1) |
| 311 | + if x_is_row and (n_x_axis_to_merge or dims_were_merged): |
| 312 | + out_unstacked = expand_dims(out_unstacked, -n_y_axis_to_merge - 2) |
| 313 | + |
| 314 | + # Move batch axis back to their original location |
| 315 | + source = range(-n_axis_to_merge - 2, 0) |
| 316 | + destination = (*x_axis_to_merge, -2, *y_axis_to_merge, -1) |
| 317 | + out = moveaxis(out_unstacked, source, destination) |
| 318 | + return [out] |
248 | 319 |
|
249 | 320 |
|
250 | 321 | @register_canonicalize
|
251 | 322 | @register_specialize
|
252 |
| -@node_rewriter([_inner_prod, _matrix_vec_prod, _vec_matrix_prod, _matrix_matrix_matmul]) |
| 323 | +@node_rewriter([_matmul]) |
253 | 324 | def local_blockwise_dot_to_mul(fgraph, node):
|
254 | 325 | """Rewrite blockwise dots that correspond to multiplication without summation.
|
255 | 326 |
|
|
0 commit comments