add cu_seqlens support and ensure numerical equality

zigzagcai · zigzagcai · commit d28e1b0d7b4a · 2024-03-14T15:17:16.000+08:00
diff --git a/csrc/selective_scan/selective_scan.cpp b/csrc/selective_scan/selective_scan.cpp
@@ -79,7 +79,9 @@ void set_ssm_params_fwd(SSMParamsBase &params,
                         void* delta_bias_ptr,
                         void* x_ptr,
                         bool has_z,
-                        bool delta_softplus) {
+                        bool delta_softplus,
+                        void* cu_seqlens_ptr,
+                        const int cu_seqlens_size) {
 
     // Reset the parameters
     memset(&params, 0, sizeof(params));
@@ -109,6 +111,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
     params.x_ptr = x_ptr;
     params.z_ptr = has_z ? z.data_ptr() : nullptr;
     params.out_z_ptr = has_z ? out_z.data_ptr() : nullptr;
+
+    params.cu_seqlens_ptr = cu_seqlens_ptr;
+    params.cu_seqlens_size = cu_seqlens_size;
+
     // All stride are in elements, not bytes.
     params.A_d_stride = A.stride(0);
     params.A_dstate_stride = A.stride(1);
@@ -173,15 +179,17 @@ void set_ssm_params_bwd(SSMParamsBwd &params,
                         void* ddelta_bias_ptr,
                         bool has_z,
                         bool delta_softplus,
-                        bool recompute_out_z) {
+                        bool recompute_out_z,
+                        void* cu_seqlens_ptr,
+                        const int cu_seqlens_size) {
     // Pass in "dout" instead of "out", we're not gonna use "out" unless we have z
     set_ssm_params_fwd(params, batch, dim, seqlen, dstate, n_groups, n_chunks, is_variable_B, is_variable_C,
                        u, delta, A, B, C, has_z ? out : dout,
                        has_z ? z : dout,
                        // If not recompute_out_z, pass dout instead of out_z.
                        // This won't be used by the bwd kernel
                        recompute_out_z ? out_z : dout,
-                       D_ptr, delta_bias_ptr, x_ptr, has_z, delta_softplus);
+                       D_ptr, delta_bias_ptr, x_ptr, has_z, delta_softplus, cu_seqlens_ptr, cu_seqlens_size);
     if (!recompute_out_z) { params.out_z_ptr = nullptr; }
 
     // Set the pointers and strides.
@@ -229,7 +237,8 @@ selective_scan_fwd(const at::Tensor &u, const at::Tensor &delta,
                   const c10::optional<at::Tensor> &D_,
                   const c10::optional<at::Tensor> &z_,
                   const c10::optional<at::Tensor> &delta_bias_,
-                  bool delta_softplus) {
+                  bool delta_softplus,
+                  const c10::optional<at::Tensor> &cu_seqlens_) {
     auto input_type = u.scalar_type();
     auto weight_type = A.scalar_type();
     TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -319,7 +328,9 @@ selective_scan_fwd(const at::Tensor &u, const at::Tensor &delta,
                        delta_bias_.has_value() ? delta_bias_.value().data_ptr() : nullptr,
                        x.data_ptr(),
                        has_z,
-                       delta_softplus);
+                       delta_softplus,
+                       cu_seqlens_.has_value() ? cu_seqlens_.value().data_ptr() : nullptr,
+                       cu_seqlens_.has_value() ? cu_seqlens_.value().size(0) : 0);
 
     // Otherwise the kernel will be launched from cuda:0 device
     // Cast to char to avoid compiler warning about narrowing
@@ -346,7 +357,8 @@ selective_scan_bwd(const at::Tensor &u, const at::Tensor &delta,
                   const c10::optional<at::Tensor> &out_,
                   c10::optional<at::Tensor> &dz_,
                   bool delta_softplus,
-                  bool recompute_out_z) {
+                  bool recompute_out_z,
+                  const c10::optional<at::Tensor> &cu_seqlens_) {
     auto input_type = u.scalar_type();
     auto weight_type = A.scalar_type();
     TORCH_CHECK(input_type == at::ScalarType::Float || input_type == at::ScalarType::Half || input_type == at::ScalarType::BFloat16);
@@ -474,7 +486,9 @@ selective_scan_bwd(const at::Tensor &u, const at::Tensor &delta,
                        dout, du, ddelta, dA, dB, dC, dz,
                        D_.has_value() ? dD.data_ptr() : nullptr,
                        delta_bias_.has_value() ? ddelta_bias.data_ptr() : nullptr,
-                       has_z, delta_softplus, recompute_out_z);
+                       has_z, delta_softplus, recompute_out_z,
+                       cu_seqlens_.has_value() ? cu_seqlens_.value().data_ptr() : nullptr,
+                       cu_seqlens_.has_value() ? cu_seqlens_.value().size(0) : 0);
 
     // Otherwise the kernel will be launched from cuda:0 device
     // Cast to char to avoid compiler warning about narrowing
diff --git a/csrc/selective_scan/selective_scan.h b/csrc/selective_scan/selective_scan.h
@@ -33,6 +33,8 @@ struct SSMParamsBase {
 
     bool delta_softplus;
 
+    int cu_seqlens_size;
+
     index_t A_d_stride;
     index_t A_dstate_stride;
     index_t B_batch_stride;
@@ -66,6 +68,8 @@ struct SSMParamsBase {
     void *__restrict__ x_ptr;
     void *__restrict__ z_ptr;
     void *__restrict__ out_z_ptr;
+
+    void *__restrict__ cu_seqlens_ptr;
 };
 
 struct SSMParamsBwd: public SSMParamsBase {
diff --git a/csrc/selective_scan/selective_scan_bwd_kernel.cuh b/csrc/selective_scan/selective_scan_bwd_kernel.cuh
@@ -136,6 +136,7 @@ void selective_scan_bwd_kernel(SSMParamsBwd params) {
         : reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id) * (params.n_chunks) * params.dstate;
     float dD_val = 0;
     float ddelta_bias_val = 0;
+    long *cu_seqlens = reinterpret_cast<long *>(params.cu_seqlens_ptr) + batch_id * params.u_batch_stride
 
     constexpr int kChunkSize = kNThreads * kNItems;
     u += (params.n_chunks - 1) * kChunkSize;
@@ -245,7 +246,22 @@ void selective_scan_bwd_kernel(SSMParamsBwd params) {
                 #pragma unroll
                 for (int i = 0; i < kNItems; ++i) {
                     const float delta_a_exp = exp2f(delta_vals[i] * A_scaled);
+
+                    // Reset A bar for cumulative sequences (Real)
+                    int left = 1;
+                    int right = params.cu_seqlens_size - 2;
+                    while (left <= right) {
+                        if (cu_seqlens[(left + right) >> 1] == threadIdx.x * kNItems + i + chunk * kChunkSize) {
+                            delta_a_exp = 0.f;
+                        } else if (cu_seqlens[(left + right) >> 1] < threadIdx.x * kNItems + i + chunk * kChunkSize) {
+                            left = ((left + right) >> 1) + 1;
+                        } else {
+                            right = ((left + right) >> 1) - 1;
+                        }
+                    }
+
                     thread_data[i] = make_float2(delta_a_exp, !kIsVariableB ? delta_vals[i] * float(u_vals[i]) : delta_vals[i] * float(u_vals[i]) * B_vals[i]);
+
                     if (i == 0) {
                         smem_delta_a[threadIdx.x == 0 ? state_idx + (chunk % 2) * MAX_DSTATE : threadIdx.x + 2 * MAX_DSTATE] = delta_a_exp;
                     } else {
@@ -332,6 +348,21 @@ void selective_scan_bwd_kernel(SSMParamsBwd params) {
                 for (int i = 0; i < kNItems; ++i) {
                     // Pytorch's implementation of complex exp (which calls thrust) is very slow
                     complex_t delta_a_exp = cexp2f(delta_vals[i] * A_scaled);
+
+                    // Reset A bar for cumulative sequences (Complex)
+                    int left = 1;
+                    int right = params.cu_seqlens_size - 2;
+                    while (left <= right) {
+                        if (cu_seqlens[(left + right) >> 1] == threadIdx.x * kNItems + i + chunk * kChunkSize) {
+                            delta_a_exp.real_ = 0.f;
+                            delta_a_exp.imag_ = 0.f;
+                        } else if (cu_seqlens[(left + right) >> 1] < threadIdx.x * kNItems + i + chunk * kChunkSize) {
+                            left = ((left + right) >> 1) + 1;
+                        } else {
+                            right = ((left + right) >> 1) - 1;
+                        }
+                    }
+
                     weight_t B_delta_u_val = !kIsVariableB ? delta_vals[i] * float(u_vals[i]) : B_vals[i] * delta_vals[i] * float(u_vals[i]);
                     thread_data[i] = make_float4(delta_a_exp.real_, delta_a_exp.imag_, B_delta_u_val.real_, B_delta_u_val.imag_);
                     if (i == 0) {
diff --git a/csrc/selective_scan/selective_scan_fwd_kernel.cuh b/csrc/selective_scan/selective_scan_fwd_kernel.cuh
@@ -107,6 +107,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
     weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
     input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + batch_id * params.C_batch_stride + group_id * params.C_group_stride;
     scan_t *x = reinterpret_cast<scan_t *>(params.x_ptr) + (batch_id * params.dim + dim_id * kNRows) * params.n_chunks * params.dstate;
+    long *cu_seqlens = reinterpret_cast<long *>(params.cu_seqlens_ptr) + batch_id * params.u_batch_stride
 
     float D_val[kNRows] = {0};
     if (params.D_ptr != nullptr) {
@@ -215,6 +216,20 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                     if constexpr (!kIsComplex) {
                         thread_data[i] = make_float2(exp2f(delta_vals[r][i] * A_val[r]),
                                                      !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i]);
+                        
+                        // Reset A bar for cumulative sequences (Real)
+                        int left = 1;
+                        int right = params.cu_seqlens_size - 2;
+                        while (left <= right) {
+                            if (cu_seqlens[(left + right) >> 1] == threadIdx.x * kNItems + i + chunk * kChunkSize) {
+                                thread_data[i].x = 0.f;
+                            } else if (cu_seqlens[(left + right) >> 1] < threadIdx.x * kNItems + i + chunk * kChunkSize) {
+                                left = ((left + right) >> 1) + 1;
+                            } else {
+                                right = ((left + right) >> 1) - 1;
+                            }
+                        }
+
                         if constexpr (!Ktraits::kIsEvenLen) {  // So that the last state is correct
                             if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
                                 thread_data[i] = make_float2(1.f, 0.f);
@@ -225,6 +240,21 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
                         complex_t delta_a_exp = cexp2f(delta_vals[r][i] * A_val[r]);
                         weight_t B_delta_u_val = !kIsVariableB ? delta_u_vals[r][i] : B_vals[i] * delta_u_vals[r][i];
                         thread_data[i] = make_float4(delta_a_exp.real_, delta_a_exp.imag_, B_delta_u_val.real_, B_delta_u_val.imag_);
+
+                        // Reset A bar for cumulative sequences (Complex)
+                        int left = 1;
+                        int right = params.cu_seqlens_size - 2;
+                        while (left <= right) {
+                            if (cu_seqlens[(left + right) >> 1] == threadIdx.x * kNItems + i + chunk * kChunkSize) {
+                                thread_data[i].x = 0.f;
+                                thread_data[i].y = 0.f;
+                            } else if (cu_seqlens[(left + right) >> 1] < threadIdx.x * kNItems + i + chunk * kChunkSize) {
+                                left = ((left + right) >> 1) + 1;
+                            } else {
+                                right = ((left + right) >> 1) - 1;
+                            }
+                        }
+
                         if constexpr (!Ktraits::kIsEvenLen) {  // So that the last state is correct
                             if (threadIdx.x * kNItems + i >= params.seqlen - chunk * kChunkSize) {
                                 thread_data[i] = make_float4(1.f, 0.f, 0.f, 0.f);
diff --git a/mamba_ssm/modules/mamba_simple.py b/mamba_ssm/modules/mamba_simple.py
@@ -10,7 +10,7 @@
 
 from einops import rearrange, repeat
 
-from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn
+from mamba_ssm.ops.selective_scan_interface import selective_scan_fn, mamba_inner_fn, selective_scan_ref
 
 try:
     from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
@@ -116,7 +116,7 @@ def __init__(
 
         self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
 
-    def forward(self, hidden_states, inference_params=None):
+    def forward(self, hidden_states, cu_seqlens=None, inference_params=None):
         """
         hidden_states: (B, L, D)
         Returns: same shape as hidden_states
@@ -157,9 +157,22 @@ def forward(self, hidden_states, inference_params=None):
                 self.D.float(),
                 delta_bias=self.dt_proj.bias.float(),
                 delta_softplus=True,
+                cu_seqlens=cu_seqlens[0] if cu_seqlens is not None else None,
             )
         else:
             x, z = xz.chunk(2, dim=1)
+
+            # (Optional Step1 for cu_seqlens): Right padding zeros at sequence boundary for con1d ops in cumulative sequences
+            if cu_seqlens is not None:
+                padded_x = x
+                count = 0
+                for idx in cu_seqlens[0][1:-1].tolist():
+                    padded_idx = idx + count*(self.d_conv - 1)
+                    padded_x = torch.cat((padded_x[:, :, :padded_idx], torch.zeros(1, x.shape[1], self.d_conv - 1, dtype=x.dtype, device=x.device), padded_x[:, :, padded_idx:]), dim=2)
+                    count = count + 1
+                x = padded_x
+                assert x.shape[2] == (self.d_conv - 1) * len(cu_seqlens[0][1:-1]) + z.shape[2]
+
             # Compute short convolution
             if conv_state is not None:
                 # If we just take x[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
@@ -175,6 +188,17 @@ def forward(self, hidden_states, inference_params=None):
                     bias=self.conv1d.bias,
                     activation=self.activation,
                 )
+            
+            # (Optional Step2 for cu_seqlens): Mask conv1d ops in cumulative sequences
+            if cu_seqlens is not None:
+                mask = []
+                for seq_len in (cu_seqlens[0][1:] - cu_seqlens[0][:-1]).tolist():
+                    mask.extend([True] * seq_len)
+                    mask.extend([False] * (self.d_conv - 1))
+                mask = mask[:-(self.d_conv - 1)]
+                assert x.shape[2] == len(mask)
+                x = x[:, :, mask]
+                assert x.shape[2] == z.shape[2]
 
             # We're careful here about the layout, to avoid extra transposes.
             # We want dt to have d as the slowest moving dimension
@@ -185,6 +209,7 @@ def forward(self, hidden_states, inference_params=None):
             dt = rearrange(dt, "d (b l) -> b d l", l=seqlen)
             B = rearrange(B, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
             C = rearrange(C, "(b l) dstate -> b dstate l", l=seqlen).contiguous()
+
             assert self.activation in ["silu", "swish"]
             y = selective_scan_fn(
                 x,
@@ -197,6 +222,7 @@ def forward(self, hidden_states, inference_params=None):
                 delta_bias=self.dt_proj.bias.float(),
                 delta_softplus=True,
                 return_last_state=ssm_state is not None,
+                cu_seqlens=cu_seqlens[0] if cu_seqlens is not None else None,
             )
             if ssm_state is not None:
                 y, last_state = y
diff --git a/mamba_ssm/ops/selective_scan_interface.py b/mamba_ssm/ops/selective_scan_interface.py