From 9c637b93583ed81845e008551d6f6fd0d6ab45bf Mon Sep 17 00:00:00 2001
From: ShepelIlya <chepsiilya@gmail.com>
Date: Wed, 18 May 2022 11:56:04 +0300
Subject: [PATCH 1/5] SoA syntax, some cleaning and optimization

---
 dogm/demo/main.cpp                            |   4 +-
 .../include/mapping/kernel/measurement_grid.h |   8 +-
 .../include/mapping/laser_to_meas_grid.h      |   4 +-
 .../mapping/kernel/measurement_grid.cu        |  20 +-
 .../simulator/mapping/laser_to_meas_grid.cu   |  18 +-
 .../simulator/mapping/opengl/framebuffer.cpp  |  12 +-
 .../demo/simulator/mapping/opengl/texture.cpp |  14 +-
 dogm/demo/utils/image_creation.cpp            |  43 ++-
 dogm/include/dogm/cuda_utils.h                |  19 +-
 dogm/include/dogm/dogm.h                      |  12 +-
 dogm/include/dogm/dogm_types.h                | 280 +++++++++++++++++-
 .../dogm/kernel/ego_motion_compensation.h     |   9 +-
 dogm/include/dogm/kernel/init.h               |   6 +-
 dogm/include/dogm/kernel/init_new_particles.h |  11 +-
 dogm/include/dogm/kernel/mass_update.h        |   8 +-
 dogm/include/dogm/kernel/particle_to_grid.h   |   5 +-
 .../include/dogm/kernel/statistical_moments.h |   5 +-
 .../dogm/kernel/update_persistent_particles.h |  12 +-
 dogm/src/dogm.cu                              | 115 ++++---
 dogm/src/kernel/ego_motion_compensation.cu    |   3 +-
 dogm/src/kernel/init.cu                       |  28 +-
 dogm/src/kernel/init_new_particles.cu         |  30 +-
 dogm/src/kernel/mass_update.cu                |  47 ++-
 dogm/src/kernel/particle_to_grid.cu           |   8 +-
 dogm/src/kernel/statistical_moments.cu        |  24 +-
 .../src/kernel/update_persistent_particles.cu |  41 ++-
 26 files changed, 502 insertions(+), 284 deletions(-)

diff --git a/dogm/demo/main.cpp b/dogm/demo/main.cpp
index 2be2c89..41ddcf7 100644
--- a/dogm/demo/main.cpp
+++ b/dogm/demo/main.cpp
@@ -80,14 +80,14 @@ int main(int argc, const char** argv)
 
     for (int step = 0; step < num_simulation_steps; ++step)
     {
-        dogm::MeasurementCell* meas_grid = grid_generator.generateGrid(sim_data[step].measurements);
+        dogm::MeasurementCellsSoA meas_grid = grid_generator.generateGrid(sim_data[step].measurements);
 
         const auto update_grid_caller = [&grid_map](auto&&... args) {
             grid_map.updateGrid(std::forward<decltype(args)>(args)...);
         };
 
         cycle_timer.timeFunctionCall(true, update_grid_caller, meas_grid, sim_data[step].ego_pose.x,
-                                     sim_data[step].ego_pose.y, 0.0f, simulation_step_period, true);
+                                     sim_data[step].ego_pose.y, simulation_step_period, true);
 
         const auto cells_with_velocity =
             computeCellsWithVelocity(grid_map, minimum_occupancy_threshold, minimum_velocity_threshold);
diff --git a/dogm/demo/simulator/include/mapping/kernel/measurement_grid.h b/dogm/demo/simulator/include/mapping/kernel/measurement_grid.h
index 49eb32d..90ff1ab 100644
--- a/dogm/demo/simulator/include/mapping/kernel/measurement_grid.h
+++ b/dogm/demo/simulator/include/mapping/kernel/measurement_grid.h
@@ -6,10 +6,6 @@
 
 #include <device_launch_parameters.h>
 
-namespace dogm
-{
-struct MeasurementCell;
-}
 
 __global__ void createPolarGridTextureKernel(cudaSurfaceObject_t polar, const float* __restrict__ measurements,
                                              int width, int height, float resolution);
@@ -17,8 +13,8 @@ __global__ void createPolarGridTextureKernel(cudaSurfaceObject_t polar, const fl
 __global__ void fusePolarGridTextureKernel(cudaSurfaceObject_t polar, const float* __restrict__ measurements, int width,
                                            int height, float resolution);
 
-__global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCell* __restrict__ meas_grid,
+__global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCellsSoA meas_grid,
                                                      cudaSurfaceObject_t cart, int grid_size);
 
-__global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCell* __restrict__ meas_grid,
+__global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCellsSoA meas_grid,
                                                  const float2* __restrict__ grid, int grid_size);
diff --git a/dogm/demo/simulator/include/mapping/laser_to_meas_grid.h b/dogm/demo/simulator/include/mapping/laser_to_meas_grid.h
index 0d210e5..7c74fea 100644
--- a/dogm/demo/simulator/include/mapping/laser_to_meas_grid.h
+++ b/dogm/demo/simulator/include/mapping/laser_to_meas_grid.h
@@ -23,10 +23,10 @@ class LaserMeasurementGrid
     LaserMeasurementGrid(const Params& params, float grid_length, float resolution);
     ~LaserMeasurementGrid();
 
-    dogm::MeasurementCell* generateGrid(const std::vector<float>& measurements);
+    dogm::MeasurementCellsSoA generateGrid(const std::vector<float>& measurements);
 
 private:
-    dogm::MeasurementCell* meas_grid;
+    dogm::MeasurementCellsSoA meas_grid;
     int grid_size;
 
     Params params;
diff --git a/dogm/demo/simulator/mapping/kernel/measurement_grid.cu b/dogm/demo/simulator/mapping/kernel/measurement_grid.cu
index 4337ba9..13717ef 100644
--- a/dogm/demo/simulator/mapping/kernel/measurement_grid.cu
+++ b/dogm/demo/simulator/mapping/kernel/measurement_grid.cu
@@ -112,7 +112,7 @@ __global__ void fusePolarGridTextureKernel(cudaSurfaceObject_t polar, const floa
     }
 }
 
-__global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCell* __restrict__ meas_grid,
+__global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCellsSoA meas_grid,
                                                      cudaSurfaceObject_t cart, int grid_size)
 {
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -123,15 +123,15 @@ __global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCell* __re
     {
         float4 color = surf2Dread<float4>(cart, x * sizeof(float4), y);
 
-        meas_grid[index].occ_mass = color.x;
-        meas_grid[index].free_mass = color.y;
+        meas_grid.occ_mass[index] = color.x;
+        meas_grid.free_mass[index] = color.y;
 
-        meas_grid[index].likelihood = 1.0f;
-        meas_grid[index].p_A = 1.0f;
+        meas_grid.likelihood[index] = 1.0f;
+        meas_grid.p_A[index] = 1.0f;
     }
 }
 
-__global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCell* __restrict__ meas_grid,
+__global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCellsSoA meas_grid,
                                                  const float2* __restrict__ grid, int grid_size)
 {
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
@@ -142,10 +142,10 @@ __global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCell* __restri
     {
         float2 masses = grid[index];
 
-        meas_grid[index].occ_mass = masses.x;
-        meas_grid[index].free_mass = masses.y;
+        meas_grid.occ_mass[index] = masses.x;
+        meas_grid.free_mass[index] = masses.y;
 
-        meas_grid[index].likelihood = 1.0f;
-        meas_grid[index].p_A = 1.0f;
+        meas_grid.likelihood[index] = 1.0f;
+        meas_grid.p_A[index] = 1.0f;
     }
 }
diff --git a/dogm/demo/simulator/mapping/laser_to_meas_grid.cu b/dogm/demo/simulator/mapping/laser_to_meas_grid.cu
index 035afd6..0c1ecb4 100644
--- a/dogm/demo/simulator/mapping/laser_to_meas_grid.cu
+++ b/dogm/demo/simulator/mapping/laser_to_meas_grid.cu
@@ -12,23 +12,23 @@ LaserMeasurementGrid::LaserMeasurementGrid(const Params& params, float grid_leng
 {
     int grid_cell_count = grid_size * grid_size;
 
-    CHECK_ERROR(cudaMalloc(&meas_grid, grid_cell_count * sizeof(dogm::MeasurementCell)));
+    meas_grid.init(grid_cell_count, true);
 
     renderer = std::make_unique<Renderer>(grid_size, params.fov, grid_length, params.max_range);
 }
 
 LaserMeasurementGrid::~LaserMeasurementGrid()
 {
-    CHECK_ERROR(cudaFree(meas_grid));
+    meas_grid.free();
 }
 
-dogm::MeasurementCell* LaserMeasurementGrid::generateGrid(const std::vector<float>& measurements)
+dogm::MeasurementCellsSoA LaserMeasurementGrid::generateGrid(const std::vector<float>& measurements)
 {
     const int num_measurements = measurements.size();
 
     float* d_measurements;
-    CHECK_ERROR(cudaMalloc(&d_measurements, num_measurements * sizeof(float)));
-    CHECK_ERROR(
+    CUDA_CALL(cudaMalloc(&d_measurements, num_measurements * sizeof(float)));
+    CUDA_CALL(
         cudaMemcpy(d_measurements, measurements.data(), num_measurements * sizeof(float), cudaMemcpyHostToDevice));
 
     const int polar_width = num_measurements;
@@ -47,7 +47,7 @@ dogm::MeasurementCell* LaserMeasurementGrid::generateGrid(const std::vector<floa
     createPolarGridTextureKernel<<<grid_dim, dim_block>>>(polar_surface, d_measurements, polar_width, polar_height,
                                                           params.resolution);
 
-    CHECK_ERROR(cudaGetLastError());
+    CUDA_CALL(cudaGetLastError());
     polar_texture.endCudaAccess(polar_surface);
 
     // render cartesian image to texture using polar texture
@@ -60,11 +60,11 @@ dogm::MeasurementCell* LaserMeasurementGrid::generateGrid(const std::vector<floa
     // transform RGBA texture to measurement grid
     cartesianGridToMeasurementGridKernel<<<cart_grid_dim, dim_block>>>(meas_grid, cartesian_surface, grid_size);
 
-    CHECK_ERROR(cudaGetLastError());
+    CUDA_CALL(cudaGetLastError());
     framebuffer->endCudaAccess(cartesian_surface);
 
-    CHECK_ERROR(cudaFree(d_measurements));
-    CHECK_ERROR(cudaDeviceSynchronize());
+    CUDA_CALL(cudaFree(d_measurements));
+    CUDA_CALL(cudaDeviceSynchronize());
 
     return meas_grid;
 }
diff --git a/dogm/demo/simulator/mapping/opengl/framebuffer.cpp b/dogm/demo/simulator/mapping/opengl/framebuffer.cpp
index 8c67d33..4c7144e 100644
--- a/dogm/demo/simulator/mapping/opengl/framebuffer.cpp
+++ b/dogm/demo/simulator/mapping/opengl/framebuffer.cpp
@@ -23,7 +23,7 @@ Framebuffer::Framebuffer(int width, int height)
 
     glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, texture, 0);
 
-    CHECK_ERROR(cudaGraphicsGLRegisterImage(&resource, texture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly));
+    CUDA_CALL(cudaGraphicsGLRegisterImage(&resource, texture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly));
 
     glBindFramebuffer(GL_FRAMEBUFFER, 0);
 }
@@ -36,23 +36,23 @@ Framebuffer::~Framebuffer()
 
 void Framebuffer::beginCudaAccess(cudaSurfaceObject_t* surfaceObject)
 {
-    CHECK_ERROR(cudaGraphicsMapResources(1, &resource, nullptr));
+    CUDA_CALL(cudaGraphicsMapResources(1, &resource, nullptr));
 
     cudaArray_t cudaArray;
-    CHECK_ERROR(cudaGraphicsSubResourceGetMappedArray(&cudaArray, resource, 0, 0));
+    CUDA_CALL(cudaGraphicsSubResourceGetMappedArray(&cudaArray, resource, 0, 0));
 
     cudaResourceDesc resourceDesc;
     memset(&resourceDesc, 0, sizeof(cudaResourceDesc));
     resourceDesc.resType = cudaResourceTypeArray;
     resourceDesc.res.array.array = cudaArray;
 
-    CHECK_ERROR(cudaCreateSurfaceObject(surfaceObject, &resourceDesc));
+    CUDA_CALL(cudaCreateSurfaceObject(surfaceObject, &resourceDesc));
 }
 
 void Framebuffer::endCudaAccess(cudaSurfaceObject_t surfaceObject)
 {
-    CHECK_ERROR(cudaGraphicsUnmapResources(1, &resource, nullptr));
-    CHECK_ERROR(cudaDestroySurfaceObject(surfaceObject));
+    CUDA_CALL(cudaGraphicsUnmapResources(1, &resource, nullptr));
+    CUDA_CALL(cudaDestroySurfaceObject(surfaceObject));
 }
 
 void Framebuffer::bind()
diff --git a/dogm/demo/simulator/mapping/opengl/texture.cpp b/dogm/demo/simulator/mapping/opengl/texture.cpp
index 682a440..63a06a5 100644
--- a/dogm/demo/simulator/mapping/opengl/texture.cpp
+++ b/dogm/demo/simulator/mapping/opengl/texture.cpp
@@ -31,7 +31,7 @@ Texture::Texture(int width, int height, float anisotropy_level)
     float color[] = {0.0f, 0.0f, 1.0f, 1.0f};
     glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR, color);
 
-    CHECK_ERROR(
+    CUDA_CALL(
         cudaGraphicsGLRegisterImage(&resource, texture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore));
 
     glBindTexture(GL_TEXTURE_2D, 0);
@@ -44,24 +44,24 @@ Texture::~Texture()
 
 void Texture::beginCudaAccess(cudaSurfaceObject_t* surfaceObject)
 {
-    CHECK_ERROR(cudaGraphicsMapResources(1, &resource, nullptr));
+    CUDA_CALL(cudaGraphicsMapResources(1, &resource, nullptr));
 
     cudaArray_t cudaArray;
-    CHECK_ERROR(cudaGraphicsSubResourceGetMappedArray(&cudaArray, resource, 0, 0));
+    CUDA_CALL(cudaGraphicsSubResourceGetMappedArray(&cudaArray, resource, 0, 0));
 
     cudaResourceDesc resourceDesc;
     memset(&resourceDesc, 0, sizeof(cudaResourceDesc));
     resourceDesc.resType = cudaResourceTypeArray;
     resourceDesc.res.array.array = cudaArray;
 
-    CHECK_ERROR(cudaCreateSurfaceObject(surfaceObject, &resourceDesc));
+    CUDA_CALL(cudaCreateSurfaceObject(surfaceObject, &resourceDesc));
 }
 
 void Texture::endCudaAccess(cudaSurfaceObject_t surfaceObject)
 {
-    CHECK_ERROR(cudaGraphicsUnmapResources(1, &resource, nullptr));
-    CHECK_ERROR(cudaGraphicsUnregisterResource(resource));
-    CHECK_ERROR(cudaDestroySurfaceObject(surfaceObject));
+    CUDA_CALL(cudaGraphicsUnmapResources(1, &resource, nullptr));
+    CUDA_CALL(cudaGraphicsUnregisterResource(resource));
+    CUDA_CALL(cudaDestroySurfaceObject(surfaceObject));
 }
 
 void Texture::generateMipMap()
diff --git a/dogm/demo/utils/image_creation.cpp b/dogm/demo/utils/image_creation.cpp
index da2ed6e..8435ca7 100644
--- a/dogm/demo/utils/image_creation.cpp
+++ b/dogm/demo/utils/image_creation.cpp
@@ -32,17 +32,16 @@ std::vector<Point<dogm::GridCell>> computeCellsWithVelocity(const dogm::DOGM& gr
         {
             int index = y * grid_map.getGridSize() + x;
 
-            const dogm::GridCell& cell = grid_cells[index];
-            float occ = pignistic_transformation(cell.free_mass, cell.occ_mass);
+            float occ = pignistic_transformation(grid_cells.free_mass[index], grid_cells.occ_mass[index]);
             cv::Mat velocity_mean(2, 1, CV_32FC1);
-            velocity_mean.at<float>(0) = cell.mean_x_vel;
-            velocity_mean.at<float>(1) = cell.mean_y_vel;
+            velocity_mean.at<float>(0) = grid_cells.mean_x_vel[index];
+            velocity_mean.at<float>(1) = grid_cells.mean_y_vel[index];
 
             cv::Mat velocity_covar(2, 2, CV_32FC1);
-            velocity_covar.at<float>(0, 0) = cell.var_x_vel;
-            velocity_covar.at<float>(1, 0) = cell.covar_xy_vel;
-            velocity_covar.at<float>(0, 1) = cell.covar_xy_vel;
-            velocity_covar.at<float>(1, 1) = cell.var_y_vel;
+            velocity_covar.at<float>(0, 0) = grid_cells.var_x_vel[index];
+            velocity_covar.at<float>(1, 0) = grid_cells.covar_xy_vel[index];
+            velocity_covar.at<float>(0, 1) = grid_cells.covar_xy_vel[index];
+            velocity_covar.at<float>(1, 1) = grid_cells.var_y_vel[index];
 
             cv::Mat velocity_normalized_by_variance = velocity_mean.t() * velocity_covar.inv() * velocity_mean;
 
@@ -54,7 +53,22 @@ std::vector<Point<dogm::GridCell>> computeCellsWithVelocity(const dogm::DOGM& gr
                 // Storing the point as grid index to be consistent with cell.mean_x_vel and cell.mean_y_vel
                 point.x = static_cast<float>(x);
                 point.y = static_cast<float>(y);
-                point.data = cell;
+                point.data.start_idx = grid_cells.start_idx[index];
+                point.data.end_idx = grid_cells.end_idx[index];
+                point.data.new_born_occ_mass = grid_cells.new_born_occ_mass[index];
+                point.data.pers_occ_mass = grid_cells.pers_occ_mass[index];
+                point.data.free_mass = grid_cells.free_mass[index];
+                point.data.occ_mass = grid_cells.occ_mass[index];
+                point.data.pred_occ_mass = grid_cells.pred_occ_mass[index];
+                point.data.mu_A = grid_cells.mu_A[index];
+                point.data.mu_UA = grid_cells.mu_UA[index];
+                point.data.w_A = grid_cells.w_A[index];
+                point.data.w_UA = grid_cells.w_UA[index];
+                point.data.mean_x_vel = grid_cells.mean_x_vel[index];
+                point.data.mean_y_vel = grid_cells.mean_y_vel[index];
+                point.data.var_x_vel = grid_cells.var_x_vel[index];
+                point.data.var_y_vel = grid_cells.var_y_vel[index];
+                point.data.covar_xy_vel = grid_cells.covar_xy_vel[index];
                 point.cluster_id = UNCLASSIFIED;
 
                 cells_with_velocity.push_back(point);
@@ -76,8 +90,7 @@ cv::Mat compute_measurement_grid_image(const dogm::DOGM& grid_map)
         {
             int index = y * grid_map.getGridSize() + x;
 
-            const dogm::MeasurementCell& cell = meas_cells[index];
-            float occ = pignistic_transformation(cell.free_mass, cell.occ_mass);
+            float occ = pignistic_transformation(meas_cells.free_mass[index], meas_cells.occ_mass[index]);
             auto temp = static_cast<uchar>(occ * 255.0f);
 
             row_ptr[x] = cv::Vec3b(255 - temp, 255 - temp, 255 - temp);
@@ -97,9 +110,8 @@ cv::Mat compute_raw_measurement_grid_image(const dogm::DOGM& grid_map)
         for (int x = 0; x < grid_map.getGridSize(); x++)
         {
             int index = y * grid_map.getGridSize() + x;
-            const dogm::MeasurementCell& cell = meas_cells[index];
-            auto red = static_cast<int>(cell.occ_mass * 255.0f);
-            auto green = static_cast<int>(cell.free_mass * 255.0f);
+            auto red = static_cast<int>(meas_cells.occ_mass[index] * 255.0f);
+            auto green = static_cast<int>(meas_cells.free_mass[index] * 255.0f);
             int blue = 255 - red - green;
 
             row_ptr[x] = cv::Vec3b(blue, green, red);
@@ -120,8 +132,7 @@ cv::Mat compute_dogm_image(const dogm::DOGM& grid_map, const std::vector<Point<d
         {
             int index = y * grid_map.getGridSize() + x;
 
-            const dogm::GridCell& cell = grid_cells[index];
-            float occ = pignistic_transformation(cell.free_mass, cell.occ_mass);
+            float occ = pignistic_transformation(grid_cells.free_mass[index], grid_cells.occ_mass[index]);
             uchar grayscale_value = 255 - static_cast<uchar>(floor(occ * 255));
 
             row_ptr[x] = cv::Vec3b(grayscale_value, grayscale_value, grayscale_value);
diff --git a/dogm/include/dogm/cuda_utils.h b/dogm/include/dogm/cuda_utils.h
index 3efdbeb..6611a7e 100644
--- a/dogm/include/dogm/cuda_utils.h
+++ b/dogm/include/dogm/cuda_utils.h
@@ -10,18 +10,15 @@
 
 #define GPU_LAMBDA [=] __host__ __device__
 
-#define CHECK_ERROR(ans)                                                                                               \
-    {                                                                                                                  \
-        checkError((ans), __FILE__, __LINE__);                                                                         \
-    }
-
-inline void checkError(cudaError_t code, const char* file, int line)
-{
-    if (code != cudaSuccess)
-    {
-        printf("GPU Kernel Error: %s %s %d\n", cudaGetErrorString(code), file, line);
-    }
+#ifndef CUDA_CALL
+#define CUDA_CALL(call)\
+{\
+    auto status = static_cast<cudaError_t>(call);\
+    if (status != cudaSuccess)\
+        fprintf(stderr, "ERROR: CUDA RT call \"%s\" in line %d of file %s failed with %s (%d).\n",\
+            #call, __LINE__, __FILE__, cudaGetErrorString(status), status);\
 }
+#endif
 
 inline int divUp(int total, int grain)
 {
diff --git a/dogm/include/dogm/dogm.h b/dogm/include/dogm/dogm.h
index 4a04a74..ee34ced 100644
--- a/dogm/include/dogm/dogm.h
+++ b/dogm/include/dogm/dogm.h
@@ -76,7 +76,7 @@ class DOGM
      * @param dt delta time since the last update.
      * @param device whether the measurement grid resides in GPU memory (default: true).
      */
-    void updateGrid(MeasurementCell* measurement_grid, float new_x, float new_y, float new_yaw, float dt,
+     void updateGrid(MeasurementCellsSoA measurement_grid, float new_x, float new_y, float dt,
                     bool device = true);
 
     /**
@@ -84,14 +84,14 @@ class DOGM
      *
      * @return grid map.
      */
-    std::vector<GridCell> getGridCells() const;
+    GridCellsSoA getGridCells() const;
 
     /**
      * Returns the measurement grid map in the host memory.
      *
      * @return measurement grid map.
      */
-    std::vector<MeasurementCell> getMeasurementCells() const;
+    MeasurementCellsSoA getMeasurementCells() const;
 
     /**
      * Returns the persistent particles of the particle filter.
@@ -141,7 +141,7 @@ class DOGM
     void initialize();
 
     void updatePose(float new_x, float new_y, float new_yaw);
-    void updateMeasurementGrid(MeasurementCell* measurement_grid, bool device);
+    void updateMeasurementGrid(MeasurementCellsSoA measurement_grid, bool device);
 
 public:
     void initializeParticles();
@@ -157,11 +157,11 @@ class DOGM
 public:
     Params params;
 
-    GridCell* grid_cell_array;
+    GridCellsSoA grid_cell_array;
     ParticlesSoA particle_array;
     ParticlesSoA particle_array_next;
     ParticlesSoA birth_particle_array;
-    MeasurementCell* meas_cell_array;
+    MeasurementCellsSoA meas_cell_array;
 
     float* weight_array;
     float* birth_weight_array;
diff --git a/dogm/include/dogm/dogm_types.h b/dogm/include/dogm/dogm_types.h
index 3c250ba..e856232 100644
--- a/dogm/include/dogm/dogm_types.h
+++ b/dogm/include/dogm/dogm_types.h
@@ -48,6 +48,262 @@ struct Particle
     glm::vec4 state;
 };
 
+struct GridCellsSoA
+{
+    int* start_idx;
+    int* end_idx;
+    float* new_born_occ_mass;
+    float* pers_occ_mass;
+    float* free_mass;
+    float* occ_mass;
+    float* pred_occ_mass;
+    float* mu_A;
+    float* mu_UA;
+
+    float* w_A;
+    float* w_UA;
+
+    float* mean_x_vel;
+    float* mean_y_vel;
+    float* var_x_vel;
+    float* var_y_vel;
+    float* covar_xy_vel;
+
+    int size;
+    bool device;
+
+    GridCellsSoA() : size(0), device(true) {}
+
+    GridCellsSoA(int new_size, bool is_device) { init(new_size, is_device); }
+
+    void init(int new_size, bool is_device)
+    {
+        size = new_size;
+        device = is_device;
+        if (device)
+        {
+            CUDA_CALL(cudaMalloc((void**)&start_idx, size * sizeof(int)));
+            CUDA_CALL(cudaMalloc((void**)&end_idx, size * sizeof(int)));
+            CUDA_CALL(cudaMalloc((void**)&new_born_occ_mass, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&pers_occ_mass, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&free_mass, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&occ_mass, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&pred_occ_mass, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&mu_A, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&mu_UA, size * sizeof(float)));
+
+            CUDA_CALL(cudaMalloc((void**)&w_A, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&w_UA, size * sizeof(float)));
+
+            CUDA_CALL(cudaMalloc((void**)&mean_x_vel, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&mean_y_vel, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&var_x_vel, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&var_y_vel, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&covar_xy_vel, size * sizeof(float)));
+        }
+        else
+        {
+            start_idx = (int*)malloc(size * sizeof(int));
+            end_idx = (int*)malloc(size * sizeof(int));
+            new_born_occ_mass = (float*)malloc(size * sizeof(float));
+            pers_occ_mass = (float*)malloc(size * sizeof(float));
+            free_mass = (float*)malloc(size * sizeof(float));
+            occ_mass = (float*)malloc(size * sizeof(float));
+            pred_occ_mass = (float*)malloc(size * sizeof(float));
+            mu_A = (float*)malloc(size * sizeof(float));
+            mu_UA = (float*)malloc(size * sizeof(float));
+
+            w_A = (float*)malloc(size * sizeof(float));
+            w_UA = (float*)malloc(size * sizeof(float));
+
+            mean_x_vel = (float*)malloc(size * sizeof(float));
+            mean_y_vel = (float*)malloc(size * sizeof(float));
+            var_x_vel = (float*)malloc(size * sizeof(float));
+            var_y_vel = (float*)malloc(size * sizeof(float));
+            covar_xy_vel = (float*)malloc(size * sizeof(float));
+        }
+    }
+
+    void free()
+    {
+        if (device)
+        {
+            CUDA_CALL(cudaFree(start_idx));
+            CUDA_CALL(cudaFree(end_idx));
+            CUDA_CALL(cudaFree(new_born_occ_mass));
+            CUDA_CALL(cudaFree(pers_occ_mass));
+            CUDA_CALL(cudaFree(free_mass));
+            CUDA_CALL(cudaFree(occ_mass));
+            CUDA_CALL(cudaFree(pred_occ_mass));
+            CUDA_CALL(cudaFree(mu_A));
+            CUDA_CALL(cudaFree(mu_UA));
+
+            CUDA_CALL(cudaFree(w_A));
+            CUDA_CALL(cudaFree(w_UA));
+
+            CUDA_CALL(cudaFree(mean_x_vel));
+            CUDA_CALL(cudaFree(mean_y_vel));
+            CUDA_CALL(cudaFree(var_x_vel));
+            CUDA_CALL(cudaFree(var_y_vel));
+            CUDA_CALL(cudaFree(covar_xy_vel));
+        }
+        else
+        {
+            ::free(start_idx);
+            ::free(end_idx);
+            ::free(new_born_occ_mass);
+            ::free(pers_occ_mass);
+            ::free(free_mass);
+            ::free(occ_mass);
+            ::free(pred_occ_mass);
+            ::free(mu_A);
+            ::free(mu_UA);
+
+            ::free(w_A);
+            ::free(w_UA);
+
+            ::free(mean_x_vel);
+            ::free(mean_y_vel);
+            ::free(var_x_vel);
+            ::free(var_y_vel);
+            ::free(covar_xy_vel);
+        }
+    }
+
+    void copy(const GridCellsSoA& other, cudaMemcpyKind kind)
+    {
+        CUDA_CALL(cudaMemcpy(start_idx, other.start_idx, size * sizeof(int), kind));
+        CUDA_CALL(cudaMemcpy(end_idx, other.end_idx, size * sizeof(int), kind));
+        CUDA_CALL(cudaMemcpy(new_born_occ_mass, other.new_born_occ_mass, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(pers_occ_mass, other.pers_occ_mass, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(free_mass, other.free_mass, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(occ_mass, other.occ_mass, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(pred_occ_mass, other.pred_occ_mass, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(mu_A, other.mu_A, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(mu_UA, other.mu_UA, size * sizeof(float), kind));
+
+        CUDA_CALL(cudaMemcpy(w_A, other.w_A, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(w_UA, other.w_UA, size * sizeof(float), kind));
+
+        CUDA_CALL(cudaMemcpy(mean_x_vel, other.mean_x_vel, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(mean_y_vel, other.mean_y_vel, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(var_x_vel, other.var_x_vel, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(var_y_vel, other.var_y_vel, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(covar_xy_vel, other.covar_xy_vel, size * sizeof(float), kind));
+    }
+
+    GridCellsSoA& operator=(const GridCellsSoA& other)
+    {
+        if (this != &other)
+        {
+            copy(other, cudaMemcpyDeviceToDevice);
+        }
+
+        return *this;
+    }
+
+    __device__ void copy(const GridCellsSoA& other, int index, int other_index)
+    {
+        start_idx[index] = other.start_idx[other_index];
+        end_idx[index] = other.end_idx[other_index];
+        new_born_occ_mass[index] = other.new_born_occ_mass[other_index];
+        pers_occ_mass[index] = other.pers_occ_mass[other_index];
+        free_mass[index] = other.free_mass[other_index];
+        occ_mass[index] = other.occ_mass[other_index];
+        pred_occ_mass[index] = other.pred_occ_mass[other_index];
+        mu_A[index] = other.mu_A[other_index];
+        mu_UA[index] = other.mu_UA[other_index];
+
+        w_A[index] = other.w_A[other_index];
+        w_UA[index] = other.w_UA[other_index];
+
+        mean_x_vel[index] = other.mean_x_vel[other_index];
+        mean_y_vel[index] = other.mean_y_vel[other_index];
+        var_x_vel[index] = other.var_x_vel[other_index];
+        var_y_vel[index] = other.var_y_vel[other_index];
+        covar_xy_vel[index] = other.covar_xy_vel[other_index];
+    }
+};
+
+struct MeasurementCellsSoA
+{
+    float* free_mass;
+    float* occ_mass;
+    float* likelihood;
+    float* p_A;
+
+    int size;
+    bool device;
+
+    MeasurementCellsSoA() : size(0), device(true) {}
+
+    MeasurementCellsSoA(int new_size, bool is_device) { init(new_size, is_device); }
+
+    void init(int new_size, bool is_device)
+    {
+        size = new_size;
+        device = is_device;
+        if (device)
+        {
+            CUDA_CALL(cudaMalloc((void**)&free_mass, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&occ_mass, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&likelihood, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&p_A, size * sizeof(float)));
+        }
+        else
+        {
+            free_mass = (float*)malloc(size * sizeof(float));
+            occ_mass = (float*)malloc(size * sizeof(float));
+            likelihood = (float*)malloc(size * sizeof(float));
+            p_A = (float*)malloc(size * sizeof(float));
+        }
+    }
+
+    void free()
+    {
+        if (device)
+        {
+            CUDA_CALL(cudaFree(free_mass));
+            CUDA_CALL(cudaFree(occ_mass));
+            CUDA_CALL(cudaFree(likelihood));
+            CUDA_CALL(cudaFree(p_A));
+        }
+        else
+        {
+            ::free(free_mass);
+            ::free(occ_mass);
+            ::free(likelihood);
+            ::free(p_A);
+        }
+    }
+
+    void copy(const MeasurementCellsSoA& other, cudaMemcpyKind kind)
+    {
+        CUDA_CALL(cudaMemcpy(free_mass, other.free_mass, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(occ_mass, other.occ_mass, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(likelihood, other.likelihood, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(p_A, other.p_A, size * sizeof(float), kind));
+    }
+
+    MeasurementCellsSoA& operator=(const MeasurementCellsSoA& other)
+    {
+        if (this != &other)
+        {
+            copy(other, cudaMemcpyDeviceToDevice);
+        }
+
+        return *this;
+    }
+
+    __device__ void copy(const MeasurementCellsSoA& other, int index, int other_index)
+    {
+        free_mass[index] = other.free_mass[other_index];
+        occ_mass[index] = other.occ_mass[other_index];
+        likelihood[index] = other.likelihood[other_index];
+        p_A[index] = other.p_A[other_index];
+    }
+};
+
 struct ParticlesSoA
 {
     glm::vec4* state;
@@ -68,10 +324,10 @@ struct ParticlesSoA
         device = is_device;
         if (device)
         {
-            CHECK_ERROR(cudaMalloc((void**)&state, size * sizeof(glm::vec4)));
-            CHECK_ERROR(cudaMalloc((void**)&grid_cell_idx, size * sizeof(int)));
-            CHECK_ERROR(cudaMalloc((void**)&weight, size * sizeof(float)));
-            CHECK_ERROR(cudaMalloc((void**)&associated, size * sizeof(bool)));
+            CUDA_CALL(cudaMalloc((void**)&state, size * sizeof(glm::vec4)));
+            CUDA_CALL(cudaMalloc((void**)&grid_cell_idx, size * sizeof(int)));
+            CUDA_CALL(cudaMalloc((void**)&weight, size * sizeof(float)));
+            CUDA_CALL(cudaMalloc((void**)&associated, size * sizeof(bool)));
         }
         else
         {
@@ -86,10 +342,10 @@ struct ParticlesSoA
     {
         if (device)
         {
-            CHECK_ERROR(cudaFree(state));
-            CHECK_ERROR(cudaFree(grid_cell_idx));
-            CHECK_ERROR(cudaFree(weight));
-            CHECK_ERROR(cudaFree(associated));
+            CUDA_CALL(cudaFree(state));
+            CUDA_CALL(cudaFree(grid_cell_idx));
+            CUDA_CALL(cudaFree(weight));
+            CUDA_CALL(cudaFree(associated));
         }
         else
         {
@@ -102,10 +358,10 @@ struct ParticlesSoA
 
     void copy(const ParticlesSoA& other, cudaMemcpyKind kind)
     {
-        CHECK_ERROR(cudaMemcpy(grid_cell_idx, other.grid_cell_idx, size * sizeof(int), kind));
-        CHECK_ERROR(cudaMemcpy(weight, other.weight, size * sizeof(float), kind));
-        CHECK_ERROR(cudaMemcpy(associated, other.associated, size * sizeof(bool), kind));
-        CHECK_ERROR(cudaMemcpy(state, other.state, size * sizeof(glm::vec4), kind));
+        CUDA_CALL(cudaMemcpy(grid_cell_idx, other.grid_cell_idx, size * sizeof(int), kind));
+        CUDA_CALL(cudaMemcpy(weight, other.weight, size * sizeof(float), kind));
+        CUDA_CALL(cudaMemcpy(associated, other.associated, size * sizeof(bool), kind));
+        CUDA_CALL(cudaMemcpy(state, other.state, size * sizeof(glm::vec4), kind));
     }
 
     ParticlesSoA& operator=(const ParticlesSoA& other)
diff --git a/dogm/include/dogm/kernel/ego_motion_compensation.h b/dogm/include/dogm/kernel/ego_motion_compensation.h
index 144b56c..0397e79 100644
--- a/dogm/include/dogm/kernel/ego_motion_compensation.h
+++ b/dogm/include/dogm/kernel/ego_motion_compensation.h
@@ -10,12 +10,11 @@
 namespace dogm
 {
 
-struct GridCell;
-struct ParticlesSoA;
+__global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int y_move, int particle_count,
+                                    float resolution, int grid_size);
 
-__global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int y_move, int particle_count);
-
-__global__ void moveMapKernel(GridCell* __restrict__ grid_cell_array, const GridCell* __restrict__ old_grid_cell_array,
+__global__ void moveMapKernel(GridCellsSoA grid_cell_array, GridCellsSoA old_grid_cell_array,
+                              MeasurementCellsSoA meas_cell_array, ParticlesSoA particle_array,
                               int x_move, int y_move, int grid_size);
 
 } /* namespace dogm */
diff --git a/dogm/include/dogm/kernel/init.h b/dogm/include/dogm/kernel/init.h
index b143215..dd273f7 100644
--- a/dogm/include/dogm/kernel/init.h
+++ b/dogm/include/dogm/kernel/init.h
@@ -22,9 +22,9 @@ __global__ void initParticlesKernel(ParticlesSoA particle_array, curandState* __
 __global__ void initBirthParticlesKernel(ParticlesSoA birth_particle_array, curandState* __restrict__ global_state,
                                          float velocity, int grid_size, int particle_count);
 
-__global__ void initGridCellsKernel(GridCell* __restrict__ grid_cell_array,
-                                    MeasurementCell* __restrict__ meas_cell_array, int grid_size, int cell_count);
+__global__ void initGridCellsKernel(GridCellsSoA grid_cell_array,
+                                    MeasurementCellsSoA meas_cell_array, int grid_size, int cell_count);
 
-__global__ void reinitGridParticleIndices(GridCell* __restrict__ grid_cell_array, int cell_count);
+__global__ void reinitGridParticleIndices(GridCellsSoA grid_cell_array, int cell_count);
 
 } /* namespace dogm */
diff --git a/dogm/include/dogm/kernel/init_new_particles.h b/dogm/include/dogm/kernel/init_new_particles.h
index cecfafc..cdb87f2 100644
--- a/dogm/include/dogm/kernel/init_new_particles.h
+++ b/dogm/include/dogm/kernel/init_new_particles.h
@@ -16,24 +16,23 @@ struct Particle;
 
 void normalize_particle_orders(float* particle_orders_array_accum, int particle_orders_count, int v_B);
 
-__global__ void copyMassesKernel(const MeasurementCell* __restrict__ meas_cell_array, float* __restrict__ masses,
+__global__ void copyMassesKernel(const MeasurementCellsSoA meas_cell_array, float* __restrict__ masses,
                                  int cell_count);
 
-__global__ void initParticlesKernel1(GridCell* __restrict__ grid_cell_array,
-                                     const MeasurementCell* __restrict__ meas_cell_array, ParticlesSoA particle_array,
+__global__ void initParticlesKernel1(ParticlesSoA particle_array,
                                      const float* __restrict__ particle_orders_array_accum, int cell_count);
 
 __global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell* __restrict__ grid_cell_array,
                                      curandState* __restrict__ global_state, float velocity, int grid_size,
                                      float new_weight, int particle_count);
 
-__global__ void initNewParticlesKernel1(GridCell* __restrict__ grid_cell_array,
-                                        const MeasurementCell* __restrict__ meas_cell_array,
+__global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array,
+                                        const MeasurementCellsSoA meas_cell_array,
                                         const float* __restrict__ weight_array,
                                         const float* __restrict__ born_masses_array, ParticlesSoA birth_particle_array,
                                         const float* __restrict__ particle_orders_array_accum, int cell_count);
 
-__global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCell* __restrict__ grid_cell_array,
+__global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCellsSoA grid_cell_array,
                                         curandState* __restrict__ global_state, float stddev_velocity,
                                         float max_velocity, int grid_size, int particle_count);
 
diff --git a/dogm/include/dogm/kernel/mass_update.h b/dogm/include/dogm/kernel/mass_update.h
index 67f16f9..d45546e 100644
--- a/dogm/include/dogm/kernel/mass_update.h
+++ b/dogm/include/dogm/kernel/mass_update.h
@@ -9,14 +9,10 @@
 namespace dogm
 {
 
-struct GridCell;
-struct MeasurementCell;
-struct Particle;
-
-__global__ void gridCellPredictionUpdateKernel(GridCell* __restrict__ grid_cell_array, ParticlesSoA particle_array,
+__global__ void gridCellPredictionUpdateKernel(GridCellsSoA grid_cell_array, ParticlesSoA particle_array,
                                                float* __restrict__ weight_array,
                                                const float* __restrict__ weight_array_accum,
-                                               const MeasurementCell* __restrict__ meas_cell_array,
+                                               const MeasurementCellsSoA meas_cell_array,
                                                float* __restrict__ born_masses_array, float p_B, int cell_count);
 
 } /* namespace dogm */
diff --git a/dogm/include/dogm/kernel/particle_to_grid.h b/dogm/include/dogm/kernel/particle_to_grid.h
index ef03861..26ccdc5 100644
--- a/dogm/include/dogm/kernel/particle_to_grid.h
+++ b/dogm/include/dogm/kernel/particle_to_grid.h
@@ -9,10 +9,7 @@
 namespace dogm
 {
 
-struct GridCell;
-struct Particle;
-
-__global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCell* __restrict__ grid_cell_array,
+__global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCellsSoA grid_cell_array,
                                      float* __restrict__ weight_array, int particle_count);
 
 } /* namespace dogm */
diff --git a/dogm/include/dogm/kernel/statistical_moments.h b/dogm/include/dogm/kernel/statistical_moments.h
index 8f715a1..396cb56 100644
--- a/dogm/include/dogm/kernel/statistical_moments.h
+++ b/dogm/include/dogm/kernel/statistical_moments.h
@@ -9,16 +9,13 @@
 namespace dogm
 {
 
-struct GridCell;
-struct Particle;
-
 __global__ void statisticalMomentsKernel1(const ParticlesSoA particle_array, const float* __restrict__ weight_array,
                                           float* __restrict__ vel_x_array, float* __restrict__ vel_y_array,
                                           float* __restrict__ vel_x_squared_array,
                                           float* __restrict__ vel_y_squared_array, float* __restrict__ vel_xy_array,
                                           int particle_count);
 
-__global__ void statisticalMomentsKernel2(GridCell* __restrict__ grid_cell_array,
+__global__ void statisticalMomentsKernel2(GridCellsSoA grid_cell_array,
                                           const float* __restrict__ vel_x_array_accum,
                                           const float* __restrict__ vel_y_array_accum,
                                           const float* __restrict__ vel_x_squared_array_accum,
diff --git a/dogm/include/dogm/kernel/update_persistent_particles.h b/dogm/include/dogm/kernel/update_persistent_particles.h
index 9b309fa..f417d9d 100644
--- a/dogm/include/dogm/kernel/update_persistent_particles.h
+++ b/dogm/include/dogm/kernel/update_persistent_particles.h
@@ -9,20 +9,16 @@
 namespace dogm
 {
 
-struct GridCell;
-struct MeasurementCell;
-struct Particle;
-
 __global__ void updatePersistentParticlesKernel1(const ParticlesSoA particle_array,
-                                                 const MeasurementCell* __restrict__ meas_cell_array,
+                                                 const MeasurementCellsSoA meas_cell_array,
                                                  float* __restrict__ weight_array, int particle_count);
 
-__global__ void updatePersistentParticlesKernel2(GridCell* __restrict__ grid_cell_array,
+__global__ void updatePersistentParticlesKernel2(GridCellsSoA grid_cell_array,
                                                  const float* __restrict__ weight_array_accum, int cell_count);
 
 __global__ void updatePersistentParticlesKernel3(const ParticlesSoA particle_array,
-                                                 const MeasurementCell* __restrict__ meas_cell_array,
-                                                 const GridCell* __restrict__ grid_cell_array,
+                                                 const MeasurementCellsSoA meas_cell_array,
+                                                 const GridCellsSoA grid_cell_array,
                                                  float* __restrict__ weight_array, int particle_count);
 
 } /* namespace dogm */
diff --git a/dogm/src/dogm.cu b/dogm/src/dogm.cu
index 48f8146..17c6f9d 100644
--- a/dogm/src/dogm.cu
+++ b/dogm/src/dogm.cu
@@ -50,22 +50,22 @@ DOGM::DOGM(const Params& params)
     particle_array_next.init(particle_count, true);
     birth_particle_array.init(new_born_particle_count, true);
 
-    CHECK_ERROR(cudaMalloc(&grid_cell_array, grid_cell_count * sizeof(GridCell)));
-    CHECK_ERROR(cudaMalloc(&meas_cell_array, grid_cell_count * sizeof(MeasurementCell)));
+    grid_cell_array.init(grid_cell_count, true);
+    meas_cell_array.init(grid_cell_count, true);
 
-    CHECK_ERROR(cudaMalloc(&weight_array, particle_count * sizeof(float)));
-    CHECK_ERROR(cudaMalloc(&birth_weight_array, new_born_particle_count * sizeof(float)));
-    CHECK_ERROR(cudaMalloc(&born_masses_array, grid_cell_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&weight_array, particle_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&birth_weight_array, new_born_particle_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&born_masses_array, grid_cell_count * sizeof(float)));
 
-    CHECK_ERROR(cudaMalloc(&vel_x_array, particle_count * sizeof(float)));
-    CHECK_ERROR(cudaMalloc(&vel_y_array, particle_count * sizeof(float)));
-    CHECK_ERROR(cudaMalloc(&vel_x_squared_array, particle_count * sizeof(float)));
-    CHECK_ERROR(cudaMalloc(&vel_y_squared_array, particle_count * sizeof(float)));
-    CHECK_ERROR(cudaMalloc(&vel_xy_array, particle_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&vel_x_array, particle_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&vel_y_array, particle_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&vel_x_squared_array, particle_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&vel_y_squared_array, particle_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&vel_xy_array, particle_count * sizeof(float)));
 
-    CHECK_ERROR(cudaMalloc(&rand_array, particle_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&rand_array, particle_count * sizeof(float)));
 
-    CHECK_ERROR(cudaMalloc(&rng_states, particles_grid.x * block_dim.x * sizeof(curandState)));
+    CUDA_CALL(cudaMalloc(&rng_states, particles_grid.x * block_dim.x * sizeof(curandState)));
 
     initialize();
 }
@@ -76,40 +76,37 @@ DOGM::~DOGM()
     particle_array_next.free();
     birth_particle_array.free();
 
-    CHECK_ERROR(cudaFree(grid_cell_array));
-    CHECK_ERROR(cudaFree(meas_cell_array));
+    grid_cell_array.free();
+    meas_cell_array.free();
 
-    CHECK_ERROR(cudaFree(weight_array));
-    CHECK_ERROR(cudaFree(birth_weight_array));
-    CHECK_ERROR(cudaFree(born_masses_array));
+    CUDA_CALL(cudaFree(weight_array));
+    CUDA_CALL(cudaFree(birth_weight_array));
+    CUDA_CALL(cudaFree(born_masses_array));
 
-    CHECK_ERROR(cudaFree(vel_x_array));
-    CHECK_ERROR(cudaFree(vel_y_array));
-    CHECK_ERROR(cudaFree(vel_x_squared_array));
-    CHECK_ERROR(cudaFree(vel_y_squared_array));
-    CHECK_ERROR(cudaFree(vel_xy_array));
+    CUDA_CALL(cudaFree(vel_x_array));
+    CUDA_CALL(cudaFree(vel_y_array));
+    CUDA_CALL(cudaFree(vel_x_squared_array));
+    CUDA_CALL(cudaFree(vel_y_squared_array));
+    CUDA_CALL(cudaFree(vel_xy_array));
 
-    CHECK_ERROR(cudaFree(rng_states));
+    CUDA_CALL(cudaFree(rand_array));
+
+    CUDA_CALL(cudaFree(rng_states));
 }
 
 void DOGM::initialize()
 {
     cudaStream_t particles_stream, grid_stream;
-    CHECK_ERROR(cudaStreamCreate(&particles_stream));
-    CHECK_ERROR(cudaStreamCreate(&grid_stream));
-
-    setupRandomStatesKernel<<<particles_grid, block_dim>>>(rng_states, 123456, particles_grid.x * block_dim.x);
+    CUDA_CALL(cudaStreamCreate(&particles_stream));
+    CUDA_CALL(cudaStreamCreate(&grid_stream));
 
-    CHECK_ERROR(cudaGetLastError());
-    CHECK_ERROR(cudaDeviceSynchronize());
+    setupRandomStatesKernel<<<particles_grid, block_dim>>>(rng_states, particles_grid.x * block_dim.x);
 
     initGridCellsKernel<<<grid_map_grid, block_dim, 0, grid_stream>>>(grid_cell_array, meas_cell_array, grid_size,
                                                                       grid_cell_count);
 
-    CHECK_ERROR(cudaGetLastError());
-
-    CHECK_ERROR(cudaStreamDestroy(particles_stream));
-    CHECK_ERROR(cudaStreamDestroy(grid_stream));
+    CUDA_CALL(cudaStreamDestroy(particles_stream));
+    CUDA_CALL(cudaStreamDestroy(grid_stream));
 }
 
 void DOGM::updateGrid(MeasurementCell* measurement_grid, float new_x, float new_y, float new_yaw, float dt, bool device)
@@ -132,22 +129,18 @@ void DOGM::updateGrid(MeasurementCell* measurement_grid, float new_x, float new_
     iteration++;
 }
 
-std::vector<GridCell> DOGM::getGridCells() const
+GridCellsSoA DOGM::getGridCells() const
 {
-    std::vector<GridCell> grid_cells(static_cast<std::vector<GridCell>::size_type>(grid_cell_count));
-
-    CHECK_ERROR(
-        cudaMemcpy(grid_cells.data(), grid_cell_array, grid_cell_count * sizeof(GridCell), cudaMemcpyDeviceToHost));
+    GridCellsSoA grid_cells(grid_cell_count, false);
+    grid_cells.copy(grid_cell_array, cudaMemcpyDeviceToHost);
 
     return grid_cells;
 }
 
-std::vector<MeasurementCell> DOGM::getMeasurementCells() const
+MeasurementCellsSoA DOGM::getMeasurementCells() const
 {
-    std::vector<MeasurementCell> meas_cells(static_cast<std::vector<GridCell>::size_type>(grid_cell_count));
-
-    CHECK_ERROR(cudaMemcpy(meas_cells.data(), meas_cell_array, grid_cell_count * sizeof(MeasurementCell),
-                           cudaMemcpyDeviceToHost));
+    MeasurementCellsSoA meas_cells(grid_cell_count, false);
+    meas_cells.copy(meas_cell_array, cudaMemcpyDeviceToHost);
 
     return meas_cells;
 }
@@ -176,26 +169,19 @@ void DOGM::updatePose(float new_x, float new_y, float new_yaw)
 
         if (fabsf(x_diff) > params.resolution || fabsf(y_diff) > params.resolution)
         {
-            const int x_move = -static_cast<int>(x_diff / params.resolution);
-            const int y_move = -static_cast<int>(y_diff / params.resolution);
-
-            GridCell* old_grid_cell_array;
-            CHECK_ERROR(cudaMalloc(&old_grid_cell_array, grid_cell_count * sizeof(GridCell)));
-
-            CHECK_ERROR(cudaMemcpy(old_grid_cell_array, grid_cell_array, grid_cell_count * sizeof(GridCell),
-                                   cudaMemcpyDeviceToDevice));
-            CHECK_ERROR(cudaMemset(grid_cell_array, 0, grid_cell_count * sizeof(GridCell)));
+            moveParticlesKernel<<<particles_grid, block_dim>>>(particle_array, x_move, y_move, particle_count,
+                params.resolution, grid_size);
 
             dim3 dim_block(32, 32);
             dim3 grid_dim(divUp(grid_size, dim_block.x), divUp(grid_size, dim_block.y));
 
-            moveParticlesKernel<<<particles_grid, block_dim>>>(particle_array, x_move, y_move, particle_count);
-            CHECK_ERROR(cudaGetLastError());
+            GridCellsSoA old_grid_cell_array(grid_cell_count, true);
+            old_grid_cell_array.copy(grid_cell_array, cudaMemcpyDeviceToDevice);
 
-            moveMapKernel<<<grid_dim, dim_block>>>(grid_cell_array, old_grid_cell_array, x_move, y_move, grid_size);
-            CHECK_ERROR(cudaGetLastError());
+            moveMapKernel<<<grid_dim, dim_block>>>(grid_cell_array, old_grid_cell_array, meas_cell_array, particle_array,
+                x_move, y_move, grid_size);
 
-            CHECK_ERROR(cudaFree(old_grid_cell_array));
+            old_grid_cell_array.free();
 
             position_x = new_x;
             position_y = new_y;
@@ -204,10 +190,10 @@ void DOGM::updatePose(float new_x, float new_y, float new_yaw)
     }
 }
 
-void DOGM::updateMeasurementGrid(MeasurementCell* measurement_grid, bool device)
+void DOGM::updateMeasurementGrid(MeasurementCellsSoA measurement_grid, bool device)
 {
     cudaMemcpyKind kind = device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
-    CHECK_ERROR(cudaMemcpy(meas_cell_array, measurement_grid, grid_cell_count * sizeof(MeasurementCell), kind));
+    meas_cell_array.copy(measurement_grid, kind);
 
     if (!first_measurement_received)
     {
@@ -245,13 +231,12 @@ void DOGM::initializeParticles()
 
 void DOGM::particlePrediction(float dt)
 {
-    // std::cout << "DOGM::particlePrediction" << std::endl;
-
+    // glm uses column major, we need row major
     // clang-format off
-    glm::mat4x4 transition_matrix(1, 0, dt, 0,
-                                  0, 1, 0, dt,
-                                  0, 0, 1, 0,
-                                  0, 0, 0, 1);
+    glm::mat4x4 transition_matrix(1, 0, 0, 0,
+                                  0, 1, 0, 0,
+                                  dt, 0, 1, 0,
+                                  0, dt, 0, 1);
     // clang-format on
 
     // FIXME: glm uses column major, we need row major
diff --git a/dogm/src/kernel/ego_motion_compensation.cu b/dogm/src/kernel/ego_motion_compensation.cu
index 5f833fe..e447203 100644
--- a/dogm/src/kernel/ego_motion_compensation.cu
+++ b/dogm/src/kernel/ego_motion_compensation.cu
@@ -22,7 +22,8 @@ __global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int
     }
 }
 
-__global__ void moveMapKernel(GridCell* __restrict__ grid_cell_array, const GridCell* __restrict__ old_grid_cell_array,
+__global__ void moveMapKernel(GridCellsSoA grid_cell_array, GridCellsSoA old_grid_cell_array,
+                              MeasurementCellsSoA meas_cell_array, ParticlesSoA particle_array,
                               int x_move, int y_move, int grid_size)
 {
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/dogm/src/kernel/init.cu b/dogm/src/kernel/init.cu
index 02eb196..6230e17 100644
--- a/dogm/src/kernel/init.cu
+++ b/dogm/src/kernel/init.cu
@@ -69,29 +69,29 @@ __global__ void initBirthParticlesKernel(ParticlesSoA birth_particle_array, cura
     // global_state[thread_id] = local_state;
 }
 
-__global__ void initGridCellsKernel(GridCell* __restrict__ grid_cell_array,
-                                    MeasurementCell* __restrict__ meas_cell_array, int grid_size, int cell_count)
+__global__ void initGridCellsKernel(GridCellsSoA grid_cell_array,
+                                    MeasurementCellsSoA meas_cell_array, int grid_size, int cell_count)
 {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x)
     {
-        grid_cell_array[i].free_mass = 0.0f;
-        grid_cell_array[i].occ_mass = 0.0f;
-        grid_cell_array[i].start_idx = -1;
-        grid_cell_array[i].end_idx = -1;
-
-        meas_cell_array[i].occ_mass = 0.0f;
-        meas_cell_array[i].free_mass = 0.0f;
-        meas_cell_array[i].likelihood = 1.0f;
-        meas_cell_array[i].p_A = 1.0f;
+        grid_cell_array.free_mass[i] = 0.0f;
+        grid_cell_array.occ_mass[i] = 0.0f;
+        grid_cell_array.start_idx[i] = -1;
+        grid_cell_array.end_idx[i] = -1;
+
+        meas_cell_array.occ_mass[i] = 0.0f;
+        meas_cell_array.free_mass[i] = 0.0f;
+        meas_cell_array.likelihood[i] = 1.0f;
+        meas_cell_array.p_A[i] = 1.0f;
     }
 }
 
-__global__ void reinitGridParticleIndices(GridCell* __restrict__ grid_cell_array, int cell_count)
+__global__ void reinitGridParticleIndices(GridCellsSoA grid_cell_array, int cell_count)
 {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x)
     {
-        grid_cell_array[i].start_idx = -1;
-        grid_cell_array[i].end_idx = -1;
+        grid_cell_array.start_idx[i] = -1;
+        grid_cell_array.end_idx[i] = -1;
     }
 }
 
diff --git a/dogm/src/kernel/init_new_particles.cu b/dogm/src/kernel/init_new_particles.cu
index e761a8a..90517f7 100644
--- a/dogm/src/kernel/init_new_particles.cu
+++ b/dogm/src/kernel/init_new_particles.cu
@@ -57,10 +57,10 @@ __device__ float calc_weight_unassoc(int nu_UA, float p_A, float born_mass)
     return nu_UA > 0 ? ((1.0 - p_A) * born_mass) / nu_UA : 0.0;
 }
 
-__device__ void store_weights(float w_A, float w_UA, GridCell* __restrict__ grid_cell_array, int j)
+__device__ void store_weights(float w_A, float w_UA, GridCellsSoA grid_cell_array, int j)
 {
-    grid_cell_array[j].w_A = w_A;
-    grid_cell_array[j].w_UA = w_UA;
+    grid_cell_array.w_A[j] = w_A;
+    grid_cell_array.w_UA[j] = w_UA;
 }
 
 void normalize_particle_orders(float* particle_orders_array_accum, int particle_orders_count, int v_B)
@@ -69,21 +69,21 @@ void normalize_particle_orders(float* particle_orders_array_accum, int particle_
 
     float max = 1.0f;
     cudaMemcpy(&max, &particle_orders_array_accum[particle_orders_count - 1], sizeof(float), cudaMemcpyDeviceToHost);
-    thrust::transform(particle_orders_accum, particle_orders_accum + particle_orders_count, particle_orders_accum,
-                      GPU_LAMBDA(float x) { return x * (v_B / max); });
+    thrust::transform(
+        particle_orders_accum, particle_orders_accum + particle_orders_count, particle_orders_accum,
+        GPU_LAMBDA(float x) { return x * (v_B / max); });
 }
 
-__global__ void copyMassesKernel(const MeasurementCell* __restrict__ meas_cell_array, float* __restrict__ masses,
+__global__ void copyMassesKernel(const MeasurementCellsSoA meas_cell_array, float* __restrict__ masses,
                                  int cell_count)
 {
     for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < cell_count; j += blockDim.x * gridDim.x)
     {
-        masses[j] = meas_cell_array[j].occ_mass;
+        masses[j] = meas_cell_array.occ_mass[j];
     }
 }
 
-__global__ void initParticlesKernel1(GridCell* __restrict__ grid_cell_array,
-                                     const MeasurementCell* __restrict__ meas_cell_array, ParticlesSoA particle_array,
+__global__ void initParticlesKernel1(ParticlesSoA particle_array,
                                      const float* __restrict__ particle_orders_array_accum, int cell_count)
 {
     for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < cell_count; j += blockDim.x * gridDim.x)
@@ -123,8 +123,8 @@ __global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell
     global_state[thread_id] = local_state;
 }
 
-__global__ void initNewParticlesKernel1(GridCell* __restrict__ grid_cell_array,
-                                        const MeasurementCell* __restrict__ meas_cell_array,
+__global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array,
+                                        const MeasurementCellsSoA meas_cell_array,
                                         const float* __restrict__ weight_array,
                                         const float* __restrict__ born_masses_array, ParticlesSoA birth_particle_array,
                                         const float* __restrict__ particle_orders_array_accum, int cell_count)
@@ -137,7 +137,7 @@ __global__ void initNewParticlesKernel1(GridCell* __restrict__ grid_cell_array,
         // printf("Start idx: %d, End idx: %d\n", start_idx, end_idx);
 
         int num_new_particles = start_idx <= end_idx ? end_idx - start_idx + 1 : 0;
-        float p_A = meas_cell_array[j].p_A;
+        float p_A = meas_cell_array.p_A[j];
         int nu_A = calc_num_assoc(num_new_particles, p_A);
         int nu_UA = num_new_particles - nu_A;
         float w_A = calc_weight_assoc(nu_A, p_A, born_masses_array[j]);
@@ -158,7 +158,7 @@ __global__ void initNewParticlesKernel1(GridCell* __restrict__ grid_cell_array,
     }
 }
 
-__global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCell* __restrict__ grid_cell_array,
+__global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCellsSoA grid_cell_array,
                                         curandState* __restrict__ global_state, float stddev_velocity,
                                         float max_velocity, int grid_size, int particle_count)
 {
@@ -182,7 +182,7 @@ __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const
             float vel_x = curand_normal(&local_state, 0.0f, stddev_velocity);
             float vel_y = curand_normal(&local_state, 0.0f, stddev_velocity);
 
-            birth_particle_array.weight[i] = grid_cell.w_A;
+            birth_particle_array.weight[i] = grid_cell_array.w_A[cell_idx];
             birth_particle_array.state[i] = glm::vec4(x, y, vel_x, vel_y);
         }
         else
@@ -190,7 +190,7 @@ __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const
             float vel_x = curand_normal(&local_state, 0.0f, stddev_velocity);
             float vel_y = curand_normal(&local_state, 0.0f, stddev_velocity);
 
-            birth_particle_array.weight[i] = grid_cell.w_UA;
+            birth_particle_array.weight[i] = grid_cell_array.w_UA[cell_idx];
             birth_particle_array.state[i] = glm::vec4(x, y, vel_x, vel_y);
         }
     }
diff --git a/dogm/src/kernel/mass_update.cu b/dogm/src/kernel/mass_update.cu
index 77d1924..d021704 100644
--- a/dogm/src/kernel/mass_update.cu
+++ b/dogm/src/kernel/mass_update.cu
@@ -13,21 +13,21 @@
 namespace dogm
 {
 
-__device__ float predict_free_mass(const GridCell& grid_cell, float m_occ_pred, float alpha = 0.9)
+__device__ float predict_free_mass(float grid_cell_free_mass, float m_occ_pred, float alpha = 0.9)
 {
-    return min(alpha * grid_cell.free_mass, 1.0f - m_occ_pred);
+    return min(alpha * grid_cell_free_mass, 1.0f - m_occ_pred);
 }
 
-__device__ float2 update_masses(float m_occ_pred, float m_free_pred, const MeasurementCell& meas_cell)
+__device__ float2 update_masses(float m_occ_pred, float m_free_pred, const MeasurementCellsSoA meas_cells, int meas_idx)
 {
     float unknown_pred = 1.0 - m_occ_pred - m_free_pred;
-    float meas_unknown = 1.0 - meas_cell.free_mass - meas_cell.occ_mass;
-    float K = m_free_pred * meas_cell.occ_mass + m_occ_pred * meas_cell.free_mass;
+    float meas_unknown = 1.0 - meas_cells.free_mass[meas_idx] - meas_cells.occ_mass[meas_idx];
+    float K = m_free_pred * meas_cells.occ_mass[meas_idx] + m_occ_pred * meas_cells.free_mass[meas_idx];
 
     float occ_mass =
-        (m_occ_pred * meas_unknown + unknown_pred * meas_cell.occ_mass + m_occ_pred * meas_cell.occ_mass) / (1.0 - K);
+        (m_occ_pred * meas_unknown + unknown_pred * meas_cells.occ_mass[meas_idx] + m_occ_pred * meas_cells.occ_mass[meas_idx]) / (1.0 - K);
     float free_mass =
-        (m_free_pred * meas_unknown + unknown_pred * meas_cell.free_mass + m_free_pred * meas_cell.free_mass) /
+        (m_free_pred * meas_unknown + unknown_pred * meas_cells.free_mass[meas_idx] + m_free_pred * meas_cells.free_mass[meas_idx]) /
         (1.0 - K);
 
     return make_float2(occ_mass, free_mass);
@@ -39,13 +39,13 @@ __device__ float separate_newborn_part(float m_occ_pred, float m_occ_up, float p
 }
 
 __device__ void store_values(float rho_b, float rho_p, float m_free_up, float m_occ_up, float m_occ_pred,
-                             GridCell* __restrict__ grid_cell_array, int i)
+                             GridCellsSoA grid_cell_array, int i)
 {
-    grid_cell_array[i].pers_occ_mass = rho_p;
-    grid_cell_array[i].new_born_occ_mass = rho_b;
-    grid_cell_array[i].free_mass = m_free_up;
-    grid_cell_array[i].occ_mass = m_occ_up;
-    grid_cell_array[i].pred_occ_mass = m_occ_pred;
+    grid_cell_array.pers_occ_mass[i] = rho_p;
+    grid_cell_array.new_born_occ_mass[i] = rho_b;
+    grid_cell_array.free_mass[i] = m_free_up;
+    grid_cell_array.occ_mass[i] = m_occ_up;
+    grid_cell_array.pred_occ_mass[i] = m_occ_pred;
 }
 
 __device__ void normalize_weights(const ParticlesSoA& particle_array, float* __restrict__ weight_array, int start_idx,
@@ -58,16 +58,16 @@ __device__ void normalize_weights(const ParticlesSoA& particle_array, float* __r
     }
 }
 
-__global__ void gridCellPredictionUpdateKernel(GridCell* __restrict__ grid_cell_array, ParticlesSoA particle_array,
+__global__ void gridCellPredictionUpdateKernel(GridCellsSoA grid_cell_array, ParticlesSoA particle_array,
                                                float* __restrict__ weight_array,
                                                const float* __restrict__ weight_array_accum,
-                                               const MeasurementCell* __restrict__ meas_cell_array,
+                                               const MeasurementCellsSoA meas_cell_array,
                                                float* __restrict__ born_masses_array, float p_B, int cell_count)
 {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x)
     {
-        int start_idx = grid_cell_array[i].start_idx;
-        int end_idx = grid_cell_array[i].end_idx;
+        int start_idx = grid_cell_array.start_idx[i];
+        int end_idx = grid_cell_array.end_idx[i];
 
         if (start_idx != -1)
         {
@@ -75,26 +75,23 @@ __global__ void gridCellPredictionUpdateKernel(GridCell* __restrict__ grid_cell_
 
             if (m_occ_pred > 1.0f)
             {
-                // printf("Predicted mass greater 1. Mass is: %f\n", m_occ_pred);
                 normalize_weights(particle_array, weight_array, start_idx, end_idx, m_occ_pred);
                 m_occ_pred = 1.0f;
             }
 
-            float m_free_pred = predict_free_mass(grid_cell_array[i], m_occ_pred);
-            float2 masses_up = update_masses(m_occ_pred, m_free_pred, meas_cell_array[i]);
+            float m_free_pred = predict_free_mass(grid_cell_array.free_mass[i], m_occ_pred);
+            float2 masses_up = update_masses(m_occ_pred, m_free_pred, meas_cell_array, i);
             float rho_b = separate_newborn_part(m_occ_pred, masses_up.x, p_B);
             float rho_p = masses_up.x - rho_b;
             born_masses_array[i] = rho_b;
 
-            // printf("Rho B: %f\n", rho_b);
-
             store_values(rho_b, rho_p, masses_up.y, masses_up.x, m_occ_pred, grid_cell_array, i);
         }
         else
         {
-            float m_occ = grid_cell_array[i].occ_mass;
-            float m_free = predict_free_mass(grid_cell_array[i], m_occ);
-            float2 masses_up = update_masses(m_occ, m_free, meas_cell_array[i]);
+            float m_occ = grid_cell_array.occ_mass[i];
+            float m_free = predict_free_mass(grid_cell_array.free_mass[i], m_occ);
+            float2 masses_up = update_masses(m_occ, m_free, meas_cell_array, i);
             born_masses_array[i] = 0.0f;
             store_values(0.0f, masses_up.x, masses_up.y, masses_up.x, 0.0f, grid_cell_array, i);
         }
diff --git a/dogm/src/kernel/particle_to_grid.cu b/dogm/src/kernel/particle_to_grid.cu
index ac339af..3d24f7f 100644
--- a/dogm/src/kernel/particle_to_grid.cu
+++ b/dogm/src/kernel/particle_to_grid.cu
@@ -24,7 +24,7 @@ __device__ bool is_last_particle(const ParticlesSoA& particle_array, int particl
     return i == particle_count - 1 || particle_array.grid_cell_idx[i] != particle_array.grid_cell_idx[i + 1];
 }
 
-__global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCell* __restrict__ grid_cell_array,
+__global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCellsSoA grid_cell_array,
                                      float* __restrict__ weight_array, int particle_count)
 {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x)
@@ -33,15 +33,13 @@ __global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCell
 
         if (is_first_particle(particle_array, i))
         {
-            grid_cell_array[j].start_idx = i;
+            grid_cell_array.start_idx[j] = i;
         }
         if (is_last_particle(particle_array, particle_count, i))
         {
-            grid_cell_array[j].end_idx = i;
+            grid_cell_array.end_idx[j] = i;
         }
 
-        // printf("Cell: %d, Start idx: %d, End idx: %d\n", j, grid_cell_array[j].start_idx,
-        // grid_cell_array[j].end_idx);
         weight_array[i] = particle_array.weight[i];
     }
 }
diff --git a/dogm/src/kernel/statistical_moments.cu b/dogm/src/kernel/statistical_moments.cu
index 436428b..287d85c 100644
--- a/dogm/src/kernel/statistical_moments.cu
+++ b/dogm/src/kernel/statistical_moments.cu
@@ -45,14 +45,14 @@ __device__ float calc_covariance(const float* __restrict__ vel_xy_array_accum, i
     return 0.0f;
 }
 
-__device__ void store(GridCell* __restrict__ grid_cell_array, int j, float mean_x_vel, float mean_y_vel,
+__device__ void store(GridCellsSoA grid_cell_array, int j, float mean_x_vel, float mean_y_vel,
                       float var_x_vel, float var_y_vel, float covar_xy_vel)
 {
-    grid_cell_array[j].mean_x_vel = mean_x_vel;
-    grid_cell_array[j].mean_y_vel = mean_y_vel;
-    grid_cell_array[j].var_x_vel = var_x_vel;
-    grid_cell_array[j].var_y_vel = var_y_vel;
-    grid_cell_array[j].covar_xy_vel = covar_xy_vel;
+    grid_cell_array.mean_x_vel[j] = mean_x_vel;
+    grid_cell_array.mean_y_vel[j] = mean_y_vel;
+    grid_cell_array.var_x_vel[j] = var_x_vel;
+    grid_cell_array.var_y_vel[j] = var_y_vel;
+    grid_cell_array.covar_xy_vel[j] = covar_xy_vel;
 }
 
 __global__ void statisticalMomentsKernel1(const ParticlesSoA particle_array, const float* __restrict__ weight_array,
@@ -71,12 +71,10 @@ __global__ void statisticalMomentsKernel1(const ParticlesSoA particle_array, con
         vel_x_squared_array[i] = weight * vel_x * vel_x;
         vel_y_squared_array[i] = weight * vel_y * vel_y;
         vel_xy_array[i] = weight * vel_x * vel_y;
-
-        // printf("vx: %f, vy: %f\n", vel_x_array[i], vel_y_array[i]);
     }
 }
 
-__global__ void statisticalMomentsKernel2(GridCell* __restrict__ grid_cell_array,
+__global__ void statisticalMomentsKernel2(GridCellsSoA grid_cell_array,
                                           const float* __restrict__ vel_x_array_accum,
                                           const float* __restrict__ vel_y_array_accum,
                                           const float* __restrict__ vel_x_squared_array_accum,
@@ -85,10 +83,9 @@ __global__ void statisticalMomentsKernel2(GridCell* __restrict__ grid_cell_array
 {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x)
     {
-        int start_idx = grid_cell_array[i].start_idx;
-        int end_idx = grid_cell_array[i].end_idx;
-        float rho_p = grid_cell_array[i].pers_occ_mass;
-        // printf("rho p: %f\n", rho_p);
+        int start_idx = grid_cell_array.start_idx[i];
+        int end_idx = grid_cell_array.end_idx[i];
+        float rho_p = grid_cell_array.pers_occ_mass[i];
 
         if (start_idx != -1)
         {
@@ -97,7 +94,6 @@ __global__ void statisticalMomentsKernel2(GridCell* __restrict__ grid_cell_array
             float var_x_vel = calc_variance(vel_x_squared_array_accum, start_idx, end_idx, rho_p, mean_x_vel);
             float var_y_vel = calc_variance(vel_y_squared_array_accum, start_idx, end_idx, rho_p, mean_y_vel);
             float covar_xy_vel = calc_covariance(vel_xy_array_accum, start_idx, end_idx, rho_p, mean_x_vel, mean_y_vel);
-            // printf("x: %f, y: %f\n", mean_x_vel, mean_y_vel);
 
             store(grid_cell_array, i, mean_x_vel, mean_y_vel, var_x_vel, var_y_vel, covar_xy_vel);
         }
diff --git a/dogm/src/kernel/update_persistent_particles.cu b/dogm/src/kernel/update_persistent_particles.cu
index b0d8aa9..e952fac 100644
--- a/dogm/src/kernel/update_persistent_particles.cu
+++ b/dogm/src/kernel/update_persistent_particles.cu
@@ -18,36 +18,34 @@ __device__ float calc_norm_assoc(float occ_accum, float rho_p)
     return occ_accum > 0.0f ? rho_p / occ_accum : 0.0f;
 }
 
-__device__ float calc_norm_unassoc(const GridCell& grid_cell)
+__device__ float calc_norm_unassoc(float pred_occ_mass, float pers_occ_mass)
 {
-    float pred_occ_mass = grid_cell.pred_occ_mass;
-    return pred_occ_mass > 0.0f ? grid_cell.pers_occ_mass / pred_occ_mass : 0.0f;
+    return pred_occ_mass > 0.0f ? pers_occ_mass / pred_occ_mass : 0.0f;
 }
 
-__device__ void set_normalization_components(GridCell* __restrict__ grid_cell_array, int i, float mu_A, float mu_UA)
+__device__ void set_normalization_components(GridCellsSoA grid_cell_array, int i, float mu_A, float mu_UA)
 {
-    grid_cell_array[i].mu_A = mu_A;
-    grid_cell_array[i].mu_UA = mu_UA;
+    grid_cell_array.mu_A[i] = mu_A;
+    grid_cell_array.mu_UA[i] = mu_UA;
 }
 
 __device__ float update_unnorm(const ParticlesSoA& particle_array, int i,
-                               const MeasurementCell* __restrict__ meas_cell_array)
+                               const MeasurementCellsSoA meas_cell_array)
 {
-    return meas_cell_array[particle_array.grid_cell_idx[i]].likelihood * particle_array.weight[i];
+    return meas_cell_array.likelihood[particle_array.grid_cell_idx[i]] * particle_array.weight[i];
 }
 
-__device__ float normalize(const ParticlesSoA& particle, int i, const GridCell* __restrict__ grid_cell_array,
-                           const MeasurementCell* __restrict__ meas_cell_array, float weight)
+__device__ float normalize(const ParticlesSoA& particle, int i, const GridCellsSoA grid_cell_array,
+                           const MeasurementCellsSoA meas_cell_array, float weight)
 {
     const int cell_idx = particle.grid_cell_idx[i];
-    const GridCell& cell = grid_cell_array[cell_idx];
-    const MeasurementCell& meas_cell = meas_cell_array[cell_idx];
+    const float p_A = meas_cell_array.p_A[cell_idx];
 
-    return meas_cell.p_A * cell.mu_A * weight + (1.0f - meas_cell.p_A) * cell.mu_UA * particle.weight[i];
+    return p_A * grid_cell_array.mu_A[cell_idx] * weight + (1.0f - p_A) * grid_cell_array.mu_UA[cell_idx] * particle.weight[i];
 }
 
 __global__ void updatePersistentParticlesKernel1(const ParticlesSoA particle_array,
-                                                 const MeasurementCell* __restrict__ meas_cell_array,
+                                                 const MeasurementCellsSoA meas_cell_array,
                                                  float* __restrict__ weight_array, int particle_count)
 {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x)
@@ -56,29 +54,28 @@ __global__ void updatePersistentParticlesKernel1(const ParticlesSoA particle_arr
     }
 }
 
-__global__ void updatePersistentParticlesKernel2(GridCell* __restrict__ grid_cell_array,
+__global__ void updatePersistentParticlesKernel2(GridCellsSoA grid_cell_array,
                                                  const float* __restrict__ weight_array_accum, int cell_count)
 {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x)
     {
-        int start_idx = grid_cell_array[i].start_idx;
-        int end_idx = grid_cell_array[i].end_idx;
+        int start_idx = grid_cell_array.start_idx[i];
+        int end_idx = grid_cell_array.end_idx[i];
 
         if (start_idx != -1)
         {
             float m_occ_accum = subtract(weight_array_accum, start_idx, end_idx);
-            float rho_p = grid_cell_array[i].pers_occ_mass;
+            float rho_p = grid_cell_array.pers_occ_mass[i];
             float mu_A = calc_norm_assoc(m_occ_accum, rho_p);
-            float mu_UA = calc_norm_unassoc(grid_cell_array[i]);
+            float mu_UA = calc_norm_unassoc(grid_cell_array.pred_occ_mass[i], grid_cell_array.pers_occ_mass[i]);
             set_normalization_components(grid_cell_array, i, mu_A, mu_UA);
-            // printf("mu_A: %f, mu_UA: %f\n", mu_A, mu_UA);
         }
     }
 }
 
 __global__ void updatePersistentParticlesKernel3(const ParticlesSoA particle_array,
-                                                 const MeasurementCell* __restrict__ meas_cell_array,
-                                                 const GridCell* __restrict__ grid_cell_array,
+                                                 const MeasurementCellsSoA meas_cell_array,
+                                                 const GridCellsSoA grid_cell_array,
                                                  float* __restrict__ weight_array, int particle_count)
 {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x)

From 77760043117e140e73d4c335b8a2897ae4f5a772 Mon Sep 17 00:00:00 2001
From: ShepelIlya <chepsiilya@gmail.com>
Date: Wed, 18 May 2022 11:56:53 +0300
Subject: [PATCH 2/5] memory leak fix

---
 dogm/src/kernel/init_new_particles.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dogm/src/kernel/init_new_particles.cu b/dogm/src/kernel/init_new_particles.cu
index 90517f7..82e5929 100644
--- a/dogm/src/kernel/init_new_particles.cu
+++ b/dogm/src/kernel/init_new_particles.cu
@@ -146,12 +146,12 @@ __global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array,
 
         // printf("w_A: %f, w_UA: %f\n", w_A, w_UA);
 
-        for (int i = start_idx; i < start_idx + nu_A + 1; i++)
+        for (int i = start_idx; i < start_idx + nu_A; i++)
         {
             set_cell_idx_A(birth_particle_array, i, j);
         }
 
-        for (int i = start_idx + nu_A + 1; i < end_idx + 1; i++)
+        for (int i = start_idx + nu_A; i < end_idx + 1; i++)
         {
             set_cell_idx_UA(birth_particle_array, i, j);
         }

From 28d13ef63b48ac6eeb2c1a2a92cb302313ab5f50 Mon Sep 17 00:00:00 2001
From: ShepelIlya <chepsiilya@gmail.com>
Date: Wed, 18 May 2022 12:01:42 +0300
Subject: [PATCH 3/5] fixed bug with resolution and translation.

---
 dogm/include/dogm/dogm.h                      |  3 +-
 dogm/include/dogm/kernel/init.h               |  8 +-
 dogm/include/dogm/kernel/init_new_particles.h | 13 +--
 dogm/include/dogm/kernel/predict.h            |  6 +-
 dogm/src/dogm.cu                              | 93 ++++---------------
 dogm/src/kernel/ego_motion_compensation.cu    | 47 ++++++++--
 dogm/src/kernel/init.cu                       | 18 +---
 dogm/src/kernel/init_new_particles.cu         | 18 ++--
 dogm/src/kernel/predict.cu                    | 10 +-
 9 files changed, 83 insertions(+), 133 deletions(-)

diff --git a/dogm/include/dogm/dogm.h b/dogm/include/dogm/dogm.h
index ee34ced..0ac54ff 100644
--- a/dogm/include/dogm/dogm.h
+++ b/dogm/include/dogm/dogm.h
@@ -140,7 +140,7 @@ class DOGM
 private:
     void initialize();
 
-    void updatePose(float new_x, float new_y, float new_yaw);
+    void updatePose(float new_x, float new_y);
     void updateMeasurementGrid(MeasurementCellsSoA measurement_grid, bool device);
 
 public:
@@ -196,7 +196,6 @@ class DOGM
     bool first_measurement_received;
     float position_x;
     float position_y;
-    float yaw;
 };
 
 } /* namespace dogm */
diff --git a/dogm/include/dogm/kernel/init.h b/dogm/include/dogm/kernel/init.h
index dd273f7..07d1280 100644
--- a/dogm/include/dogm/kernel/init.h
+++ b/dogm/include/dogm/kernel/init.h
@@ -10,14 +10,10 @@
 namespace dogm
 {
 
-struct GridCell;
-struct MeasurementCell;
-struct Particle;
-
-__global__ void setupRandomStatesKernel(curandState* __restrict__ states, unsigned long long seed, int count);
+__global__ void setupRandomStatesKernel(curandState* __restrict__ states, int count);
 
 __global__ void initParticlesKernel(ParticlesSoA particle_array, curandState* __restrict__ global_state, float velocity,
-                                    int grid_size, int particle_count);
+                                    int grid_size, int particle_count, float resolution);
 
 __global__ void initBirthParticlesKernel(ParticlesSoA birth_particle_array, curandState* __restrict__ global_state,
                                          float velocity, int grid_size, int particle_count);
diff --git a/dogm/include/dogm/kernel/init_new_particles.h b/dogm/include/dogm/kernel/init_new_particles.h
index cdb87f2..17b71c6 100644
--- a/dogm/include/dogm/kernel/init_new_particles.h
+++ b/dogm/include/dogm/kernel/init_new_particles.h
@@ -10,10 +10,6 @@
 namespace dogm
 {
 
-struct GridCell;
-struct MeasurementCell;
-struct Particle;
-
 void normalize_particle_orders(float* particle_orders_array_accum, int particle_orders_count, int v_B);
 
 __global__ void copyMassesKernel(const MeasurementCellsSoA meas_cell_array, float* __restrict__ masses,
@@ -22,9 +18,9 @@ __global__ void copyMassesKernel(const MeasurementCellsSoA meas_cell_array, floa
 __global__ void initParticlesKernel1(ParticlesSoA particle_array,
                                      const float* __restrict__ particle_orders_array_accum, int cell_count);
 
-__global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell* __restrict__ grid_cell_array,
-                                     curandState* __restrict__ global_state, float velocity, int grid_size,
-                                     float new_weight, int particle_count);
+__global__ void initParticlesKernel2(ParticlesSoA particle_array, curandState* __restrict__ global_state,
+                                     float velocity, int grid_size, float new_weight, int particle_count,
+                                     float resolution);
 
 __global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array,
                                         const MeasurementCellsSoA meas_cell_array,
@@ -34,7 +30,8 @@ __global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array,
 
 __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCellsSoA grid_cell_array,
                                         curandState* __restrict__ global_state, float stddev_velocity,
-                                        float max_velocity, int grid_size, int particle_count);
+                                        float max_velocity, int grid_size, int particle_count,
+                                        float resolution);
 
 __global__ void copyBirthWeightKernel(const ParticlesSoA birth_particle_array, float* __restrict__ birth_weight_array,
                                       int particle_count);
diff --git a/dogm/include/dogm/kernel/predict.h b/dogm/include/dogm/kernel/predict.h
index bfec921..69bddf9 100644
--- a/dogm/include/dogm/kernel/predict.h
+++ b/dogm/include/dogm/kernel/predict.h
@@ -11,10 +11,8 @@
 namespace dogm
 {
 
-struct Particle;
-
 __global__ void predictKernel(ParticlesSoA particle_array, curandState* __restrict__ global_state, float velocity,
-                              int grid_size, float p_S, const glm::mat4x4 transition_matrix,
-                              float process_noise_position, float process_noise_velocity, int particle_count);
+                              int grid_size, float p_S, const glm::mat4x4 transition_matrix, float process_noise_position,
+                              float process_noise_velocity, int particle_count, float resolution);
 
 } /* namespace dogm */
diff --git a/dogm/src/dogm.cu b/dogm/src/dogm.cu
index 17c6f9d..d1d1119 100644
--- a/dogm/src/dogm.cu
+++ b/dogm/src/dogm.cu
@@ -34,13 +34,13 @@ DOGM::DOGM(const Params& params)
     : params(params), grid_size(static_cast<int>(params.size / params.resolution)),
       particle_count(params.particle_count), grid_cell_count(grid_size * grid_size),
       new_born_particle_count(params.new_born_particle_count), block_dim(BLOCK_SIZE), first_pose_received(false),
-      first_measurement_received(false), position_x(0.0f), position_y(0.0f)
+      first_measurement_received(false), position_x(0.0f), position_y(0.0f), iteration(0)
 {
     int device;
-    CHECK_ERROR(cudaGetDevice(&device));
+    CUDA_CALL(cudaGetDevice(&device));
 
     cudaDeviceProp device_prop;
-    CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device));
+    CUDA_CALL(cudaGetDeviceProperties(&device_prop, device));
 
     int blocks_per_sm = device_prop.maxThreadsPerMultiProcessor / block_dim.x;
     dim3 dim(device_prop.multiProcessorCount * blocks_per_sm);
@@ -109,10 +109,10 @@ void DOGM::initialize()
     CUDA_CALL(cudaStreamDestroy(grid_stream));
 }
 
-void DOGM::updateGrid(MeasurementCell* measurement_grid, float new_x, float new_y, float new_yaw, float dt, bool device)
+void DOGM::updateGrid(MeasurementCellsSoA measurement_grid, float new_x, float new_y, float dt, bool device)
 {
     updateMeasurementGrid(measurement_grid, device);
-    updatePose(new_x, new_y, new_yaw);
+    updatePose(new_x, new_y);
 
     particlePrediction(dt);
     particleAssignment();
@@ -124,8 +124,6 @@ void DOGM::updateGrid(MeasurementCell* measurement_grid, float new_x, float new_
 
     particle_array = particle_array_next;
 
-    CHECK_ERROR(cudaDeviceSynchronize());
-
     iteration++;
 }
 
@@ -153,21 +151,19 @@ ParticlesSoA DOGM::getParticles() const
     return particles;
 }
 
-void DOGM::updatePose(float new_x, float new_y, float new_yaw)
+void DOGM::updatePose(float new_x, float new_y)
 {
     if (!first_pose_received)
     {
         position_x = new_x;
         position_y = new_y;
-        yaw = new_yaw;
         first_pose_received = true;
     }
     else
     {
-        const float x_diff = new_x - position_x;
-        const float y_diff = new_y - position_y;
-
-        if (fabsf(x_diff) > params.resolution || fabsf(y_diff) > params.resolution)
+        const int x_move = std::nearbyint((new_x - position_x) / params.resolution);
+        const int y_move = std::nearbyint((new_y - position_y) / params.resolution);
+        if (x_move != 0 || y_move != 0)
         {
             moveParticlesKernel<<<particles_grid, block_dim>>>(particle_array, x_move, y_move, particle_count,
                 params.resolution, grid_size);
@@ -185,7 +181,6 @@ void DOGM::updatePose(float new_x, float new_y, float new_yaw)
 
             position_x = new_x;
             position_y = new_y;
-            yaw = new_yaw;
         }
     }
 }
@@ -206,27 +201,22 @@ void DOGM::initializeParticles()
 {
     copyMassesKernel<<<grid_map_grid, block_dim>>>(meas_cell_array, born_masses_array, grid_cell_count);
 
-    CHECK_ERROR(cudaGetLastError());
-    CHECK_ERROR(cudaDeviceSynchronize());
-
     thrust::device_vector<float> particle_orders_accum(grid_cell_count);
     accumulate(born_masses_array, particle_orders_accum);
     float* particle_orders_array_accum = thrust::raw_pointer_cast(particle_orders_accum.data());
 
     // TODO: particle_orders_accum.back() / particle_count is correct but leads to inferior results
-    float new_weight = 1.0f / particle_count;
+    // TODO: or back to 1 / particle_count
+    float new_weight = particle_orders_accum.back() / particle_count;
 
     normalize_particle_orders(particle_orders_array_accum, grid_cell_count, particle_count);
 
-    initParticlesKernel1<<<grid_map_grid, block_dim>>>(grid_cell_array, meas_cell_array, particle_array,
+    initParticlesKernel1<<<grid_map_grid, block_dim>>>(particle_array,
                                                        particle_orders_array_accum, grid_cell_count);
 
-    CHECK_ERROR(cudaGetLastError());
-
     initParticlesKernel2<<<particles_grid, block_dim>>>(
-        particle_array, grid_cell_array, rng_states, params.init_max_velocity, grid_size, new_weight, particle_count);
-
-    CHECK_ERROR(cudaGetLastError());
+        particle_array, rng_states, params.init_max_velocity, grid_size, new_weight, particle_count,
+        params.resolution);
 }
 
 void DOGM::particlePrediction(float dt)
@@ -239,25 +229,15 @@ void DOGM::particlePrediction(float dt)
                                   0, dt, 0, 1);
     // clang-format on
 
-    // FIXME: glm uses column major, we need row major
-    transition_matrix = glm::transpose(transition_matrix);
-
     predictKernel<<<particles_grid, block_dim>>>(
         particle_array, rng_states, params.stddev_velocity, grid_size, params.persistence_prob, transition_matrix,
-        params.stddev_process_noise_position, params.stddev_process_noise_velocity, particle_count);
-
-    CHECK_ERROR(cudaGetLastError());
+        params.stddev_process_noise_position, params.stddev_process_noise_velocity, particle_count, params.resolution);
 }
 
 void DOGM::particleAssignment()
 {
-    // std::cout << "DOGM::particleAssignment" << std::endl;
-
     reinitGridParticleIndices<<<grid_map_grid, block_dim>>>(grid_cell_array, grid_cell_count);
 
-    CHECK_ERROR(cudaGetLastError());
-    // CHECK_ERROR(cudaDeviceSynchronize());
-
     // sort particles
     thrust::device_ptr<int> grid_index_ptr(particle_array.grid_cell_idx);
     thrust::device_ptr<float> weight_ptr(particle_array.weight);
@@ -268,16 +248,10 @@ void DOGM::particleAssignment()
     thrust::sort_by_key(grid_index_ptr, grid_index_ptr + particle_count, it);
 
     particleToGridKernel<<<particles_grid, block_dim>>>(particle_array, grid_cell_array, weight_array, particle_count);
-
-    CHECK_ERROR(cudaGetLastError());
 }
 
 void DOGM::gridCellOccupancyUpdate()
 {
-    // std::cout << "DOGM::gridCellOccupancyUpdate" << std::endl;
-
-    // CHECK_ERROR(cudaDeviceSynchronize());
-
     thrust::device_vector<float> weights_accum(particle_count);
     accumulate(weight_array, weights_accum);
     float* weight_array_accum = thrust::raw_pointer_cast(weights_accum.data());
@@ -285,20 +259,13 @@ void DOGM::gridCellOccupancyUpdate()
     gridCellPredictionUpdateKernel<<<grid_map_grid, block_dim>>>(grid_cell_array, particle_array, weight_array,
                                                                  weight_array_accum, meas_cell_array, born_masses_array,
                                                                  params.birth_prob, grid_cell_count);
-
-    CHECK_ERROR(cudaGetLastError());
 }
 
 void DOGM::updatePersistentParticles()
 {
-    // std::cout << "DOGM::updatePersistentParticles" << std::endl;
-
     updatePersistentParticlesKernel1<<<particles_grid, block_dim>>>(particle_array, meas_cell_array, weight_array,
                                                                     particle_count);
 
-    CHECK_ERROR(cudaGetLastError());
-    // CHECK_ERROR(cudaDeviceSynchronize());
-
     thrust::device_vector<float> weights_accum(particle_count);
     accumulate(weight_array, weights_accum);
     float* weight_array_accum = thrust::raw_pointer_cast(weights_accum.data());
@@ -306,24 +273,15 @@ void DOGM::updatePersistentParticles()
     updatePersistentParticlesKernel2<<<divUp(grid_cell_count, BLOCK_SIZE), BLOCK_SIZE>>>(
         grid_cell_array, weight_array_accum, grid_cell_count);
 
-    CHECK_ERROR(cudaGetLastError());
-
     updatePersistentParticlesKernel3<<<particles_grid, block_dim>>>(particle_array, meas_cell_array, grid_cell_array,
                                                                     weight_array, particle_count);
-
-    CHECK_ERROR(cudaGetLastError());
 }
 
 void DOGM::initializeNewParticles()
 {
-    // std::cout << "DOGM::initializeNewParticles" << std::endl;
-
     initBirthParticlesKernel<<<birth_particles_grid, block_dim>>>(
         birth_particle_array, rng_states, params.stddev_velocity, grid_size, new_born_particle_count);
 
-    CHECK_ERROR(cudaGetLastError());
-    // CHECK_ERROR(cudaDeviceSynchronize());
-
     thrust::device_vector<float> particle_orders_accum(grid_cell_count);
     accumulate(born_masses_array, particle_orders_accum);
     float* particle_orders_array_accum = thrust::raw_pointer_cast(particle_orders_accum.data());
@@ -334,26 +292,18 @@ void DOGM::initializeNewParticles()
                                                           born_masses_array, birth_particle_array,
                                                           particle_orders_array_accum, grid_cell_count);
 
-    CHECK_ERROR(cudaGetLastError());
-
     initNewParticlesKernel2<<<birth_particles_grid, block_dim>>>(birth_particle_array, grid_cell_array, rng_states,
                                                                  params.stddev_velocity, params.init_max_velocity,
-                                                                 grid_size, new_born_particle_count);
+                                                                 grid_size, new_born_particle_count, params.resolution);
 
-    CHECK_ERROR(cudaGetLastError());
 }
 
 void DOGM::statisticalMoments()
 {
-    // std::cout << "DOGM::statisticalMoments" << std::endl;
-
     statisticalMomentsKernel1<<<particles_grid, block_dim>>>(particle_array, weight_array, vel_x_array, vel_y_array,
                                                              vel_x_squared_array, vel_y_squared_array, vel_xy_array,
                                                              particle_count);
 
-    CHECK_ERROR(cudaGetLastError());
-    // CHECK_ERROR(cudaDeviceSynchronize());
-
     thrust::device_vector<float> vel_x_accum(particle_count);
     accumulate(vel_x_array, vel_x_accum);
     float* vel_x_array_accum = thrust::raw_pointer_cast(vel_x_accum.data());
@@ -377,16 +327,10 @@ void DOGM::statisticalMoments()
     statisticalMomentsKernel2<<<grid_map_grid, block_dim>>>(grid_cell_array, vel_x_array_accum, vel_y_array_accum,
                                                             vel_x_squared_array_accum, vel_y_squared_array_accum,
                                                             vel_xy_array_accum, grid_cell_count);
-
-    CHECK_ERROR(cudaGetLastError());
 }
 
 void DOGM::resampling()
 {
-    // std::cout << "DOGM::resampling" << std::endl;
-
-    // CHECK_ERROR(cudaDeviceSynchronize());
-
     thrust::device_ptr<float> persistent_weights(weight_array);
     thrust::device_ptr<float> new_born_weights(birth_particle_array.weight);
 
@@ -402,9 +346,6 @@ void DOGM::resampling()
     resamplingGenerateRandomNumbersKernel<<<particles_grid, block_dim>>>(rand_array, rng_states, joint_max,
                                                                          particle_count);
 
-    CHECK_ERROR(cudaGetLastError());
-    // CHECK_ERROR(cudaDeviceSynchronize());
-
     thrust::device_ptr<float> rand_ptr(rand_array);
     thrust::device_vector<float> rand_vector(rand_ptr, rand_ptr + particle_count);
 
@@ -416,8 +357,6 @@ void DOGM::resampling()
 
     float new_weight = joint_max / particle_count;
 
-    // printf("joint_max: %f\n", joint_max);
-
     resamplingKernel<<<particles_grid, block_dim>>>(particle_array, particle_array_next, birth_particle_array,
                                                     idx_array_resampled, new_weight, particle_count);
 
diff --git a/dogm/src/kernel/ego_motion_compensation.cu b/dogm/src/kernel/ego_motion_compensation.cu
index e447203..e26b557 100644
--- a/dogm/src/kernel/ego_motion_compensation.cu
+++ b/dogm/src/kernel/ego_motion_compensation.cu
@@ -13,12 +13,16 @@
 namespace dogm
 {
 
-__global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int y_move, int particle_count)
+__global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int y_move, int particle_count,
+                                    float resolution, int grid_size)
 {
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x)
     {
-        particle_array.state[i][0] -= x_move;
-        particle_array.state[i][1] -= y_move;
+        particle_array.state[i][0] -= (x_move * resolution);
+        particle_array.state[i][1] -= (y_move * resolution);
+
+        particle_array.grid_cell_idx[i] = static_cast<int>(particle_array.state[i][1] / resolution) * grid_size
+            + static_cast<int>(particle_array.state[i][0] / resolution);
     }
 }
 
@@ -28,17 +32,44 @@ __global__ void moveMapKernel(GridCellsSoA grid_cell_array, GridCellsSoA old_gri
 {
     const int x = blockIdx.x * blockDim.x + threadIdx.x;
     const int y = blockIdx.y * blockDim.y + threadIdx.y;
+    float eps = 0.001f;
 
     if (x < grid_size && y < grid_size)
     {
         int index = x + grid_size * y;
-        int new_y = y + y_move;
-        int new_x = x + x_move;
-        int new_index = new_x + grid_size * new_y;
+        int old_y = y + y_move;
+        int old_x = x + x_move;
+        int old_index = old_x + grid_size * old_y;
 
-        if (new_x > 0 && new_x < grid_size && new_y > 0 && new_y < grid_size)
+        if (old_x >= 0 && old_x < grid_size && old_y >= 0 && old_y < grid_size && meas_cell_array.occ_mass[index] > eps)
+        {
+            grid_cell_array.copy(old_grid_cell_array, index, old_index);
+        }
+        else
         {
-            grid_cell_array[index] = old_grid_cell_array[new_index];
+            // delete particles on old cells? looks like it break something
+            // for (int i = old_grid_cell_array.start_idx[old_index]; i < old_grid_cell_array.end_idx[old_index]; ++i)
+            //     particle_array.weight[i] = 0;
+            grid_cell_array.start_idx[index] = -1;
+            grid_cell_array.end_idx[index] = -1;
+            grid_cell_array.new_born_occ_mass[index] = 0.0f;
+            grid_cell_array.pers_occ_mass[index] = 0.0f;
+            grid_cell_array.free_mass[index] = 0.0f;
+            grid_cell_array.occ_mass[index] = 0.0f;            
+            grid_cell_array.pred_occ_mass[index] = 0.0f;
+
+            grid_cell_array.mu_A[index] = 0.0f;
+            grid_cell_array.mu_UA[index] = 0.0f;
+
+            grid_cell_array.w_A[index] = 0.0f;
+            grid_cell_array.w_UA[index] = 0.0f;
+
+            grid_cell_array.mean_x_vel[index] = 0.0f;
+            grid_cell_array.mean_y_vel[index] = 0.0f;
+            grid_cell_array.var_x_vel[index] = 0.0f;
+            grid_cell_array.var_y_vel[index] = 0.0f;
+            grid_cell_array.covar_xy_vel[index] = 0.0f;
+            
         }
     }
 }
diff --git a/dogm/src/kernel/init.cu b/dogm/src/kernel/init.cu
index 6230e17..1435b10 100644
--- a/dogm/src/kernel/init.cu
+++ b/dogm/src/kernel/init.cu
@@ -13,8 +13,9 @@
 namespace dogm
 {
 
-__global__ void setupRandomStatesKernel(curandState* __restrict__ states, unsigned long long seed, int count)
+__global__ void setupRandomStatesKernel(curandState* __restrict__ states, int count)
 {
+    long long int seed = clock64();
     for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x)
     {
         curand_init(seed, i, 0, &states[i]);
@@ -22,7 +23,7 @@ __global__ void setupRandomStatesKernel(curandState* __restrict__ states, unsign
 }
 
 __global__ void initParticlesKernel(ParticlesSoA particle_array, curandState* __restrict__ global_state, float velocity,
-                                    int grid_size, int particle_count)
+                                    int grid_size, int particle_count, float resolution)
 {
     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     int stride = blockDim.x * gridDim.x;
@@ -31,8 +32,8 @@ __global__ void initParticlesKernel(ParticlesSoA particle_array, curandState* __
 
     for (int i = thread_id; i < particle_count; i += stride)
     {
-        float x = curand_uniform(&local_state, 0.0f, grid_size - 1);
-        float y = curand_uniform(&local_state, 0.0f, grid_size - 1);
+        float x = curand_uniform(&local_state, 0.0f, (grid_size - 1) * resolution);
+        float y = curand_uniform(&local_state, 0.0f, (grid_size - 1) * resolution);
         float vel_x = curand_uniform(&local_state, -velocity, velocity);
         float vel_y = curand_uniform(&local_state, -velocity, velocity);
 
@@ -52,21 +53,12 @@ __global__ void initBirthParticlesKernel(ParticlesSoA birth_particle_array, cura
     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     int stride = blockDim.x * gridDim.x;
 
-    // curandState local_state = global_state[thread_id];
-
     for (int i = thread_id; i < particle_count; i += stride)
     {
-        // float x = curand_uniform(&local_state, 0.0f, grid_size - 1);
-        // float y = curand_uniform(&local_state, 0.0f, grid_size - 1);
-        // float vel_x = curand_normal(&local_state, 0.0f, velocity);
-        // float vel_y = curand_normal(&local_state, 0.0f, velocity);
-
         birth_particle_array.weight[i] = 0.0f;
         birth_particle_array.associated[i] = false;
         birth_particle_array.state[i] = glm::vec4(0.0f, 0.0f, 0.0f, 0.0f);
     }
-
-    // global_state[thread_id] = local_state;
 }
 
 __global__ void initGridCellsKernel(GridCellsSoA grid_cell_array,
diff --git a/dogm/src/kernel/init_new_particles.cu b/dogm/src/kernel/init_new_particles.cu
index 82e5929..d6511af 100644
--- a/dogm/src/kernel/init_new_particles.cu
+++ b/dogm/src/kernel/init_new_particles.cu
@@ -98,9 +98,9 @@ __global__ void initParticlesKernel1(ParticlesSoA particle_array,
     }
 }
 
-__global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell* __restrict__ grid_cell_array,
-                                     curandState* __restrict__ global_state, float velocity, int grid_size,
-                                     float new_weight, int particle_count)
+__global__ void initParticlesKernel2(ParticlesSoA particle_array, curandState* __restrict__ global_state, 
+                                     float velocity, int grid_size, float new_weight, int particle_count,
+                                     float resolution)
 {
     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     int stride = blockDim.x * gridDim.x;
@@ -111,8 +111,8 @@ __global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell
     {
         int cell_idx = particle_array.grid_cell_idx[i];
 
-        float x = cell_idx % grid_size + 0.5f;
-        float y = cell_idx / grid_size + 0.5f;
+        float x = (cell_idx % grid_size + 0.5f) * resolution;
+        float y = (cell_idx / grid_size + 0.5f) * resolution;
         float vel_x = curand_uniform(&local_state, -velocity, velocity);
         float vel_y = curand_uniform(&local_state, -velocity, velocity);
 
@@ -160,7 +160,8 @@ __global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array,
 
 __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCellsSoA grid_cell_array,
                                         curandState* __restrict__ global_state, float stddev_velocity,
-                                        float max_velocity, int grid_size, int particle_count)
+                                        float max_velocity, int grid_size, int particle_count,
+                                        float resolution)
 {
     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     int stride = blockDim.x * gridDim.x;
@@ -170,11 +171,10 @@ __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const
     for (int i = thread_id; i < particle_count; i += stride)
     {
         int cell_idx = birth_particle_array.grid_cell_idx[i];
-        const GridCell& grid_cell = grid_cell_array[cell_idx];
         bool associated = birth_particle_array.associated[i];
 
-        float x = cell_idx % grid_size + 0.5f;
-        float y = cell_idx / static_cast<float>(grid_size) + 0.5f;
+        float x = (cell_idx % grid_size + 0.5f) * resolution;
+        float y = (cell_idx / grid_size + 0.5f) * resolution;
 
         if (associated)
         {
diff --git a/dogm/src/kernel/predict.cu b/dogm/src/kernel/predict.cu
index 1dcdd38..1ca06fa 100644
--- a/dogm/src/kernel/predict.cu
+++ b/dogm/src/kernel/predict.cu
@@ -14,8 +14,8 @@ namespace dogm
 {
 
 __global__ void predictKernel(ParticlesSoA particle_array, curandState* __restrict__ global_state, float velocity,
-                              int grid_size, float p_S, const glm::mat4x4 transition_matrix,
-                              float process_noise_position, float process_noise_velocity, int particle_count)
+                              int grid_size, float p_S, const glm::mat4x4 transition_matrix, float process_noise_position,
+                              float process_noise_velocity, int particle_count, float resolution)
 {
     int thread_id = blockIdx.x * blockDim.x + threadIdx.x;
     int stride = blockDim.x * gridDim.x;
@@ -34,8 +34,8 @@ __global__ void predictKernel(ParticlesSoA particle_array, curandState* __restri
         particle_array.weight[i] = p_S * particle_array.weight[i];
 
         glm::vec4 state = particle_array.state[i];
-        float x = state[0];
-        float y = state[1];
+        float x = state[0] / resolution;
+        float y = state[1] / resolution;
 
         // Particle out of grid so decrease its chance of being resampled
         if ((x > grid_size - 1 || x < 0) || (y > grid_size - 1 || y < 0))
@@ -46,8 +46,6 @@ __global__ void predictKernel(ParticlesSoA particle_array, curandState* __restri
         int pos_x = clamp(static_cast<int>(x), 0, grid_size - 1);
         int pos_y = clamp(static_cast<int>(y), 0, grid_size - 1);
         particle_array.grid_cell_idx[i] = pos_x + grid_size * pos_y;
-
-        // printf("X: %d, Y: %d, Cell index: %d\n", pos_x, pos_y, (pos_x + grid_size * pos_y));
     }
 
     global_state[thread_id] = local_state;

From b21f1b8da51eb2dab7e5d7a0b5783222caad532d Mon Sep 17 00:00:00 2001
From: ShepelIlya <chepsiilya@gmail.com>
Date: Wed, 18 May 2022 12:09:20 +0300
Subject: [PATCH 4/5] custom commit - merge this with this care! added some
 debug functions, changed cmeke files for building with ros

---
 dogm/CMakeLists.txt                | 56 +++++++++++++++++++------
 dogm/demo/CMakeLists.txt           |  7 +++-
 dogm/demo/simulator/CMakeLists.txt | 14 +++----
 dogm/demo/utils/CMakeLists.txt     |  8 ++--
 dogm/dogm-config.cmake.in          | 14 +++++++
 dogm/include/dogm/dogm.h           | 13 +++---
 dogm/package.xml                   | 16 +++++++
 dogm/src/dogm.cu                   | 67 +++++++++++++++++++++++++++++-
 8 files changed, 160 insertions(+), 35 deletions(-)
 create mode 100644 dogm/dogm-config.cmake.in
 create mode 100644 dogm/package.xml

diff --git a/dogm/CMakeLists.txt b/dogm/CMakeLists.txt
index 2bc88cd..1f63cca 100644
--- a/dogm/CMakeLists.txt
+++ b/dogm/CMakeLists.txt
@@ -1,9 +1,10 @@
 cmake_minimum_required(VERSION 3.9)
-project(DOGM LANGUAGES CXX CUDA)
+project(dogm LANGUAGES CXX CUDA)
 
 set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-option(BUILD_TESTS "Build unit tests" ON)
+option(BUILD_TESTS "Build unit tests" OFF)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/modules/")
 find_package(OpenGL REQUIRED)
@@ -27,13 +28,11 @@ if (NOT GLEW_FOUND)
 	set(GLEW_LIBRARIES "" CACHE FILEPATH "" )
 endif()
 
-LIST (APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
-INCLUDE(GoogleTestDownloadAndBuild)
+find_package(OpenCV REQUIRED)
+find_package(Eigen3 REQUIRED)
 
-#if(WIN32)
-#    install(FILES "${OpenCV_DIR}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/bin/opencv_world${OpenCV_VERSION_MAJOR}${OpenCV_VERSION_MINOR}${OpenCV_VERSION_PATCH}d.dll" DESTINATION ${CMAKE_INSTALL_BINARY}/debug CONFIGURATIONS Debug)
-#    install(FILES "${OpenCV_DIR}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/bin/opencv_world${OpenCV_VERSION_MAJOR}${OpenCV_VERSION_MINOR}${OpenCV_VERSION_PATCH}.dll" DESTINATION ${CMAKE_INSTALL_BINARY} CONFIGURATIONS RelWithDebInfo Release)
-#endif(WIN32)
+LIST (APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+#INCLUDE(GoogleTestDownloadAndBuild)
 
 include_directories(
 	SYSTEM
@@ -41,6 +40,8 @@ include_directories(
 	${GLFW3_INCLUDE_DIR}
 	${GLEW_INCLUDE_DIRS}
 	${GLM_INCLUDE_DIR}
+	${OpenCV_INCLUDE_DIRS}
+	${EIGEN3_INCLUDE_DIR}
 )
 
 set(HEADER_FILES
@@ -76,7 +77,7 @@ set(SRC_FILES
 
 source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SRC_FILES})
 
-add_library(dogm STATIC
+add_library(dogm SHARED
     ${HEADER_FILES}
 	${SRC_FILES}
 )
@@ -91,12 +92,13 @@ string(APPEND CMAKE_CUDA_FLAGS " --expt-extended-lambda")
 #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored")
 set(CUDA_PROPAGATE_HOST_FLAGS ON)
 
-set_target_properties(dogm PROPERTIES PUBLIC_HEADER "include/dogm.h;include/dogm_types.h")
+set_target_properties(dogm PROPERTIES PUBLIC_HEADER "include/dogm/dogm.h;include/dogm/dogm_types.h")
 
 target_link_libraries(dogm
-	PRIVATE ${OPENGL_LIBRARY}
-	PRIVATE ${GLFW3_LIBRARY}
-	PRIVATE ${GLEW_LIBRARIES}
+	${OPENGL_LIBRARY}
+	${GLFW3_LIBRARY}
+	${GLEW_LIBRARIES}
+	${OpenCV_LIBS}
 )
 
 #INSTALL(TARGETS dogm
@@ -111,3 +113,31 @@ if(BUILD_TESTS)
 endif()
 
 add_subdirectory(demo)
+
+# Install package.xml for catkin
+install(FILES package.xml DESTINATION share/${PROJECT_NAME}/)
+
+set(CONF_INCLUDE_DIRS "${CMAKE_INSTALL_PREFIX}/include")
+set(DOGM_CMAKE_DIR share/${PROJECT_NAME}/cmake)
+include(CMakePackageConfigHelpers)
+configure_package_config_file(
+	dogm-config.cmake.in
+	${PROJECT_BINARY_DIR}/cmake/${PROJECT_NAME}/dogm-config.cmake
+	PATH_VARS DOGM_CMAKE_DIR
+	INSTALL_DESTINATION ${CMAKE_INSTALL_PREFIX}/share/${PROJECT_NAME}
+)
+
+install(
+	FILES ${PROJECT_BINARY_DIR}/cmake/${PROJECT_NAME}/dogm-config.cmake
+	DESTINATION share/${PROJECT_NAME}/
+)
+
+install(TARGETS dogm
+	ARCHIVE DESTINATION lib
+	LIBRARY DESTINATION lib
+)
+
+install(
+	DIRECTORY include/
+	DESTINATION include/
+)
diff --git a/dogm/demo/CMakeLists.txt b/dogm/demo/CMakeLists.txt
index 12aea53..a00b670 100644
--- a/dogm/demo/CMakeLists.txt
+++ b/dogm/demo/CMakeLists.txt
@@ -4,6 +4,8 @@ if (NOT OpenCV_FOUND)
 	set(OpenCV_LIBS "" CACHE FILEPATH "" )
 endif()
 
+find_package(CUDA REQUIRED)
+
 add_definitions(-D_USE_MATH_DEFINES)  # Required to make M_PI from cmath available in MSVC
 
 add_executable(demo
@@ -14,9 +16,10 @@ target_include_directories(demo PUBLIC ${OpenCV_INCLUDE_DIRS} ${GLM_INCLUDE_DIR}
 
 target_link_libraries(demo
 	dogm
-	utils
-	simulator
+	dogm_utils
+	dogm_simulator
 	${OpenCV_LIBS}
+	${CUDA_CUDART_LIBRARY}
 )
 
 add_subdirectory(utils)
diff --git a/dogm/demo/simulator/CMakeLists.txt b/dogm/demo/simulator/CMakeLists.txt
index 256317d..6659081 100644
--- a/dogm/demo/simulator/CMakeLists.txt
+++ b/dogm/demo/simulator/CMakeLists.txt
@@ -1,10 +1,5 @@
 find_package(OpenGL REQUIRED)
 
-find_package(GLM)
-if (NOT GLM_FOUND)
-	set(GLM_INCLUDE_DIR "" CACHE PATH "")
-endif()
-
 find_package(GLFW3)
 if (NOT GLFW3_FOUND)
 	set(GLFW3_INCLUDE_DIR "" CACHE PATH "")
@@ -45,13 +40,18 @@ SET(SRC_FILES
 
 source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SRC_FILES})
 
-ADD_LIBRARY(simulator STATIC ${SRC_FILES} ${HEADER_FILES})
+ADD_LIBRARY(dogm_simulator SHARED ${SRC_FILES} ${HEADER_FILES})
 
-TARGET_INCLUDE_DIRECTORIES(simulator
+TARGET_INCLUDE_DIRECTORIES(dogm_simulator
     PUBLIC
     ${CMAKE_CURRENT_SOURCE_DIR}/include
     ${PROJECT_SOURCE_DIR}/include)
 
+TARGET_LINK_LIBRARIES(dogm_simulator
+    ${GLFW3_LIBRARY}
+    ${GLEW_LIBRARIES}
+)
+
 if(BUILD_TESTS)
     add_subdirectory(test)
 endif()
diff --git a/dogm/demo/utils/CMakeLists.txt b/dogm/demo/utils/CMakeLists.txt
index cd8359c..a5efed3 100644
--- a/dogm/demo/utils/CMakeLists.txt
+++ b/dogm/demo/utils/CMakeLists.txt
@@ -25,18 +25,18 @@ SET(SRC_FILES
     timer.cpp
 )
 
-ADD_LIBRARY(utils STATIC ${SRC_FILES} ${HEADER_FILES})
+ADD_LIBRARY(dogm_utils SHARED ${SRC_FILES} ${HEADER_FILES})
 
-TARGET_COMPILE_FEATURES(utils PUBLIC cxx_std_17)
+TARGET_COMPILE_FEATURES(dogm_utils PUBLIC cxx_std_17)
 
-TARGET_INCLUDE_DIRECTORIES(utils
+TARGET_INCLUDE_DIRECTORIES(dogm_utils
     PUBLIC
     simulator
     ${CMAKE_CURRENT_SOURCE_DIR}/include
     ${PROJECT_SOURCE_DIR}/include
     ${OpenCV_INCLUDE_DIRS})
 
-TARGET_LINK_LIBRARIES(utils PRIVATE simulator ${OpenCV_LIBS})
+TARGET_LINK_LIBRARIES(dogm_utils dogm_simulator ${OpenCV_LIBS})
 
 if(BUILD_TESTS)
     add_subdirectory(test)
diff --git a/dogm/dogm-config.cmake.in b/dogm/dogm-config.cmake.in
new file mode 100644
index 0000000..731cbc5
--- /dev/null
+++ b/dogm/dogm-config.cmake.in
@@ -0,0 +1,14 @@
+get_filename_component(DOGM_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+set(DOGM_INCLUDE_DIRS "@CONF_INCLUDE_DIRS@" "@CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES@")
+
+find_library(DOGM_LIBRARY NAMES dogm NO_DEFAULT_PATH HINTS ${DOGM_CMAKE_DIR}/../../lib/ REQUIRED)
+#find_library(DOGM_SIMULATOR_LIBRARY NAMES dogm_simulator NO_DEFAULT_PATH HINTS ${DOGM_CMAKE_DIR}/../../lib/dogm/ REQUIRED)
+#find_library(DOGM_UTILS_LIBRARY NAMES dogm_utils NO_DEFAULT_PATH HINTS ${DOGM_CMAKE_DIR}/../../lib/dogm/ REQUIRED)
+find_package(CUDA REQUIRED)
+
+SET(DOGM_LIBRARIES
+    ${DOGM_LIBRARY}
+#    ${DOGM_SIMULATOR_LIBRARY}
+#    ${DOGM_UTILS_LIBRARY}
+    ${CUDA_CUDART_LIBRARY}
+)
diff --git a/dogm/include/dogm/dogm.h b/dogm/include/dogm/dogm.h
index 0ac54ff..7783712 100644
--- a/dogm/include/dogm/dogm.h
+++ b/dogm/include/dogm/dogm.h
@@ -13,6 +13,7 @@
 #include <memory>
 
 #include <vector>
+#include <opencv2/opencv.hpp>
 
 namespace dogm
 {
@@ -72,7 +73,6 @@ class DOGM
      * @param measurement_grid new measurement grid map.
      * @param new_x new x pose.
      * @param new_y new y pose.
-     * @param new_yaw new yaw.
      * @param dt delta time since the last update.
      * @param device whether the measurement grid resides in GPU memory (default: true).
      */
@@ -121,13 +121,6 @@ class DOGM
      */
     float getPositionX() const { return position_x; }
 
-    /**
-     * Returns the vehicles yaw.
-     *
-     * @return yaw.
-     */
-    float getYaw() const { return yaw; }
-
     /**
      * Returns the y position.
      *
@@ -137,6 +130,10 @@ class DOGM
 
     int getIteration() const { return iteration; }
 
+    cv::Mat getPredOccMassImage(GridCellsSoA& grid_cells) const;
+    cv::Mat getNewBornOccMassImage(GridCellsSoA& grid_cells) const;
+    cv::Mat getPersOccMassImage(GridCellsSoA& grid_cells) const;
+    cv::Mat getOccupancyImage(GridCellsSoA& grid_cells) const;
 private:
     void initialize();
 
diff --git a/dogm/package.xml b/dogm/package.xml
new file mode 100644
index 0000000..58004df
--- /dev/null
+++ b/dogm/package.xml
@@ -0,0 +1,16 @@
+<package format="2">
+  <name>dogm</name>
+  <version>0.0.0</version>
+  <description>The dogm package</description>
+
+  <maintainer email="user@todo.todo">user</maintainer>
+
+  <license>TODO</license>
+
+  <buildtool_depend>cmake</buildtool_depend>
+  
+
+  <export>
+    <build_type>cmake</build_type>
+  </export>
+</package>
diff --git a/dogm/src/dogm.cu b/dogm/src/dogm.cu
index d1d1119..a9233ab 100644
--- a/dogm/src/dogm.cu
+++ b/dogm/src/dogm.cu
@@ -24,6 +24,13 @@
 
 #include <cmath>
 #include <vector>
+#include <Eigen/Dense>
+#include <stdexcept>
+
+#include <opencv2/opencv.hpp>
+#include <opencv2/core/cuda.hpp>
+#include <opencv2/cudawarping.hpp>
+
 
 namespace dogm
 {
@@ -359,8 +366,66 @@ void DOGM::resampling()
 
     resamplingKernel<<<particles_grid, block_dim>>>(particle_array, particle_array_next, birth_particle_array,
                                                     idx_array_resampled, new_weight, particle_count);
+}
+
+cv::Mat DOGM::getPredOccMassImage(GridCellsSoA& grid_cells) const
+{
+    cv::Mat image(grid_size, grid_size, CV_8UC3);
+    for (int i = 0; i < grid_cell_count; i++)
+    {
+        cv::Vec3b color;
+        int x = i % grid_size;
+        int y = i / grid_size;
+        color[0] = color[1] = color[2] = uchar(grid_cells.pred_occ_mass[i] * 255);
+        image.at<cv::Vec3b>(grid_size - x - 1, grid_size - y - 1) = color;
+    }
+    return image;
+}
+
+cv::Mat DOGM::getNewBornOccMassImage(GridCellsSoA& grid_cells) const
+{
+    cv::Mat image(grid_size, grid_size, CV_8UC3);
+    for (int i = 0; i < grid_cell_count; i++)
+    {
+        cv::Vec3b color;
+        int x = i % grid_size;
+        int y = i / grid_size;
+        color[0] = color[1] = color[2] = uchar(grid_cells.new_born_occ_mass[i] * 255);
+        image.at<cv::Vec3b>(grid_size - x - 1, grid_size - y - 1) = color;
+    }
+    return image;
+}
+
+cv::Mat DOGM::getPersOccMassImage(GridCellsSoA& grid_cells) const
+{
+    cv::Mat image(grid_size, grid_size, CV_8UC3);
+    for (int i = 0; i < grid_cell_count; i++)
+    {
+        cv::Vec3b color;
+        int x = i % grid_size;
+        int y = i / grid_size;
+        color[0] = color[1] = color[2] = uchar(grid_cells.pers_occ_mass[i] * 255);
+        image.at<cv::Vec3b>(grid_size - x - 1, grid_size - y - 1) = color;
+    }
+    return image;
+}
+
+cv::Mat DOGM::getOccupancyImage(GridCellsSoA& grid_cells) const
+{
+    cv::Mat image(grid_size, grid_size, CV_8UC3);
+    for (int i = 0; i < grid_cell_count; i++)
+    {
+        const auto occ_mass = grid_cells.occ_mass[i];
+        const auto free_mass = grid_cells.free_mass[i];
+        cv::Vec3b color;
+        int x = i % grid_size;
+        int y = i / grid_size;
+        color[0] = color[1] = color[2] = uchar((occ_mass + (1 - occ_mass - free_mass) / 2) * 255);
+        image.at<cv::Vec3b>(grid_size - x - 1, grid_size - y - 1) = color;
+    }
+    return image;
+}
 
-    CHECK_ERROR(cudaGetLastError());
 }
 
 } /* namespace dogm */

From 0e91442c9ae5a3e29636a8f1540639288ca333f0 Mon Sep 17 00:00:00 2001
From: ShepelIlya <chepsiilya@gmail.com>
Date: Wed, 18 May 2022 17:22:29 +0300
Subject: [PATCH 5/5] debug commit for parallel systematic resampling

---
 dogm/CMakeLists.txt                           |   2 +
 dogm/demo/CMakeLists.txt                      |   5 +-
 dogm/include/dogm/dogm.h                      |   5 +-
 dogm/include/dogm/kernel/resampling.h         |   2 -
 .../include/dogm/kernel/resampling_parallel.h |  44 ++++
 dogm/src/dogm.cu                              |  72 +++++-
 dogm/src/kernel/resampling_parallel.cu        | 233 ++++++++++++++++++
 7 files changed, 353 insertions(+), 10 deletions(-)
 create mode 100644 dogm/include/dogm/kernel/resampling_parallel.h
 create mode 100644 dogm/src/kernel/resampling_parallel.cu

diff --git a/dogm/CMakeLists.txt b/dogm/CMakeLists.txt
index 1f63cca..307f204 100644
--- a/dogm/CMakeLists.txt
+++ b/dogm/CMakeLists.txt
@@ -51,6 +51,7 @@ set(HEADER_FILES
 	include/dogm/kernel/particle_to_grid.h
 	include/dogm/kernel/predict.h
 	include/dogm/kernel/resampling.h
+	include/dogm/kernel/resampling_parallel.h
 	include/dogm/kernel/statistical_moments.h
 	include/dogm/kernel/update_persistent_particles.h
 	include/dogm/kernel/ego_motion_compensation.h
@@ -69,6 +70,7 @@ set(SRC_FILES
 	src/kernel/particle_to_grid.cu
 	src/kernel/predict.cu
 	src/kernel/resampling.cu
+	src/kernel/resampling_parallel.cu
 	src/kernel/statistical_moments.cu
 	src/kernel/update_persistent_particles.cu
 	src/kernel/ego_motion_compensation.cu
diff --git a/dogm/demo/CMakeLists.txt b/dogm/demo/CMakeLists.txt
index a00b670..e2c8cfa 100644
--- a/dogm/demo/CMakeLists.txt
+++ b/dogm/demo/CMakeLists.txt
@@ -12,7 +12,10 @@ add_executable(demo
 	main.cpp
 )
 
-target_include_directories(demo PUBLIC ${OpenCV_INCLUDE_DIRS} ${GLM_INCLUDE_DIR})
+target_include_directories(demo PUBLIC
+	${OpenCV_INCLUDE_DIRS}
+	${GLM_INCLUDE_DIR}
+)
 
 target_link_libraries(demo
 	dogm
diff --git a/dogm/include/dogm/dogm.h b/dogm/include/dogm/dogm.h
index 7783712..e35a3c0 100644
--- a/dogm/include/dogm/dogm.h
+++ b/dogm/include/dogm/dogm.h
@@ -11,7 +11,6 @@
 #include <glm/mat4x4.hpp>
 #include <glm/vec4.hpp>
 #include <memory>
-
 #include <vector>
 #include <opencv2/opencv.hpp>
 
@@ -150,6 +149,7 @@ class DOGM
     void initializeNewParticles();
     void statisticalMoments();
     void resampling();
+    void resampling_parallel_ns();
 
 public:
     Params params;
@@ -172,6 +172,8 @@ class DOGM
     float* vel_xy_array;
 
     float* rand_array;
+    int* idx_array_up;
+    int* idx_array_down;
 
     curandState* rng_states;
 
@@ -181,6 +183,7 @@ class DOGM
     int particle_count;
     int new_born_particle_count;
 
+    cudaDeviceProp device_prop;
     dim3 block_dim;
     dim3 particles_grid;
     dim3 birth_particles_grid;
diff --git a/dogm/include/dogm/kernel/resampling.h b/dogm/include/dogm/kernel/resampling.h
index 665f0e5..b69019c 100644
--- a/dogm/include/dogm/kernel/resampling.h
+++ b/dogm/include/dogm/kernel/resampling.h
@@ -13,8 +13,6 @@
 namespace dogm
 {
 
-struct Particle;
-
 __global__ void resamplingGenerateRandomNumbersKernel(float* __restrict__ rand_array,
                                                       curandState* __restrict__ global_state, float max,
                                                       int particle_count);
diff --git a/dogm/include/dogm/kernel/resampling_parallel.h b/dogm/include/dogm/kernel/resampling_parallel.h
new file mode 100644
index 0000000..797a029
--- /dev/null
+++ b/dogm/include/dogm/kernel/resampling_parallel.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2020 Michael Koesel and respective contributors
+// SPDX-License-Identifier: MIT
+// See accompanying LICENSE file for detailed information
+
+#pragma once
+
+#include <device_launch_parameters.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+
+#include <curand_kernel.h>
+
+#include <cooperative_groups.h>
+
+namespace cg = cooperative_groups;
+
+constexpr auto kTRI {256};
+constexpr auto kWarpSize {32};
+constexpr bool systematic {true};
+
+namespace dogm
+{
+
+__global__ void resampleIndexKernel(const ParticlesSoA particle_array, ParticlesSoA particle_array_next,
+    const ParticlesSoA birth_particle_array, const int* __restrict__ idx_array_up,
+    const int* __restrict__ idx_array_down, float new_weight, int particle_count);
+
+// Systematic / Stratified max optimized
+
+__global__ void resampleSystematicIndexUp(int const num_particles,
+    unsigned long long int const seed, int* __restrict__ resampling_index_up, float* __restrict__ prefix_sum);
+
+__device__ void ResamplingUpPerWarp(cg::thread_block_tile<kWarpSize> const &tile_32,
+    unsigned int const &tid, int const &num_particles, float const &distro,
+    float *shared, float *__restrict__ prefix_sum, int *__restrict__ resampling_index_up);
+
+__global__ void resampleSystematicIndexDown(int const num_particles,
+    unsigned long long int const seed, int *__restrict__ resampling_index_down, float *__restrict__ prefix_sum);
+
+__device__ void ResamplingDownPerWarp(cg::thread_block_tile<kWarpSize> const &tile_32,
+    unsigned int const &tid, int const &num_particles, float const &distro,
+    float *shared, float *__restrict__ prefix_sum, int *__restrict__ resampling_index_down );
+
+} /* namespace dogm */
diff --git a/dogm/src/dogm.cu b/dogm/src/dogm.cu
index a9233ab..f7deefc 100644
--- a/dogm/src/dogm.cu
+++ b/dogm/src/dogm.cu
@@ -14,6 +14,7 @@
 #include "dogm/kernel/particle_to_grid.h"
 #include "dogm/kernel/predict.h"
 #include "dogm/kernel/resampling.h"
+#include "dogm/kernel/resampling_parallel.h"
 #include "dogm/kernel/statistical_moments.h"
 #include "dogm/kernel/update_persistent_particles.h"
 
@@ -71,6 +72,8 @@ DOGM::DOGM(const Params& params)
     CUDA_CALL(cudaMalloc(&vel_xy_array, particle_count * sizeof(float)));
 
     CUDA_CALL(cudaMalloc(&rand_array, particle_count * sizeof(float)));
+    CUDA_CALL(cudaMalloc(&idx_array_up, particle_count * sizeof(int)));
+    CUDA_CALL(cudaMalloc(&idx_array_down, particle_count * sizeof(int)));
 
     CUDA_CALL(cudaMalloc(&rng_states, particles_grid.x * block_dim.x * sizeof(curandState)));
 
@@ -99,6 +102,8 @@ DOGM::~DOGM()
     CUDA_CALL(cudaFree(rand_array));
 
     CUDA_CALL(cudaFree(rng_states));
+    CUDA_CALL(cudaFree(idx_array_up));
+    CUDA_CALL(cudaFree(idx_array_down));
 }
 
 void DOGM::initialize()
@@ -127,7 +132,9 @@ void DOGM::updateGrid(MeasurementCellsSoA measurement_grid, float new_x, float n
     updatePersistentParticles();
     initializeNewParticles();
     statisticalMoments();
-    resampling();
+   
+    // resampling();
+    resampling_parallel_ns();
 
     particle_array = particle_array_next;
 
@@ -336,6 +343,7 @@ void DOGM::statisticalMoments()
                                                             vel_xy_array_accum, grid_cell_count);
 }
 
+// Multinomial origin resampling
 void DOGM::resampling()
 {
     thrust::device_ptr<float> persistent_weights(weight_array);
@@ -355,19 +363,73 @@ void DOGM::resampling()
 
     thrust::device_ptr<float> rand_ptr(rand_array);
     thrust::device_vector<float> rand_vector(rand_ptr, rand_ptr + particle_count);
-
     thrust::sort(rand_vector.begin(), rand_vector.end());
-
     thrust::device_vector<int> idx_resampled(particle_count);
     calc_resampled_indices(joint_weight_accum, rand_vector, idx_resampled, joint_max);
     int* idx_array_resampled = thrust::raw_pointer_cast(idx_resampled.data());
-
     float new_weight = joint_max / particle_count;
 
     resamplingKernel<<<particles_grid, block_dim>>>(particle_array, particle_array_next, birth_particle_array,
                                                     idx_array_resampled, new_weight, particle_count);
 }
 
+void DOGM::resampling_parallel_ns()
+{
+    thrust::device_ptr<float> persistent_weights(weight_array);
+    thrust::device_ptr<float> new_born_weights(birth_particle_array.weight);
+
+    thrust::device_vector<float> joint_weight_array;
+    joint_weight_array.insert(joint_weight_array.end(), persistent_weights, persistent_weights + particle_count);
+    joint_weight_array.insert(joint_weight_array.end(), new_born_weights, new_born_weights + new_born_particle_count);
+
+    thrust::device_vector<float> joint_weight_accum(joint_weight_array.size());
+    accumulate(joint_weight_array, joint_weight_accum);
+
+    float joint_max = joint_weight_accum.back();
+    thrust::transform(joint_weight_accum.begin(), joint_weight_accum.end(),
+        joint_weight_accum.begin(), thrust::placeholders::_1 /= joint_max);
+    float new_weight = joint_max / particle_count;
+
+
+    unsigned long long int seed {static_cast<unsigned long long int>(clock())};
+    // thrust::device_vector<int> up_vec(particle_count, 0);
+    // thrust::device_vector<int> down_vec(particle_count, 0);
+    // int* idx_array_up = thrust::raw_pointer_cast(up_vec.data());
+    // int* idx_array_down = thrust::raw_pointer_cast(down_vec.data());
+    float* accumulated_sum = thrust::raw_pointer_cast(joint_weight_accum.data());
+
+    // void *args_up[] {const_cast<int *>( &particle_count ), &seed,                          
+    //     &idx_array_up, &accumulated_sum};
+
+    CUDA_CALL(cudaGetLastError());
+
+    // CUDA_RT_CALL(cudaLaunchKernel(reinterpret_cast<void*>(&resampleSystematicIndexUp),
+    //     particles_grid, block_dim, args_up, 0, cuda_streams[0]));
+
+    // cudaDeviceSynchronize();
+
+    resampleSystematicIndexUp<<<particles_grid, block_dim>>>(particle_count,
+        seed, idx_array_up, accumulated_sum);
+    
+    // cudaDeviceSynchronize(); 
+    // std::cout << up_vec[0] << "\n";
+
+    // void *args_down[] {const_cast<int *>( &particle_count ), &seed,                          
+    //     &idx_array_down, &accumulated_sum};
+
+    // CUDA_RT_CALL(cudaLaunchKernel(reinterpret_cast<void*>(&resampleSystematicIndexDown),
+    //     40, 256, args_down, 0, cuda_streams[1]));
+
+    resampleSystematicIndexDown<<<particles_grid, block_dim>>>(particle_count,
+        seed, idx_array_down, accumulated_sum);
+    
+    // CHECK_ERROR(cudaDeviceSynchronize());
+    // cudaDeviceSynchronize();
+
+    resampleIndexKernel<<<particles_grid, block_dim>>>(particle_array, particle_array_next,
+        birth_particle_array, idx_array_up, idx_array_down, new_weight, particle_count);
+}
+
 cv::Mat DOGM::getPredOccMassImage(GridCellsSoA& grid_cells) const
 {
     cv::Mat image(grid_size, grid_size, CV_8UC3);
@@ -426,6 +488,4 @@ cv::Mat DOGM::getOccupancyImage(GridCellsSoA& grid_cells) const
     return image;
 }
 
-}
-
 } /* namespace dogm */
diff --git a/dogm/src/kernel/resampling_parallel.cu b/dogm/src/kernel/resampling_parallel.cu
new file mode 100644
index 0000000..853fcbd
--- /dev/null
+++ b/dogm/src/kernel/resampling_parallel.cu
@@ -0,0 +1,233 @@
+#include "dogm/common.h"
+#include "dogm/cuda_utils.h"
+#include "dogm/dogm_types.h"
+#include "dogm/kernel/resampling_parallel.h"
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <thrust/binary_search.h>
+
+namespace dogm 
+{
+
+__global__ void resampleIndexKernel(const ParticlesSoA particle_array, ParticlesSoA particle_array_next,
+    const ParticlesSoA birth_particle_array, const int* __restrict__ idx_array_up,
+    const int* __restrict__ idx_array_down, float new_weight, int particle_count)
+{
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x)
+    {
+        int idx = i + idx_array_up[i] + idx_array_down[i];
+
+        // if (idx_array_up[i] != 0 || idx_array_down[i] != 0)
+        //     printf("\t(%d, %d,%d, %d)", i, idx_array_up[i], idx_array_down[i], idx);
+
+        if (idx < particle_count)
+        {
+            particle_array_next.copy(particle_array, i, idx);
+        }
+        else
+        {
+            particle_array_next.copy(birth_particle_array, i, idx - particle_count);
+            // printf("!");
+        }
+
+        particle_array_next.weight[i] = new_weight;
+    }
+}
+
+// Systematic / Stratified max optimized
+
+__global__ void __launch_bounds__(kTRI) resampleSystematicIndexUp(int const num_particles,
+    unsigned long long int const seed, int* __restrict__ resampling_index_up, float* __restrict__ prefix_sum) {
+    auto const tile_32 = cg::tiled_partition<kWarpSize>(cg::this_thread_block());
+
+    __shared__ float s_warp_0[kTRI]; // strange diff *2
+    __shared__ float s_warp_1[kTRI]; // strange diff *2
+
+    // Setting prefix_sum[n - 1] in each block versus call a separate kernel
+    // beforehand. Set last value in prefix-sum to 1.0f
+    if ( threadIdx.x == 0 ) {
+        prefix_sum[num_particles - 1] = 1.0f;  //
+    }
+
+    for ( unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < num_particles;
+          tid += blockDim.x * gridDim.x ) {
+
+        curandStateXORWOW_t local_state {};
+
+        float distro {};
+
+        if (systematic) {
+            curand_init( seed, 0, 0, &local_state );
+            distro = curand_uniform( &local_state );
+        } else {
+            curand_init( seed + tid, 0, 0, &local_state );
+            distro = curand_uniform( &local_state );
+        }
+
+        if ( threadIdx.x < kWarpSize ) {
+            ResamplingUpPerWarp(tile_32, tid, num_particles, distro, s_warp_0, prefix_sum, resampling_index_up);
+        } else {
+            ResamplingUpPerWarp(tile_32, tid, num_particles, distro, s_warp_1, prefix_sum, resampling_index_up);
+        }
+    }
+}
+
+__device__ void ResamplingUpPerWarp(cg::thread_block_tile<kWarpSize> const &tile_32,
+    unsigned int const &tid, int const &num_particles, float const &distro,
+    float* shared, float* __restrict__ prefix_sum, int* __restrict__ resampling_index_up) {
+
+    float const    tidf { static_cast<float>( tid ) };
+    auto const t { tile_32.thread_rank( ) };
+
+    int l {0};
+    int idx {0};
+    float   a {};
+    float   b {};
+
+    bool mask { true };
+
+    if ( tid < num_particles - kWarpSize - l ) { // strange diff kWarpSize or kTRI
+        shared[t] = prefix_sum[tid + l];     
+        shared[t + kWarpSize] = prefix_sum[tid + kWarpSize + l];   // strange diff kWarpSize || kTRI
+    }
+
+    // Distribution will be the same for each Monte Carlo
+    float const draw = ( distro + tidf ) / num_particles;
+
+    tile_32.sync();
+
+    while (tile_32.any(mask)) {
+        if (tid < num_particles - (kTRI) - l) {  // strange diff (+ kWarpSize)
+
+            a = prefix_sum[tid + kWarpSize + l];
+            b = prefix_sum[tid + kTRI + l]; // strange diff + kWarpSize
+
+            #pragma unroll         // strange diff  kWarpSize or kTRI
+            for ( int i = 0; i < kWarpSize; i++ ) {          // strange diff kWarpSize or kTRI
+                mask = shared[t + i] < draw;
+                if ( mask ) {
+                    idx++;
+                }
+            }
+            l += kWarpSize;          // strange diff kWarpSize or kTRI
+            shared[t] = a;
+            shared[t + kWarpSize] = b;           // strange diff kWarpSize or kTRI
+
+            tile_32.sync();
+        } else {
+            while ( mask && tid < ( num_particles - l ) ) {
+                mask = prefix_sum[tid + l] < draw;
+                if ( mask ) {
+                    idx++;
+                }
+                l++;
+            }
+        }
+
+        tile_32.sync( );
+    }
+    resampling_index_up[tid] = idx;
+}
+
+__global__ void __launch_bounds__(kTRI) resampleSystematicIndexDown(int const num_particles,
+    unsigned long long int const seed, int *__restrict__ resampling_index_down, float *__restrict__ prefix_sum) {
+
+    auto const tile_32 = cg::tiled_partition<kWarpSize>( cg::this_thread_block( ) );
+
+    __shared__ float s_warp_0[kTRI]; // strange diff *2
+    __shared__ float s_warp_1[kTRI]; // strange diff *2
+
+    // Setting prefix_sum_particle_weights[n - 1] in each block versus call a
+    // separate kernel beforehand
+    if ( threadIdx.x == 0 ) {
+        prefix_sum[num_particles - 1] = 1.0f;
+    }
+
+    for ( unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < num_particles;
+          tid += blockDim.x * gridDim.x ) {
+
+        curandStateXORWOW_t local_state {};
+
+        float distro {};
+
+        if ( systematic ) {
+            curand_init( seed, 0, 0, &local_state );
+            distro = curand_uniform( &local_state );
+        } else {
+            curand_init( seed + tid, 0, 0, &local_state );
+            distro = curand_uniform( &local_state );
+        }
+
+        if ( threadIdx.x < kWarpSize ) {
+            ResamplingDownPerWarp( tile_32, tid, num_particles, distro, s_warp_0, prefix_sum, resampling_index_down );
+        } else {
+            ResamplingDownPerWarp( tile_32, tid, num_particles, distro, s_warp_1, prefix_sum, resampling_index_down );
+        }
+    }
+}
+
+__device__ void ResamplingDownPerWarp( cg::thread_block_tile<kWarpSize> const &tile_32,
+    unsigned int const &tid, int const &num_particles, float const &distro,
+    float *shared, float *__restrict__ prefix_sum, int *__restrict__ resampling_index_down ) {
+
+    float const tidf { static_cast<float>( tid ) };
+    auto const t { tile_32.thread_rank( ) };
+
+    int l {1};
+    int idx {0};
+    float a{};
+    float b{};
+
+    bool mask { false };
+
+    // Preload in into shared memory
+    if ( tid >= kWarpSize + l ) { // strange diff kWarpSize or kTRI
+        shared[t] = prefix_sum[tid - kWarpSize - l]; // strange diff kWarpSize or kTRI
+        shared[t + kWarpSize] = prefix_sum[tid - l]; // strange diff kWarpSize or kTRI
+    }
+
+    // Distribution will be the same for each Monte Carlo
+    float const draw = ( distro + tidf ) / num_particles;
+
+    tile_32.sync( );
+
+    while ( !tile_32.all( mask ) ) {
+
+        if ( tid >= kTRI + l ) { // strange diff  + kWarpSize
+            a = prefix_sum[tid - ( kTRI )-l]; // strange diff + kWarpSize)
+            b = prefix_sum[tid - kWarpSize - l];
+
+            #pragma unroll
+            for ( int i = 1; i < kWarpSize + 1; i++ ) { // strange diff kWarpSize or kTRI
+                mask = shared[t + kWarpSize - i] < draw; // strange diff kWarpSize or kTRI
+                if ( !mask ) {
+                    idx--;
+                }
+            }
+            l += kWarpSize; // strange diff kWarpSize or kTRI
+            shared[t]             = a;
+            shared[t + kWarpSize] = b; // strange diff kWarpSize or kTRI
+            tile_32.sync( );
+
+        } else {
+
+            while ( !mask ) {
+                if ( tid > l ) {
+                    mask = prefix_sum[tid - ( l + 1 )] < draw;
+                } else {
+                    mask = true;
+                }
+                if ( !mask ) {
+                    idx--;
+                }
+                l++;
+            }
+        }
+
+        tile_32.sync( );
+    }
+    resampling_index_down[tid] = idx;
+}
+
+}
\ No newline at end of file