From 9c637b93583ed81845e008551d6f6fd0d6ab45bf Mon Sep 17 00:00:00 2001 From: ShepelIlya Date: Wed, 18 May 2022 11:56:04 +0300 Subject: [PATCH 1/5] SoA syntax, some cleaning and optimization --- dogm/demo/main.cpp | 4 +- .../include/mapping/kernel/measurement_grid.h | 8 +- .../include/mapping/laser_to_meas_grid.h | 4 +- .../mapping/kernel/measurement_grid.cu | 20 +- .../simulator/mapping/laser_to_meas_grid.cu | 18 +- .../simulator/mapping/opengl/framebuffer.cpp | 12 +- .../demo/simulator/mapping/opengl/texture.cpp | 14 +- dogm/demo/utils/image_creation.cpp | 43 ++- dogm/include/dogm/cuda_utils.h | 19 +- dogm/include/dogm/dogm.h | 12 +- dogm/include/dogm/dogm_types.h | 280 +++++++++++++++++- .../dogm/kernel/ego_motion_compensation.h | 9 +- dogm/include/dogm/kernel/init.h | 6 +- dogm/include/dogm/kernel/init_new_particles.h | 11 +- dogm/include/dogm/kernel/mass_update.h | 8 +- dogm/include/dogm/kernel/particle_to_grid.h | 5 +- .../include/dogm/kernel/statistical_moments.h | 5 +- .../dogm/kernel/update_persistent_particles.h | 12 +- dogm/src/dogm.cu | 115 ++++--- dogm/src/kernel/ego_motion_compensation.cu | 3 +- dogm/src/kernel/init.cu | 28 +- dogm/src/kernel/init_new_particles.cu | 30 +- dogm/src/kernel/mass_update.cu | 47 ++- dogm/src/kernel/particle_to_grid.cu | 8 +- dogm/src/kernel/statistical_moments.cu | 24 +- .../src/kernel/update_persistent_particles.cu | 41 ++- 26 files changed, 502 insertions(+), 284 deletions(-) diff --git a/dogm/demo/main.cpp b/dogm/demo/main.cpp index 2be2c89..41ddcf7 100644 --- a/dogm/demo/main.cpp +++ b/dogm/demo/main.cpp @@ -80,14 +80,14 @@ int main(int argc, const char** argv) for (int step = 0; step < num_simulation_steps; ++step) { - dogm::MeasurementCell* meas_grid = grid_generator.generateGrid(sim_data[step].measurements); + dogm::MeasurementCellsSoA meas_grid = grid_generator.generateGrid(sim_data[step].measurements); const auto update_grid_caller = [&grid_map](auto&&... args) { grid_map.updateGrid(std::forward(args)...); }; cycle_timer.timeFunctionCall(true, update_grid_caller, meas_grid, sim_data[step].ego_pose.x, - sim_data[step].ego_pose.y, 0.0f, simulation_step_period, true); + sim_data[step].ego_pose.y, simulation_step_period, true); const auto cells_with_velocity = computeCellsWithVelocity(grid_map, minimum_occupancy_threshold, minimum_velocity_threshold); diff --git a/dogm/demo/simulator/include/mapping/kernel/measurement_grid.h b/dogm/demo/simulator/include/mapping/kernel/measurement_grid.h index 49eb32d..90ff1ab 100644 --- a/dogm/demo/simulator/include/mapping/kernel/measurement_grid.h +++ b/dogm/demo/simulator/include/mapping/kernel/measurement_grid.h @@ -6,10 +6,6 @@ #include -namespace dogm -{ -struct MeasurementCell; -} __global__ void createPolarGridTextureKernel(cudaSurfaceObject_t polar, const float* __restrict__ measurements, int width, int height, float resolution); @@ -17,8 +13,8 @@ __global__ void createPolarGridTextureKernel(cudaSurfaceObject_t polar, const fl __global__ void fusePolarGridTextureKernel(cudaSurfaceObject_t polar, const float* __restrict__ measurements, int width, int height, float resolution); -__global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCell* __restrict__ meas_grid, +__global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCellsSoA meas_grid, cudaSurfaceObject_t cart, int grid_size); -__global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCell* __restrict__ meas_grid, +__global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCellsSoA meas_grid, const float2* __restrict__ grid, int grid_size); diff --git a/dogm/demo/simulator/include/mapping/laser_to_meas_grid.h b/dogm/demo/simulator/include/mapping/laser_to_meas_grid.h index 0d210e5..7c74fea 100644 --- a/dogm/demo/simulator/include/mapping/laser_to_meas_grid.h +++ b/dogm/demo/simulator/include/mapping/laser_to_meas_grid.h @@ -23,10 +23,10 @@ class LaserMeasurementGrid LaserMeasurementGrid(const Params& params, float grid_length, float resolution); ~LaserMeasurementGrid(); - dogm::MeasurementCell* generateGrid(const std::vector& measurements); + dogm::MeasurementCellsSoA generateGrid(const std::vector& measurements); private: - dogm::MeasurementCell* meas_grid; + dogm::MeasurementCellsSoA meas_grid; int grid_size; Params params; diff --git a/dogm/demo/simulator/mapping/kernel/measurement_grid.cu b/dogm/demo/simulator/mapping/kernel/measurement_grid.cu index 4337ba9..13717ef 100644 --- a/dogm/demo/simulator/mapping/kernel/measurement_grid.cu +++ b/dogm/demo/simulator/mapping/kernel/measurement_grid.cu @@ -112,7 +112,7 @@ __global__ void fusePolarGridTextureKernel(cudaSurfaceObject_t polar, const floa } } -__global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCell* __restrict__ meas_grid, +__global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCellsSoA meas_grid, cudaSurfaceObject_t cart, int grid_size) { const int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -123,15 +123,15 @@ __global__ void cartesianGridToMeasurementGridKernel(dogm::MeasurementCell* __re { float4 color = surf2Dread(cart, x * sizeof(float4), y); - meas_grid[index].occ_mass = color.x; - meas_grid[index].free_mass = color.y; + meas_grid.occ_mass[index] = color.x; + meas_grid.free_mass[index] = color.y; - meas_grid[index].likelihood = 1.0f; - meas_grid[index].p_A = 1.0f; + meas_grid.likelihood[index] = 1.0f; + meas_grid.p_A[index] = 1.0f; } } -__global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCell* __restrict__ meas_grid, +__global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCellsSoA meas_grid, const float2* __restrict__ grid, int grid_size) { const int x = blockIdx.x * blockDim.x + threadIdx.x; @@ -142,10 +142,10 @@ __global__ void gridArrayToMeasurementGridKernel(dogm::MeasurementCell* __restri { float2 masses = grid[index]; - meas_grid[index].occ_mass = masses.x; - meas_grid[index].free_mass = masses.y; + meas_grid.occ_mass[index] = masses.x; + meas_grid.free_mass[index] = masses.y; - meas_grid[index].likelihood = 1.0f; - meas_grid[index].p_A = 1.0f; + meas_grid.likelihood[index] = 1.0f; + meas_grid.p_A[index] = 1.0f; } } diff --git a/dogm/demo/simulator/mapping/laser_to_meas_grid.cu b/dogm/demo/simulator/mapping/laser_to_meas_grid.cu index 035afd6..0c1ecb4 100644 --- a/dogm/demo/simulator/mapping/laser_to_meas_grid.cu +++ b/dogm/demo/simulator/mapping/laser_to_meas_grid.cu @@ -12,23 +12,23 @@ LaserMeasurementGrid::LaserMeasurementGrid(const Params& params, float grid_leng { int grid_cell_count = grid_size * grid_size; - CHECK_ERROR(cudaMalloc(&meas_grid, grid_cell_count * sizeof(dogm::MeasurementCell))); + meas_grid.init(grid_cell_count, true); renderer = std::make_unique(grid_size, params.fov, grid_length, params.max_range); } LaserMeasurementGrid::~LaserMeasurementGrid() { - CHECK_ERROR(cudaFree(meas_grid)); + meas_grid.free(); } -dogm::MeasurementCell* LaserMeasurementGrid::generateGrid(const std::vector& measurements) +dogm::MeasurementCellsSoA LaserMeasurementGrid::generateGrid(const std::vector& measurements) { const int num_measurements = measurements.size(); float* d_measurements; - CHECK_ERROR(cudaMalloc(&d_measurements, num_measurements * sizeof(float))); - CHECK_ERROR( + CUDA_CALL(cudaMalloc(&d_measurements, num_measurements * sizeof(float))); + CUDA_CALL( cudaMemcpy(d_measurements, measurements.data(), num_measurements * sizeof(float), cudaMemcpyHostToDevice)); const int polar_width = num_measurements; @@ -47,7 +47,7 @@ dogm::MeasurementCell* LaserMeasurementGrid::generateGrid(const std::vector>>(polar_surface, d_measurements, polar_width, polar_height, params.resolution); - CHECK_ERROR(cudaGetLastError()); + CUDA_CALL(cudaGetLastError()); polar_texture.endCudaAccess(polar_surface); // render cartesian image to texture using polar texture @@ -60,11 +60,11 @@ dogm::MeasurementCell* LaserMeasurementGrid::generateGrid(const std::vector>>(meas_grid, cartesian_surface, grid_size); - CHECK_ERROR(cudaGetLastError()); + CUDA_CALL(cudaGetLastError()); framebuffer->endCudaAccess(cartesian_surface); - CHECK_ERROR(cudaFree(d_measurements)); - CHECK_ERROR(cudaDeviceSynchronize()); + CUDA_CALL(cudaFree(d_measurements)); + CUDA_CALL(cudaDeviceSynchronize()); return meas_grid; } diff --git a/dogm/demo/simulator/mapping/opengl/framebuffer.cpp b/dogm/demo/simulator/mapping/opengl/framebuffer.cpp index 8c67d33..4c7144e 100644 --- a/dogm/demo/simulator/mapping/opengl/framebuffer.cpp +++ b/dogm/demo/simulator/mapping/opengl/framebuffer.cpp @@ -23,7 +23,7 @@ Framebuffer::Framebuffer(int width, int height) glFramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, texture, 0); - CHECK_ERROR(cudaGraphicsGLRegisterImage(&resource, texture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly)); + CUDA_CALL(cudaGraphicsGLRegisterImage(&resource, texture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsReadOnly)); glBindFramebuffer(GL_FRAMEBUFFER, 0); } @@ -36,23 +36,23 @@ Framebuffer::~Framebuffer() void Framebuffer::beginCudaAccess(cudaSurfaceObject_t* surfaceObject) { - CHECK_ERROR(cudaGraphicsMapResources(1, &resource, nullptr)); + CUDA_CALL(cudaGraphicsMapResources(1, &resource, nullptr)); cudaArray_t cudaArray; - CHECK_ERROR(cudaGraphicsSubResourceGetMappedArray(&cudaArray, resource, 0, 0)); + CUDA_CALL(cudaGraphicsSubResourceGetMappedArray(&cudaArray, resource, 0, 0)); cudaResourceDesc resourceDesc; memset(&resourceDesc, 0, sizeof(cudaResourceDesc)); resourceDesc.resType = cudaResourceTypeArray; resourceDesc.res.array.array = cudaArray; - CHECK_ERROR(cudaCreateSurfaceObject(surfaceObject, &resourceDesc)); + CUDA_CALL(cudaCreateSurfaceObject(surfaceObject, &resourceDesc)); } void Framebuffer::endCudaAccess(cudaSurfaceObject_t surfaceObject) { - CHECK_ERROR(cudaGraphicsUnmapResources(1, &resource, nullptr)); - CHECK_ERROR(cudaDestroySurfaceObject(surfaceObject)); + CUDA_CALL(cudaGraphicsUnmapResources(1, &resource, nullptr)); + CUDA_CALL(cudaDestroySurfaceObject(surfaceObject)); } void Framebuffer::bind() diff --git a/dogm/demo/simulator/mapping/opengl/texture.cpp b/dogm/demo/simulator/mapping/opengl/texture.cpp index 682a440..63a06a5 100644 --- a/dogm/demo/simulator/mapping/opengl/texture.cpp +++ b/dogm/demo/simulator/mapping/opengl/texture.cpp @@ -31,7 +31,7 @@ Texture::Texture(int width, int height, float anisotropy_level) float color[] = {0.0f, 0.0f, 1.0f, 1.0f}; glTexParameterfv(GL_TEXTURE_2D, GL_TEXTURE_BORDER_COLOR, color); - CHECK_ERROR( + CUDA_CALL( cudaGraphicsGLRegisterImage(&resource, texture, GL_TEXTURE_2D, cudaGraphicsRegisterFlagsSurfaceLoadStore)); glBindTexture(GL_TEXTURE_2D, 0); @@ -44,24 +44,24 @@ Texture::~Texture() void Texture::beginCudaAccess(cudaSurfaceObject_t* surfaceObject) { - CHECK_ERROR(cudaGraphicsMapResources(1, &resource, nullptr)); + CUDA_CALL(cudaGraphicsMapResources(1, &resource, nullptr)); cudaArray_t cudaArray; - CHECK_ERROR(cudaGraphicsSubResourceGetMappedArray(&cudaArray, resource, 0, 0)); + CUDA_CALL(cudaGraphicsSubResourceGetMappedArray(&cudaArray, resource, 0, 0)); cudaResourceDesc resourceDesc; memset(&resourceDesc, 0, sizeof(cudaResourceDesc)); resourceDesc.resType = cudaResourceTypeArray; resourceDesc.res.array.array = cudaArray; - CHECK_ERROR(cudaCreateSurfaceObject(surfaceObject, &resourceDesc)); + CUDA_CALL(cudaCreateSurfaceObject(surfaceObject, &resourceDesc)); } void Texture::endCudaAccess(cudaSurfaceObject_t surfaceObject) { - CHECK_ERROR(cudaGraphicsUnmapResources(1, &resource, nullptr)); - CHECK_ERROR(cudaGraphicsUnregisterResource(resource)); - CHECK_ERROR(cudaDestroySurfaceObject(surfaceObject)); + CUDA_CALL(cudaGraphicsUnmapResources(1, &resource, nullptr)); + CUDA_CALL(cudaGraphicsUnregisterResource(resource)); + CUDA_CALL(cudaDestroySurfaceObject(surfaceObject)); } void Texture::generateMipMap() diff --git a/dogm/demo/utils/image_creation.cpp b/dogm/demo/utils/image_creation.cpp index da2ed6e..8435ca7 100644 --- a/dogm/demo/utils/image_creation.cpp +++ b/dogm/demo/utils/image_creation.cpp @@ -32,17 +32,16 @@ std::vector> computeCellsWithVelocity(const dogm::DOGM& gr { int index = y * grid_map.getGridSize() + x; - const dogm::GridCell& cell = grid_cells[index]; - float occ = pignistic_transformation(cell.free_mass, cell.occ_mass); + float occ = pignistic_transformation(grid_cells.free_mass[index], grid_cells.occ_mass[index]); cv::Mat velocity_mean(2, 1, CV_32FC1); - velocity_mean.at(0) = cell.mean_x_vel; - velocity_mean.at(1) = cell.mean_y_vel; + velocity_mean.at(0) = grid_cells.mean_x_vel[index]; + velocity_mean.at(1) = grid_cells.mean_y_vel[index]; cv::Mat velocity_covar(2, 2, CV_32FC1); - velocity_covar.at(0, 0) = cell.var_x_vel; - velocity_covar.at(1, 0) = cell.covar_xy_vel; - velocity_covar.at(0, 1) = cell.covar_xy_vel; - velocity_covar.at(1, 1) = cell.var_y_vel; + velocity_covar.at(0, 0) = grid_cells.var_x_vel[index]; + velocity_covar.at(1, 0) = grid_cells.covar_xy_vel[index]; + velocity_covar.at(0, 1) = grid_cells.covar_xy_vel[index]; + velocity_covar.at(1, 1) = grid_cells.var_y_vel[index]; cv::Mat velocity_normalized_by_variance = velocity_mean.t() * velocity_covar.inv() * velocity_mean; @@ -54,7 +53,22 @@ std::vector> computeCellsWithVelocity(const dogm::DOGM& gr // Storing the point as grid index to be consistent with cell.mean_x_vel and cell.mean_y_vel point.x = static_cast(x); point.y = static_cast(y); - point.data = cell; + point.data.start_idx = grid_cells.start_idx[index]; + point.data.end_idx = grid_cells.end_idx[index]; + point.data.new_born_occ_mass = grid_cells.new_born_occ_mass[index]; + point.data.pers_occ_mass = grid_cells.pers_occ_mass[index]; + point.data.free_mass = grid_cells.free_mass[index]; + point.data.occ_mass = grid_cells.occ_mass[index]; + point.data.pred_occ_mass = grid_cells.pred_occ_mass[index]; + point.data.mu_A = grid_cells.mu_A[index]; + point.data.mu_UA = grid_cells.mu_UA[index]; + point.data.w_A = grid_cells.w_A[index]; + point.data.w_UA = grid_cells.w_UA[index]; + point.data.mean_x_vel = grid_cells.mean_x_vel[index]; + point.data.mean_y_vel = grid_cells.mean_y_vel[index]; + point.data.var_x_vel = grid_cells.var_x_vel[index]; + point.data.var_y_vel = grid_cells.var_y_vel[index]; + point.data.covar_xy_vel = grid_cells.covar_xy_vel[index]; point.cluster_id = UNCLASSIFIED; cells_with_velocity.push_back(point); @@ -76,8 +90,7 @@ cv::Mat compute_measurement_grid_image(const dogm::DOGM& grid_map) { int index = y * grid_map.getGridSize() + x; - const dogm::MeasurementCell& cell = meas_cells[index]; - float occ = pignistic_transformation(cell.free_mass, cell.occ_mass); + float occ = pignistic_transformation(meas_cells.free_mass[index], meas_cells.occ_mass[index]); auto temp = static_cast(occ * 255.0f); row_ptr[x] = cv::Vec3b(255 - temp, 255 - temp, 255 - temp); @@ -97,9 +110,8 @@ cv::Mat compute_raw_measurement_grid_image(const dogm::DOGM& grid_map) for (int x = 0; x < grid_map.getGridSize(); x++) { int index = y * grid_map.getGridSize() + x; - const dogm::MeasurementCell& cell = meas_cells[index]; - auto red = static_cast(cell.occ_mass * 255.0f); - auto green = static_cast(cell.free_mass * 255.0f); + auto red = static_cast(meas_cells.occ_mass[index] * 255.0f); + auto green = static_cast(meas_cells.free_mass[index] * 255.0f); int blue = 255 - red - green; row_ptr[x] = cv::Vec3b(blue, green, red); @@ -120,8 +132,7 @@ cv::Mat compute_dogm_image(const dogm::DOGM& grid_map, const std::vector(floor(occ * 255)); row_ptr[x] = cv::Vec3b(grayscale_value, grayscale_value, grayscale_value); diff --git a/dogm/include/dogm/cuda_utils.h b/dogm/include/dogm/cuda_utils.h index 3efdbeb..6611a7e 100644 --- a/dogm/include/dogm/cuda_utils.h +++ b/dogm/include/dogm/cuda_utils.h @@ -10,18 +10,15 @@ #define GPU_LAMBDA [=] __host__ __device__ -#define CHECK_ERROR(ans) \ - { \ - checkError((ans), __FILE__, __LINE__); \ - } - -inline void checkError(cudaError_t code, const char* file, int line) -{ - if (code != cudaSuccess) - { - printf("GPU Kernel Error: %s %s %d\n", cudaGetErrorString(code), file, line); - } +#ifndef CUDA_CALL +#define CUDA_CALL(call)\ +{\ + auto status = static_cast(call);\ + if (status != cudaSuccess)\ + fprintf(stderr, "ERROR: CUDA RT call \"%s\" in line %d of file %s failed with %s (%d).\n",\ + #call, __LINE__, __FILE__, cudaGetErrorString(status), status);\ } +#endif inline int divUp(int total, int grain) { diff --git a/dogm/include/dogm/dogm.h b/dogm/include/dogm/dogm.h index 4a04a74..ee34ced 100644 --- a/dogm/include/dogm/dogm.h +++ b/dogm/include/dogm/dogm.h @@ -76,7 +76,7 @@ class DOGM * @param dt delta time since the last update. * @param device whether the measurement grid resides in GPU memory (default: true). */ - void updateGrid(MeasurementCell* measurement_grid, float new_x, float new_y, float new_yaw, float dt, + void updateGrid(MeasurementCellsSoA measurement_grid, float new_x, float new_y, float dt, bool device = true); /** @@ -84,14 +84,14 @@ class DOGM * * @return grid map. */ - std::vector getGridCells() const; + GridCellsSoA getGridCells() const; /** * Returns the measurement grid map in the host memory. * * @return measurement grid map. */ - std::vector getMeasurementCells() const; + MeasurementCellsSoA getMeasurementCells() const; /** * Returns the persistent particles of the particle filter. @@ -141,7 +141,7 @@ class DOGM void initialize(); void updatePose(float new_x, float new_y, float new_yaw); - void updateMeasurementGrid(MeasurementCell* measurement_grid, bool device); + void updateMeasurementGrid(MeasurementCellsSoA measurement_grid, bool device); public: void initializeParticles(); @@ -157,11 +157,11 @@ class DOGM public: Params params; - GridCell* grid_cell_array; + GridCellsSoA grid_cell_array; ParticlesSoA particle_array; ParticlesSoA particle_array_next; ParticlesSoA birth_particle_array; - MeasurementCell* meas_cell_array; + MeasurementCellsSoA meas_cell_array; float* weight_array; float* birth_weight_array; diff --git a/dogm/include/dogm/dogm_types.h b/dogm/include/dogm/dogm_types.h index 3c250ba..e856232 100644 --- a/dogm/include/dogm/dogm_types.h +++ b/dogm/include/dogm/dogm_types.h @@ -48,6 +48,262 @@ struct Particle glm::vec4 state; }; +struct GridCellsSoA +{ + int* start_idx; + int* end_idx; + float* new_born_occ_mass; + float* pers_occ_mass; + float* free_mass; + float* occ_mass; + float* pred_occ_mass; + float* mu_A; + float* mu_UA; + + float* w_A; + float* w_UA; + + float* mean_x_vel; + float* mean_y_vel; + float* var_x_vel; + float* var_y_vel; + float* covar_xy_vel; + + int size; + bool device; + + GridCellsSoA() : size(0), device(true) {} + + GridCellsSoA(int new_size, bool is_device) { init(new_size, is_device); } + + void init(int new_size, bool is_device) + { + size = new_size; + device = is_device; + if (device) + { + CUDA_CALL(cudaMalloc((void**)&start_idx, size * sizeof(int))); + CUDA_CALL(cudaMalloc((void**)&end_idx, size * sizeof(int))); + CUDA_CALL(cudaMalloc((void**)&new_born_occ_mass, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&pers_occ_mass, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&free_mass, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&occ_mass, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&pred_occ_mass, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&mu_A, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&mu_UA, size * sizeof(float))); + + CUDA_CALL(cudaMalloc((void**)&w_A, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&w_UA, size * sizeof(float))); + + CUDA_CALL(cudaMalloc((void**)&mean_x_vel, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&mean_y_vel, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&var_x_vel, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&var_y_vel, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&covar_xy_vel, size * sizeof(float))); + } + else + { + start_idx = (int*)malloc(size * sizeof(int)); + end_idx = (int*)malloc(size * sizeof(int)); + new_born_occ_mass = (float*)malloc(size * sizeof(float)); + pers_occ_mass = (float*)malloc(size * sizeof(float)); + free_mass = (float*)malloc(size * sizeof(float)); + occ_mass = (float*)malloc(size * sizeof(float)); + pred_occ_mass = (float*)malloc(size * sizeof(float)); + mu_A = (float*)malloc(size * sizeof(float)); + mu_UA = (float*)malloc(size * sizeof(float)); + + w_A = (float*)malloc(size * sizeof(float)); + w_UA = (float*)malloc(size * sizeof(float)); + + mean_x_vel = (float*)malloc(size * sizeof(float)); + mean_y_vel = (float*)malloc(size * sizeof(float)); + var_x_vel = (float*)malloc(size * sizeof(float)); + var_y_vel = (float*)malloc(size * sizeof(float)); + covar_xy_vel = (float*)malloc(size * sizeof(float)); + } + } + + void free() + { + if (device) + { + CUDA_CALL(cudaFree(start_idx)); + CUDA_CALL(cudaFree(end_idx)); + CUDA_CALL(cudaFree(new_born_occ_mass)); + CUDA_CALL(cudaFree(pers_occ_mass)); + CUDA_CALL(cudaFree(free_mass)); + CUDA_CALL(cudaFree(occ_mass)); + CUDA_CALL(cudaFree(pred_occ_mass)); + CUDA_CALL(cudaFree(mu_A)); + CUDA_CALL(cudaFree(mu_UA)); + + CUDA_CALL(cudaFree(w_A)); + CUDA_CALL(cudaFree(w_UA)); + + CUDA_CALL(cudaFree(mean_x_vel)); + CUDA_CALL(cudaFree(mean_y_vel)); + CUDA_CALL(cudaFree(var_x_vel)); + CUDA_CALL(cudaFree(var_y_vel)); + CUDA_CALL(cudaFree(covar_xy_vel)); + } + else + { + ::free(start_idx); + ::free(end_idx); + ::free(new_born_occ_mass); + ::free(pers_occ_mass); + ::free(free_mass); + ::free(occ_mass); + ::free(pred_occ_mass); + ::free(mu_A); + ::free(mu_UA); + + ::free(w_A); + ::free(w_UA); + + ::free(mean_x_vel); + ::free(mean_y_vel); + ::free(var_x_vel); + ::free(var_y_vel); + ::free(covar_xy_vel); + } + } + + void copy(const GridCellsSoA& other, cudaMemcpyKind kind) + { + CUDA_CALL(cudaMemcpy(start_idx, other.start_idx, size * sizeof(int), kind)); + CUDA_CALL(cudaMemcpy(end_idx, other.end_idx, size * sizeof(int), kind)); + CUDA_CALL(cudaMemcpy(new_born_occ_mass, other.new_born_occ_mass, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(pers_occ_mass, other.pers_occ_mass, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(free_mass, other.free_mass, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(occ_mass, other.occ_mass, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(pred_occ_mass, other.pred_occ_mass, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(mu_A, other.mu_A, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(mu_UA, other.mu_UA, size * sizeof(float), kind)); + + CUDA_CALL(cudaMemcpy(w_A, other.w_A, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(w_UA, other.w_UA, size * sizeof(float), kind)); + + CUDA_CALL(cudaMemcpy(mean_x_vel, other.mean_x_vel, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(mean_y_vel, other.mean_y_vel, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(var_x_vel, other.var_x_vel, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(var_y_vel, other.var_y_vel, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(covar_xy_vel, other.covar_xy_vel, size * sizeof(float), kind)); + } + + GridCellsSoA& operator=(const GridCellsSoA& other) + { + if (this != &other) + { + copy(other, cudaMemcpyDeviceToDevice); + } + + return *this; + } + + __device__ void copy(const GridCellsSoA& other, int index, int other_index) + { + start_idx[index] = other.start_idx[other_index]; + end_idx[index] = other.end_idx[other_index]; + new_born_occ_mass[index] = other.new_born_occ_mass[other_index]; + pers_occ_mass[index] = other.pers_occ_mass[other_index]; + free_mass[index] = other.free_mass[other_index]; + occ_mass[index] = other.occ_mass[other_index]; + pred_occ_mass[index] = other.pred_occ_mass[other_index]; + mu_A[index] = other.mu_A[other_index]; + mu_UA[index] = other.mu_UA[other_index]; + + w_A[index] = other.w_A[other_index]; + w_UA[index] = other.w_UA[other_index]; + + mean_x_vel[index] = other.mean_x_vel[other_index]; + mean_y_vel[index] = other.mean_y_vel[other_index]; + var_x_vel[index] = other.var_x_vel[other_index]; + var_y_vel[index] = other.var_y_vel[other_index]; + covar_xy_vel[index] = other.covar_xy_vel[other_index]; + } +}; + +struct MeasurementCellsSoA +{ + float* free_mass; + float* occ_mass; + float* likelihood; + float* p_A; + + int size; + bool device; + + MeasurementCellsSoA() : size(0), device(true) {} + + MeasurementCellsSoA(int new_size, bool is_device) { init(new_size, is_device); } + + void init(int new_size, bool is_device) + { + size = new_size; + device = is_device; + if (device) + { + CUDA_CALL(cudaMalloc((void**)&free_mass, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&occ_mass, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&likelihood, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&p_A, size * sizeof(float))); + } + else + { + free_mass = (float*)malloc(size * sizeof(float)); + occ_mass = (float*)malloc(size * sizeof(float)); + likelihood = (float*)malloc(size * sizeof(float)); + p_A = (float*)malloc(size * sizeof(float)); + } + } + + void free() + { + if (device) + { + CUDA_CALL(cudaFree(free_mass)); + CUDA_CALL(cudaFree(occ_mass)); + CUDA_CALL(cudaFree(likelihood)); + CUDA_CALL(cudaFree(p_A)); + } + else + { + ::free(free_mass); + ::free(occ_mass); + ::free(likelihood); + ::free(p_A); + } + } + + void copy(const MeasurementCellsSoA& other, cudaMemcpyKind kind) + { + CUDA_CALL(cudaMemcpy(free_mass, other.free_mass, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(occ_mass, other.occ_mass, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(likelihood, other.likelihood, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(p_A, other.p_A, size * sizeof(float), kind)); + } + + MeasurementCellsSoA& operator=(const MeasurementCellsSoA& other) + { + if (this != &other) + { + copy(other, cudaMemcpyDeviceToDevice); + } + + return *this; + } + + __device__ void copy(const MeasurementCellsSoA& other, int index, int other_index) + { + free_mass[index] = other.free_mass[other_index]; + occ_mass[index] = other.occ_mass[other_index]; + likelihood[index] = other.likelihood[other_index]; + p_A[index] = other.p_A[other_index]; + } +}; + struct ParticlesSoA { glm::vec4* state; @@ -68,10 +324,10 @@ struct ParticlesSoA device = is_device; if (device) { - CHECK_ERROR(cudaMalloc((void**)&state, size * sizeof(glm::vec4))); - CHECK_ERROR(cudaMalloc((void**)&grid_cell_idx, size * sizeof(int))); - CHECK_ERROR(cudaMalloc((void**)&weight, size * sizeof(float))); - CHECK_ERROR(cudaMalloc((void**)&associated, size * sizeof(bool))); + CUDA_CALL(cudaMalloc((void**)&state, size * sizeof(glm::vec4))); + CUDA_CALL(cudaMalloc((void**)&grid_cell_idx, size * sizeof(int))); + CUDA_CALL(cudaMalloc((void**)&weight, size * sizeof(float))); + CUDA_CALL(cudaMalloc((void**)&associated, size * sizeof(bool))); } else { @@ -86,10 +342,10 @@ struct ParticlesSoA { if (device) { - CHECK_ERROR(cudaFree(state)); - CHECK_ERROR(cudaFree(grid_cell_idx)); - CHECK_ERROR(cudaFree(weight)); - CHECK_ERROR(cudaFree(associated)); + CUDA_CALL(cudaFree(state)); + CUDA_CALL(cudaFree(grid_cell_idx)); + CUDA_CALL(cudaFree(weight)); + CUDA_CALL(cudaFree(associated)); } else { @@ -102,10 +358,10 @@ struct ParticlesSoA void copy(const ParticlesSoA& other, cudaMemcpyKind kind) { - CHECK_ERROR(cudaMemcpy(grid_cell_idx, other.grid_cell_idx, size * sizeof(int), kind)); - CHECK_ERROR(cudaMemcpy(weight, other.weight, size * sizeof(float), kind)); - CHECK_ERROR(cudaMemcpy(associated, other.associated, size * sizeof(bool), kind)); - CHECK_ERROR(cudaMemcpy(state, other.state, size * sizeof(glm::vec4), kind)); + CUDA_CALL(cudaMemcpy(grid_cell_idx, other.grid_cell_idx, size * sizeof(int), kind)); + CUDA_CALL(cudaMemcpy(weight, other.weight, size * sizeof(float), kind)); + CUDA_CALL(cudaMemcpy(associated, other.associated, size * sizeof(bool), kind)); + CUDA_CALL(cudaMemcpy(state, other.state, size * sizeof(glm::vec4), kind)); } ParticlesSoA& operator=(const ParticlesSoA& other) diff --git a/dogm/include/dogm/kernel/ego_motion_compensation.h b/dogm/include/dogm/kernel/ego_motion_compensation.h index 144b56c..0397e79 100644 --- a/dogm/include/dogm/kernel/ego_motion_compensation.h +++ b/dogm/include/dogm/kernel/ego_motion_compensation.h @@ -10,12 +10,11 @@ namespace dogm { -struct GridCell; -struct ParticlesSoA; +__global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int y_move, int particle_count, + float resolution, int grid_size); -__global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int y_move, int particle_count); - -__global__ void moveMapKernel(GridCell* __restrict__ grid_cell_array, const GridCell* __restrict__ old_grid_cell_array, +__global__ void moveMapKernel(GridCellsSoA grid_cell_array, GridCellsSoA old_grid_cell_array, + MeasurementCellsSoA meas_cell_array, ParticlesSoA particle_array, int x_move, int y_move, int grid_size); } /* namespace dogm */ diff --git a/dogm/include/dogm/kernel/init.h b/dogm/include/dogm/kernel/init.h index b143215..dd273f7 100644 --- a/dogm/include/dogm/kernel/init.h +++ b/dogm/include/dogm/kernel/init.h @@ -22,9 +22,9 @@ __global__ void initParticlesKernel(ParticlesSoA particle_array, curandState* __ __global__ void initBirthParticlesKernel(ParticlesSoA birth_particle_array, curandState* __restrict__ global_state, float velocity, int grid_size, int particle_count); -__global__ void initGridCellsKernel(GridCell* __restrict__ grid_cell_array, - MeasurementCell* __restrict__ meas_cell_array, int grid_size, int cell_count); +__global__ void initGridCellsKernel(GridCellsSoA grid_cell_array, + MeasurementCellsSoA meas_cell_array, int grid_size, int cell_count); -__global__ void reinitGridParticleIndices(GridCell* __restrict__ grid_cell_array, int cell_count); +__global__ void reinitGridParticleIndices(GridCellsSoA grid_cell_array, int cell_count); } /* namespace dogm */ diff --git a/dogm/include/dogm/kernel/init_new_particles.h b/dogm/include/dogm/kernel/init_new_particles.h index cecfafc..cdb87f2 100644 --- a/dogm/include/dogm/kernel/init_new_particles.h +++ b/dogm/include/dogm/kernel/init_new_particles.h @@ -16,24 +16,23 @@ struct Particle; void normalize_particle_orders(float* particle_orders_array_accum, int particle_orders_count, int v_B); -__global__ void copyMassesKernel(const MeasurementCell* __restrict__ meas_cell_array, float* __restrict__ masses, +__global__ void copyMassesKernel(const MeasurementCellsSoA meas_cell_array, float* __restrict__ masses, int cell_count); -__global__ void initParticlesKernel1(GridCell* __restrict__ grid_cell_array, - const MeasurementCell* __restrict__ meas_cell_array, ParticlesSoA particle_array, +__global__ void initParticlesKernel1(ParticlesSoA particle_array, const float* __restrict__ particle_orders_array_accum, int cell_count); __global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell* __restrict__ grid_cell_array, curandState* __restrict__ global_state, float velocity, int grid_size, float new_weight, int particle_count); -__global__ void initNewParticlesKernel1(GridCell* __restrict__ grid_cell_array, - const MeasurementCell* __restrict__ meas_cell_array, +__global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array, + const MeasurementCellsSoA meas_cell_array, const float* __restrict__ weight_array, const float* __restrict__ born_masses_array, ParticlesSoA birth_particle_array, const float* __restrict__ particle_orders_array_accum, int cell_count); -__global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCell* __restrict__ grid_cell_array, +__global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCellsSoA grid_cell_array, curandState* __restrict__ global_state, float stddev_velocity, float max_velocity, int grid_size, int particle_count); diff --git a/dogm/include/dogm/kernel/mass_update.h b/dogm/include/dogm/kernel/mass_update.h index 67f16f9..d45546e 100644 --- a/dogm/include/dogm/kernel/mass_update.h +++ b/dogm/include/dogm/kernel/mass_update.h @@ -9,14 +9,10 @@ namespace dogm { -struct GridCell; -struct MeasurementCell; -struct Particle; - -__global__ void gridCellPredictionUpdateKernel(GridCell* __restrict__ grid_cell_array, ParticlesSoA particle_array, +__global__ void gridCellPredictionUpdateKernel(GridCellsSoA grid_cell_array, ParticlesSoA particle_array, float* __restrict__ weight_array, const float* __restrict__ weight_array_accum, - const MeasurementCell* __restrict__ meas_cell_array, + const MeasurementCellsSoA meas_cell_array, float* __restrict__ born_masses_array, float p_B, int cell_count); } /* namespace dogm */ diff --git a/dogm/include/dogm/kernel/particle_to_grid.h b/dogm/include/dogm/kernel/particle_to_grid.h index ef03861..26ccdc5 100644 --- a/dogm/include/dogm/kernel/particle_to_grid.h +++ b/dogm/include/dogm/kernel/particle_to_grid.h @@ -9,10 +9,7 @@ namespace dogm { -struct GridCell; -struct Particle; - -__global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCell* __restrict__ grid_cell_array, +__global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCellsSoA grid_cell_array, float* __restrict__ weight_array, int particle_count); } /* namespace dogm */ diff --git a/dogm/include/dogm/kernel/statistical_moments.h b/dogm/include/dogm/kernel/statistical_moments.h index 8f715a1..396cb56 100644 --- a/dogm/include/dogm/kernel/statistical_moments.h +++ b/dogm/include/dogm/kernel/statistical_moments.h @@ -9,16 +9,13 @@ namespace dogm { -struct GridCell; -struct Particle; - __global__ void statisticalMomentsKernel1(const ParticlesSoA particle_array, const float* __restrict__ weight_array, float* __restrict__ vel_x_array, float* __restrict__ vel_y_array, float* __restrict__ vel_x_squared_array, float* __restrict__ vel_y_squared_array, float* __restrict__ vel_xy_array, int particle_count); -__global__ void statisticalMomentsKernel2(GridCell* __restrict__ grid_cell_array, +__global__ void statisticalMomentsKernel2(GridCellsSoA grid_cell_array, const float* __restrict__ vel_x_array_accum, const float* __restrict__ vel_y_array_accum, const float* __restrict__ vel_x_squared_array_accum, diff --git a/dogm/include/dogm/kernel/update_persistent_particles.h b/dogm/include/dogm/kernel/update_persistent_particles.h index 9b309fa..f417d9d 100644 --- a/dogm/include/dogm/kernel/update_persistent_particles.h +++ b/dogm/include/dogm/kernel/update_persistent_particles.h @@ -9,20 +9,16 @@ namespace dogm { -struct GridCell; -struct MeasurementCell; -struct Particle; - __global__ void updatePersistentParticlesKernel1(const ParticlesSoA particle_array, - const MeasurementCell* __restrict__ meas_cell_array, + const MeasurementCellsSoA meas_cell_array, float* __restrict__ weight_array, int particle_count); -__global__ void updatePersistentParticlesKernel2(GridCell* __restrict__ grid_cell_array, +__global__ void updatePersistentParticlesKernel2(GridCellsSoA grid_cell_array, const float* __restrict__ weight_array_accum, int cell_count); __global__ void updatePersistentParticlesKernel3(const ParticlesSoA particle_array, - const MeasurementCell* __restrict__ meas_cell_array, - const GridCell* __restrict__ grid_cell_array, + const MeasurementCellsSoA meas_cell_array, + const GridCellsSoA grid_cell_array, float* __restrict__ weight_array, int particle_count); } /* namespace dogm */ diff --git a/dogm/src/dogm.cu b/dogm/src/dogm.cu index 48f8146..17c6f9d 100644 --- a/dogm/src/dogm.cu +++ b/dogm/src/dogm.cu @@ -50,22 +50,22 @@ DOGM::DOGM(const Params& params) particle_array_next.init(particle_count, true); birth_particle_array.init(new_born_particle_count, true); - CHECK_ERROR(cudaMalloc(&grid_cell_array, grid_cell_count * sizeof(GridCell))); - CHECK_ERROR(cudaMalloc(&meas_cell_array, grid_cell_count * sizeof(MeasurementCell))); + grid_cell_array.init(grid_cell_count, true); + meas_cell_array.init(grid_cell_count, true); - CHECK_ERROR(cudaMalloc(&weight_array, particle_count * sizeof(float))); - CHECK_ERROR(cudaMalloc(&birth_weight_array, new_born_particle_count * sizeof(float))); - CHECK_ERROR(cudaMalloc(&born_masses_array, grid_cell_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&weight_array, particle_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&birth_weight_array, new_born_particle_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&born_masses_array, grid_cell_count * sizeof(float))); - CHECK_ERROR(cudaMalloc(&vel_x_array, particle_count * sizeof(float))); - CHECK_ERROR(cudaMalloc(&vel_y_array, particle_count * sizeof(float))); - CHECK_ERROR(cudaMalloc(&vel_x_squared_array, particle_count * sizeof(float))); - CHECK_ERROR(cudaMalloc(&vel_y_squared_array, particle_count * sizeof(float))); - CHECK_ERROR(cudaMalloc(&vel_xy_array, particle_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&vel_x_array, particle_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&vel_y_array, particle_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&vel_x_squared_array, particle_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&vel_y_squared_array, particle_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&vel_xy_array, particle_count * sizeof(float))); - CHECK_ERROR(cudaMalloc(&rand_array, particle_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&rand_array, particle_count * sizeof(float))); - CHECK_ERROR(cudaMalloc(&rng_states, particles_grid.x * block_dim.x * sizeof(curandState))); + CUDA_CALL(cudaMalloc(&rng_states, particles_grid.x * block_dim.x * sizeof(curandState))); initialize(); } @@ -76,40 +76,37 @@ DOGM::~DOGM() particle_array_next.free(); birth_particle_array.free(); - CHECK_ERROR(cudaFree(grid_cell_array)); - CHECK_ERROR(cudaFree(meas_cell_array)); + grid_cell_array.free(); + meas_cell_array.free(); - CHECK_ERROR(cudaFree(weight_array)); - CHECK_ERROR(cudaFree(birth_weight_array)); - CHECK_ERROR(cudaFree(born_masses_array)); + CUDA_CALL(cudaFree(weight_array)); + CUDA_CALL(cudaFree(birth_weight_array)); + CUDA_CALL(cudaFree(born_masses_array)); - CHECK_ERROR(cudaFree(vel_x_array)); - CHECK_ERROR(cudaFree(vel_y_array)); - CHECK_ERROR(cudaFree(vel_x_squared_array)); - CHECK_ERROR(cudaFree(vel_y_squared_array)); - CHECK_ERROR(cudaFree(vel_xy_array)); + CUDA_CALL(cudaFree(vel_x_array)); + CUDA_CALL(cudaFree(vel_y_array)); + CUDA_CALL(cudaFree(vel_x_squared_array)); + CUDA_CALL(cudaFree(vel_y_squared_array)); + CUDA_CALL(cudaFree(vel_xy_array)); - CHECK_ERROR(cudaFree(rng_states)); + CUDA_CALL(cudaFree(rand_array)); + + CUDA_CALL(cudaFree(rng_states)); } void DOGM::initialize() { cudaStream_t particles_stream, grid_stream; - CHECK_ERROR(cudaStreamCreate(&particles_stream)); - CHECK_ERROR(cudaStreamCreate(&grid_stream)); - - setupRandomStatesKernel<<>>(rng_states, 123456, particles_grid.x * block_dim.x); + CUDA_CALL(cudaStreamCreate(&particles_stream)); + CUDA_CALL(cudaStreamCreate(&grid_stream)); - CHECK_ERROR(cudaGetLastError()); - CHECK_ERROR(cudaDeviceSynchronize()); + setupRandomStatesKernel<<>>(rng_states, particles_grid.x * block_dim.x); initGridCellsKernel<<>>(grid_cell_array, meas_cell_array, grid_size, grid_cell_count); - CHECK_ERROR(cudaGetLastError()); - - CHECK_ERROR(cudaStreamDestroy(particles_stream)); - CHECK_ERROR(cudaStreamDestroy(grid_stream)); + CUDA_CALL(cudaStreamDestroy(particles_stream)); + CUDA_CALL(cudaStreamDestroy(grid_stream)); } void DOGM::updateGrid(MeasurementCell* measurement_grid, float new_x, float new_y, float new_yaw, float dt, bool device) @@ -132,22 +129,18 @@ void DOGM::updateGrid(MeasurementCell* measurement_grid, float new_x, float new_ iteration++; } -std::vector DOGM::getGridCells() const +GridCellsSoA DOGM::getGridCells() const { - std::vector grid_cells(static_cast::size_type>(grid_cell_count)); - - CHECK_ERROR( - cudaMemcpy(grid_cells.data(), grid_cell_array, grid_cell_count * sizeof(GridCell), cudaMemcpyDeviceToHost)); + GridCellsSoA grid_cells(grid_cell_count, false); + grid_cells.copy(grid_cell_array, cudaMemcpyDeviceToHost); return grid_cells; } -std::vector DOGM::getMeasurementCells() const +MeasurementCellsSoA DOGM::getMeasurementCells() const { - std::vector meas_cells(static_cast::size_type>(grid_cell_count)); - - CHECK_ERROR(cudaMemcpy(meas_cells.data(), meas_cell_array, grid_cell_count * sizeof(MeasurementCell), - cudaMemcpyDeviceToHost)); + MeasurementCellsSoA meas_cells(grid_cell_count, false); + meas_cells.copy(meas_cell_array, cudaMemcpyDeviceToHost); return meas_cells; } @@ -176,26 +169,19 @@ void DOGM::updatePose(float new_x, float new_y, float new_yaw) if (fabsf(x_diff) > params.resolution || fabsf(y_diff) > params.resolution) { - const int x_move = -static_cast(x_diff / params.resolution); - const int y_move = -static_cast(y_diff / params.resolution); - - GridCell* old_grid_cell_array; - CHECK_ERROR(cudaMalloc(&old_grid_cell_array, grid_cell_count * sizeof(GridCell))); - - CHECK_ERROR(cudaMemcpy(old_grid_cell_array, grid_cell_array, grid_cell_count * sizeof(GridCell), - cudaMemcpyDeviceToDevice)); - CHECK_ERROR(cudaMemset(grid_cell_array, 0, grid_cell_count * sizeof(GridCell))); + moveParticlesKernel<<>>(particle_array, x_move, y_move, particle_count, + params.resolution, grid_size); dim3 dim_block(32, 32); dim3 grid_dim(divUp(grid_size, dim_block.x), divUp(grid_size, dim_block.y)); - moveParticlesKernel<<>>(particle_array, x_move, y_move, particle_count); - CHECK_ERROR(cudaGetLastError()); + GridCellsSoA old_grid_cell_array(grid_cell_count, true); + old_grid_cell_array.copy(grid_cell_array, cudaMemcpyDeviceToDevice); - moveMapKernel<<>>(grid_cell_array, old_grid_cell_array, x_move, y_move, grid_size); - CHECK_ERROR(cudaGetLastError()); + moveMapKernel<<>>(grid_cell_array, old_grid_cell_array, meas_cell_array, particle_array, + x_move, y_move, grid_size); - CHECK_ERROR(cudaFree(old_grid_cell_array)); + old_grid_cell_array.free(); position_x = new_x; position_y = new_y; @@ -204,10 +190,10 @@ void DOGM::updatePose(float new_x, float new_y, float new_yaw) } } -void DOGM::updateMeasurementGrid(MeasurementCell* measurement_grid, bool device) +void DOGM::updateMeasurementGrid(MeasurementCellsSoA measurement_grid, bool device) { cudaMemcpyKind kind = device ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice; - CHECK_ERROR(cudaMemcpy(meas_cell_array, measurement_grid, grid_cell_count * sizeof(MeasurementCell), kind)); + meas_cell_array.copy(measurement_grid, kind); if (!first_measurement_received) { @@ -245,13 +231,12 @@ void DOGM::initializeParticles() void DOGM::particlePrediction(float dt) { - // std::cout << "DOGM::particlePrediction" << std::endl; - + // glm uses column major, we need row major // clang-format off - glm::mat4x4 transition_matrix(1, 0, dt, 0, - 0, 1, 0, dt, - 0, 0, 1, 0, - 0, 0, 0, 1); + glm::mat4x4 transition_matrix(1, 0, 0, 0, + 0, 1, 0, 0, + dt, 0, 1, 0, + 0, dt, 0, 1); // clang-format on // FIXME: glm uses column major, we need row major diff --git a/dogm/src/kernel/ego_motion_compensation.cu b/dogm/src/kernel/ego_motion_compensation.cu index 5f833fe..e447203 100644 --- a/dogm/src/kernel/ego_motion_compensation.cu +++ b/dogm/src/kernel/ego_motion_compensation.cu @@ -22,7 +22,8 @@ __global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int } } -__global__ void moveMapKernel(GridCell* __restrict__ grid_cell_array, const GridCell* __restrict__ old_grid_cell_array, +__global__ void moveMapKernel(GridCellsSoA grid_cell_array, GridCellsSoA old_grid_cell_array, + MeasurementCellsSoA meas_cell_array, ParticlesSoA particle_array, int x_move, int y_move, int grid_size) { const int x = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/dogm/src/kernel/init.cu b/dogm/src/kernel/init.cu index 02eb196..6230e17 100644 --- a/dogm/src/kernel/init.cu +++ b/dogm/src/kernel/init.cu @@ -69,29 +69,29 @@ __global__ void initBirthParticlesKernel(ParticlesSoA birth_particle_array, cura // global_state[thread_id] = local_state; } -__global__ void initGridCellsKernel(GridCell* __restrict__ grid_cell_array, - MeasurementCell* __restrict__ meas_cell_array, int grid_size, int cell_count) +__global__ void initGridCellsKernel(GridCellsSoA grid_cell_array, + MeasurementCellsSoA meas_cell_array, int grid_size, int cell_count) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x) { - grid_cell_array[i].free_mass = 0.0f; - grid_cell_array[i].occ_mass = 0.0f; - grid_cell_array[i].start_idx = -1; - grid_cell_array[i].end_idx = -1; - - meas_cell_array[i].occ_mass = 0.0f; - meas_cell_array[i].free_mass = 0.0f; - meas_cell_array[i].likelihood = 1.0f; - meas_cell_array[i].p_A = 1.0f; + grid_cell_array.free_mass[i] = 0.0f; + grid_cell_array.occ_mass[i] = 0.0f; + grid_cell_array.start_idx[i] = -1; + grid_cell_array.end_idx[i] = -1; + + meas_cell_array.occ_mass[i] = 0.0f; + meas_cell_array.free_mass[i] = 0.0f; + meas_cell_array.likelihood[i] = 1.0f; + meas_cell_array.p_A[i] = 1.0f; } } -__global__ void reinitGridParticleIndices(GridCell* __restrict__ grid_cell_array, int cell_count) +__global__ void reinitGridParticleIndices(GridCellsSoA grid_cell_array, int cell_count) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x) { - grid_cell_array[i].start_idx = -1; - grid_cell_array[i].end_idx = -1; + grid_cell_array.start_idx[i] = -1; + grid_cell_array.end_idx[i] = -1; } } diff --git a/dogm/src/kernel/init_new_particles.cu b/dogm/src/kernel/init_new_particles.cu index e761a8a..90517f7 100644 --- a/dogm/src/kernel/init_new_particles.cu +++ b/dogm/src/kernel/init_new_particles.cu @@ -57,10 +57,10 @@ __device__ float calc_weight_unassoc(int nu_UA, float p_A, float born_mass) return nu_UA > 0 ? ((1.0 - p_A) * born_mass) / nu_UA : 0.0; } -__device__ void store_weights(float w_A, float w_UA, GridCell* __restrict__ grid_cell_array, int j) +__device__ void store_weights(float w_A, float w_UA, GridCellsSoA grid_cell_array, int j) { - grid_cell_array[j].w_A = w_A; - grid_cell_array[j].w_UA = w_UA; + grid_cell_array.w_A[j] = w_A; + grid_cell_array.w_UA[j] = w_UA; } void normalize_particle_orders(float* particle_orders_array_accum, int particle_orders_count, int v_B) @@ -69,21 +69,21 @@ void normalize_particle_orders(float* particle_orders_array_accum, int particle_ float max = 1.0f; cudaMemcpy(&max, &particle_orders_array_accum[particle_orders_count - 1], sizeof(float), cudaMemcpyDeviceToHost); - thrust::transform(particle_orders_accum, particle_orders_accum + particle_orders_count, particle_orders_accum, - GPU_LAMBDA(float x) { return x * (v_B / max); }); + thrust::transform( + particle_orders_accum, particle_orders_accum + particle_orders_count, particle_orders_accum, + GPU_LAMBDA(float x) { return x * (v_B / max); }); } -__global__ void copyMassesKernel(const MeasurementCell* __restrict__ meas_cell_array, float* __restrict__ masses, +__global__ void copyMassesKernel(const MeasurementCellsSoA meas_cell_array, float* __restrict__ masses, int cell_count) { for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < cell_count; j += blockDim.x * gridDim.x) { - masses[j] = meas_cell_array[j].occ_mass; + masses[j] = meas_cell_array.occ_mass[j]; } } -__global__ void initParticlesKernel1(GridCell* __restrict__ grid_cell_array, - const MeasurementCell* __restrict__ meas_cell_array, ParticlesSoA particle_array, +__global__ void initParticlesKernel1(ParticlesSoA particle_array, const float* __restrict__ particle_orders_array_accum, int cell_count) { for (int j = blockIdx.x * blockDim.x + threadIdx.x; j < cell_count; j += blockDim.x * gridDim.x) @@ -123,8 +123,8 @@ __global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell global_state[thread_id] = local_state; } -__global__ void initNewParticlesKernel1(GridCell* __restrict__ grid_cell_array, - const MeasurementCell* __restrict__ meas_cell_array, +__global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array, + const MeasurementCellsSoA meas_cell_array, const float* __restrict__ weight_array, const float* __restrict__ born_masses_array, ParticlesSoA birth_particle_array, const float* __restrict__ particle_orders_array_accum, int cell_count) @@ -137,7 +137,7 @@ __global__ void initNewParticlesKernel1(GridCell* __restrict__ grid_cell_array, // printf("Start idx: %d, End idx: %d\n", start_idx, end_idx); int num_new_particles = start_idx <= end_idx ? end_idx - start_idx + 1 : 0; - float p_A = meas_cell_array[j].p_A; + float p_A = meas_cell_array.p_A[j]; int nu_A = calc_num_assoc(num_new_particles, p_A); int nu_UA = num_new_particles - nu_A; float w_A = calc_weight_assoc(nu_A, p_A, born_masses_array[j]); @@ -158,7 +158,7 @@ __global__ void initNewParticlesKernel1(GridCell* __restrict__ grid_cell_array, } } -__global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCell* __restrict__ grid_cell_array, +__global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCellsSoA grid_cell_array, curandState* __restrict__ global_state, float stddev_velocity, float max_velocity, int grid_size, int particle_count) { @@ -182,7 +182,7 @@ __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const float vel_x = curand_normal(&local_state, 0.0f, stddev_velocity); float vel_y = curand_normal(&local_state, 0.0f, stddev_velocity); - birth_particle_array.weight[i] = grid_cell.w_A; + birth_particle_array.weight[i] = grid_cell_array.w_A[cell_idx]; birth_particle_array.state[i] = glm::vec4(x, y, vel_x, vel_y); } else @@ -190,7 +190,7 @@ __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const float vel_x = curand_normal(&local_state, 0.0f, stddev_velocity); float vel_y = curand_normal(&local_state, 0.0f, stddev_velocity); - birth_particle_array.weight[i] = grid_cell.w_UA; + birth_particle_array.weight[i] = grid_cell_array.w_UA[cell_idx]; birth_particle_array.state[i] = glm::vec4(x, y, vel_x, vel_y); } } diff --git a/dogm/src/kernel/mass_update.cu b/dogm/src/kernel/mass_update.cu index 77d1924..d021704 100644 --- a/dogm/src/kernel/mass_update.cu +++ b/dogm/src/kernel/mass_update.cu @@ -13,21 +13,21 @@ namespace dogm { -__device__ float predict_free_mass(const GridCell& grid_cell, float m_occ_pred, float alpha = 0.9) +__device__ float predict_free_mass(float grid_cell_free_mass, float m_occ_pred, float alpha = 0.9) { - return min(alpha * grid_cell.free_mass, 1.0f - m_occ_pred); + return min(alpha * grid_cell_free_mass, 1.0f - m_occ_pred); } -__device__ float2 update_masses(float m_occ_pred, float m_free_pred, const MeasurementCell& meas_cell) +__device__ float2 update_masses(float m_occ_pred, float m_free_pred, const MeasurementCellsSoA meas_cells, int meas_idx) { float unknown_pred = 1.0 - m_occ_pred - m_free_pred; - float meas_unknown = 1.0 - meas_cell.free_mass - meas_cell.occ_mass; - float K = m_free_pred * meas_cell.occ_mass + m_occ_pred * meas_cell.free_mass; + float meas_unknown = 1.0 - meas_cells.free_mass[meas_idx] - meas_cells.occ_mass[meas_idx]; + float K = m_free_pred * meas_cells.occ_mass[meas_idx] + m_occ_pred * meas_cells.free_mass[meas_idx]; float occ_mass = - (m_occ_pred * meas_unknown + unknown_pred * meas_cell.occ_mass + m_occ_pred * meas_cell.occ_mass) / (1.0 - K); + (m_occ_pred * meas_unknown + unknown_pred * meas_cells.occ_mass[meas_idx] + m_occ_pred * meas_cells.occ_mass[meas_idx]) / (1.0 - K); float free_mass = - (m_free_pred * meas_unknown + unknown_pred * meas_cell.free_mass + m_free_pred * meas_cell.free_mass) / + (m_free_pred * meas_unknown + unknown_pred * meas_cells.free_mass[meas_idx] + m_free_pred * meas_cells.free_mass[meas_idx]) / (1.0 - K); return make_float2(occ_mass, free_mass); @@ -39,13 +39,13 @@ __device__ float separate_newborn_part(float m_occ_pred, float m_occ_up, float p } __device__ void store_values(float rho_b, float rho_p, float m_free_up, float m_occ_up, float m_occ_pred, - GridCell* __restrict__ grid_cell_array, int i) + GridCellsSoA grid_cell_array, int i) { - grid_cell_array[i].pers_occ_mass = rho_p; - grid_cell_array[i].new_born_occ_mass = rho_b; - grid_cell_array[i].free_mass = m_free_up; - grid_cell_array[i].occ_mass = m_occ_up; - grid_cell_array[i].pred_occ_mass = m_occ_pred; + grid_cell_array.pers_occ_mass[i] = rho_p; + grid_cell_array.new_born_occ_mass[i] = rho_b; + grid_cell_array.free_mass[i] = m_free_up; + grid_cell_array.occ_mass[i] = m_occ_up; + grid_cell_array.pred_occ_mass[i] = m_occ_pred; } __device__ void normalize_weights(const ParticlesSoA& particle_array, float* __restrict__ weight_array, int start_idx, @@ -58,16 +58,16 @@ __device__ void normalize_weights(const ParticlesSoA& particle_array, float* __r } } -__global__ void gridCellPredictionUpdateKernel(GridCell* __restrict__ grid_cell_array, ParticlesSoA particle_array, +__global__ void gridCellPredictionUpdateKernel(GridCellsSoA grid_cell_array, ParticlesSoA particle_array, float* __restrict__ weight_array, const float* __restrict__ weight_array_accum, - const MeasurementCell* __restrict__ meas_cell_array, + const MeasurementCellsSoA meas_cell_array, float* __restrict__ born_masses_array, float p_B, int cell_count) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x) { - int start_idx = grid_cell_array[i].start_idx; - int end_idx = grid_cell_array[i].end_idx; + int start_idx = grid_cell_array.start_idx[i]; + int end_idx = grid_cell_array.end_idx[i]; if (start_idx != -1) { @@ -75,26 +75,23 @@ __global__ void gridCellPredictionUpdateKernel(GridCell* __restrict__ grid_cell_ if (m_occ_pred > 1.0f) { - // printf("Predicted mass greater 1. Mass is: %f\n", m_occ_pred); normalize_weights(particle_array, weight_array, start_idx, end_idx, m_occ_pred); m_occ_pred = 1.0f; } - float m_free_pred = predict_free_mass(grid_cell_array[i], m_occ_pred); - float2 masses_up = update_masses(m_occ_pred, m_free_pred, meas_cell_array[i]); + float m_free_pred = predict_free_mass(grid_cell_array.free_mass[i], m_occ_pred); + float2 masses_up = update_masses(m_occ_pred, m_free_pred, meas_cell_array, i); float rho_b = separate_newborn_part(m_occ_pred, masses_up.x, p_B); float rho_p = masses_up.x - rho_b; born_masses_array[i] = rho_b; - // printf("Rho B: %f\n", rho_b); - store_values(rho_b, rho_p, masses_up.y, masses_up.x, m_occ_pred, grid_cell_array, i); } else { - float m_occ = grid_cell_array[i].occ_mass; - float m_free = predict_free_mass(grid_cell_array[i], m_occ); - float2 masses_up = update_masses(m_occ, m_free, meas_cell_array[i]); + float m_occ = grid_cell_array.occ_mass[i]; + float m_free = predict_free_mass(grid_cell_array.free_mass[i], m_occ); + float2 masses_up = update_masses(m_occ, m_free, meas_cell_array, i); born_masses_array[i] = 0.0f; store_values(0.0f, masses_up.x, masses_up.y, masses_up.x, 0.0f, grid_cell_array, i); } diff --git a/dogm/src/kernel/particle_to_grid.cu b/dogm/src/kernel/particle_to_grid.cu index ac339af..3d24f7f 100644 --- a/dogm/src/kernel/particle_to_grid.cu +++ b/dogm/src/kernel/particle_to_grid.cu @@ -24,7 +24,7 @@ __device__ bool is_last_particle(const ParticlesSoA& particle_array, int particl return i == particle_count - 1 || particle_array.grid_cell_idx[i] != particle_array.grid_cell_idx[i + 1]; } -__global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCell* __restrict__ grid_cell_array, +__global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCellsSoA grid_cell_array, float* __restrict__ weight_array, int particle_count) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x) @@ -33,15 +33,13 @@ __global__ void particleToGridKernel(const ParticlesSoA particle_array, GridCell if (is_first_particle(particle_array, i)) { - grid_cell_array[j].start_idx = i; + grid_cell_array.start_idx[j] = i; } if (is_last_particle(particle_array, particle_count, i)) { - grid_cell_array[j].end_idx = i; + grid_cell_array.end_idx[j] = i; } - // printf("Cell: %d, Start idx: %d, End idx: %d\n", j, grid_cell_array[j].start_idx, - // grid_cell_array[j].end_idx); weight_array[i] = particle_array.weight[i]; } } diff --git a/dogm/src/kernel/statistical_moments.cu b/dogm/src/kernel/statistical_moments.cu index 436428b..287d85c 100644 --- a/dogm/src/kernel/statistical_moments.cu +++ b/dogm/src/kernel/statistical_moments.cu @@ -45,14 +45,14 @@ __device__ float calc_covariance(const float* __restrict__ vel_xy_array_accum, i return 0.0f; } -__device__ void store(GridCell* __restrict__ grid_cell_array, int j, float mean_x_vel, float mean_y_vel, +__device__ void store(GridCellsSoA grid_cell_array, int j, float mean_x_vel, float mean_y_vel, float var_x_vel, float var_y_vel, float covar_xy_vel) { - grid_cell_array[j].mean_x_vel = mean_x_vel; - grid_cell_array[j].mean_y_vel = mean_y_vel; - grid_cell_array[j].var_x_vel = var_x_vel; - grid_cell_array[j].var_y_vel = var_y_vel; - grid_cell_array[j].covar_xy_vel = covar_xy_vel; + grid_cell_array.mean_x_vel[j] = mean_x_vel; + grid_cell_array.mean_y_vel[j] = mean_y_vel; + grid_cell_array.var_x_vel[j] = var_x_vel; + grid_cell_array.var_y_vel[j] = var_y_vel; + grid_cell_array.covar_xy_vel[j] = covar_xy_vel; } __global__ void statisticalMomentsKernel1(const ParticlesSoA particle_array, const float* __restrict__ weight_array, @@ -71,12 +71,10 @@ __global__ void statisticalMomentsKernel1(const ParticlesSoA particle_array, con vel_x_squared_array[i] = weight * vel_x * vel_x; vel_y_squared_array[i] = weight * vel_y * vel_y; vel_xy_array[i] = weight * vel_x * vel_y; - - // printf("vx: %f, vy: %f\n", vel_x_array[i], vel_y_array[i]); } } -__global__ void statisticalMomentsKernel2(GridCell* __restrict__ grid_cell_array, +__global__ void statisticalMomentsKernel2(GridCellsSoA grid_cell_array, const float* __restrict__ vel_x_array_accum, const float* __restrict__ vel_y_array_accum, const float* __restrict__ vel_x_squared_array_accum, @@ -85,10 +83,9 @@ __global__ void statisticalMomentsKernel2(GridCell* __restrict__ grid_cell_array { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x) { - int start_idx = grid_cell_array[i].start_idx; - int end_idx = grid_cell_array[i].end_idx; - float rho_p = grid_cell_array[i].pers_occ_mass; - // printf("rho p: %f\n", rho_p); + int start_idx = grid_cell_array.start_idx[i]; + int end_idx = grid_cell_array.end_idx[i]; + float rho_p = grid_cell_array.pers_occ_mass[i]; if (start_idx != -1) { @@ -97,7 +94,6 @@ __global__ void statisticalMomentsKernel2(GridCell* __restrict__ grid_cell_array float var_x_vel = calc_variance(vel_x_squared_array_accum, start_idx, end_idx, rho_p, mean_x_vel); float var_y_vel = calc_variance(vel_y_squared_array_accum, start_idx, end_idx, rho_p, mean_y_vel); float covar_xy_vel = calc_covariance(vel_xy_array_accum, start_idx, end_idx, rho_p, mean_x_vel, mean_y_vel); - // printf("x: %f, y: %f\n", mean_x_vel, mean_y_vel); store(grid_cell_array, i, mean_x_vel, mean_y_vel, var_x_vel, var_y_vel, covar_xy_vel); } diff --git a/dogm/src/kernel/update_persistent_particles.cu b/dogm/src/kernel/update_persistent_particles.cu index b0d8aa9..e952fac 100644 --- a/dogm/src/kernel/update_persistent_particles.cu +++ b/dogm/src/kernel/update_persistent_particles.cu @@ -18,36 +18,34 @@ __device__ float calc_norm_assoc(float occ_accum, float rho_p) return occ_accum > 0.0f ? rho_p / occ_accum : 0.0f; } -__device__ float calc_norm_unassoc(const GridCell& grid_cell) +__device__ float calc_norm_unassoc(float pred_occ_mass, float pers_occ_mass) { - float pred_occ_mass = grid_cell.pred_occ_mass; - return pred_occ_mass > 0.0f ? grid_cell.pers_occ_mass / pred_occ_mass : 0.0f; + return pred_occ_mass > 0.0f ? pers_occ_mass / pred_occ_mass : 0.0f; } -__device__ void set_normalization_components(GridCell* __restrict__ grid_cell_array, int i, float mu_A, float mu_UA) +__device__ void set_normalization_components(GridCellsSoA grid_cell_array, int i, float mu_A, float mu_UA) { - grid_cell_array[i].mu_A = mu_A; - grid_cell_array[i].mu_UA = mu_UA; + grid_cell_array.mu_A[i] = mu_A; + grid_cell_array.mu_UA[i] = mu_UA; } __device__ float update_unnorm(const ParticlesSoA& particle_array, int i, - const MeasurementCell* __restrict__ meas_cell_array) + const MeasurementCellsSoA meas_cell_array) { - return meas_cell_array[particle_array.grid_cell_idx[i]].likelihood * particle_array.weight[i]; + return meas_cell_array.likelihood[particle_array.grid_cell_idx[i]] * particle_array.weight[i]; } -__device__ float normalize(const ParticlesSoA& particle, int i, const GridCell* __restrict__ grid_cell_array, - const MeasurementCell* __restrict__ meas_cell_array, float weight) +__device__ float normalize(const ParticlesSoA& particle, int i, const GridCellsSoA grid_cell_array, + const MeasurementCellsSoA meas_cell_array, float weight) { const int cell_idx = particle.grid_cell_idx[i]; - const GridCell& cell = grid_cell_array[cell_idx]; - const MeasurementCell& meas_cell = meas_cell_array[cell_idx]; + const float p_A = meas_cell_array.p_A[cell_idx]; - return meas_cell.p_A * cell.mu_A * weight + (1.0f - meas_cell.p_A) * cell.mu_UA * particle.weight[i]; + return p_A * grid_cell_array.mu_A[cell_idx] * weight + (1.0f - p_A) * grid_cell_array.mu_UA[cell_idx] * particle.weight[i]; } __global__ void updatePersistentParticlesKernel1(const ParticlesSoA particle_array, - const MeasurementCell* __restrict__ meas_cell_array, + const MeasurementCellsSoA meas_cell_array, float* __restrict__ weight_array, int particle_count) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x) @@ -56,29 +54,28 @@ __global__ void updatePersistentParticlesKernel1(const ParticlesSoA particle_arr } } -__global__ void updatePersistentParticlesKernel2(GridCell* __restrict__ grid_cell_array, +__global__ void updatePersistentParticlesKernel2(GridCellsSoA grid_cell_array, const float* __restrict__ weight_array_accum, int cell_count) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < cell_count; i += blockDim.x * gridDim.x) { - int start_idx = grid_cell_array[i].start_idx; - int end_idx = grid_cell_array[i].end_idx; + int start_idx = grid_cell_array.start_idx[i]; + int end_idx = grid_cell_array.end_idx[i]; if (start_idx != -1) { float m_occ_accum = subtract(weight_array_accum, start_idx, end_idx); - float rho_p = grid_cell_array[i].pers_occ_mass; + float rho_p = grid_cell_array.pers_occ_mass[i]; float mu_A = calc_norm_assoc(m_occ_accum, rho_p); - float mu_UA = calc_norm_unassoc(grid_cell_array[i]); + float mu_UA = calc_norm_unassoc(grid_cell_array.pred_occ_mass[i], grid_cell_array.pers_occ_mass[i]); set_normalization_components(grid_cell_array, i, mu_A, mu_UA); - // printf("mu_A: %f, mu_UA: %f\n", mu_A, mu_UA); } } } __global__ void updatePersistentParticlesKernel3(const ParticlesSoA particle_array, - const MeasurementCell* __restrict__ meas_cell_array, - const GridCell* __restrict__ grid_cell_array, + const MeasurementCellsSoA meas_cell_array, + const GridCellsSoA grid_cell_array, float* __restrict__ weight_array, int particle_count) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x) From 77760043117e140e73d4c335b8a2897ae4f5a772 Mon Sep 17 00:00:00 2001 From: ShepelIlya Date: Wed, 18 May 2022 11:56:53 +0300 Subject: [PATCH 2/5] memory leak fix --- dogm/src/kernel/init_new_particles.cu | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dogm/src/kernel/init_new_particles.cu b/dogm/src/kernel/init_new_particles.cu index 90517f7..82e5929 100644 --- a/dogm/src/kernel/init_new_particles.cu +++ b/dogm/src/kernel/init_new_particles.cu @@ -146,12 +146,12 @@ __global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array, // printf("w_A: %f, w_UA: %f\n", w_A, w_UA); - for (int i = start_idx; i < start_idx + nu_A + 1; i++) + for (int i = start_idx; i < start_idx + nu_A; i++) { set_cell_idx_A(birth_particle_array, i, j); } - for (int i = start_idx + nu_A + 1; i < end_idx + 1; i++) + for (int i = start_idx + nu_A; i < end_idx + 1; i++) { set_cell_idx_UA(birth_particle_array, i, j); } From 28d13ef63b48ac6eeb2c1a2a92cb302313ab5f50 Mon Sep 17 00:00:00 2001 From: ShepelIlya Date: Wed, 18 May 2022 12:01:42 +0300 Subject: [PATCH 3/5] fixed bug with resolution and translation. --- dogm/include/dogm/dogm.h | 3 +- dogm/include/dogm/kernel/init.h | 8 +- dogm/include/dogm/kernel/init_new_particles.h | 13 +-- dogm/include/dogm/kernel/predict.h | 6 +- dogm/src/dogm.cu | 93 ++++--------------- dogm/src/kernel/ego_motion_compensation.cu | 47 ++++++++-- dogm/src/kernel/init.cu | 18 +--- dogm/src/kernel/init_new_particles.cu | 18 ++-- dogm/src/kernel/predict.cu | 10 +- 9 files changed, 83 insertions(+), 133 deletions(-) diff --git a/dogm/include/dogm/dogm.h b/dogm/include/dogm/dogm.h index ee34ced..0ac54ff 100644 --- a/dogm/include/dogm/dogm.h +++ b/dogm/include/dogm/dogm.h @@ -140,7 +140,7 @@ class DOGM private: void initialize(); - void updatePose(float new_x, float new_y, float new_yaw); + void updatePose(float new_x, float new_y); void updateMeasurementGrid(MeasurementCellsSoA measurement_grid, bool device); public: @@ -196,7 +196,6 @@ class DOGM bool first_measurement_received; float position_x; float position_y; - float yaw; }; } /* namespace dogm */ diff --git a/dogm/include/dogm/kernel/init.h b/dogm/include/dogm/kernel/init.h index dd273f7..07d1280 100644 --- a/dogm/include/dogm/kernel/init.h +++ b/dogm/include/dogm/kernel/init.h @@ -10,14 +10,10 @@ namespace dogm { -struct GridCell; -struct MeasurementCell; -struct Particle; - -__global__ void setupRandomStatesKernel(curandState* __restrict__ states, unsigned long long seed, int count); +__global__ void setupRandomStatesKernel(curandState* __restrict__ states, int count); __global__ void initParticlesKernel(ParticlesSoA particle_array, curandState* __restrict__ global_state, float velocity, - int grid_size, int particle_count); + int grid_size, int particle_count, float resolution); __global__ void initBirthParticlesKernel(ParticlesSoA birth_particle_array, curandState* __restrict__ global_state, float velocity, int grid_size, int particle_count); diff --git a/dogm/include/dogm/kernel/init_new_particles.h b/dogm/include/dogm/kernel/init_new_particles.h index cdb87f2..17b71c6 100644 --- a/dogm/include/dogm/kernel/init_new_particles.h +++ b/dogm/include/dogm/kernel/init_new_particles.h @@ -10,10 +10,6 @@ namespace dogm { -struct GridCell; -struct MeasurementCell; -struct Particle; - void normalize_particle_orders(float* particle_orders_array_accum, int particle_orders_count, int v_B); __global__ void copyMassesKernel(const MeasurementCellsSoA meas_cell_array, float* __restrict__ masses, @@ -22,9 +18,9 @@ __global__ void copyMassesKernel(const MeasurementCellsSoA meas_cell_array, floa __global__ void initParticlesKernel1(ParticlesSoA particle_array, const float* __restrict__ particle_orders_array_accum, int cell_count); -__global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell* __restrict__ grid_cell_array, - curandState* __restrict__ global_state, float velocity, int grid_size, - float new_weight, int particle_count); +__global__ void initParticlesKernel2(ParticlesSoA particle_array, curandState* __restrict__ global_state, + float velocity, int grid_size, float new_weight, int particle_count, + float resolution); __global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array, const MeasurementCellsSoA meas_cell_array, @@ -34,7 +30,8 @@ __global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array, __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCellsSoA grid_cell_array, curandState* __restrict__ global_state, float stddev_velocity, - float max_velocity, int grid_size, int particle_count); + float max_velocity, int grid_size, int particle_count, + float resolution); __global__ void copyBirthWeightKernel(const ParticlesSoA birth_particle_array, float* __restrict__ birth_weight_array, int particle_count); diff --git a/dogm/include/dogm/kernel/predict.h b/dogm/include/dogm/kernel/predict.h index bfec921..69bddf9 100644 --- a/dogm/include/dogm/kernel/predict.h +++ b/dogm/include/dogm/kernel/predict.h @@ -11,10 +11,8 @@ namespace dogm { -struct Particle; - __global__ void predictKernel(ParticlesSoA particle_array, curandState* __restrict__ global_state, float velocity, - int grid_size, float p_S, const glm::mat4x4 transition_matrix, - float process_noise_position, float process_noise_velocity, int particle_count); + int grid_size, float p_S, const glm::mat4x4 transition_matrix, float process_noise_position, + float process_noise_velocity, int particle_count, float resolution); } /* namespace dogm */ diff --git a/dogm/src/dogm.cu b/dogm/src/dogm.cu index 17c6f9d..d1d1119 100644 --- a/dogm/src/dogm.cu +++ b/dogm/src/dogm.cu @@ -34,13 +34,13 @@ DOGM::DOGM(const Params& params) : params(params), grid_size(static_cast(params.size / params.resolution)), particle_count(params.particle_count), grid_cell_count(grid_size * grid_size), new_born_particle_count(params.new_born_particle_count), block_dim(BLOCK_SIZE), first_pose_received(false), - first_measurement_received(false), position_x(0.0f), position_y(0.0f) + first_measurement_received(false), position_x(0.0f), position_y(0.0f), iteration(0) { int device; - CHECK_ERROR(cudaGetDevice(&device)); + CUDA_CALL(cudaGetDevice(&device)); cudaDeviceProp device_prop; - CHECK_ERROR(cudaGetDeviceProperties(&device_prop, device)); + CUDA_CALL(cudaGetDeviceProperties(&device_prop, device)); int blocks_per_sm = device_prop.maxThreadsPerMultiProcessor / block_dim.x; dim3 dim(device_prop.multiProcessorCount * blocks_per_sm); @@ -109,10 +109,10 @@ void DOGM::initialize() CUDA_CALL(cudaStreamDestroy(grid_stream)); } -void DOGM::updateGrid(MeasurementCell* measurement_grid, float new_x, float new_y, float new_yaw, float dt, bool device) +void DOGM::updateGrid(MeasurementCellsSoA measurement_grid, float new_x, float new_y, float dt, bool device) { updateMeasurementGrid(measurement_grid, device); - updatePose(new_x, new_y, new_yaw); + updatePose(new_x, new_y); particlePrediction(dt); particleAssignment(); @@ -124,8 +124,6 @@ void DOGM::updateGrid(MeasurementCell* measurement_grid, float new_x, float new_ particle_array = particle_array_next; - CHECK_ERROR(cudaDeviceSynchronize()); - iteration++; } @@ -153,21 +151,19 @@ ParticlesSoA DOGM::getParticles() const return particles; } -void DOGM::updatePose(float new_x, float new_y, float new_yaw) +void DOGM::updatePose(float new_x, float new_y) { if (!first_pose_received) { position_x = new_x; position_y = new_y; - yaw = new_yaw; first_pose_received = true; } else { - const float x_diff = new_x - position_x; - const float y_diff = new_y - position_y; - - if (fabsf(x_diff) > params.resolution || fabsf(y_diff) > params.resolution) + const int x_move = std::nearbyint((new_x - position_x) / params.resolution); + const int y_move = std::nearbyint((new_y - position_y) / params.resolution); + if (x_move != 0 || y_move != 0) { moveParticlesKernel<<>>(particle_array, x_move, y_move, particle_count, params.resolution, grid_size); @@ -185,7 +181,6 @@ void DOGM::updatePose(float new_x, float new_y, float new_yaw) position_x = new_x; position_y = new_y; - yaw = new_yaw; } } } @@ -206,27 +201,22 @@ void DOGM::initializeParticles() { copyMassesKernel<<>>(meas_cell_array, born_masses_array, grid_cell_count); - CHECK_ERROR(cudaGetLastError()); - CHECK_ERROR(cudaDeviceSynchronize()); - thrust::device_vector particle_orders_accum(grid_cell_count); accumulate(born_masses_array, particle_orders_accum); float* particle_orders_array_accum = thrust::raw_pointer_cast(particle_orders_accum.data()); // TODO: particle_orders_accum.back() / particle_count is correct but leads to inferior results - float new_weight = 1.0f / particle_count; + // TODO: or back to 1 / particle_count + float new_weight = particle_orders_accum.back() / particle_count; normalize_particle_orders(particle_orders_array_accum, grid_cell_count, particle_count); - initParticlesKernel1<<>>(grid_cell_array, meas_cell_array, particle_array, + initParticlesKernel1<<>>(particle_array, particle_orders_array_accum, grid_cell_count); - CHECK_ERROR(cudaGetLastError()); - initParticlesKernel2<<>>( - particle_array, grid_cell_array, rng_states, params.init_max_velocity, grid_size, new_weight, particle_count); - - CHECK_ERROR(cudaGetLastError()); + particle_array, rng_states, params.init_max_velocity, grid_size, new_weight, particle_count, + params.resolution); } void DOGM::particlePrediction(float dt) @@ -239,25 +229,15 @@ void DOGM::particlePrediction(float dt) 0, dt, 0, 1); // clang-format on - // FIXME: glm uses column major, we need row major - transition_matrix = glm::transpose(transition_matrix); - predictKernel<<>>( particle_array, rng_states, params.stddev_velocity, grid_size, params.persistence_prob, transition_matrix, - params.stddev_process_noise_position, params.stddev_process_noise_velocity, particle_count); - - CHECK_ERROR(cudaGetLastError()); + params.stddev_process_noise_position, params.stddev_process_noise_velocity, particle_count, params.resolution); } void DOGM::particleAssignment() { - // std::cout << "DOGM::particleAssignment" << std::endl; - reinitGridParticleIndices<<>>(grid_cell_array, grid_cell_count); - CHECK_ERROR(cudaGetLastError()); - // CHECK_ERROR(cudaDeviceSynchronize()); - // sort particles thrust::device_ptr grid_index_ptr(particle_array.grid_cell_idx); thrust::device_ptr weight_ptr(particle_array.weight); @@ -268,16 +248,10 @@ void DOGM::particleAssignment() thrust::sort_by_key(grid_index_ptr, grid_index_ptr + particle_count, it); particleToGridKernel<<>>(particle_array, grid_cell_array, weight_array, particle_count); - - CHECK_ERROR(cudaGetLastError()); } void DOGM::gridCellOccupancyUpdate() { - // std::cout << "DOGM::gridCellOccupancyUpdate" << std::endl; - - // CHECK_ERROR(cudaDeviceSynchronize()); - thrust::device_vector weights_accum(particle_count); accumulate(weight_array, weights_accum); float* weight_array_accum = thrust::raw_pointer_cast(weights_accum.data()); @@ -285,20 +259,13 @@ void DOGM::gridCellOccupancyUpdate() gridCellPredictionUpdateKernel<<>>(grid_cell_array, particle_array, weight_array, weight_array_accum, meas_cell_array, born_masses_array, params.birth_prob, grid_cell_count); - - CHECK_ERROR(cudaGetLastError()); } void DOGM::updatePersistentParticles() { - // std::cout << "DOGM::updatePersistentParticles" << std::endl; - updatePersistentParticlesKernel1<<>>(particle_array, meas_cell_array, weight_array, particle_count); - CHECK_ERROR(cudaGetLastError()); - // CHECK_ERROR(cudaDeviceSynchronize()); - thrust::device_vector weights_accum(particle_count); accumulate(weight_array, weights_accum); float* weight_array_accum = thrust::raw_pointer_cast(weights_accum.data()); @@ -306,24 +273,15 @@ void DOGM::updatePersistentParticles() updatePersistentParticlesKernel2<<>>( grid_cell_array, weight_array_accum, grid_cell_count); - CHECK_ERROR(cudaGetLastError()); - updatePersistentParticlesKernel3<<>>(particle_array, meas_cell_array, grid_cell_array, weight_array, particle_count); - - CHECK_ERROR(cudaGetLastError()); } void DOGM::initializeNewParticles() { - // std::cout << "DOGM::initializeNewParticles" << std::endl; - initBirthParticlesKernel<<>>( birth_particle_array, rng_states, params.stddev_velocity, grid_size, new_born_particle_count); - CHECK_ERROR(cudaGetLastError()); - // CHECK_ERROR(cudaDeviceSynchronize()); - thrust::device_vector particle_orders_accum(grid_cell_count); accumulate(born_masses_array, particle_orders_accum); float* particle_orders_array_accum = thrust::raw_pointer_cast(particle_orders_accum.data()); @@ -334,26 +292,18 @@ void DOGM::initializeNewParticles() born_masses_array, birth_particle_array, particle_orders_array_accum, grid_cell_count); - CHECK_ERROR(cudaGetLastError()); - initNewParticlesKernel2<<>>(birth_particle_array, grid_cell_array, rng_states, params.stddev_velocity, params.init_max_velocity, - grid_size, new_born_particle_count); + grid_size, new_born_particle_count, params.resolution); - CHECK_ERROR(cudaGetLastError()); } void DOGM::statisticalMoments() { - // std::cout << "DOGM::statisticalMoments" << std::endl; - statisticalMomentsKernel1<<>>(particle_array, weight_array, vel_x_array, vel_y_array, vel_x_squared_array, vel_y_squared_array, vel_xy_array, particle_count); - CHECK_ERROR(cudaGetLastError()); - // CHECK_ERROR(cudaDeviceSynchronize()); - thrust::device_vector vel_x_accum(particle_count); accumulate(vel_x_array, vel_x_accum); float* vel_x_array_accum = thrust::raw_pointer_cast(vel_x_accum.data()); @@ -377,16 +327,10 @@ void DOGM::statisticalMoments() statisticalMomentsKernel2<<>>(grid_cell_array, vel_x_array_accum, vel_y_array_accum, vel_x_squared_array_accum, vel_y_squared_array_accum, vel_xy_array_accum, grid_cell_count); - - CHECK_ERROR(cudaGetLastError()); } void DOGM::resampling() { - // std::cout << "DOGM::resampling" << std::endl; - - // CHECK_ERROR(cudaDeviceSynchronize()); - thrust::device_ptr persistent_weights(weight_array); thrust::device_ptr new_born_weights(birth_particle_array.weight); @@ -402,9 +346,6 @@ void DOGM::resampling() resamplingGenerateRandomNumbersKernel<<>>(rand_array, rng_states, joint_max, particle_count); - CHECK_ERROR(cudaGetLastError()); - // CHECK_ERROR(cudaDeviceSynchronize()); - thrust::device_ptr rand_ptr(rand_array); thrust::device_vector rand_vector(rand_ptr, rand_ptr + particle_count); @@ -416,8 +357,6 @@ void DOGM::resampling() float new_weight = joint_max / particle_count; - // printf("joint_max: %f\n", joint_max); - resamplingKernel<<>>(particle_array, particle_array_next, birth_particle_array, idx_array_resampled, new_weight, particle_count); diff --git a/dogm/src/kernel/ego_motion_compensation.cu b/dogm/src/kernel/ego_motion_compensation.cu index e447203..e26b557 100644 --- a/dogm/src/kernel/ego_motion_compensation.cu +++ b/dogm/src/kernel/ego_motion_compensation.cu @@ -13,12 +13,16 @@ namespace dogm { -__global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int y_move, int particle_count) +__global__ void moveParticlesKernel(ParticlesSoA particle_array, int x_move, int y_move, int particle_count, + float resolution, int grid_size) { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x) { - particle_array.state[i][0] -= x_move; - particle_array.state[i][1] -= y_move; + particle_array.state[i][0] -= (x_move * resolution); + particle_array.state[i][1] -= (y_move * resolution); + + particle_array.grid_cell_idx[i] = static_cast(particle_array.state[i][1] / resolution) * grid_size + + static_cast(particle_array.state[i][0] / resolution); } } @@ -28,17 +32,44 @@ __global__ void moveMapKernel(GridCellsSoA grid_cell_array, GridCellsSoA old_gri { const int x = blockIdx.x * blockDim.x + threadIdx.x; const int y = blockIdx.y * blockDim.y + threadIdx.y; + float eps = 0.001f; if (x < grid_size && y < grid_size) { int index = x + grid_size * y; - int new_y = y + y_move; - int new_x = x + x_move; - int new_index = new_x + grid_size * new_y; + int old_y = y + y_move; + int old_x = x + x_move; + int old_index = old_x + grid_size * old_y; - if (new_x > 0 && new_x < grid_size && new_y > 0 && new_y < grid_size) + if (old_x >= 0 && old_x < grid_size && old_y >= 0 && old_y < grid_size && meas_cell_array.occ_mass[index] > eps) + { + grid_cell_array.copy(old_grid_cell_array, index, old_index); + } + else { - grid_cell_array[index] = old_grid_cell_array[new_index]; + // delete particles on old cells? looks like it break something + // for (int i = old_grid_cell_array.start_idx[old_index]; i < old_grid_cell_array.end_idx[old_index]; ++i) + // particle_array.weight[i] = 0; + grid_cell_array.start_idx[index] = -1; + grid_cell_array.end_idx[index] = -1; + grid_cell_array.new_born_occ_mass[index] = 0.0f; + grid_cell_array.pers_occ_mass[index] = 0.0f; + grid_cell_array.free_mass[index] = 0.0f; + grid_cell_array.occ_mass[index] = 0.0f; + grid_cell_array.pred_occ_mass[index] = 0.0f; + + grid_cell_array.mu_A[index] = 0.0f; + grid_cell_array.mu_UA[index] = 0.0f; + + grid_cell_array.w_A[index] = 0.0f; + grid_cell_array.w_UA[index] = 0.0f; + + grid_cell_array.mean_x_vel[index] = 0.0f; + grid_cell_array.mean_y_vel[index] = 0.0f; + grid_cell_array.var_x_vel[index] = 0.0f; + grid_cell_array.var_y_vel[index] = 0.0f; + grid_cell_array.covar_xy_vel[index] = 0.0f; + } } } diff --git a/dogm/src/kernel/init.cu b/dogm/src/kernel/init.cu index 6230e17..1435b10 100644 --- a/dogm/src/kernel/init.cu +++ b/dogm/src/kernel/init.cu @@ -13,8 +13,9 @@ namespace dogm { -__global__ void setupRandomStatesKernel(curandState* __restrict__ states, unsigned long long seed, int count) +__global__ void setupRandomStatesKernel(curandState* __restrict__ states, int count) { + long long int seed = clock64(); for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < count; i += blockDim.x * gridDim.x) { curand_init(seed, i, 0, &states[i]); @@ -22,7 +23,7 @@ __global__ void setupRandomStatesKernel(curandState* __restrict__ states, unsign } __global__ void initParticlesKernel(ParticlesSoA particle_array, curandState* __restrict__ global_state, float velocity, - int grid_size, int particle_count) + int grid_size, int particle_count, float resolution) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -31,8 +32,8 @@ __global__ void initParticlesKernel(ParticlesSoA particle_array, curandState* __ for (int i = thread_id; i < particle_count; i += stride) { - float x = curand_uniform(&local_state, 0.0f, grid_size - 1); - float y = curand_uniform(&local_state, 0.0f, grid_size - 1); + float x = curand_uniform(&local_state, 0.0f, (grid_size - 1) * resolution); + float y = curand_uniform(&local_state, 0.0f, (grid_size - 1) * resolution); float vel_x = curand_uniform(&local_state, -velocity, velocity); float vel_y = curand_uniform(&local_state, -velocity, velocity); @@ -52,21 +53,12 @@ __global__ void initBirthParticlesKernel(ParticlesSoA birth_particle_array, cura int thread_id = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; - // curandState local_state = global_state[thread_id]; - for (int i = thread_id; i < particle_count; i += stride) { - // float x = curand_uniform(&local_state, 0.0f, grid_size - 1); - // float y = curand_uniform(&local_state, 0.0f, grid_size - 1); - // float vel_x = curand_normal(&local_state, 0.0f, velocity); - // float vel_y = curand_normal(&local_state, 0.0f, velocity); - birth_particle_array.weight[i] = 0.0f; birth_particle_array.associated[i] = false; birth_particle_array.state[i] = glm::vec4(0.0f, 0.0f, 0.0f, 0.0f); } - - // global_state[thread_id] = local_state; } __global__ void initGridCellsKernel(GridCellsSoA grid_cell_array, diff --git a/dogm/src/kernel/init_new_particles.cu b/dogm/src/kernel/init_new_particles.cu index 82e5929..d6511af 100644 --- a/dogm/src/kernel/init_new_particles.cu +++ b/dogm/src/kernel/init_new_particles.cu @@ -98,9 +98,9 @@ __global__ void initParticlesKernel1(ParticlesSoA particle_array, } } -__global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell* __restrict__ grid_cell_array, - curandState* __restrict__ global_state, float velocity, int grid_size, - float new_weight, int particle_count) +__global__ void initParticlesKernel2(ParticlesSoA particle_array, curandState* __restrict__ global_state, + float velocity, int grid_size, float new_weight, int particle_count, + float resolution) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -111,8 +111,8 @@ __global__ void initParticlesKernel2(ParticlesSoA particle_array, const GridCell { int cell_idx = particle_array.grid_cell_idx[i]; - float x = cell_idx % grid_size + 0.5f; - float y = cell_idx / grid_size + 0.5f; + float x = (cell_idx % grid_size + 0.5f) * resolution; + float y = (cell_idx / grid_size + 0.5f) * resolution; float vel_x = curand_uniform(&local_state, -velocity, velocity); float vel_y = curand_uniform(&local_state, -velocity, velocity); @@ -160,7 +160,8 @@ __global__ void initNewParticlesKernel1(GridCellsSoA grid_cell_array, __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const GridCellsSoA grid_cell_array, curandState* __restrict__ global_state, float stddev_velocity, - float max_velocity, int grid_size, int particle_count) + float max_velocity, int grid_size, int particle_count, + float resolution) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -170,11 +171,10 @@ __global__ void initNewParticlesKernel2(ParticlesSoA birth_particle_array, const for (int i = thread_id; i < particle_count; i += stride) { int cell_idx = birth_particle_array.grid_cell_idx[i]; - const GridCell& grid_cell = grid_cell_array[cell_idx]; bool associated = birth_particle_array.associated[i]; - float x = cell_idx % grid_size + 0.5f; - float y = cell_idx / static_cast(grid_size) + 0.5f; + float x = (cell_idx % grid_size + 0.5f) * resolution; + float y = (cell_idx / grid_size + 0.5f) * resolution; if (associated) { diff --git a/dogm/src/kernel/predict.cu b/dogm/src/kernel/predict.cu index 1dcdd38..1ca06fa 100644 --- a/dogm/src/kernel/predict.cu +++ b/dogm/src/kernel/predict.cu @@ -14,8 +14,8 @@ namespace dogm { __global__ void predictKernel(ParticlesSoA particle_array, curandState* __restrict__ global_state, float velocity, - int grid_size, float p_S, const glm::mat4x4 transition_matrix, - float process_noise_position, float process_noise_velocity, int particle_count) + int grid_size, float p_S, const glm::mat4x4 transition_matrix, float process_noise_position, + float process_noise_velocity, int particle_count, float resolution) { int thread_id = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; @@ -34,8 +34,8 @@ __global__ void predictKernel(ParticlesSoA particle_array, curandState* __restri particle_array.weight[i] = p_S * particle_array.weight[i]; glm::vec4 state = particle_array.state[i]; - float x = state[0]; - float y = state[1]; + float x = state[0] / resolution; + float y = state[1] / resolution; // Particle out of grid so decrease its chance of being resampled if ((x > grid_size - 1 || x < 0) || (y > grid_size - 1 || y < 0)) @@ -46,8 +46,6 @@ __global__ void predictKernel(ParticlesSoA particle_array, curandState* __restri int pos_x = clamp(static_cast(x), 0, grid_size - 1); int pos_y = clamp(static_cast(y), 0, grid_size - 1); particle_array.grid_cell_idx[i] = pos_x + grid_size * pos_y; - - // printf("X: %d, Y: %d, Cell index: %d\n", pos_x, pos_y, (pos_x + grid_size * pos_y)); } global_state[thread_id] = local_state; From b21f1b8da51eb2dab7e5d7a0b5783222caad532d Mon Sep 17 00:00:00 2001 From: ShepelIlya Date: Wed, 18 May 2022 12:09:20 +0300 Subject: [PATCH 4/5] custom commit - merge this with this care! added some debug functions, changed cmeke files for building with ros --- dogm/CMakeLists.txt | 56 +++++++++++++++++++------ dogm/demo/CMakeLists.txt | 7 +++- dogm/demo/simulator/CMakeLists.txt | 14 +++---- dogm/demo/utils/CMakeLists.txt | 8 ++-- dogm/dogm-config.cmake.in | 14 +++++++ dogm/include/dogm/dogm.h | 13 +++--- dogm/package.xml | 16 +++++++ dogm/src/dogm.cu | 67 +++++++++++++++++++++++++++++- 8 files changed, 160 insertions(+), 35 deletions(-) create mode 100644 dogm/dogm-config.cmake.in create mode 100644 dogm/package.xml diff --git a/dogm/CMakeLists.txt b/dogm/CMakeLists.txt index 2bc88cd..1f63cca 100644 --- a/dogm/CMakeLists.txt +++ b/dogm/CMakeLists.txt @@ -1,9 +1,10 @@ cmake_minimum_required(VERSION 3.9) -project(DOGM LANGUAGES CXX CUDA) +project(dogm LANGUAGES CXX CUDA) set(CMAKE_CXX_STANDARD 14) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) -option(BUILD_TESTS "Build unit tests" ON) +option(BUILD_TESTS "Build unit tests" OFF) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake/modules/") find_package(OpenGL REQUIRED) @@ -27,13 +28,11 @@ if (NOT GLEW_FOUND) set(GLEW_LIBRARIES "" CACHE FILEPATH "" ) endif() -LIST (APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) -INCLUDE(GoogleTestDownloadAndBuild) +find_package(OpenCV REQUIRED) +find_package(Eigen3 REQUIRED) -#if(WIN32) -# install(FILES "${OpenCV_DIR}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/bin/opencv_world${OpenCV_VERSION_MAJOR}${OpenCV_VERSION_MINOR}${OpenCV_VERSION_PATCH}d.dll" DESTINATION ${CMAKE_INSTALL_BINARY}/debug CONFIGURATIONS Debug) -# install(FILES "${OpenCV_DIR}/${OpenCV_ARCH}/${OpenCV_RUNTIME}/bin/opencv_world${OpenCV_VERSION_MAJOR}${OpenCV_VERSION_MINOR}${OpenCV_VERSION_PATCH}.dll" DESTINATION ${CMAKE_INSTALL_BINARY} CONFIGURATIONS RelWithDebInfo Release) -#endif(WIN32) +LIST (APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) +#INCLUDE(GoogleTestDownloadAndBuild) include_directories( SYSTEM @@ -41,6 +40,8 @@ include_directories( ${GLFW3_INCLUDE_DIR} ${GLEW_INCLUDE_DIRS} ${GLM_INCLUDE_DIR} + ${OpenCV_INCLUDE_DIRS} + ${EIGEN3_INCLUDE_DIR} ) set(HEADER_FILES @@ -76,7 +77,7 @@ set(SRC_FILES source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SRC_FILES}) -add_library(dogm STATIC +add_library(dogm SHARED ${HEADER_FILES} ${SRC_FILES} ) @@ -91,12 +92,13 @@ string(APPEND CMAKE_CUDA_FLAGS " --expt-extended-lambda") #set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored") set(CUDA_PROPAGATE_HOST_FLAGS ON) -set_target_properties(dogm PROPERTIES PUBLIC_HEADER "include/dogm.h;include/dogm_types.h") +set_target_properties(dogm PROPERTIES PUBLIC_HEADER "include/dogm/dogm.h;include/dogm/dogm_types.h") target_link_libraries(dogm - PRIVATE ${OPENGL_LIBRARY} - PRIVATE ${GLFW3_LIBRARY} - PRIVATE ${GLEW_LIBRARIES} + ${OPENGL_LIBRARY} + ${GLFW3_LIBRARY} + ${GLEW_LIBRARIES} + ${OpenCV_LIBS} ) #INSTALL(TARGETS dogm @@ -111,3 +113,31 @@ if(BUILD_TESTS) endif() add_subdirectory(demo) + +# Install package.xml for catkin +install(FILES package.xml DESTINATION share/${PROJECT_NAME}/) + +set(CONF_INCLUDE_DIRS "${CMAKE_INSTALL_PREFIX}/include") +set(DOGM_CMAKE_DIR share/${PROJECT_NAME}/cmake) +include(CMakePackageConfigHelpers) +configure_package_config_file( + dogm-config.cmake.in + ${PROJECT_BINARY_DIR}/cmake/${PROJECT_NAME}/dogm-config.cmake + PATH_VARS DOGM_CMAKE_DIR + INSTALL_DESTINATION ${CMAKE_INSTALL_PREFIX}/share/${PROJECT_NAME} +) + +install( + FILES ${PROJECT_BINARY_DIR}/cmake/${PROJECT_NAME}/dogm-config.cmake + DESTINATION share/${PROJECT_NAME}/ +) + +install(TARGETS dogm + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib +) + +install( + DIRECTORY include/ + DESTINATION include/ +) diff --git a/dogm/demo/CMakeLists.txt b/dogm/demo/CMakeLists.txt index 12aea53..a00b670 100644 --- a/dogm/demo/CMakeLists.txt +++ b/dogm/demo/CMakeLists.txt @@ -4,6 +4,8 @@ if (NOT OpenCV_FOUND) set(OpenCV_LIBS "" CACHE FILEPATH "" ) endif() +find_package(CUDA REQUIRED) + add_definitions(-D_USE_MATH_DEFINES) # Required to make M_PI from cmath available in MSVC add_executable(demo @@ -14,9 +16,10 @@ target_include_directories(demo PUBLIC ${OpenCV_INCLUDE_DIRS} ${GLM_INCLUDE_DIR} target_link_libraries(demo dogm - utils - simulator + dogm_utils + dogm_simulator ${OpenCV_LIBS} + ${CUDA_CUDART_LIBRARY} ) add_subdirectory(utils) diff --git a/dogm/demo/simulator/CMakeLists.txt b/dogm/demo/simulator/CMakeLists.txt index 256317d..6659081 100644 --- a/dogm/demo/simulator/CMakeLists.txt +++ b/dogm/demo/simulator/CMakeLists.txt @@ -1,10 +1,5 @@ find_package(OpenGL REQUIRED) -find_package(GLM) -if (NOT GLM_FOUND) - set(GLM_INCLUDE_DIR "" CACHE PATH "") -endif() - find_package(GLFW3) if (NOT GLFW3_FOUND) set(GLFW3_INCLUDE_DIR "" CACHE PATH "") @@ -45,13 +40,18 @@ SET(SRC_FILES source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} FILES ${SRC_FILES}) -ADD_LIBRARY(simulator STATIC ${SRC_FILES} ${HEADER_FILES}) +ADD_LIBRARY(dogm_simulator SHARED ${SRC_FILES} ${HEADER_FILES}) -TARGET_INCLUDE_DIRECTORIES(simulator +TARGET_INCLUDE_DIRECTORIES(dogm_simulator PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include) +TARGET_LINK_LIBRARIES(dogm_simulator + ${GLFW3_LIBRARY} + ${GLEW_LIBRARIES} +) + if(BUILD_TESTS) add_subdirectory(test) endif() diff --git a/dogm/demo/utils/CMakeLists.txt b/dogm/demo/utils/CMakeLists.txt index cd8359c..a5efed3 100644 --- a/dogm/demo/utils/CMakeLists.txt +++ b/dogm/demo/utils/CMakeLists.txt @@ -25,18 +25,18 @@ SET(SRC_FILES timer.cpp ) -ADD_LIBRARY(utils STATIC ${SRC_FILES} ${HEADER_FILES}) +ADD_LIBRARY(dogm_utils SHARED ${SRC_FILES} ${HEADER_FILES}) -TARGET_COMPILE_FEATURES(utils PUBLIC cxx_std_17) +TARGET_COMPILE_FEATURES(dogm_utils PUBLIC cxx_std_17) -TARGET_INCLUDE_DIRECTORIES(utils +TARGET_INCLUDE_DIRECTORIES(dogm_utils PUBLIC simulator ${CMAKE_CURRENT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include ${OpenCV_INCLUDE_DIRS}) -TARGET_LINK_LIBRARIES(utils PRIVATE simulator ${OpenCV_LIBS}) +TARGET_LINK_LIBRARIES(dogm_utils dogm_simulator ${OpenCV_LIBS}) if(BUILD_TESTS) add_subdirectory(test) diff --git a/dogm/dogm-config.cmake.in b/dogm/dogm-config.cmake.in new file mode 100644 index 0000000..731cbc5 --- /dev/null +++ b/dogm/dogm-config.cmake.in @@ -0,0 +1,14 @@ +get_filename_component(DOGM_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) +set(DOGM_INCLUDE_DIRS "@CONF_INCLUDE_DIRS@" "@CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES@") + +find_library(DOGM_LIBRARY NAMES dogm NO_DEFAULT_PATH HINTS ${DOGM_CMAKE_DIR}/../../lib/ REQUIRED) +#find_library(DOGM_SIMULATOR_LIBRARY NAMES dogm_simulator NO_DEFAULT_PATH HINTS ${DOGM_CMAKE_DIR}/../../lib/dogm/ REQUIRED) +#find_library(DOGM_UTILS_LIBRARY NAMES dogm_utils NO_DEFAULT_PATH HINTS ${DOGM_CMAKE_DIR}/../../lib/dogm/ REQUIRED) +find_package(CUDA REQUIRED) + +SET(DOGM_LIBRARIES + ${DOGM_LIBRARY} +# ${DOGM_SIMULATOR_LIBRARY} +# ${DOGM_UTILS_LIBRARY} + ${CUDA_CUDART_LIBRARY} +) diff --git a/dogm/include/dogm/dogm.h b/dogm/include/dogm/dogm.h index 0ac54ff..7783712 100644 --- a/dogm/include/dogm/dogm.h +++ b/dogm/include/dogm/dogm.h @@ -13,6 +13,7 @@ #include #include +#include namespace dogm { @@ -72,7 +73,6 @@ class DOGM * @param measurement_grid new measurement grid map. * @param new_x new x pose. * @param new_y new y pose. - * @param new_yaw new yaw. * @param dt delta time since the last update. * @param device whether the measurement grid resides in GPU memory (default: true). */ @@ -121,13 +121,6 @@ class DOGM */ float getPositionX() const { return position_x; } - /** - * Returns the vehicles yaw. - * - * @return yaw. - */ - float getYaw() const { return yaw; } - /** * Returns the y position. * @@ -137,6 +130,10 @@ class DOGM int getIteration() const { return iteration; } + cv::Mat getPredOccMassImage(GridCellsSoA& grid_cells) const; + cv::Mat getNewBornOccMassImage(GridCellsSoA& grid_cells) const; + cv::Mat getPersOccMassImage(GridCellsSoA& grid_cells) const; + cv::Mat getOccupancyImage(GridCellsSoA& grid_cells) const; private: void initialize(); diff --git a/dogm/package.xml b/dogm/package.xml new file mode 100644 index 0000000..58004df --- /dev/null +++ b/dogm/package.xml @@ -0,0 +1,16 @@ + + dogm + 0.0.0 + The dogm package + + user + + TODO + + cmake + + + + cmake + + diff --git a/dogm/src/dogm.cu b/dogm/src/dogm.cu index d1d1119..a9233ab 100644 --- a/dogm/src/dogm.cu +++ b/dogm/src/dogm.cu @@ -24,6 +24,13 @@ #include #include +#include +#include + +#include +#include +#include + namespace dogm { @@ -359,8 +366,66 @@ void DOGM::resampling() resamplingKernel<<>>(particle_array, particle_array_next, birth_particle_array, idx_array_resampled, new_weight, particle_count); +} + +cv::Mat DOGM::getPredOccMassImage(GridCellsSoA& grid_cells) const +{ + cv::Mat image(grid_size, grid_size, CV_8UC3); + for (int i = 0; i < grid_cell_count; i++) + { + cv::Vec3b color; + int x = i % grid_size; + int y = i / grid_size; + color[0] = color[1] = color[2] = uchar(grid_cells.pred_occ_mass[i] * 255); + image.at(grid_size - x - 1, grid_size - y - 1) = color; + } + return image; +} + +cv::Mat DOGM::getNewBornOccMassImage(GridCellsSoA& grid_cells) const +{ + cv::Mat image(grid_size, grid_size, CV_8UC3); + for (int i = 0; i < grid_cell_count; i++) + { + cv::Vec3b color; + int x = i % grid_size; + int y = i / grid_size; + color[0] = color[1] = color[2] = uchar(grid_cells.new_born_occ_mass[i] * 255); + image.at(grid_size - x - 1, grid_size - y - 1) = color; + } + return image; +} + +cv::Mat DOGM::getPersOccMassImage(GridCellsSoA& grid_cells) const +{ + cv::Mat image(grid_size, grid_size, CV_8UC3); + for (int i = 0; i < grid_cell_count; i++) + { + cv::Vec3b color; + int x = i % grid_size; + int y = i / grid_size; + color[0] = color[1] = color[2] = uchar(grid_cells.pers_occ_mass[i] * 255); + image.at(grid_size - x - 1, grid_size - y - 1) = color; + } + return image; +} + +cv::Mat DOGM::getOccupancyImage(GridCellsSoA& grid_cells) const +{ + cv::Mat image(grid_size, grid_size, CV_8UC3); + for (int i = 0; i < grid_cell_count; i++) + { + const auto occ_mass = grid_cells.occ_mass[i]; + const auto free_mass = grid_cells.free_mass[i]; + cv::Vec3b color; + int x = i % grid_size; + int y = i / grid_size; + color[0] = color[1] = color[2] = uchar((occ_mass + (1 - occ_mass - free_mass) / 2) * 255); + image.at(grid_size - x - 1, grid_size - y - 1) = color; + } + return image; +} - CHECK_ERROR(cudaGetLastError()); } } /* namespace dogm */ From 0e91442c9ae5a3e29636a8f1540639288ca333f0 Mon Sep 17 00:00:00 2001 From: ShepelIlya Date: Wed, 18 May 2022 17:22:29 +0300 Subject: [PATCH 5/5] debug commit for parallel systematic resampling --- dogm/CMakeLists.txt | 2 + dogm/demo/CMakeLists.txt | 5 +- dogm/include/dogm/dogm.h | 5 +- dogm/include/dogm/kernel/resampling.h | 2 - .../include/dogm/kernel/resampling_parallel.h | 44 ++++ dogm/src/dogm.cu | 72 +++++- dogm/src/kernel/resampling_parallel.cu | 233 ++++++++++++++++++ 7 files changed, 353 insertions(+), 10 deletions(-) create mode 100644 dogm/include/dogm/kernel/resampling_parallel.h create mode 100644 dogm/src/kernel/resampling_parallel.cu diff --git a/dogm/CMakeLists.txt b/dogm/CMakeLists.txt index 1f63cca..307f204 100644 --- a/dogm/CMakeLists.txt +++ b/dogm/CMakeLists.txt @@ -51,6 +51,7 @@ set(HEADER_FILES include/dogm/kernel/particle_to_grid.h include/dogm/kernel/predict.h include/dogm/kernel/resampling.h + include/dogm/kernel/resampling_parallel.h include/dogm/kernel/statistical_moments.h include/dogm/kernel/update_persistent_particles.h include/dogm/kernel/ego_motion_compensation.h @@ -69,6 +70,7 @@ set(SRC_FILES src/kernel/particle_to_grid.cu src/kernel/predict.cu src/kernel/resampling.cu + src/kernel/resampling_parallel.cu src/kernel/statistical_moments.cu src/kernel/update_persistent_particles.cu src/kernel/ego_motion_compensation.cu diff --git a/dogm/demo/CMakeLists.txt b/dogm/demo/CMakeLists.txt index a00b670..e2c8cfa 100644 --- a/dogm/demo/CMakeLists.txt +++ b/dogm/demo/CMakeLists.txt @@ -12,7 +12,10 @@ add_executable(demo main.cpp ) -target_include_directories(demo PUBLIC ${OpenCV_INCLUDE_DIRS} ${GLM_INCLUDE_DIR}) +target_include_directories(demo PUBLIC + ${OpenCV_INCLUDE_DIRS} + ${GLM_INCLUDE_DIR} +) target_link_libraries(demo dogm diff --git a/dogm/include/dogm/dogm.h b/dogm/include/dogm/dogm.h index 7783712..e35a3c0 100644 --- a/dogm/include/dogm/dogm.h +++ b/dogm/include/dogm/dogm.h @@ -11,7 +11,6 @@ #include #include #include - #include #include @@ -150,6 +149,7 @@ class DOGM void initializeNewParticles(); void statisticalMoments(); void resampling(); + void resampling_parallel_ns(); public: Params params; @@ -172,6 +172,8 @@ class DOGM float* vel_xy_array; float* rand_array; + int* idx_array_up; + int* idx_array_down; curandState* rng_states; @@ -181,6 +183,7 @@ class DOGM int particle_count; int new_born_particle_count; + cudaDeviceProp device_prop; dim3 block_dim; dim3 particles_grid; dim3 birth_particles_grid; diff --git a/dogm/include/dogm/kernel/resampling.h b/dogm/include/dogm/kernel/resampling.h index 665f0e5..b69019c 100644 --- a/dogm/include/dogm/kernel/resampling.h +++ b/dogm/include/dogm/kernel/resampling.h @@ -13,8 +13,6 @@ namespace dogm { -struct Particle; - __global__ void resamplingGenerateRandomNumbersKernel(float* __restrict__ rand_array, curandState* __restrict__ global_state, float max, int particle_count); diff --git a/dogm/include/dogm/kernel/resampling_parallel.h b/dogm/include/dogm/kernel/resampling_parallel.h new file mode 100644 index 0000000..797a029 --- /dev/null +++ b/dogm/include/dogm/kernel/resampling_parallel.h @@ -0,0 +1,44 @@ +// Copyright (c) 2020 Michael Koesel and respective contributors +// SPDX-License-Identifier: MIT +// See accompanying LICENSE file for detailed information + +#pragma once + +#include +#include +#include + +#include + +#include + +namespace cg = cooperative_groups; + +constexpr auto kTRI {256}; +constexpr auto kWarpSize {32}; +constexpr bool systematic {true}; + +namespace dogm +{ + +__global__ void resampleIndexKernel(const ParticlesSoA particle_array, ParticlesSoA particle_array_next, + const ParticlesSoA birth_particle_array, const int* __restrict__ idx_array_up, + const int* __restrict__ idx_array_down, float new_weight, int particle_count); + +// Systematic / Stratified max optimized + +__global__ void resampleSystematicIndexUp(int const num_particles, + unsigned long long int const seed, int* __restrict__ resampling_index_up, float* __restrict__ prefix_sum); + +__device__ void ResamplingUpPerWarp(cg::thread_block_tile const &tile_32, + unsigned int const &tid, int const &num_particles, float const &distro, + float *shared, float *__restrict__ prefix_sum, int *__restrict__ resampling_index_up); + +__global__ void resampleSystematicIndexDown(int const num_particles, + unsigned long long int const seed, int *__restrict__ resampling_index_down, float *__restrict__ prefix_sum); + +__device__ void ResamplingDownPerWarp(cg::thread_block_tile const &tile_32, + unsigned int const &tid, int const &num_particles, float const &distro, + float *shared, float *__restrict__ prefix_sum, int *__restrict__ resampling_index_down ); + +} /* namespace dogm */ diff --git a/dogm/src/dogm.cu b/dogm/src/dogm.cu index a9233ab..f7deefc 100644 --- a/dogm/src/dogm.cu +++ b/dogm/src/dogm.cu @@ -14,6 +14,7 @@ #include "dogm/kernel/particle_to_grid.h" #include "dogm/kernel/predict.h" #include "dogm/kernel/resampling.h" +#include "dogm/kernel/resampling_parallel.h" #include "dogm/kernel/statistical_moments.h" #include "dogm/kernel/update_persistent_particles.h" @@ -71,6 +72,8 @@ DOGM::DOGM(const Params& params) CUDA_CALL(cudaMalloc(&vel_xy_array, particle_count * sizeof(float))); CUDA_CALL(cudaMalloc(&rand_array, particle_count * sizeof(float))); + CUDA_CALL(cudaMalloc(&idx_array_up, particle_count * sizeof(int))); + CUDA_CALL(cudaMalloc(&idx_array_down, particle_count * sizeof(int))); CUDA_CALL(cudaMalloc(&rng_states, particles_grid.x * block_dim.x * sizeof(curandState))); @@ -99,6 +102,8 @@ DOGM::~DOGM() CUDA_CALL(cudaFree(rand_array)); CUDA_CALL(cudaFree(rng_states)); + CUDA_CALL(cudaFree(idx_array_up)); + CUDA_CALL(cudaFree(idx_array_down)); } void DOGM::initialize() @@ -127,7 +132,9 @@ void DOGM::updateGrid(MeasurementCellsSoA measurement_grid, float new_x, float n updatePersistentParticles(); initializeNewParticles(); statisticalMoments(); - resampling(); + + // resampling(); + resampling_parallel_ns(); particle_array = particle_array_next; @@ -336,6 +343,7 @@ void DOGM::statisticalMoments() vel_xy_array_accum, grid_cell_count); } +// Multinomial origin resampling void DOGM::resampling() { thrust::device_ptr persistent_weights(weight_array); @@ -355,19 +363,73 @@ void DOGM::resampling() thrust::device_ptr rand_ptr(rand_array); thrust::device_vector rand_vector(rand_ptr, rand_ptr + particle_count); - thrust::sort(rand_vector.begin(), rand_vector.end()); - thrust::device_vector idx_resampled(particle_count); calc_resampled_indices(joint_weight_accum, rand_vector, idx_resampled, joint_max); int* idx_array_resampled = thrust::raw_pointer_cast(idx_resampled.data()); - float new_weight = joint_max / particle_count; resamplingKernel<<>>(particle_array, particle_array_next, birth_particle_array, idx_array_resampled, new_weight, particle_count); } +void DOGM::resampling_parallel_ns() +{ + thrust::device_ptr persistent_weights(weight_array); + thrust::device_ptr new_born_weights(birth_particle_array.weight); + + thrust::device_vector joint_weight_array; + joint_weight_array.insert(joint_weight_array.end(), persistent_weights, persistent_weights + particle_count); + joint_weight_array.insert(joint_weight_array.end(), new_born_weights, new_born_weights + new_born_particle_count); + + thrust::device_vector joint_weight_accum(joint_weight_array.size()); + accumulate(joint_weight_array, joint_weight_accum); + + float joint_max = joint_weight_accum.back(); + thrust::transform(joint_weight_accum.begin(), joint_weight_accum.end(), + joint_weight_accum.begin(), thrust::placeholders::_1 /= joint_max); + float new_weight = joint_max / particle_count; + + + unsigned long long int seed {static_cast(clock())}; + // thrust::device_vector up_vec(particle_count, 0); + // thrust::device_vector down_vec(particle_count, 0); + // int* idx_array_up = thrust::raw_pointer_cast(up_vec.data()); + // int* idx_array_down = thrust::raw_pointer_cast(down_vec.data()); + float* accumulated_sum = thrust::raw_pointer_cast(joint_weight_accum.data()); + + // void *args_up[] {const_cast( &particle_count ), &seed, + // &idx_array_up, &accumulated_sum}; + + CUDA_CALL(cudaGetLastError()); + + // CUDA_RT_CALL(cudaLaunchKernel(reinterpret_cast(&resampleSystematicIndexUp), + // particles_grid, block_dim, args_up, 0, cuda_streams[0])); + + // cudaDeviceSynchronize(); + + resampleSystematicIndexUp<<>>(particle_count, + seed, idx_array_up, accumulated_sum); + + // cudaDeviceSynchronize(); + // std::cout << up_vec[0] << "\n"; + + // void *args_down[] {const_cast( &particle_count ), &seed, + // &idx_array_down, &accumulated_sum}; + + // CUDA_RT_CALL(cudaLaunchKernel(reinterpret_cast(&resampleSystematicIndexDown), + // 40, 256, args_down, 0, cuda_streams[1])); + + resampleSystematicIndexDown<<>>(particle_count, + seed, idx_array_down, accumulated_sum); + + // CHECK_ERROR(cudaDeviceSynchronize()); + // cudaDeviceSynchronize(); + + resampleIndexKernel<<>>(particle_array, particle_array_next, + birth_particle_array, idx_array_up, idx_array_down, new_weight, particle_count); +} + cv::Mat DOGM::getPredOccMassImage(GridCellsSoA& grid_cells) const { cv::Mat image(grid_size, grid_size, CV_8UC3); @@ -426,6 +488,4 @@ cv::Mat DOGM::getOccupancyImage(GridCellsSoA& grid_cells) const return image; } -} - } /* namespace dogm */ diff --git a/dogm/src/kernel/resampling_parallel.cu b/dogm/src/kernel/resampling_parallel.cu new file mode 100644 index 0000000..853fcbd --- /dev/null +++ b/dogm/src/kernel/resampling_parallel.cu @@ -0,0 +1,233 @@ +#include "dogm/common.h" +#include "dogm/cuda_utils.h" +#include "dogm/dogm_types.h" +#include "dogm/kernel/resampling_parallel.h" + +#include +#include +#include + +namespace dogm +{ + +__global__ void resampleIndexKernel(const ParticlesSoA particle_array, ParticlesSoA particle_array_next, + const ParticlesSoA birth_particle_array, const int* __restrict__ idx_array_up, + const int* __restrict__ idx_array_down, float new_weight, int particle_count) +{ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < particle_count; i += blockDim.x * gridDim.x) + { + int idx = i + idx_array_up[i] + idx_array_down[i]; + + // if (idx_array_up[i] != 0 || idx_array_down[i] != 0) + // printf("\t(%d, %d,%d, %d)", i, idx_array_up[i], idx_array_down[i], idx); + + if (idx < particle_count) + { + particle_array_next.copy(particle_array, i, idx); + } + else + { + particle_array_next.copy(birth_particle_array, i, idx - particle_count); + // printf("!"); + } + + particle_array_next.weight[i] = new_weight; + } +} + +// Systematic / Stratified max optimized + +__global__ void __launch_bounds__(kTRI) resampleSystematicIndexUp(int const num_particles, + unsigned long long int const seed, int* __restrict__ resampling_index_up, float* __restrict__ prefix_sum) { + auto const tile_32 = cg::tiled_partition(cg::this_thread_block()); + + __shared__ float s_warp_0[kTRI]; // strange diff *2 + __shared__ float s_warp_1[kTRI]; // strange diff *2 + + // Setting prefix_sum[n - 1] in each block versus call a separate kernel + // beforehand. Set last value in prefix-sum to 1.0f + if ( threadIdx.x == 0 ) { + prefix_sum[num_particles - 1] = 1.0f; // + } + + for ( unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < num_particles; + tid += blockDim.x * gridDim.x ) { + + curandStateXORWOW_t local_state {}; + + float distro {}; + + if (systematic) { + curand_init( seed, 0, 0, &local_state ); + distro = curand_uniform( &local_state ); + } else { + curand_init( seed + tid, 0, 0, &local_state ); + distro = curand_uniform( &local_state ); + } + + if ( threadIdx.x < kWarpSize ) { + ResamplingUpPerWarp(tile_32, tid, num_particles, distro, s_warp_0, prefix_sum, resampling_index_up); + } else { + ResamplingUpPerWarp(tile_32, tid, num_particles, distro, s_warp_1, prefix_sum, resampling_index_up); + } + } +} + +__device__ void ResamplingUpPerWarp(cg::thread_block_tile const &tile_32, + unsigned int const &tid, int const &num_particles, float const &distro, + float* shared, float* __restrict__ prefix_sum, int* __restrict__ resampling_index_up) { + + float const tidf { static_cast( tid ) }; + auto const t { tile_32.thread_rank( ) }; + + int l {0}; + int idx {0}; + float a {}; + float b {}; + + bool mask { true }; + + if ( tid < num_particles - kWarpSize - l ) { // strange diff kWarpSize or kTRI + shared[t] = prefix_sum[tid + l]; + shared[t + kWarpSize] = prefix_sum[tid + kWarpSize + l]; // strange diff kWarpSize || kTRI + } + + // Distribution will be the same for each Monte Carlo + float const draw = ( distro + tidf ) / num_particles; + + tile_32.sync(); + + while (tile_32.any(mask)) { + if (tid < num_particles - (kTRI) - l) { // strange diff (+ kWarpSize) + + a = prefix_sum[tid + kWarpSize + l]; + b = prefix_sum[tid + kTRI + l]; // strange diff + kWarpSize + + #pragma unroll // strange diff kWarpSize or kTRI + for ( int i = 0; i < kWarpSize; i++ ) { // strange diff kWarpSize or kTRI + mask = shared[t + i] < draw; + if ( mask ) { + idx++; + } + } + l += kWarpSize; // strange diff kWarpSize or kTRI + shared[t] = a; + shared[t + kWarpSize] = b; // strange diff kWarpSize or kTRI + + tile_32.sync(); + } else { + while ( mask && tid < ( num_particles - l ) ) { + mask = prefix_sum[tid + l] < draw; + if ( mask ) { + idx++; + } + l++; + } + } + + tile_32.sync( ); + } + resampling_index_up[tid] = idx; +} + +__global__ void __launch_bounds__(kTRI) resampleSystematicIndexDown(int const num_particles, + unsigned long long int const seed, int *__restrict__ resampling_index_down, float *__restrict__ prefix_sum) { + + auto const tile_32 = cg::tiled_partition( cg::this_thread_block( ) ); + + __shared__ float s_warp_0[kTRI]; // strange diff *2 + __shared__ float s_warp_1[kTRI]; // strange diff *2 + + // Setting prefix_sum_particle_weights[n - 1] in each block versus call a + // separate kernel beforehand + if ( threadIdx.x == 0 ) { + prefix_sum[num_particles - 1] = 1.0f; + } + + for ( unsigned int tid = blockIdx.x * blockDim.x + threadIdx.x; tid < num_particles; + tid += blockDim.x * gridDim.x ) { + + curandStateXORWOW_t local_state {}; + + float distro {}; + + if ( systematic ) { + curand_init( seed, 0, 0, &local_state ); + distro = curand_uniform( &local_state ); + } else { + curand_init( seed + tid, 0, 0, &local_state ); + distro = curand_uniform( &local_state ); + } + + if ( threadIdx.x < kWarpSize ) { + ResamplingDownPerWarp( tile_32, tid, num_particles, distro, s_warp_0, prefix_sum, resampling_index_down ); + } else { + ResamplingDownPerWarp( tile_32, tid, num_particles, distro, s_warp_1, prefix_sum, resampling_index_down ); + } + } +} + +__device__ void ResamplingDownPerWarp( cg::thread_block_tile const &tile_32, + unsigned int const &tid, int const &num_particles, float const &distro, + float *shared, float *__restrict__ prefix_sum, int *__restrict__ resampling_index_down ) { + + float const tidf { static_cast( tid ) }; + auto const t { tile_32.thread_rank( ) }; + + int l {1}; + int idx {0}; + float a{}; + float b{}; + + bool mask { false }; + + // Preload in into shared memory + if ( tid >= kWarpSize + l ) { // strange diff kWarpSize or kTRI + shared[t] = prefix_sum[tid - kWarpSize - l]; // strange diff kWarpSize or kTRI + shared[t + kWarpSize] = prefix_sum[tid - l]; // strange diff kWarpSize or kTRI + } + + // Distribution will be the same for each Monte Carlo + float const draw = ( distro + tidf ) / num_particles; + + tile_32.sync( ); + + while ( !tile_32.all( mask ) ) { + + if ( tid >= kTRI + l ) { // strange diff + kWarpSize + a = prefix_sum[tid - ( kTRI )-l]; // strange diff + kWarpSize) + b = prefix_sum[tid - kWarpSize - l]; + + #pragma unroll + for ( int i = 1; i < kWarpSize + 1; i++ ) { // strange diff kWarpSize or kTRI + mask = shared[t + kWarpSize - i] < draw; // strange diff kWarpSize or kTRI + if ( !mask ) { + idx--; + } + } + l += kWarpSize; // strange diff kWarpSize or kTRI + shared[t] = a; + shared[t + kWarpSize] = b; // strange diff kWarpSize or kTRI + tile_32.sync( ); + + } else { + + while ( !mask ) { + if ( tid > l ) { + mask = prefix_sum[tid - ( l + 1 )] < draw; + } else { + mask = true; + } + if ( !mask ) { + idx--; + } + l++; + } + } + + tile_32.sync( ); + } + resampling_index_down[tid] = idx; +} + +} \ No newline at end of file