diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index affcd031..c9deb9ba 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -128,6 +128,10 @@ if (KOKKOS)
   add_executable(qr_test test_qr_solve.cpp)
   target_link_libraries(qr_test ${LINKING_LIBRARIES})
 
+  include_directories(pointcloud)
+  add_subdirectory(pointcloud)
+
+
   if (Matar_ENABLE_TRILINOS)
     add_executable(anndistributed ann_distributed.cpp)
     target_link_libraries(anndistributed ${LINKING_LIBRARIES})
diff --git a/examples/laplaceMPI/CMakeLists.txt b/examples/laplaceMPI/CMakeLists.txt
index d9d4ec6c..5b114927 100644
--- a/examples/laplaceMPI/CMakeLists.txt
+++ b/examples/laplaceMPI/CMakeLists.txt
@@ -4,10 +4,10 @@ if (KOKKOS)
   #find_package(Kokkos REQUIRED) #new
   find_package(MPI REQUIRED)
   
-  #add_executable(laplace_mpi laplace_mpi.cpp)
+  add_executable(laplace_mpi laplace_mpi.cpp)
   #add_executable(laplace_mpi simple_mpi.cpp)
   #add_executable(laplace_mpi mpi_mesh_test.cpp)
-  add_executable(laplace_mpi simple_halo.cpp)
+  #add_executable(laplace_mpi simple_halo.cpp)
   add_definitions(-DHAVE_MPI=1)
 
   add_definitions(-DHAVE_KOKKOS=1)
diff --git a/examples/pointcloud/CMakeLists.txt b/examples/pointcloud/CMakeLists.txt
new file mode 100644
index 00000000..aafb20e9
--- /dev/null
+++ b/examples/pointcloud/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.18)
+
+
+if (KOKKOS)
+  #find_package(Kokkos REQUIRED) #new
+
+  add_definitions(-DHAVE_KOKKOS=1)
+  if (CUDA)
+    add_definitions(-DHAVE_CUDA=1)
+  elseif (HIP)
+    add_definitions(-DHAVE_HIP=1)
+  elseif (OPENMP)
+    add_definitions(-DHAVE_OPENMP=1)
+  elseif (THREADS)
+    add_definitions(-DHAVE_THREADS=1)
+  endif()
+
+
+  add_executable(pointcloud-gbl pointcloud-gbl.cpp)
+  add_executable(pointcloud-rk pointcloud-rk.cpp)
+
+  target_link_libraries(pointcloud-gbl ${LINKING_LIBRARIES})
+  target_link_libraries(pointcloud-rk ${LINKING_LIBRARIES})
+endif(KOKKOS)
diff --git a/examples/pointcloud/graphics-reader.h b/examples/pointcloud/graphics-reader.h
new file mode 100755
index 00000000..0b1eb3cb
--- /dev/null
+++ b/examples/pointcloud/graphics-reader.h
@@ -0,0 +1,796 @@
+/**********************************************************************************************
+ © 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+ 
+// -----------------------------------------------
+// routines to read a graphics file
+//
+// -----------------------------------------------
+
+#include <iostream>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <cmath>
+#include <vector>
+
+#include "matar.h"
+
+using namespace mtr;
+
+
+
+// checks to see if a path exists
+bool DoesPathExist(const std::string &s)
+{
+    struct stat buffer;
+    return (stat (s.c_str(), &buffer) == 0);
+}
+
+// Code from stackover flow for string delimiter parsing
+std::vector<std::string> split (std::string s, std::string delimiter) {
+    size_t pos_start = 0, pos_end, delim_len = delimiter.length();
+    std::string token;
+    std::vector<std::string> res;
+
+    while ((pos_end = s.find (delimiter, pos_start)) != std::string::npos) {
+        token = s.substr (pos_start, pos_end - pos_start);
+        pos_start = pos_end + delim_len;
+        res.push_back (token);
+    }
+
+    res.push_back (s.substr (pos_start));
+    return res;
+    
+} // end of split
+
+
+// retrieves multiple values between [ ]
+std::vector<double> extract_list(std::string str) {
+    
+    // replace '[' with a space and ']' with a space
+    std::replace(str.begin(), str.end(), '[', ' ');
+    std::replace(str.begin(), str.end(), ']', ' ');
+    
+    std::vector<std::string> str_values;
+    std::vector<double> values;
+
+    // exact the str values into a vector
+    str_values = split(str, ",");
+    
+    // convert the text values into double values
+    for (auto &word : str_values) {
+        values.push_back( atof(word.c_str()) );
+    } // end for
+    
+    return values;
+    
+}  // end of extract_list
+
+
+
+// from stack overflow on removing blanks in string
+template<typename T, typename P>
+T remove_if(T beg, T end, P pred)
+{
+    T dest = beg;
+    for (T itr = beg;itr != end; ++itr)
+        if (!pred(*itr))
+            *(dest++) = *itr;
+    return dest;
+}
+
+
+
+
+
+
+
+// This function reads a VTK file
+//--------------------------------------------------------
+//
+/*
+ vtk mesh node ordering
+          7--------6
+         /|       /|
+        / |      / |
+       4--------5  |
+       |  |     |  |
+       |  |     |  |
+       |  3-----|--2
+       | /      | /
+       |/       |/
+       0--------1
+ 
+ in the i,j,k format node order is
+ 0 = (i  , j  , k  )
+ 1 = (i+1, j  , k  )
+ 2 = (i+1, j+1, k  )
+ 3 = (i  , j+1, k  )
+ 4 = (i  , j  , k+1)
+ 5 = (i+1, j  , k+1)
+ 6 = (i+1, j+1, k+1)
+ 7 = (i  , j+1, k+1)
+ 
+ The marching cubes ordering is
+ (i  , j  , k  ) = 0
+ (i+1, j  , k  ) = 1
+ (i+1, j  , k+1) = 5
+ (i  , j  , k+1) = 4
+ (i  , j+1, k  ) = 3
+ (i+1, j+1, k  ) = 2
+ (i+1, j+1, k+1) = 6
+ (i  , j+1, k+1) = 7
+
+ */
+void readVTK(char * filename,
+             CArray <double> &pt_coords,
+             CArray <int> &elem_point_list,
+             CArray <double> &pt_values,
+             int &num_points,
+             int &num_elems,
+             int &num_points_in_elem,
+             bool test_vtk_read,
+             bool verbose_vtk)
+{
+   
+    int i;           // used for writing information to file
+    int point_id;    // the global id for the point
+    int elem_id;     // the global id for the elem
+    int this_point;   // a local id for a point in a elem (0:7 for a Hexahedral elem)
+    
+    int num_dims = 3;
+    
+
+    std::string token;
+    
+    bool found = false;
+    
+    std::ifstream in;  // FILE *in;
+    in.open(filename);
+    
+
+    // look for POINTS
+    i = 0;
+    while (found==false) {
+        std::string str;
+        std::string delimiter = " ";
+        std::getline(in, str);
+        std::vector<std::string> v = split (str, delimiter);
+        
+        // looking for the following text:
+        //      POINTS %d float
+        if(v[0] == "POINTS"){
+            num_points = std::stoi(v[1]);
+            printf("Num nodes read in %d\n", num_points);
+            
+            found=true;
+        } // end if
+        
+        
+        if (i>1000){
+            printf("ERROR: Failed to find POINTS \n");
+            break;
+        } // end if
+        
+        i++;
+    } // end while
+    
+    
+    // allocate memory for point coords and values
+    pt_coords = CArray <double> (num_points,3);
+    pt_values = CArray <double> (num_points);
+    
+    
+    // read the point coordinates
+    for (point_id=0; point_id<num_points; point_id++){
+        
+        std::string str;
+        std::getline(in, str);
+        
+        std::string delimiter = " ";
+        std::vector<std::string> v = split (str, delimiter);
+        
+        for (int dim=0; dim<3; dim++){
+            pt_coords(point_id,dim) = std::stod(v[dim]); // double
+            //if num_dims=2 skip the 3rd value
+            
+            // printing all the mesh coordinates
+            if (verbose_vtk) printf(" %f ", pt_coords(point_id,dim));
+        }
+        if (verbose_vtk) printf("\n"); // printing a space for readability
+        
+    } // end for points
+    found=false;
+    
+    
+    if (verbose_vtk)printf("\n");
+    if (verbose_vtk) printf("looking for CELLS \n");
+    
+    // look for CELLS
+    i = 0;
+    while (found==false) {
+        std::string str;
+        std::getline(in, str);
+        
+        std::string delimiter = " ";
+        std::vector<std::string> v = split (str, delimiter);
+        
+        // looking for the following text:
+        //      CELLS num_elems size
+        if(v[0] == "CELLS"){
+            num_elems = std::stoi(v[1]);
+            printf("Num elements read in %d\n", num_elems);
+            
+            found=true;
+        } // end if
+        
+        
+        if (i>1000){
+            printf("ERROR: Failed to find CELLS \n");
+            break;
+        } // end if
+        
+        i++;
+    } // end while
+    
+    
+    
+    // allocate memomry for points in each element
+    elem_point_list = CArray <int> (num_elems,8);   // 8 points in a hex
+    
+    
+    // read the point ids in the element
+    for (elem_id=0; elem_id<num_elems; elem_id++) {
+        
+        std::string str;
+        std::getline(in, str);
+        
+        std::string delimiter = " ";
+        std::vector<std::string> v = split (str, delimiter);
+        num_points_in_elem = std::stoi(v[0]);
+        
+        for (this_point=0; this_point<num_points_in_elem; this_point++){
+            elem_point_list(elem_id,this_point) = std::stod(v[this_point+1]);
+            
+            // printing details on nodes in the element
+            if (verbose_vtk) printf(" %d ", elem_point_list(elem_id,this_point) );
+        }
+        if (verbose_vtk) printf("\n"); // printing a space for readability
+        
+    } // end for
+    found=false;
+
+    if (verbose_vtk) printf("\n"); // printing a space for readability
+    
+    
+    // look for CELL_TYPE
+    i = 0;
+    int elem_type = 0;
+    while (found==false) {
+        std::string str;
+        std::string delimiter = " ";
+        std::getline(in, str);
+        std::vector<std::string> v = split (str, delimiter);
+        
+        // looking for the following text:
+        //      CELLS num_elems size
+        if(v[0] == "CELL_TYPES"){
+
+            std::getline(in, str);
+            elem_type = std::stoi(str);
+            
+            found=true;
+        } // end if
+        
+        
+        if (i>1000){
+            printf("ERROR: Failed to find elem_TYPE \n");
+            break;
+        } // end if
+        
+        i++;
+    } // end while
+    
+    if (verbose_vtk) printf("elem type = %d \n", elem_type);
+    // elem types:
+    // linear hex = 12, linear quad = 9
+    found=false;
+    
+    // verify mesh has hexahedral elements, which have 8 nodes
+    if(num_points_in_elem != 8) {
+        printf("wrong elem type of %d \n", elem_type);
+    }
+    
+    
+    
+    // look for the point_var in the POINT_DATA heading
+    i = 0;
+    while (found==false) {
+        std::string str;
+        std::string delimiter = " ";
+        std::getline(in, str);
+        std::vector<std::string> v = split (str, delimiter);
+        
+        // looking for the following text:
+        //      POINT_DATA num_points
+        if(v[1] == "point_var"){
+            
+            std::getline(in, str);  // read next line -- its LOOKUP_TABLE
+            
+            //
+            for(int point_id=0; point_id<num_points; point_id++){
+                std::getline(in, str);
+                pt_values(point_id) = std::stoi(str);
+                
+                
+                // printing the node values in the mesh
+                if (verbose_vtk) printf("%f \n", pt_values(point_id) );
+            }
+            if (verbose_vtk) printf("\n"); // printing a blank space
+        
+            found=true;
+            
+        } // end if
+        
+        
+        if (i>10000000){
+            printf("ERROR: Failed to find point_var in POINT_DATA \n");
+            break;
+        } // end if
+        
+        i++;
+    } // end while
+
+
+    found=false;
+    
+    
+    
+    // testing the file read by painting a part
+    if(test_vtk_read == true){
+        for(int node_id = 0; node_id<num_points; node_id++){
+            
+            double x = pt_coords(node_id,0);
+            double y = pt_coords(node_id,1);
+            double z = pt_coords(node_id,2);
+            
+            // a simple sphere
+            pt_values(node_id) = sqrt(x*x + y*y + z*z) - 0.5;
+        }
+    } // end of test=true
+    
+    
+    printf("Finished reading mesh \n\n");
+    
+    in.close();
+    
+}
+
+
+// an array to convert the marchint cubes id order to marching cubes id order
+const int marching_cubes_2_vtk[8] =
+{
+    0,
+    1,
+    5,
+    4,
+    3,
+    2,
+    6,
+    7
+};
+
+
+
+void readTechPlot(char * filename,
+             CArray <double> &pt_coords,
+             CArray <int> &elem_point_list,
+             CArray <double> &pt_values,
+             int &num_points,
+             int &num_elems,
+             int &num_points_in_elem,
+             bool test_tecplot_read,
+             bool verbose_vtk)
+{
+   
+    int i;           // used for writing information to file
+    int point_id;    // the global id for the point
+    int elem_id;     // the global id for the elem
+    
+    int num_dims = 3;
+    
+
+    std::string token;
+    
+    bool found = false;
+    
+    std::ifstream in;  // FILE *in;
+    in.open(filename);
+    
+
+    // look for POINTS
+    i = 0;  // lines in the file
+    while (found==false) {
+        std::string str;
+        std::string delimiter = ",";
+        std::getline(in, str);
+        std::vector<std::string> v = split (str, delimiter);
+        
+        bool found_nodes = false;
+        bool found_elems = false;
+        
+        // loop over the parsed text stored in v and
+        // am now looking for the following text:
+        //      NODES %d float
+        for (int text=0; text<v.size(); text++){
+
+            
+            std::string delimiter_words = "=";
+            std::vector<std::string> words = split (v[text], delimiter_words);
+            
+            
+            
+            for(int a_word=0; a_word<words.size(); a_word++){
+                
+                // erase extra spaces from the text, the remaining text are names and numbers
+                words[a_word].erase(std::remove_if(words[a_word].begin(), words[a_word].end(),
+                    [](char c) { return std::isspace(c); } ),
+                                  words[a_word].end());
+                
+                
+                if(words[a_word] == "NODES"){
+                    num_points = std::stoi(words[a_word+1]);
+                    printf("Num nodes to read in %d\n", num_points);
+                    
+                    found_nodes = true;
+                } // end if
+                
+                if(words[a_word] == "ELEMENTS"){
+                    num_elems = std::stoi(words[a_word+1]);
+                    printf("Num elements to read in %d\n", num_elems);
+                    
+                    found_elems = true;
+                }
+                
+                if(found_nodes == true && found_elems == true){
+                    found = true;
+                }
+            } // end loop over all the words in the text within a vector
+            
+        } // end for loop over text in the line
+        
+        if (i>=2) found=true;
+        
+        if (i>1000){
+            printf("ERROR: Failed to find NODES and ELEMENTS \n");
+            break;
+        } // end if
+        
+        i++;
+    } // end while
+    
+    
+    // allocate memory for point coords and values
+    pt_coords = CArray <double> (num_points,3);
+    pt_values = CArray <double> (num_points);
+    
+
+    printf("starting the x,y,z point read and density value \n");
+    
+    // read the point coordinates
+    for (point_id=0; point_id<num_points; point_id++){
+        
+        std::string str;
+        std::getline(in, str);
+        
+        std::string delimiter = " ";
+        std::vector<std::string> v = split (str, delimiter);
+        
+        int column = 0;
+        for (int text=0; text<v.size(); text++){
+            
+            // erase extra spaces from the text, the remaining text is a number
+            v[text].erase(std::remove_if(v[text].begin(), v[text].end(),
+                [](char c) { return std::isspace(c); } ),
+                          v[text].end());
+            
+            
+            // column numbering starts at 0
+            if(v[text].size() > 0 && column==3){
+                pt_values(point_id) = std::stod(v[text]); // double
+                
+                if (verbose_vtk) printf("    %f \n", pt_values(point_id));
+                
+                break; // exit after reading the density on this line
+            } // end if column is density
+            
+            
+            // if there is text, then it is a number
+            if(v[text].size() > 0 && column<3){
+                
+                
+                pt_coords(point_id,column) = std::stod(v[text]); // double
+                //if num_dims=2 skip the 3rd value
+            
+                // printing all the mesh coordinates
+                if (verbose_vtk) printf(" %f ", pt_coords(point_id,column));
+                
+                column++; // this will make the column = 3 after all dims are saved
+            } // end if to save coordinates
+            
+            
+        } // end for over the partitioned test
+        
+    } // end for points
+    found=false;
+    
+    
+
+    
+    // allocate memomry for points in each element
+    num_points_in_elem = 8;
+    elem_point_list = CArray <int> (num_elems,8);   // 8 points in a hex
+    
+    
+    // read the point ids in the element
+    if (verbose_vtk) printf("Reading the nodes in the element\n");
+    for (elem_id=0; elem_id<num_elems; elem_id++) {
+        
+        std::string str;
+        std::getline(in, str);
+        
+        std::string delimiter = " ";
+        std::vector<std::string> v = split (str, delimiter);
+        
+        
+        int column = 0;
+        for (int text=0; text<v.size(); text++){
+            
+            // erase extra spaces from the text, the remaining text is a number
+            v[text].erase(std::remove_if(v[text].begin(), v[text].end(),
+                [](char c) { return std::isspace(c); } ),
+                          v[text].end());
+            
+            
+            
+            if(column==8){
+                if (verbose_vtk) printf("\n"); // printing a space for readability
+                
+                break;  // exit after reading the last node on this line
+            }
+            
+            // if there is text, then it is a number
+            if(v[text].size() > 0 && column<8){
+                
+                
+                elem_point_list(elem_id,column) = std::stod(v[text]) - 1; // Fortran convention
+                //if num_dims=2 skip the 3rd value
+            
+                // printing all the mesh coordinates
+                // printing details on nodes in the element
+                if (verbose_vtk) printf(" %d ", elem_point_list(elem_id,column) );
+                
+                column++; // this will make the column = 8 after all node values are saved
+            } // end if to save points in this element
+            
+            
+        } // end loop over the text
+        
+    } // end for
+    found=false;
+
+    if (verbose_vtk) printf("\n"); // printing a space for readability
+ 
+        
+
+    
+    // testing the file read by painting a part
+    if(test_tecplot_read == true){
+        for(int node_id = 0; node_id<num_points; node_id++){
+            
+            
+            pt_coords(node_id,0) *= 100;
+            pt_coords(node_id,1) *= 100;
+            pt_coords(node_id,2) *= 100;
+            
+            double x = pt_coords(node_id,0);
+            double y = pt_coords(node_id,1);
+            double z = pt_coords(node_id,2);
+            
+            // a simple sphere
+            pt_values(node_id) = sqrt(x*x + y*y + z*z) - 0.5;
+        }
+    } // end of test=true
+     
+
+    
+    printf("Finished reading mesh \n\n");
+    
+    in.close();
+    
+}
+
+
+
+
+
+// -------------------------------------------------------
+// This function write outs the data to a VTK file
+//--------------------------------------------------------
+//
+void VTK(CArray <double> &pt_coords,
+         CArray <int> &elem_point_list,
+         int num_points,
+         int num_elems,
+         int num_points_in_elem,
+         CArray <double> &pt_values)
+{
+    
+    int GraphicsNumber = 0;
+    double Time = 0.0;
+    
+   
+    int i;           // used for writing information to file
+    int point_id;    // the global id for the point
+    int elem_id;     // the global id for the elem
+    int this_point;   // a local id for a point in a elem (0:7 for a Hexahedral elem)
+    
+    
+    FILE *out[20];   // the output files that are written to
+    char name[100];  // char string
+    
+    
+    
+    std::string directory = "vtk";
+    bool path = DoesPathExist(directory);
+    
+    // Create the folders for the ensight files
+    if (path==false) {
+        i=system("mkdir vtk");
+    }
+    
+    
+    
+    
+    /*
+     ---------------------------------------------------------------------------
+     Write the Geometry file
+     ---------------------------------------------------------------------------
+     */
+    
+    
+    snprintf(name, sizeof(name), "vtk/mesh.vtk");  // mesh file
+    
+    
+    out[0]=fopen(name,"w");
+    
+    
+    fprintf(out[0],"# vtk DataFile Version 2.0\n");  // part 2
+    fprintf(out[0],"Mesh for Fierro\n");             // part 2
+    fprintf(out[0],"ASCII \n");                      // part 3
+    fprintf(out[0],"DATASET UNSTRUCTURED_GRID\n\n"); // part 4
+    
+    fprintf(out[0],"POINTS %d float\n", num_points);
+
+    
+    // write all components of the point coordinates
+    for (point_id=0; point_id<num_points; point_id++){
+        fprintf(out[0],
+                "%f %f %f\n",
+                pt_coords(point_id,0),
+                pt_coords(point_id,1),
+                pt_coords(point_id,2));
+    } // end for
+    
+    /*
+     ---------------------------------------------------------------------------
+     Write the elems
+     ---------------------------------------------------------------------------
+     */
+    fprintf(out[0],"\n");
+    fprintf(out[0],"CELLS %d %d\n", num_elems, num_elems+num_elems*8);  // size=all printed values
+    
+    // write all global point numbers for this elem
+    for (elem_id=0; elem_id<num_elems; elem_id++) {
+        
+        fprintf(out[0],"8 "); // num points in this elem
+        for (this_point=0; this_point<num_points_in_elem; this_point++){
+            fprintf(out[0],"%d ", elem_point_list(elem_id,this_point));
+        }
+        fprintf(out[0],"\n");
+        
+    } // end for
+    
+    fprintf(out[0],"\n");
+    fprintf(out[0],"CELL_TYPES %d \n", num_elems);
+    // elem types:
+    // linear hex = 12, linear quad = 9
+    // element types: https://vtk.org/doc/nightly/html/vtkCellType_8h_source.html
+    // element types: https://kitware.github.io/vtk-js/api/Common_DataModel_CellTypes.html
+    // vtk format: https://www.kitware.com//modeling-arbitrary-order-lagrange-finite-elements-in-the-visualization-toolkit/
+    for (elem_id=0; elem_id<num_elems; elem_id++) {
+        fprintf(out[0],"%d \n", 12); // linear hex is type 12
+    }
+    
+    
+    /*
+     ---------------------------------------------------------------------------
+     Write the nodal variable file
+     ---------------------------------------------------------------------------
+     */
+    fprintf(out[0],"\n");
+    fprintf(out[0],"POINT_DATA %d \n", num_points);
+    fprintf(out[0],"SCALARS point_var float 1\n"); // the 1 is number of scalar components [1:4]
+    fprintf(out[0],"LOOKUP_TABLE default\n");
+    for (point_id=0; point_id<num_points; point_id++) {
+        double var=2;
+        fprintf(out[0],"%f\n",pt_values(point_id));
+    }
+    
+    /*
+     ---------------------------------------------------------------------------
+     Write the vector variables to file
+     ---------------------------------------------------------------------------
+     */
+
+    
+    /*
+     ---------------------------------------------------------------------------
+     Write the scalar elem variable to file
+     ---------------------------------------------------------------------------
+     */
+    fprintf(out[0],"\n");
+    fprintf(out[0],"CELL_DATA %d \n", num_elems);
+    fprintf(out[0],"SCALARS elem_var float 1\n"); // the 1 is number of scalar components [1:4]
+    fprintf(out[0],"LOOKUP_TABLE default\n");
+    for (elem_id=0; elem_id<num_elems; elem_id++) {
+        double var=1;
+        fprintf(out[0],"%f\n",var);
+    }
+    
+    fprintf(out[0],"\n");
+    fprintf(out[0],"SCALARS elem_var2 float 1\n"); // the 1 is number of scalar components [1:4]
+    fprintf(out[0],"LOOKUP_TABLE default\n");
+    for (elem_id=0; elem_id<num_elems; elem_id++) {
+        double var=10;
+        fprintf(out[0],"%f\n",var);
+    }
+    
+    fclose(out[0]);
+
+}
+
diff --git a/examples/pointcloud/icosahedron_bin.stl b/examples/pointcloud/icosahedron_bin.stl
new file mode 100644
index 00000000..edb31f4b
Binary files /dev/null and b/examples/pointcloud/icosahedron_bin.stl differ
diff --git a/examples/pointcloud/pointcloud-gbl.cpp b/examples/pointcloud/pointcloud-gbl.cpp
new file mode 100755
index 00000000..442a8dc2
--- /dev/null
+++ b/examples/pointcloud/pointcloud-gbl.cpp
@@ -0,0 +1,1183 @@
+/**********************************************************************************************
+ © 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+
+// -----------------------------------------------
+// pointcloud reconstrution in C++
+//  credit to Andrew Morgan and Nathaniel Morgan
+//
+// To run the code with an external mesh file:
+//    ./a.out graphics-file
+//
+// Requires the matar.h and macros.h libraries from
+// the github LANL/MATAR/src folder
+//
+// The following rountines in this code came from:
+//   https://paulbourke.net/geometry/polygonise/
+//   - Polygonise
+//   - VertexInterp
+//
+//
+// The surface reconstruction method is from
+//   Reconstruction and Representation of 3D Objects with Radial BasisFunctions
+//    J. Carr, R. Beatson, ..., T. Evans
+//    https://www.cs.jhu.edu/~misha/Fall05/Papers/carr01.pdf   
+// -----------------------------------------------
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <stdio.h>
+#include <cmath>
+
+
+#include "matar.h"
+
+#include "lu_solver.hpp"
+
+
+using namespace mtr;
+
+
+// -----------------------------------------------
+// inputs:
+
+
+// the number of nodes in the mesh
+const double dx = 0.01; // resolution
+const double dy = 0.01; // resolution
+const double dz = 0.01; // resolution
+
+
+// the mesh dimensions
+// length of the domain is 5 for crazy shape and 1 for sphere
+const double XMax = 1.0; 
+const double YMax = 1.0; 
+const double ZMax = 1.0; 
+
+const double X0 = 0.0; 
+const double Y0 = 0.0; 
+const double Z0 = 0.0; 
+
+
+const double isoLevel=0.0; // contour to extract
+
+
+//
+// -----------------------------------------------
+
+
+std::tuple<
+    CArray<float>,   // normal
+    CArray<float>, CArray<float>, CArray<float>,   // v1X, v1Y, v1Z
+    CArray<float>, CArray<float>, CArray<float>,   // v2X, v2Y, v2Z
+    CArray<float>, CArray<float>, CArray<float>,   // v3X, v3Y, v3Z
+    size_t // n_facets
+>
+binary_stl_reader(const std::string& path)
+{
+    std::ifstream in(path, std::ios::binary | std::ios::ate);
+    if (!in) { std::perror("open"); std::exit(EXIT_FAILURE); }
+
+    const std::streamoff filesize = in.tellg();
+    if (filesize < 100) {
+        std::cerr << "ERROR: File too small to be a valid STL\n";
+        std::exit(EXIT_FAILURE);
+    }
+    in.seekg(0);
+
+    // ---- check if ASCII -------------------------------------------------
+    char magic[6] = { 0 };
+    in.read(magic, 5);          // read first 5 chars
+    in.seekg(0);               // rewind
+    if (std::strncmp(magic, "solid", 5) == 0) {
+        std::cerr
+            << "ERROR: \"" << path
+            << "\" looks like an **ASCII** STL (starts with \"solid\").\n"
+            << "Re‑export it as *binary* or implement an ASCII parser.\n";
+        std::exit(EXIT_FAILURE);        // or call ascii_stl_reader();
+    }
+
+    // ---- read 80‑byte header + nominal facet count ----------------------
+    char header[80];                in.read(header, 80);
+    size_t n_facets_nominal;  in.read(reinterpret_cast<char*>(&n_facets_nominal), 4);
+
+    // ---- compute expected count from file size to sanity‑check ----------
+    // binary facet record = 50 bytes (12×4 + 12×4 + 12×4 + 2)
+    const size_t n_facets_from_size =
+        static_cast<size_t>((filesize - 84) / 50);
+
+    size_t n_facets = n_facets_nominal;
+    if (n_facets_nominal != n_facets_from_size) {
+        std::cout << "WARNING: facet count in header (" << n_facets_nominal
+            << ") disagrees with file size (" << n_facets_from_size
+            << ").  Using size‑derived value.\n";
+        n_facets = n_facets_from_size;
+    }
+    std::cout << "STL facets: " << n_facets << '\n';
+
+    // ---- allocate MATAR arrays -----------------------------------------
+    CArray<float> normal(n_facets, 3);
+    CArray<float> v1X(n_facets), v1Y(n_facets), v1Z(n_facets);
+    CArray<float> v2X(n_facets), v2Y(n_facets), v2Z(n_facets);
+    CArray<float> v3X(n_facets), v3Y(n_facets), v3Z(n_facets);
+
+    // ---- read facet records --------------------------------------------
+    float nrm[3], v1[3], v2[3], v3[3];
+    for (unsigned int i = 0; i < n_facets; ++i) {
+        in.read(reinterpret_cast<char*>(nrm), 12);
+        in.read(reinterpret_cast<char*>(v1), 12);
+        in.read(reinterpret_cast<char*>(v2), 12);
+        in.read(reinterpret_cast<char*>(v3), 12);
+        in.ignore(2);                        // attribute byte count
+
+        for (int d = 0; d < 3; ++d) normal(i, d) = nrm[d];
+        v1X(i) = v1[0]; v1Y(i) = v1[1]; v1Z(i) = v1[2];
+        v2X(i) = v2[0]; v2Y(i) = v2[1]; v2Z(i) = v2[2];
+        v3X(i) = v3[0]; v3Y(i) = v3[1]; v3Z(i) = v3[2];
+    }
+    return { normal,v1X,v1Y,v1Z,v2X,v2Y,v2Z,v3X,v3Y,v3Z,n_facets };
+}
+
+
+
+
+
+// a vector type with 3 components
+struct vec_t{
+    double x;
+    double y;
+    double z;
+    
+    // default constructor
+    vec_t (){};
+    
+    // overloaded constructor
+    vec_t(double x_in, double y_in, double z_in){
+        x = x_in;
+        y = y_in;
+        z = z_in;
+    };
+    
+}; // end vec_t
+
+
+// a triangle data type
+struct triangle_t {
+    
+    vec_t normal; // surface normal
+    
+    vec_t p[3];   // three nodes with x,y,z coords
+    
+    // default constructor
+    triangle_t(){};
+    
+    // overloaded constructor
+    triangle_t (vec_t p_in[3])
+    {
+        p[0]=p_in[0];
+        p[1]=p_in[1];
+        p[2]=p_in[2];
+    };
+    
+}; // end triangle_t
+
+
+// calculate the surface normal of a triangle
+KOKKOS_INLINE_FUNCTION
+void calc_normal(triangle_t *triangle){
+    
+    //A = p1 - p0;
+    //B = p2 - p0;
+    vec_t A;
+    A.x = triangle->p[1].x - triangle->p[0].x;
+    A.y = triangle->p[1].y - triangle->p[0].y;
+    A.z = triangle->p[1].z - triangle->p[0].z;
+    
+    vec_t B;
+    B.x = triangle->p[2].x - triangle->p[0].x;
+    B.y = triangle->p[2].y - triangle->p[0].y;
+    B.z = triangle->p[2].z - triangle->p[0].z;
+    
+    vec_t N;
+    N.x = A.y * B.z - A.z * B.y;
+    N.y = A.z * B.x - A.x * B.z;
+    N.z = A.x * B.y - A.y * B.x;
+    
+    double mag;
+    mag = sqrt(N.x*N.x + N.y*N.y + N.z*N.z);
+    
+    // save the unit normal
+    triangle->normal.x = N.x/mag;
+    triangle->normal.y = N.y/mag;
+    triangle->normal.z = N.z/mag;
+    
+} // end normal
+
+
+// cross prodcut
+vec_t cross(const vec_t &a, const vec_t &b) {
+    return {a.y*b.z - a.z*b.y,
+            a.z*b.x - a.x*b.z,
+            a.x*b.y - a.y*b.x};
+}
+
+double dot(const vec_t &a, const vec_t &b) {
+    return a.x*b.x + a.y*b.y + a.z*b.z;
+}
+
+
+// calculate the volume of a tet with this triangular face
+double compute_volume(const triangle_t &triangle) {
+    // triangle.p[0] is the first vec_t, being node 0
+    // ...
+    // triangle.p[1] is the third vec_t, being node 2
+    double volume = dot(triangle.p[0], cross(triangle.p[1], triangle.p[2])) / 6.0;
+
+    return volume;
+}
+
+struct gridcell_t {
+    
+    vec_t* p;
+    double* val;
+    
+    // default constructor
+    gridcell_t(){};
+    
+    // overloaded constructor
+    gridcell_t (vec_t p_in[8], double val_in[8])
+    {
+        p=p_in;
+        val=val_in;
+    };
+    
+}; // end gridcell_t
+
+
+/*
+   Linearly interpolate the position where an isosurface cuts
+   an edge between two vertices, each with their own scalar value
+*/
+KOKKOS_INLINE_FUNCTION
+vec_t VertexInterp(double isolevel, vec_t p1, vec_t p2, double valp1, double valp2)
+{
+   double mu;
+   vec_t p;
+
+   if (fabs(isolevel-valp1) < 0.00001)
+      return(p1);
+   if (fabs(isolevel-valp2) < 0.00001)
+      return(p2);
+   if (fabs(valp1-valp2) < 0.00001)
+      return(p1);
+   mu = (isolevel - valp1) / (valp2 - valp1);
+   p.x = p1.x + mu * (p2.x - p1.x);
+   p.y = p1.y + mu * (p2.y - p1.y);
+   p.z = p1.z + mu * (p2.z - p1.z);
+
+   return(p);
+}
+
+/*
+   Given a grid cell and an isolevel, calculate the triangular
+   facets required to represent the isosurface through the cell.
+   Return the number of triangular facets, the array "triangles"
+   will be loaded up with the vertices at most 5 triangular facets.
+    0 will be returned if the grid cell is either totally above
+   of totally below the isolevel.
+*/
+KOKKOS_INLINE_FUNCTION
+int Polygonise(gridcell_t grid, double isolevel, triangle_t *triangles)
+{
+    
+    int i,ntriang;
+    int cubeindex;
+    vec_t vertlist[12];
+
+    int edgeTable[256]={
+        0x0  , 0x109, 0x203, 0x30a, 0x406, 0x50f, 0x605, 0x70c,
+        0x80c, 0x905, 0xa0f, 0xb06, 0xc0a, 0xd03, 0xe09, 0xf00,
+        0x190, 0x99 , 0x393, 0x29a, 0x596, 0x49f, 0x795, 0x69c,
+        0x99c, 0x895, 0xb9f, 0xa96, 0xd9a, 0xc93, 0xf99, 0xe90,
+        0x230, 0x339, 0x33 , 0x13a, 0x636, 0x73f, 0x435, 0x53c,
+        0xa3c, 0xb35, 0x83f, 0x936, 0xe3a, 0xf33, 0xc39, 0xd30,
+        0x3a0, 0x2a9, 0x1a3, 0xaa , 0x7a6, 0x6af, 0x5a5, 0x4ac,
+        0xbac, 0xaa5, 0x9af, 0x8a6, 0xfaa, 0xea3, 0xda9, 0xca0,
+        0x460, 0x569, 0x663, 0x76a, 0x66 , 0x16f, 0x265, 0x36c,
+        0xc6c, 0xd65, 0xe6f, 0xf66, 0x86a, 0x963, 0xa69, 0xb60,
+        0x5f0, 0x4f9, 0x7f3, 0x6fa, 0x1f6, 0xff , 0x3f5, 0x2fc,
+        0xdfc, 0xcf5, 0xfff, 0xef6, 0x9fa, 0x8f3, 0xbf9, 0xaf0,
+        0x650, 0x759, 0x453, 0x55a, 0x256, 0x35f, 0x55 , 0x15c,
+        0xe5c, 0xf55, 0xc5f, 0xd56, 0xa5a, 0xb53, 0x859, 0x950,
+        0x7c0, 0x6c9, 0x5c3, 0x4ca, 0x3c6, 0x2cf, 0x1c5, 0xcc ,
+        0xfcc, 0xec5, 0xdcf, 0xcc6, 0xbca, 0xac3, 0x9c9, 0x8c0,
+        0x8c0, 0x9c9, 0xac3, 0xbca, 0xcc6, 0xdcf, 0xec5, 0xfcc,
+        0xcc , 0x1c5, 0x2cf, 0x3c6, 0x4ca, 0x5c3, 0x6c9, 0x7c0,
+        0x950, 0x859, 0xb53, 0xa5a, 0xd56, 0xc5f, 0xf55, 0xe5c,
+        0x15c, 0x55 , 0x35f, 0x256, 0x55a, 0x453, 0x759, 0x650,
+        0xaf0, 0xbf9, 0x8f3, 0x9fa, 0xef6, 0xfff, 0xcf5, 0xdfc,
+        0x2fc, 0x3f5, 0xff , 0x1f6, 0x6fa, 0x7f3, 0x4f9, 0x5f0,
+        0xb60, 0xa69, 0x963, 0x86a, 0xf66, 0xe6f, 0xd65, 0xc6c,
+        0x36c, 0x265, 0x16f, 0x66 , 0x76a, 0x663, 0x569, 0x460,
+        0xca0, 0xda9, 0xea3, 0xfaa, 0x8a6, 0x9af, 0xaa5, 0xbac,
+        0x4ac, 0x5a5, 0x6af, 0x7a6, 0xaa , 0x1a3, 0x2a9, 0x3a0,
+        0xd30, 0xc39, 0xf33, 0xe3a, 0x936, 0x83f, 0xb35, 0xa3c,
+        0x53c, 0x435, 0x73f, 0x636, 0x13a, 0x33 , 0x339, 0x230,
+        0xe90, 0xf99, 0xc93, 0xd9a, 0xa96, 0xb9f, 0x895, 0x99c,
+        0x69c, 0x795, 0x49f, 0x596, 0x29a, 0x393, 0x99 , 0x190,
+        0xf00, 0xe09, 0xd03, 0xc0a, 0xb06, 0xa0f, 0x905, 0x80c,
+        0x70c, 0x605, 0x50f, 0x406, 0x30a, 0x203, 0x109, 0x0   };
+    
+    int triTable[256][16] =
+    {{-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 1, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 8, 3, 9, 8, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 8, 3, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {9, 2, 10, 0, 2, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {2, 8, 3, 2, 10, 8, 10, 9, 8, -1, -1, -1, -1, -1, -1, -1},
+        {3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 11, 2, 8, 11, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 9, 0, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 11, 2, 1, 9, 11, 9, 8, 11, -1, -1, -1, -1, -1, -1, -1},
+        {3, 10, 1, 11, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 10, 1, 0, 8, 10, 8, 11, 10, -1, -1, -1, -1, -1, -1, -1},
+        {3, 9, 0, 3, 11, 9, 11, 10, 9, -1, -1, -1, -1, -1, -1, -1},
+        {9, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {4, 3, 0, 7, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 1, 9, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {4, 1, 9, 4, 7, 1, 7, 3, 1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 2, 10, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {3, 4, 7, 3, 0, 4, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1},
+        {9, 2, 10, 9, 0, 2, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1},
+        {2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4, -1, -1, -1, -1},
+        {8, 4, 7, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {11, 4, 7, 11, 2, 4, 2, 0, 4, -1, -1, -1, -1, -1, -1, -1},
+        {9, 0, 1, 8, 4, 7, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1},
+        {4, 7, 11, 9, 4, 11, 9, 11, 2, 9, 2, 1, -1, -1, -1, -1},
+        {3, 10, 1, 3, 11, 10, 7, 8, 4, -1, -1, -1, -1, -1, -1, -1},
+        {1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4, -1, -1, -1, -1},
+        {4, 7, 8, 9, 0, 11, 9, 11, 10, 11, 0, 3, -1, -1, -1, -1},
+        {4, 7, 11, 4, 11, 9, 9, 11, 10, -1, -1, -1, -1, -1, -1, -1},
+        {9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {9, 5, 4, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 5, 4, 1, 5, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {8, 5, 4, 8, 3, 5, 3, 1, 5, -1, -1, -1, -1, -1, -1, -1},
+        {1, 2, 10, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {3, 0, 8, 1, 2, 10, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1},
+        {5, 2, 10, 5, 4, 2, 4, 0, 2, -1, -1, -1, -1, -1, -1, -1},
+        {2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8, -1, -1, -1, -1},
+        {9, 5, 4, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 11, 2, 0, 8, 11, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1},
+        {0, 5, 4, 0, 1, 5, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1},
+        {2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5, -1, -1, -1, -1},
+        {10, 3, 11, 10, 1, 3, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1},
+        {4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10, -1, -1, -1, -1},
+        {5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3, -1, -1, -1, -1},
+        {5, 4, 8, 5, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1},
+        {9, 7, 8, 5, 7, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {9, 3, 0, 9, 5, 3, 5, 7, 3, -1, -1, -1, -1, -1, -1, -1},
+        {0, 7, 8, 0, 1, 7, 1, 5, 7, -1, -1, -1, -1, -1, -1, -1},
+        {1, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {9, 7, 8, 9, 5, 7, 10, 1, 2, -1, -1, -1, -1, -1, -1, -1},
+        {10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3, -1, -1, -1, -1},
+        {8, 0, 2, 8, 2, 5, 8, 5, 7, 10, 5, 2, -1, -1, -1, -1},
+        {2, 10, 5, 2, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1},
+        {7, 9, 5, 7, 8, 9, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1},
+        {9, 5, 7, 9, 7, 2, 9, 2, 0, 2, 7, 11, -1, -1, -1, -1},
+        {2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7, -1, -1, -1, -1},
+        {11, 2, 1, 11, 1, 7, 7, 1, 5, -1, -1, -1, -1, -1, -1, -1},
+        {9, 5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11, -1, -1, -1, -1},
+        {5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0, -1},
+        {11, 10, 0, 11, 0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0, -1},
+        {11, 10, 5, 7, 11, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 8, 3, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {9, 0, 1, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 8, 3, 1, 9, 8, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1},
+        {1, 6, 5, 2, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 6, 5, 1, 2, 6, 3, 0, 8, -1, -1, -1, -1, -1, -1, -1},
+        {9, 6, 5, 9, 0, 6, 0, 2, 6, -1, -1, -1, -1, -1, -1, -1},
+        {5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8, -1, -1, -1, -1},
+        {2, 3, 11, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {11, 0, 8, 11, 2, 0, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1},
+        {0, 1, 9, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1},
+        {5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11, -1, -1, -1, -1},
+        {6, 3, 11, 6, 5, 3, 5, 1, 3, -1, -1, -1, -1, -1, -1, -1},
+        {0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6, -1, -1, -1, -1},
+        {3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9, -1, -1, -1, -1},
+        {6, 5, 9, 6, 9, 11, 11, 9, 8, -1, -1, -1, -1, -1, -1, -1},
+        {5, 10, 6, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {4, 3, 0, 4, 7, 3, 6, 5, 10, -1, -1, -1, -1, -1, -1, -1},
+        {1, 9, 0, 5, 10, 6, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1},
+        {10, 6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4, -1, -1, -1, -1},
+        {6, 1, 2, 6, 5, 1, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1},
+        {1, 2, 5, 5, 2, 6, 3, 0, 4, 3, 4, 7, -1, -1, -1, -1},
+        {8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6, -1, -1, -1, -1},
+        {7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9, -1},
+        {3, 11, 2, 7, 8, 4, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1},
+        {5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11, -1, -1, -1, -1},
+        {0, 1, 9, 4, 7, 8, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1},
+        {9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6, -1},
+        {8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6, -1, -1, -1, -1},
+        {5, 1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11, -1},
+        {0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7, -1},
+        {6, 5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9, -1, -1, -1, -1},
+        {10, 4, 9, 6, 4, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {4, 10, 6, 4, 9, 10, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1},
+        {10, 0, 1, 10, 6, 0, 6, 4, 0, -1, -1, -1, -1, -1, -1, -1},
+        {8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10, -1, -1, -1, -1},
+        {1, 4, 9, 1, 2, 4, 2, 6, 4, -1, -1, -1, -1, -1, -1, -1},
+        {3, 0, 8, 1, 2, 9, 2, 4, 9, 2, 6, 4, -1, -1, -1, -1},
+        {0, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {8, 3, 2, 8, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1},
+        {10, 4, 9, 10, 6, 4, 11, 2, 3, -1, -1, -1, -1, -1, -1, -1},
+        {0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6, -1, -1, -1, -1},
+        {3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10, -1, -1, -1, -1},
+        {6, 4, 1, 6, 1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1, -1},
+        {9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3, -1, -1, -1, -1},
+        {8, 11, 1, 8, 1, 0, 11, 6, 1, 9, 1, 4, 6, 4, 1, -1},
+        {3, 11, 6, 3, 6, 0, 0, 6, 4, -1, -1, -1, -1, -1, -1, -1},
+        {6, 4, 8, 11, 6, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {7, 10, 6, 7, 8, 10, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1},
+        {0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10, -1, -1, -1, -1},
+        {10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0, -1, -1, -1, -1},
+        {10, 6, 7, 10, 7, 1, 1, 7, 3, -1, -1, -1, -1, -1, -1, -1},
+        {1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7, -1, -1, -1, -1},
+        {2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7, 3, 9, -1},
+        {7, 8, 0, 7, 0, 6, 6, 0, 2, -1, -1, -1, -1, -1, -1, -1},
+        {7, 3, 2, 6, 7, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7, -1, -1, -1, -1},
+        {2, 0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7, -1},
+        {1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11, -1},
+        {11, 2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1, -1, -1, -1, -1},
+        {8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6, -1},
+        {0, 9, 1, 11, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0, -1, -1, -1, -1},
+        {7, 11, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {3, 0, 8, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 1, 9, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {8, 1, 9, 8, 3, 1, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1},
+        {10, 1, 2, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 2, 10, 3, 0, 8, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1},
+        {2, 9, 0, 2, 10, 9, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1},
+        {6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8, -1, -1, -1, -1},
+        {7, 2, 3, 6, 2, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {7, 0, 8, 7, 6, 0, 6, 2, 0, -1, -1, -1, -1, -1, -1, -1},
+        {2, 7, 6, 2, 3, 7, 0, 1, 9, -1, -1, -1, -1, -1, -1, -1},
+        {1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6, -1, -1, -1, -1},
+        {10, 7, 6, 10, 1, 7, 1, 3, 7, -1, -1, -1, -1, -1, -1, -1},
+        {10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8, -1, -1, -1, -1},
+        {0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7, -1, -1, -1, -1},
+        {7, 6, 10, 7, 10, 8, 8, 10, 9, -1, -1, -1, -1, -1, -1, -1},
+        {6, 8, 4, 11, 8, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {3, 6, 11, 3, 0, 6, 0, 4, 6, -1, -1, -1, -1, -1, -1, -1},
+        {8, 6, 11, 8, 4, 6, 9, 0, 1, -1, -1, -1, -1, -1, -1, -1},
+        {9, 4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6, -1, -1, -1, -1},
+        {6, 8, 4, 6, 11, 8, 2, 10, 1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 2, 10, 3, 0, 11, 0, 6, 11, 0, 4, 6, -1, -1, -1, -1},
+        {4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9, -1, -1, -1, -1},
+        {10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3, -1},
+        {8, 2, 3, 8, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1},
+        {0, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8, -1, -1, -1, -1},
+        {1, 9, 4, 1, 4, 2, 2, 4, 6, -1, -1, -1, -1, -1, -1, -1},
+        {8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1, -1, -1, -1, -1},
+        {10, 1, 0, 10, 0, 6, 6, 0, 4, -1, -1, -1, -1, -1, -1, -1},
+        {4, 6, 3, 4, 3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3, -1},
+        {10, 9, 4, 6, 10, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {4, 9, 5, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 8, 3, 4, 9, 5, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1},
+        {5, 0, 1, 5, 4, 0, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1},
+        {11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5, -1, -1, -1, -1},
+        {9, 5, 4, 10, 1, 2, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1},
+        {6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5, -1, -1, -1, -1},
+        {7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2, -1, -1, -1, -1},
+        {3, 4, 8, 3, 5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6, -1},
+        {7, 2, 3, 7, 6, 2, 5, 4, 9, -1, -1, -1, -1, -1, -1, -1},
+        {9, 5, 4, 0, 8, 6, 0, 6, 2, 6, 8, 7, -1, -1, -1, -1},
+        {3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0, -1, -1, -1, -1},
+        {6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8, -1},
+        {9, 5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7, -1, -1, -1, -1},
+        {1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4, -1},
+        {4, 0, 10, 4, 10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10, -1},
+        {7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10, -1, -1, -1, -1},
+        {6, 9, 5, 6, 11, 9, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1},
+        {3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5, -1, -1, -1, -1},
+        {0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11, -1, -1, -1, -1},
+        {6, 11, 3, 6, 3, 5, 5, 3, 1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6, -1, -1, -1, -1},
+        {0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1, 2, 10, -1},
+        {11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5, -1},
+        {6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3, -1, -1, -1, -1},
+        {5, 8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2, -1, -1, -1, -1},
+        {9, 5, 6, 9, 6, 0, 0, 6, 2, -1, -1, -1, -1, -1, -1, -1},
+        {1, 5, 8, 1, 8, 0, 5, 6, 8, 3, 8, 2, 6, 2, 8, -1},
+        {1, 5, 6, 2, 1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6, -1},
+        {10, 1, 0, 10, 0, 6, 9, 5, 0, 5, 6, 0, -1, -1, -1, -1},
+        {0, 3, 8, 5, 6, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {10, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {11, 5, 10, 7, 5, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {11, 5, 10, 11, 7, 5, 8, 3, 0, -1, -1, -1, -1, -1, -1, -1},
+        {5, 11, 7, 5, 10, 11, 1, 9, 0, -1, -1, -1, -1, -1, -1, -1},
+        {10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1, -1, -1, -1, -1},
+        {11, 1, 2, 11, 7, 1, 7, 5, 1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11, -1, -1, -1, -1},
+        {9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7, -1, -1, -1, -1},
+        {7, 5, 2, 7, 2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2, -1},
+        {2, 5, 10, 2, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1},
+        {8, 2, 0, 8, 5, 2, 8, 7, 5, 10, 2, 5, -1, -1, -1, -1},
+        {9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2, -1, -1, -1, -1},
+        {9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2, -1},
+        {1, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 8, 7, 0, 7, 1, 1, 7, 5, -1, -1, -1, -1, -1, -1, -1},
+        {9, 0, 3, 9, 3, 5, 5, 3, 7, -1, -1, -1, -1, -1, -1, -1},
+        {9, 8, 7, 5, 9, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {5, 8, 4, 5, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1},
+        {5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0, -1, -1, -1, -1},
+        {0, 1, 9, 8, 4, 10, 8, 10, 11, 10, 4, 5, -1, -1, -1, -1},
+        {10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4, -1},
+        {2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8, -1, -1, -1, -1},
+        {0, 4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11, -1},
+        {0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5, -1},
+        {9, 4, 5, 2, 11, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4, -1, -1, -1, -1},
+        {5, 10, 2, 5, 2, 4, 4, 2, 0, -1, -1, -1, -1, -1, -1, -1},
+        {3, 10, 2, 3, 5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9, -1},
+        {5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2, -1, -1, -1, -1},
+        {8, 4, 5, 8, 5, 3, 3, 5, 1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 4, 5, 1, 0, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5, -1, -1, -1, -1},
+        {9, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {4, 11, 7, 4, 9, 11, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1},
+        {0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11, -1, -1, -1, -1},
+        {1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11, -1, -1, -1, -1},
+        {3, 1, 4, 3, 4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4, -1},
+        {4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2, -1, -1, -1, -1},
+        {9, 7, 4, 9, 11, 7, 9, 1, 11, 2, 11, 1, 0, 8, 3, -1},
+        {11, 7, 4, 11, 4, 2, 2, 4, 0, -1, -1, -1, -1, -1, -1, -1},
+        {11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4, -1, -1, -1, -1},
+        {2, 9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9, -1, -1, -1, -1},
+        {9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7, -1},
+        {3, 7, 10, 3, 10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10, -1},
+        {1, 10, 2, 8, 7, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {4, 9, 1, 4, 1, 7, 7, 1, 3, -1, -1, -1, -1, -1, -1, -1},
+        {4, 9, 1, 4, 1, 7, 0, 8, 1, 8, 7, 1, -1, -1, -1, -1},
+        {4, 0, 3, 7, 4, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {4, 8, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {9, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {3, 0, 9, 3, 9, 11, 11, 9, 10, -1, -1, -1, -1, -1, -1, -1},
+        {0, 1, 10, 0, 10, 8, 8, 10, 11, -1, -1, -1, -1, -1, -1, -1},
+        {3, 1, 10, 11, 3, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 2, 11, 1, 11, 9, 9, 11, 8, -1, -1, -1, -1, -1, -1, -1},
+        {3, 0, 9, 3, 9, 11, 1, 2, 9, 2, 11, 9, -1, -1, -1, -1},
+        {0, 2, 11, 8, 0, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {3, 2, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {2, 3, 8, 2, 8, 10, 10, 8, 9, -1, -1, -1, -1, -1, -1, -1},
+        {9, 10, 2, 0, 9, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8, -1, -1, -1, -1},
+        {1, 10, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {1, 3, 8, 9, 1, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 9, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {0, 3, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1},
+        {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}};
+
+    
+    /*
+      Determine the index into the edge table which
+      tells us which vertices are inside of the surface
+     */
+    cubeindex = 0;
+
+    if (grid.val[0] < isolevel) cubeindex |= 1;
+    if (grid.val[1] < isolevel) cubeindex |= 2;
+    if (grid.val[2] < isolevel) cubeindex |= 4;
+    if (grid.val[3] < isolevel) cubeindex |= 8;
+    if (grid.val[4] < isolevel) cubeindex |= 16;
+    if (grid.val[5] < isolevel) cubeindex |= 32;
+    if (grid.val[6] < isolevel) cubeindex |= 64;
+    if (grid.val[7] < isolevel) cubeindex |= 128;
+    
+    
+    
+    /* Cube is entirely in/out of the surface */
+    if (edgeTable[cubeindex] == 0)
+        return(0);
+    
+    /* Find the vertices where the surface intersects the cube */
+    if (edgeTable[cubeindex] & 1)
+        vertlist[0] =
+         VertexInterp(isolevel,grid.p[0],grid.p[1],grid.val[0],grid.val[1]);
+    if (edgeTable[cubeindex] & 2)
+        vertlist[1] =
+         VertexInterp(isolevel,grid.p[1],grid.p[2],grid.val[1],grid.val[2]);
+    if (edgeTable[cubeindex] & 4)
+        vertlist[2] =
+         VertexInterp(isolevel,grid.p[2],grid.p[3],grid.val[2],grid.val[3]);
+    if (edgeTable[cubeindex] & 8)
+        vertlist[3] =
+         VertexInterp(isolevel,grid.p[3],grid.p[0],grid.val[3],grid.val[0]);
+    if (edgeTable[cubeindex] & 16)
+        vertlist[4] =
+         VertexInterp(isolevel,grid.p[4],grid.p[5],grid.val[4],grid.val[5]);
+    if (edgeTable[cubeindex] & 32)
+        vertlist[5] =
+         VertexInterp(isolevel,grid.p[5],grid.p[6],grid.val[5],grid.val[6]);
+    if (edgeTable[cubeindex] & 64)
+        vertlist[6] =
+         VertexInterp(isolevel,grid.p[6],grid.p[7],grid.val[6],grid.val[7]);
+    if (edgeTable[cubeindex] & 128)
+        vertlist[7] =
+         VertexInterp(isolevel,grid.p[7],grid.p[4],grid.val[7],grid.val[4]);
+    if (edgeTable[cubeindex] & 256)
+        vertlist[8] =
+         VertexInterp(isolevel,grid.p[0],grid.p[4],grid.val[0],grid.val[4]);
+    if (edgeTable[cubeindex] & 512)
+        vertlist[9] =
+         VertexInterp(isolevel,grid.p[1],grid.p[5],grid.val[1],grid.val[5]);
+    if (edgeTable[cubeindex] & 1024)
+        vertlist[10] =
+         VertexInterp(isolevel,grid.p[2],grid.p[6],grid.val[2],grid.val[6]);
+    if (edgeTable[cubeindex] & 2048)
+        vertlist[11] =
+         VertexInterp(isolevel,grid.p[3],grid.p[7],grid.val[3],grid.val[7]);
+    
+    /* Create the triangle */
+    ntriang = 0;
+    for (i=0; triTable[cubeindex][i]!=-1; i+=3) {
+        
+        triangles[ntriang].p[0] = vertlist[triTable[cubeindex][i  ]];
+        triangles[ntriang].p[1] = vertlist[triTable[cubeindex][i+1]];
+        triangles[ntriang].p[2] = vertlist[triTable[cubeindex][i+2]];
+        
+        ntriang++;
+    } // end for i
+    
+    return(ntriang);
+}
+
+
+// Gaussian function part of the RBF
+// rbf = p(x) + lambda_j*exp(-(x - xj)*(x - xj))
+KOKKOS_FUNCTION
+double Gaussian_fcn(double xi_value[3], double xj_value[3]){
+
+    double diff_sqrd = 0.0;
+    for(size_t dim=0; dim<3; dim++){
+        diff_sqrd += (xi_value[dim] - xj_value[dim])*(xi_value[dim] - xj_value[dim]);
+    }
+    return exp(-diff_sqrd);
+}
+
+// biharmonic function part of the RBF
+// rbf = p(x) + lambda_j*sqrt((x - xj)*(x - xj))
+KOKKOS_FUNCTION
+double biharmonic_fcn(double xi_value[3], double xj_value[3]){
+
+    double diff_sqrd = 0.0;
+    for(size_t dim=0; dim<3; dim++){
+        diff_sqrd += (xi_value[dim] - xj_value[dim])*(xi_value[dim] - xj_value[dim]);
+    }
+    return sqrt(diff_sqrd);
+}
+
+
+
+int main(int argc, char *argv[])
+{
+    Kokkos::initialize(argc, argv);
+    {  
+
+        printf("Pointcloud reconstruction \n\n");
+
+        if(argc==1){
+            printf("Please supply an STL file for testing the point cloud surface reconstruction code \n");
+            return 0;
+        }
+        
+        std::string filename = argv[1];
+
+        auto [normal_host, 
+              v1X_host, v1Y_host, v1Z_host, 
+              v2X_host, v2Y_host, v2Z_host, 
+              v3X_host, v3Y_host, v3Z_host, 
+              num_inp_triangles_host] = binary_stl_reader(filename);
+        
+        // Warning on C++ support:
+        // At this time with C++, the contents from a tuple cannot 
+        // be used inside a lambda function.  The parallel loops use 
+        // lambda functions. To overcome this C++ limitation, all 
+        // contents in the tuple will be copied or pointed to (Using 
+        // a MATAR dual view) allowing the data to be used in parallel.
+        const size_t num_inp_triangles = num_inp_triangles_host;
+        DViewCArrayKokkos <float> normal(&normal_host(0,0), num_inp_triangles, 3);
+        DViewCArrayKokkos <float> v1X(&v1X_host(0),num_inp_triangles); 
+        DViewCArrayKokkos <float> v1Y(&v1Y_host(0),num_inp_triangles); 
+        DViewCArrayKokkos <float> v1Z(&v1Z_host(0),num_inp_triangles); 
+        DViewCArrayKokkos <float> v2X(&v2X_host(0),num_inp_triangles); 
+        DViewCArrayKokkos <float> v2Y(&v2Y_host(0),num_inp_triangles); 
+        DViewCArrayKokkos <float> v2Z(&v2Z_host(0),num_inp_triangles); 
+        DViewCArrayKokkos <float> v3X(&v3X_host(0),num_inp_triangles); 
+        DViewCArrayKokkos <float> v3Y(&v3Y_host(0),num_inp_triangles); 
+        DViewCArrayKokkos <float> v3Z(&v3Z_host(0),num_inp_triangles);
+
+        normal.update_device(); 
+        v1X.update_device(); 
+        v1Y.update_device(); 
+        v1Z.update_device(); 
+        v2X.update_device(); 
+        v2Y.update_device(); 
+        v2Z.update_device(); 
+        v3X.update_device(); 
+        v3Y.update_device(); 
+        v3Z.update_device();
+        
+        
+        // define mesh spacing, it is used to create a mesh
+            
+        double LX = (XMax - X0);   // length in x-dir
+        double LY = (YMax - Y0);
+        double LZ = (ZMax - Z0);
+        
+        // the number of nodes in the mesh
+        int num_pt_x = (int)( LX/dx ) + 1;  // there must be at least 2 nodes
+        int num_pt_y = (int)( LY/dy ) + 1;  // there must be at least 2 nodes
+        int num_pt_z = (int)( LZ/dz ) + 1;  // there must be at least 2 nodes
+        
+        
+        // mesh coordinates
+        DCArrayKokkos <double> x(num_pt_x, "pt_x");
+        DCArrayKokkos <double> y(num_pt_y, "pt_y");
+        DCArrayKokkos <double> z(num_pt_z, "pt_z");
+
+        // small distance for moving in the +/- normal directions 
+        double epsilon = 0.1*fmin(fmin(dx, dy), dz);
+
+        
+        // function with isosurface that we want extracted
+        DCArrayKokkos <double> gridValues (num_pt_x,num_pt_y,num_pt_z, "grid_values");
+        
+
+        // define the triangles of extracted surface
+        const size_t num_elems = (num_pt_x-1)*(num_pt_y-1)*(num_pt_z-1);
+        DCArrayKokkos <triangle_t> all_mesh_surf_triangles(num_elems, 5, "mesh_surf_tris"); // max of 5 per elem
+        DCArrayKokkos <size_t> num_triangles_in_elem(num_elems, "num_tris_in_elem");
+        num_triangles_in_elem.set_values(0);
+
+
+        printf("Creating point cloud data from STL file \n\n");
+
+        // define a point cloud
+        size_t num_points = num_inp_triangles*3; // 1 point per triangle plus 2 more in the +/- directions
+        DCArrayKokkos <double> point_positions(num_points, 3, "point_positions");
+        DCArrayKokkos <double> point_signed_distance(num_points, "point_sign_distance"); // this is f in the journal paper
+
+        //for(size_t tri=0; tri<num_inp_triangles; tri++){
+        FOR_ALL(tri, 0, num_inp_triangles, {
+            // point on surface
+            point_positions(tri, 0) =  1.0/3.0*((double)v1X(tri) + (double)v2X(tri) + (double)v3X(tri));
+            point_positions(tri, 1) =  1.0/3.0*((double)v1Y(tri) + (double)v2Y(tri) + (double)v3Y(tri));
+            point_positions(tri, 2) =  1.0/3.0*((double)v1Z(tri) + (double)v2Z(tri) + (double)v3Z(tri));
+
+            point_signed_distance(tri) = 0.0;
+
+            // off surface +normal
+            point_positions(tri+num_inp_triangles, 0) =  point_positions(tri, 0) + epsilon*(double)normal(tri, 0);
+            point_positions(tri+num_inp_triangles, 1) =  point_positions(tri, 1) + epsilon*(double)normal(tri, 1);
+            point_positions(tri+num_inp_triangles, 2) =  point_positions(tri, 2) + epsilon*(double)normal(tri, 2);
+
+            point_signed_distance(tri+num_inp_triangles) = epsilon;
+
+            // off surface -normal
+            point_positions(tri+2*num_inp_triangles, 0) =  point_positions(tri, 0) - epsilon*(double)normal(tri, 0);
+            point_positions(tri+2*num_inp_triangles, 1) =  point_positions(tri, 1) - epsilon*(double)normal(tri, 1);
+            point_positions(tri+2*num_inp_triangles, 2) =  point_positions(tri, 2) - epsilon*(double)normal(tri, 2);
+
+            point_signed_distance(tri+2*num_inp_triangles) = -epsilon;
+
+        }); // end parallel for tri's in the file
+
+
+        // ----------------------------
+        // Reconstruct surface here
+        // ----------------------------
+
+        printf("Reconstructing surface using point cloud data \n\n");
+
+        size_t Pn = 4; // For p(x) = c0 + c1*x + c2*y + c3*z 
+        const size_t N = num_points+Pn;
+        DCArrayKokkos <double> M_matrix(N,N,"M_matrix");
+        DCArrayKokkos <double> b_vector(N, "b_vector");
+
+        // -----------------------
+        // assemble system matrix
+        //    Mx = B
+        // _     _  _      _   _ _
+        // |A   P|  |lambda|   |f|
+        // |P^T 0| *| c    | = |0|
+        // -     -  -      -   - -
+        //
+        // -----------------------
+
+        // initializing to zero
+        M_matrix.set_values(0.0);
+
+        // assemble M
+        FOR_ALL(i, 0, num_points, 
+                j, 0, num_points, {
+
+                // this is the A matrix part of M
+                M_matrix(i, j) = Gaussian_fcn(&point_positions(i,0), &point_positions(j,0));
+        });
+
+        FOR_ALL(i, 0, num_points, {
+            // Polynomial basis: [1, x, y, z]
+            // this the P matrix part of M
+            M_matrix(i, num_points+0) = 1.0;
+            M_matrix(i, num_points+1) = point_positions(i,0); // x coord
+            M_matrix(i, num_points+2) = point_positions(i,1); // y coord
+            M_matrix(i, num_points+3) = point_positions(i,2); // z coord
+        }); // end for i
+
+        FOR_ALL(j, 0, num_points, {
+            // this the P^T matrix part of M
+            M_matrix(num_points+0, j) = 1.0;
+            M_matrix(num_points+1, j) = point_positions(j,0); // x coord
+            M_matrix(num_points+2, j) = point_positions(j,1); // y coord
+            M_matrix(num_points+3, j) = point_positions(j,2); // z coord
+        }); // end for j
+
+        // adding the zeros in the bottom right corner
+        // for (size_t i = num_points; i < num_points+Pn; ++i) {
+        //     for (size_t j = num_points; j < num_points+Pn; ++j) {
+        //         M_matrix(i, j) = 0.0;
+        //     } // end for j
+        // } // end for i
+
+        //for(size_t i = num_points; i < num_points+Pn; ++i){
+        FOR_ALL(i, num_points, N, {
+            b_vector(i) = 0.0;
+        }); // end for
+
+        // assemble RHS vector, b
+        //for (size_t i = 0; i < num_points; ++i) {
+        FOR_ALL(i, 0, num_points, {
+            b_vector(i) = point_signed_distance(i);
+        }); // end for
+        
+
+        M_matrix.update_host();
+        b_vector.update_host();
+
+
+
+        // ----------------------------------
+        // Solve for x in Mx=b
+        // ----------------------------------
+
+        // checking by printing
+        printf("matrix = \n");
+        RUN({
+            for (size_t i = 0; i < N; ++i) {
+                for (size_t j = 0; j < N; ++j) {
+                    printf("%f , ", M_matrix(i, j));
+                } // end for j
+                printf("\n");
+            } // end for i
+
+            printf("\n");
+            printf("b = \n");
+            for (size_t i = 0; i < N; ++i) {
+                printf("%f \n", b_vector(i));
+            }
+        });
+
+        DCArrayKokkos <size_t> perm (N, "perm");
+        perm.set_values(0);
+        CArrayKokkos <double> vv(N, "vv");
+        
+        // used for LU problem
+
+        int singular = 0;
+        int parity = 0;
+        
+        // Get the LU decomposition of the Mass Matrix
+        singular = LU_decompose_host(M_matrix, perm, vv, parity);  // matrix is returned as the LU matrix  
+        if(singular==0){
+            printf("ERROR: matrix to fit point cloud surface is singluar \n");
+            return 0;
+        }
+        // BUG in LU decompose
+
+        // RUN({
+        //     int singular_d = 0; 
+        //     int parity_d = 0;
+        //     singular_d = LU_decompose(M_matrix, perm, vv, parity_d);  // M is returned as the LU matrix  
+        //     if(singular_d==0){
+        //         printf("ERROR: matrix is singluar \n");
+        //     }
+        // });
+        
+        LU_backsub_host(M_matrix, perm, b_vector);  // note: answer is sent back in b_vector
+
+        // RUN({
+        //     LU_backsub(M_matrix, perm, b_vector);  // note: answer is sent back in b
+        // });
+
+        RUN({
+            //lambda coefficients for radial basis function, slice out only lambda values and polynomial values
+            auto lambda = ViewCArrayKokkos <double> (&b_vector(0), num_points);
+            auto coefs  = ViewCArrayKokkos <double> (&b_vector(num_points), Pn);
+            printf("coeffs = \n");
+            for(size_t i=0; i<Pn; i++){
+                printf("%f \n", coefs(i));
+            }
+            printf("lambda = \n");
+            for(size_t i=0; i<num_points; i++){
+                printf("%f \n", lambda(i));
+            }
+            printf("perm = \n");
+            for(size_t i=0; i<N; i++){
+                printf("%zu \n", perm(i));
+            }
+        });
+
+        // ----------------------------------
+        // Evaluate surface function on mesh
+        // ----------------------------------
+
+        printf("Evaluating surf function on mesh \n");
+        FOR_ALL(i, 0, num_pt_x, {
+            x(i) = dx*(double)i + X0;;
+        });
+        FOR_ALL(j, 0, num_pt_y, {
+            y(j) = dy*(double)j + Y0;
+        });
+        FOR_ALL(k, 0, num_pt_z, {
+            z(k) = dz*(double)k + Z0;
+        });
+        Kokkos::fence();
+
+        // save mesh coordinates of the nodes
+        FOR_ALL(k, 0, num_pt_z, 
+                j, 0, num_pt_y,
+                i, 0, num_pt_x, {
+
+                    double x_point[3];
+                    x_point[0] = x(i);
+                    x_point[1] = y(j);
+                    x_point[2] = z(k);
+                    
+                    // lambda coefficients for radial basis function, slice out only lambda values and polynomial values
+                    // lambda = ViewCArrayKokkos <double> (&b_vector(0), num_points);
+                    // coefs  = ViewCArrayKokkos <double> (&b_vector(num_points), Pn);
+
+                    // evaluate the polynomial part
+                    gridValues(i,j,k) = b_vector(0+num_points) + 
+                                        b_vector(1+num_points)*x(i) + 
+                                        b_vector(2+num_points)*y(j) + 
+                                        b_vector(3+num_points)*z(k);
+
+
+                    for (size_t point=0; point<num_points; point++){
+                        gridValues(i,j,k) += b_vector(point)*Gaussian_fcn(&x_point[0], &point_positions(point,0));
+                    } // end for points      
+        
+        }); // end parallel over k,j,i
+
+        x.update_host();
+        y.update_host();
+        z.update_host();
+        gridValues.update_host();
+        
+        
+        
+        
+        // ------------------------------------
+        // Use marching cubes to build surface
+        // ------------------------------------
+
+        printf("Running marching cubes algorithm\n");
+        
+        FOR_ALL(k, 0, num_pt_z-1,
+                j, 0, num_pt_y-1,
+                i, 0, num_pt_x-1, {
+
+                    // elem gid
+                    size_t elem_gid = i + j*(num_pt_x-1) + k*(num_pt_x-1)*(num_pt_y-1);
+        
+                    // extract the x,y,z node coords
+                    // using the index ordering for the cell
+                    vec_t xyzs [8];
+                    xyzs[0] = vec_t(x(i  ), y(j  ), z(k  ));
+                    xyzs[1] = vec_t(x(i+1), y(j  ), z(k  ));
+                    xyzs[2] = vec_t(x(i+1), y(j  ), z(k+1));
+                    xyzs[3] = vec_t(x(i  ), y(j  ), z(k+1));
+                    xyzs[4] = vec_t(x(i  ), y(j+1), z(k  ));
+                    xyzs[5] = vec_t(x(i+1), y(j+1), z(k  ));
+                    xyzs[6] = vec_t(x(i+1), y(j+1), z(k+1));
+                    xyzs[7] = vec_t(x(i  ), y(j+1), z(k+1));
+        
+        
+                    // extract the values at the nodes
+                    // using the index ordering for the cell
+                    double vals [8];
+                    vals[0] = gridValues(i  ,j  ,k  );
+                    vals[1] = gridValues(i+1,j  ,k  );
+                    vals[2] = gridValues(i+1,j  ,k+1);
+                    vals[3] = gridValues(i  ,j  ,k+1);
+                    vals[4] = gridValues(i  ,j+1,k  );
+                    vals[5] = gridValues(i+1,j+1,k  );
+                    vals[6] = gridValues(i+1,j+1,k+1);
+                    vals[7] = gridValues(i  ,j+1,k+1);
+        
+        
+                    // details of the cell, save coords and the values at the nodes
+                    gridcell_t cell(xyzs, vals);
+        
+        
+                    // the most triangles in a cell is 5
+                    triangle_t triangles[5];
+                    num_triangles_in_elem(elem_gid) = Polygonise(cell, isoLevel, triangles);
+        
+                    // save the triangles
+                    for (size_t tri = 0; tri < num_triangles_in_elem(elem_gid); tri++)
+                    {
+                        all_mesh_surf_triangles(elem_gid,tri) = triangles[tri];
+                    } // end for tri
+
+        });  // end parallel for k,j,i
+
+        
+        // calculate the normal vector of triangles
+        FOR_ALL(elem_gid, 0, num_elems, {
+            for (size_t tri = 0; tri < num_triangles_in_elem(elem_gid); tri++){
+                calc_normal(&all_mesh_surf_triangles(elem_gid, tri));
+            }
+        }); // end loop over triangles
+
+        all_mesh_surf_triangles.update_host();
+        num_triangles_in_elem.update_host();
+
+        
+        printf("Marching cubes finished \n\n");
+        
+
+
+        // --------------------------------------------------
+        // volume calculation
+        // --------------------------------------------------
+        double volume = 0.0;
+        double vol_lcl = 0.0;
+        FOR_REDUCE_SUM(elem_gid, 0, num_elems, 
+                       vol_lcl, {
+
+            for (size_t tri = 0; tri < num_triangles_in_elem(elem_gid); tri++){
+                vol_lcl += compute_volume(all_mesh_surf_triangles(elem_gid,tri)); 
+            }
+
+        }, volume);
+        volume = fabs(volume);
+
+        double radius =  0.794651/2.0; // radius of constructured part, based on a small mesh size
+        double PI = 3.14159265358979323846264338327950288419716939937510;
+        double vol_exact = 4.0/3.0*PI*radius*radius*radius;
+        printf("volume = %f, and `exact' sphere volume = %f \n", volume, vol_exact);
+
+        // 0.262744 at 0.001 mesh size  
+
+
+        // --------------------------------------------------
+        // Export STL file using results from marching cubes
+        // --------------------------------------------------
+
+        printf("Exporting STL file for a 3D printer\n");
+        
+        
+        // export triangles as STL file
+        
+        FILE * myfile;
+        myfile=fopen("surface.stl","w");
+        fprintf(myfile,"solid points \n");
+        // a serial file write
+        for(size_t elem_gid=0; elem_gid<num_elems; elem_gid++){
+            for (size_t tri = 0; tri < num_triangles_in_elem.host(elem_gid); tri++){
+        
+                fprintf(myfile,"facet normal %f %f %f\n",
+                        all_mesh_surf_triangles.host(elem_gid,tri).normal.x,
+                        all_mesh_surf_triangles.host(elem_gid,tri).normal.y,
+                        all_mesh_surf_triangles.host(elem_gid,tri).normal.z);
+                
+                fprintf(myfile,"outer loop \n");
+                
+                fprintf(myfile,"vertex %f %f %f\n",
+                        all_mesh_surf_triangles.host(elem_gid,tri).p[0].x,
+                        all_mesh_surf_triangles.host(elem_gid,tri).p[0].y,
+                        all_mesh_surf_triangles.host(elem_gid,tri).p[0].z);
+                
+                fprintf(myfile,"vertex %f %f %f\n",
+                        all_mesh_surf_triangles.host(elem_gid,tri).p[1].x,
+                        all_mesh_surf_triangles.host(elem_gid,tri).p[1].y,
+                        all_mesh_surf_triangles.host(elem_gid,tri).p[1].z);
+                
+                fprintf(myfile,"vertex %f %f %f\n",
+                        all_mesh_surf_triangles.host(elem_gid,tri).p[2].x,
+                        all_mesh_surf_triangles.host(elem_gid,tri).p[2].y,
+                        all_mesh_surf_triangles.host(elem_gid,tri).p[2].z);
+                fprintf(myfile,"endloop \n");
+                fprintf(myfile,"endfacet \n");
+            }   
+        } // end loop over triangles
+        fprintf(myfile,"endsolid points \n");
+        
+        fclose(myfile);
+            
+    
+        printf("Finished \n\n");
+
+    } // end of kokkos scope
+
+
+
+    Kokkos::finalize();
+
+    return 0;
+    
+} // end main
diff --git a/examples/pointcloud/pointcloud-rk.cpp b/examples/pointcloud/pointcloud-rk.cpp
new file mode 100755
index 00000000..54fd49c9
--- /dev/null
+++ b/examples/pointcloud/pointcloud-rk.cpp
@@ -0,0 +1,1760 @@
+/**********************************************************************************************
+ © 2020. Triad National Security, LLC. All rights reserved.
+ This program was produced under U.S. Government contract 89233218CNA000001 for Los Alamos
+ National Laboratory (LANL), which is operated by Triad National Security, LLC for the U.S.
+ Department of Energy/National Nuclear Security Administration. All rights in the program are
+ reserved by Triad National Security, LLC, and the U.S. Department of Energy/National Nuclear
+ Security Administration. The Government is granted for itself and others acting on its behalf a
+ nonexclusive, paid-up, irrevocable worldwide license in this material to reproduce, prepare
+ derivative works, distribute copies to the public, perform publicly and display publicly, and
+ to permit others to do so.
+ This program is open source under the BSD-3 License.
+ Redistribution and use in source and binary forms, with or without modification, are permitted
+ provided that the following conditions are met:
+ 
+ 1.  Redistributions of source code must retain the above copyright notice, this list of
+ conditions and the following disclaimer.
+ 
+ 2.  Redistributions in binary form must reproduce the above copyright notice, this list of
+ conditions and the following disclaimer in the documentation and/or other materials
+ provided with the distribution.
+ 
+ 3.  Neither the name of the copyright holder nor the names of its contributors may be used
+ to endorse or promote products derived from this software without specific prior
+ written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+ IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ **********************************************************************************************/
+
+// -----------------------------------------------
+// pointcloud reproducing kernels in C++
+//  Nathaniel Morgan
+// -----------------------------------------------
+
+#include <chrono>   // for timing
+
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <stdio.h>
+#include <cmath>
+
+#include <cstdlib> // For rand() and srand()
+
+
+#include "matar.h"
+
+#include "lu_solver.hpp"
+
+
+#include <set> // for unorded map testing 
+
+
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+
+using namespace mtr;
+
+const double PI = 3.14159265358979323846;
+
+// -----------------------------------------------
+// inputs:
+
+
+#define P2 // P1 or P2
+#define CUBIC_SPLINE // CUBIC_SPLINE or GAUSS kernel
+
+bool RAND_CLOUD = true; // RAND_CLOUD or uniform
+
+const size_t num_1d_x = 5;
+const size_t num_1d_y = 5;
+const size_t num_1d_z = 5;
+
+const double h_kernel = 2.0/5.; // it is always 2/num_1d_x
+const double num_points_fit = 27; // minimum to P2 fit on structured mesh is 3x3x3
+
+const size_t num_points = num_1d_x*num_1d_y*num_1d_z;
+
+// the bin sizes for finding neighboring points
+const double bin_dx = 0.05; // bins in x
+const double bin_dy = 0.05; // bins in y
+const double bin_dz = 0.05; // bins in z
+
+const double X0 = 0.0;   // origin
+const double Y0 = 0.0;
+const double Z0 = 0.0;
+
+// length of the domain 
+const double LX = 1.0;   // length in x-dir
+const double LY = 1.0;
+const double LZ = 1.0;
+
+
+bool check_maps = false; // CPU only!!!!
+
+//
+// -----------------------------------------------
+
+
+
+struct bin_keys_t{
+    size_t i,j,k;
+};
+
+KOKKOS_INLINE_FUNCTION
+size_t get_gid(size_t i, size_t j, size_t k, size_t num_x, size_t num_y){
+    return i + (j + k*num_y)*num_x;
+}
+
+KOKKOS_INLINE_FUNCTION
+bin_keys_t get_bin_keys(const double x_pt, 
+                        const double y_pt, 
+                        const double z_pt){
+            
+
+    double i_dbl = fmax(0, round((x_pt - X0 - bin_dx*0.5)/bin_dx - 1.0e-10)); // x = ih + X0 + dx_bin*0.5
+    double j_dbl = fmax(0, round((y_pt - Y0 - bin_dy*0.5)/bin_dy - 1.0e-10));
+    double k_dbl = fmax(0, round((z_pt - Z0 - bin_dz*0.5)/bin_dz - 1.0e-10));
+
+    bin_keys_t bin_keys; // save i,j,k to the bin keys
+
+    // get the integer for the bins
+    bin_keys.i = (size_t)i_dbl;
+    bin_keys.j = (size_t)j_dbl;
+    bin_keys.k = (size_t)k_dbl;
+
+    return bin_keys;
+
+} // end function
+
+KOKKOS_INLINE_FUNCTION
+size_t get_bin_gid(const double x_pt, 
+                   const double y_pt, 
+                   const double z_pt, 
+                   const size_t num_bins_x,
+                   const size_t num_bins_y,
+                   const size_t num_bins_z){
+            
+
+    double i_dbl = fmin(num_bins_x-1, fmax(0.0, round((x_pt - X0)/bin_dx - 1.0e-8))); // x = ih + X0
+    double j_dbl = fmin(num_bins_y-1, fmax(0.0, round((y_pt - Y0)/bin_dy - 1.0e-8)));
+    double k_dbl = fmin(num_bins_z-1, fmax(0.0, round((z_pt - Z0)/bin_dz - 1.0e-8)));
+
+    // get the integers for the bins
+    size_t i = (size_t)i_dbl;
+    size_t j = (size_t)j_dbl;
+    size_t k = (size_t)k_dbl;
+    
+    // get the 1D index for this bin                               
+    return get_gid(i, j, k, num_bins_x, num_bins_y);
+
+} // end function
+
+
+
+
+#if defined(CUBIC_SPLINE)
+
+KOKKOS_FUNCTION
+double kernel(const double r[3], double h) {
+    
+    double diff_sqrd = 0.0;
+    for(size_t dim=0; dim<3; dim++){
+        diff_sqrd += r[dim]*r[dim];
+    } // dim
+
+    const double radius = sqrt(diff_sqrd);
+    const double q = radius/h;
+    const double alpha = 2.0/(3.0*h);
+    if (q < 0.0) return 0.0; // defensive
+    if (q < 1.0) return (alpha * (1.0 - 1.5*q*q + 0.75*q*q*q));
+    if (q < 2.0) return (alpha * 0.25 * pow(2.0 - q, 3));
+
+    return 0.0;
+}
+
+
+KOKKOS_FUNCTION
+// derivative dW/dx_i = - dW/dr where r = xj-xi
+void grad_kernel(double *grad_W, const double r[3], const double h, const bool derivative_wrt_i) {
+
+    double diff_sqrd = 0.0;
+    for(size_t dim=0; dim<3; dim++){
+        diff_sqrd += r[dim]*r[dim];
+    } // dim
+
+    const double radius = sqrt(diff_sqrd);
+    const double q = radius/h;
+
+    double df_dq = 0.0; // derivative of the dimensionless kernel shape function f(q)
+    if (q < 1.0) {
+        // f(q) = 1 - 1.5 q^2 + 0.75 q^3
+        // f'(q) = -3 q + 2.25 q^2
+        df_dq = -3.0 * q + 2.25 * q * q;
+    } else if (q < 2.0) {
+        // f(q) = 0.25 (2 - q)^3
+        // f'(q) = 0.25 * 3 (2 - q)^2 * (-1) = -0.75 (2 - q)^2
+        const double two_minus_q = 2.0 - q;
+        df_dq = -0.75 * two_minus_q * two_minus_q;
+    } else {
+        df_dq = 0.0;
+    }
+
+    const double alpha = 2.0/(3.0*h);
+    const double dW_dr = alpha*(df_dq / h);
+    // grad W = dW/dr * (rij / radius)
+    const double invr = 1.0 /(radius + 1e-16);
+
+    double drdx = -1.0;  // default is derivative with respect to i
+    if(derivative_wrt_i == false){
+        drdx = 1.0;  // derivative with respect to j
+    } // end if
+
+    for (size_t dim=0; dim<3; ++dim) {
+        grad_W[dim] = dW_dr * r[dim] * drdx * invr;
+    }
+
+    return;
+}
+
+#else 
+
+// Gaussian function part of the RBF
+// rbf = exp(-(xj - x)*(xj - x)/h)
+KOKKOS_FUNCTION
+double kernel(const double r[3], const double h){
+
+    double diff_sqrd = 0.0;
+
+    for(size_t dim=0; dim<3; dim++){
+        diff_sqrd += r[dim]*r[dim];
+    } // dim
+
+    double norm = 1.0; // / (h * h * h * pow(PI, 1.5));
+    return norm * exp(-diff_sqrd / (h * h));
+} // end of function
+
+
+
+// Gradient Gaussian function
+// d/dx rbf = d/dx (exp(-(xj - xi)*(xj - x)/hi^2) 
+KOKKOS_FUNCTION
+void grad_kernel(double *grad_W, const double r[3], const double h, const bool derivative_wrt_i){
+
+    double diff_sqrd = 0.0;
+
+    for(size_t dim=0; dim<3; dim++){
+        diff_sqrd += r[dim]*r[dim];
+    } // dim
+
+    double drdx = -1;
+    if(derivative_wrt_i == false){
+        drdx = 1.0;  // derivative with respect to j
+    } // end if
+
+    const double rbf = kernel(r, h);
+
+    // gradient
+    for (size_t dim=0; dim<3; ++dim) {
+        grad_W[dim] = -2.0/(h*h)*r[dim]*rbf*drdx; 
+    }
+
+    return;
+} // end of function
+
+#endif
+
+// Gaussian function part of the RBF, symmeterized
+// rbf = 0.5*(exp(-(xj - xi)*(xj - xi)/hi^2) + exp(-(xi - xj)*(xi - xj)/hj^2))
+KOKKOS_FUNCTION
+double kernel_syn(const double r[3], const double hi, const double hj){
+
+    double diff_sqrd = 0.0;
+
+    for(size_t dim=0; dim<3; dim++){
+        diff_sqrd += r[dim]*r[dim];
+    } // dim
+
+    const double Wi = exp(-diff_sqrd/(hi*hi)); // use kernel func call
+    const double Wj = exp(-diff_sqrd/(hj*hj));
+
+    return 0.5*(Wi + Wj);
+} // end of function
+
+
+// d/dx rbf = d/dx ( 0.5(exp(-(xj - xi)*(xj - x)/hi^2) + exp(-(xi - xj)*(xi - xj)/hj^2)) ) 
+KOKKOS_FUNCTION
+void grad_kernel_sym(double *gradW, const double r[3], const double hi, const double hj) {
+    double diff_sqrd = 0.0;
+    for (size_t dim=0; dim<3; ++dim){
+        diff_sqrd += r[dim]*r[dim];
+    }
+
+    const double drdxi = -1;
+
+    double Wi = exp(-diff_sqrd/(hi*hi));
+    double Wj = exp(-diff_sqrd/(hj*hj));
+
+    double dWi = -2.0/ (hi*hi) * Wi*drdxi; // it uses xj - xi so a minus one
+    double dWj = -2.0/ (hj*hj) * Wj; // it uses xi - xj so it has a +1 for drdxi
+
+    for (size_t dim=0; dim<3; ++dim) {
+        gradW[dim] = 0.5 * (dWi * r[dim] - dWj * r[dim]); // second term using -r
+    }
+}
+
+#if defined(P2)
+    // Polynomial basis up to quadratic in 3D (10 terms)
+    const size_t num_poly_basis = 10;
+    KOKKOS_INLINE_FUNCTION
+    void poly_basis(const double r[3], double *p) {
+
+        p[0] = 1.0;
+        p[1] = r[0];
+        p[2] = r[1];
+        p[3] = r[2];
+        p[4] = r[0] * r[0];
+        p[5] = r[0] * r[1];
+        p[6] = r[0] * r[2];
+        p[7] = r[1] * r[1];
+        p[8] = r[1] * r[2];
+        p[9] = r[2] * r[2];
+
+        // for high-order will use (x^a y^b z^c)
+
+        return;
+    } // end function
+
+
+    KOKKOS_INLINE_FUNCTION
+    void grad_poly_basis(const double r[3], double (*grad_p)[3], bool derivative_wrt_i) {
+        
+        // definition, r = r_j - r_i
+
+        double drdx = -1.0;  // default is derivative with respect to i
+        double drdy = -1.0;  // default is derivative with respect to i
+        double drdz = -1.0;  // default is derivative with respect to i
+        if(derivative_wrt_i == false){
+            drdx = 1.0;  // derivative with respect to j
+            drdy = 1.0;  // derivative with respect to j
+            drdz = 1.0;  // derivative with respect to j
+        } // end if
+
+        grad_p[0][0] = 0.0;
+        grad_p[1][0] = drdx;
+        grad_p[2][0] = 0.0;
+        grad_p[3][0] = 0.0;
+        grad_p[4][0] = 2.0*r[0]*drdx;
+        grad_p[5][0] = r[1]*drdx;
+        grad_p[6][0] = r[2]*drdx;
+        grad_p[7][0] = 0.0;
+        grad_p[8][0] = 0.0;
+        grad_p[9][0] = 0.0;
+
+        // for high-order will use (x^a y^b z^c)
+
+        grad_p[0][1] = 0.0;
+        grad_p[1][1] = 0.0;
+        grad_p[2][1] = drdy;
+        grad_p[3][1] = 0.0;
+        grad_p[4][1] = 0.0;
+        grad_p[5][1] = r[0]*drdy;
+        grad_p[6][1] = 0.0;
+        grad_p[7][1] = 2.0*r[1]*drdy;
+        grad_p[8][1] = r[2]*drdy;
+        grad_p[9][1] = 0.0;
+
+        // for high-order will use (x^a y^b z^c)
+
+        grad_p[0][2] = 0.0;
+        grad_p[1][2] = 0.0;
+        grad_p[2][2] = 0.0;
+        grad_p[3][2] = drdz;
+        grad_p[4][2] = 0.0;
+        grad_p[5][2] = 0.0;
+        grad_p[6][2] = r[0]*drdz;
+        grad_p[7][2] = 0.0;
+        grad_p[8][2] = r[1]*drdz;
+        grad_p[9][2] = 2.0*r[2]*drdz;
+
+        // for high-order will use (x^a y^b z^c)
+
+        return;
+    } // end function
+#else
+    // Polynomial basis up to quadratic in 3D (10 terms)
+    const size_t num_poly_basis = 4;
+    KOKKOS_INLINE_FUNCTION
+    void poly_basis(const double r[3], double *p) {
+
+        p[0] = 1.0;
+        p[1] = r[0];
+        p[2] = r[1];
+        p[3] = r[2];
+
+        return;
+    } // end function
+
+
+    KOKKOS_INLINE_FUNCTION
+    void grad_poly_basis(const double r[3], double (*grad_p)[3], bool derivative_wrt_i) {
+        
+        double drdx = -1.0;  // default is derivative with respect to i
+        double drdy = -1.0;  // default is derivative with respect to i
+        double drdz = -1.0;  // default is derivative with respect to i
+        if(derivative_wrt_i == false){
+            drdx = 1.0;  // derivative with respect to j
+            drdy = 1.0;  // derivative with respect to j
+            drdz = 1.0;  // derivative with respect to j
+        } // end if
+
+        grad_p[0][0] = 0.0;
+        grad_p[1][0] = drdx;
+        grad_p[2][0] = 0.0;
+        grad_p[3][0] = 0.0;
+
+        grad_p[0][1] = 0.0;
+        grad_p[1][1] = 0.0;
+        grad_p[2][1] = drdy;
+        grad_p[3][1] = 0.0;
+
+
+        grad_p[0][2] = 0.0;
+        grad_p[1][2] = 0.0;
+        grad_p[2][2] = 0.0;
+        grad_p[3][2] = drdz;
+
+        return;
+    } // end function
+#endif
+
+
+void calc_basis_functions(
+    size_t point_gid,
+    const DCArrayKokkos <double>& x,
+    const DCArrayKokkos <size_t> points_num_neighbors, 
+    const DRaggedRightArrayKokkos <size_t> points_in_point,
+    const DCArrayKokkos <double>& vol,
+    const CArrayKokkos <double>& p_coeffs,
+    const DRaggedRightArrayKokkos <double>& basis,
+    const double h)
+{
+
+    //---------------------------------------------
+    // walk over the neighboring points 
+    //---------------------------------------------
+
+    FOR_ALL(neighbor_point_lid, 0, points_num_neighbors(point_gid), {
+
+        size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+
+        double p[num_poly_basis];    // array holding polynomial basis [x, y, z, x^2, y^2, ... , yz]
+        double r[3];    // vecx_j - vecx_i
+        r[0] = x(neighbor_point_gid,0) - x(point_gid,0); // x_j-x_i
+        r[1] = x(neighbor_point_gid,1) - x(point_gid,1); // y_j-y_i
+        r[2] = x(neighbor_point_gid,2) - x(point_gid,2); // z_j-z_i
+
+        double W = kernel(r, h);
+        poly_basis(r,p);
+
+        double correction = 0.0;
+        for (size_t a = 0; a < num_poly_basis; ++a){
+            correction += p_coeffs(point_gid,a) * p[a];
+        } // end for a
+
+        basis(point_gid, neighbor_point_lid) = W * correction;
+
+    }); // neighbor_point_lid
+
+    return;
+    
+} // end function
+
+
+// grad_C = -M^-1 * grad_M * C
+// grad_M[a][b][i] = V (P[a] * grad_p[i][b] W + grad_p[i][a] P[b] W + P[a] P[b] grad_W[i])
+// -M^-1[d][k]*grad_M[k][b][i]*C[b] = [d][i]
+// p[d] grad_C[d][i]
+void calc_basis_and_grad_basis_functions(
+    const DCArrayKokkos <double>& x,
+    const DCArrayKokkos <size_t> points_num_neighbors, 
+    const DRaggedRightArrayKokkos <size_t> points_in_point,
+    const DCArrayKokkos <double>& vol,
+    const CArrayKokkos <double>& p_coeffs,
+    const CArrayKokkos <double>& M_inv,
+    const DRaggedRightArrayKokkos <double>& basis,
+    const DRaggedRightArrayKokkos <double>& grad_basis,
+    const double h,
+    const bool derivative_wrt_i)
+{
+
+    // dir 
+
+    // actual number of points
+    size_t num_points = x.dims(0);
+    
+    // loop over all nodes in the problem
+    FOR_ALL(point_gid, 0, num_points, {
+
+        // --------------
+        // Step 1: assemble grad M, the gradient of the moment matrix
+
+        double grad_m[num_poly_basis][num_poly_basis][3];
+
+        for(size_t a=0;a<num_poly_basis;++a){
+            for(size_t b=0;b<num_poly_basis;++b){
+                for(size_t dim=0;dim<3;++dim){
+                    grad_m[a][b][dim] = 0.0;
+                }
+            } // end for b
+        } // end for a
+
+        // walk over the neighboring points 
+        for(size_t neighbor_point_lid=0; neighbor_point_lid<points_num_neighbors(point_gid); neighbor_point_lid++){
+
+            size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+
+            double r[3];    // vecx_j - vecx_i
+            r[0] = x(neighbor_point_gid,0) - x(point_gid,0); // x_j-x_i
+            r[1] = x(neighbor_point_gid,1) - x(point_gid,1); // y_j-y_i
+            r[2] = x(neighbor_point_gid,2) - x(point_gid,2); // z_j-z_i
+
+
+
+            double W = kernel(r,h);
+            //printf("kernel = %f \n", W);
+            double grad_W[3]; 
+            grad_kernel(grad_W,r,h,derivative_wrt_i);
+            //printf("grad kernel = %f, %f, %f \n", grad_W[0], grad_W[1], grad_W[2]);
+
+            double p[num_poly_basis]; 
+            poly_basis(r,p);
+
+            double grad_p[num_poly_basis][3]; 
+            grad_poly_basis(r, grad_p, derivative_wrt_i);
+
+            double Vj = vol(neighbor_point_gid);
+
+            // grad_M = V (P grad_p W + grad_p P W + P P grad_W)
+            for(size_t a=0;a<num_poly_basis;++a){
+                for(size_t b=0;b<num_poly_basis;++b){
+                    for(size_t dim=0;dim<3;++dim){
+                        grad_m[a][b][dim] += Vj * ( p[a]*grad_p[b][dim]*W + grad_p[a][dim]*p[b]*W + p[a]*p[b]*grad_W[dim] );
+                    }
+                } // end for b
+            } // end for a
+
+        } // end for loop over neighboring points
+
+
+        // -----------
+        // Step 2: calculate basis and grad basis
+
+        // walk over the neighboring points 
+        for(size_t neighbor_point_lid=0; neighbor_point_lid<points_num_neighbors(point_gid); neighbor_point_lid++){
+
+            size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+
+            double r[3];    // vecx_j - vecx_i
+            r[0] = x(neighbor_point_gid,0) - x(point_gid,0); // x_j-x_i
+            r[1] = x(neighbor_point_gid,1) - x(point_gid,1); // y_j-y_i
+            r[2] = x(neighbor_point_gid,2) - x(point_gid,2); // z_j-z_i
+
+            double W = kernel(r, h);
+
+            double p[num_poly_basis];    // array holding polynomial basis [x, y, z, x^2, y^2, ... , yz]
+            poly_basis(r,p);
+            
+            double grad_W[3];
+            grad_kernel(grad_W, r, h, derivative_wrt_i);
+
+            double grad_p[num_poly_basis][3]; // matrix holding grad polynomial basis
+            grad_poly_basis(r, grad_p, derivative_wrt_i);
+
+            // 
+            double correction = 0.0;
+            for (size_t a = 0; a < num_poly_basis; ++a){
+                correction += p_coeffs(point_gid,a) * p[a];
+            } // end for a
+
+            basis(point_gid, neighbor_point_lid) = W * correction;
+
+
+            // --- gradient contributions ---
+
+            // term from grad poly
+            double term[3]; 
+            for(size_t dim=0;dim<3;++dim){
+                term[dim] = 0.0;
+            }
+
+            // calc the grad of poly term
+            for(size_t dim=0;dim<3;++dim){
+                for (size_t a = 0; a < num_poly_basis; ++a){
+                    term[dim] += grad_p[a][dim] * p_coeffs(point_gid,a);
+                } // end for a
+            } // end for dim
+
+            // saving the grad poly term plus the grad kernel term to the grad basis
+            for(size_t dim=0;dim<3;++dim){
+                grad_basis(point_gid,neighbor_point_lid,dim) = term[dim]*W + correction*grad_W[dim];
+            }
+
+            // --- gradient of correction coefficients (grad_C) ---
+
+            // the last contirubtion to grad_basis is from grad_C
+            // sum_a p[a]*W*grad_C[i][a]
+            double grad_C[num_poly_basis][3];
+
+            
+            for (size_t a = 0; a < num_poly_basis; ++a){
+                for(size_t dim=0;dim<3;++dim){
+                    grad_C[a][dim]= 0.0;
+                }
+            } // end for 1
+            
+
+            // grad_C = -M^-1 * grad_M * C
+            for (size_t d = 0; d < num_poly_basis; ++d) {
+                for (size_t b = 0; b < num_poly_basis; ++b) {
+                    double Cb = p_coeffs(point_gid,b);
+                    for (size_t k = 0; k < num_poly_basis; ++k) {
+                        for(size_t dim=0;dim<3;++dim){
+                            grad_C[d][dim] -= M_inv(point_gid,d,k) * grad_m[k][b][dim] * Cb;
+                        } // end dim
+                    } // end for k
+                } // end for b
+            } // end for d
+
+
+            // adding grad_C[i][d]* p[d]*W to the grad basis function
+            for (size_t d = 0; d < num_poly_basis; ++d){
+                for(size_t dim=0;dim<3;++dim){
+                    grad_basis(point_gid,neighbor_point_lid,dim) += grad_C[d][dim]*p[d]*W;
+                } // end dim
+            } // end for d
+
+
+        } // end for over neighbor_point_lid
+
+    }); // end parallel loop over all points 
+
+    return;
+    
+} // end function
+
+
+
+// Build reproducing kernel poly coefficients for all particles in the domain
+void calc_p_coefficients(
+    const DCArrayKokkos <double>& x,
+    const DCArrayKokkos <size_t> points_num_neighbors, 
+    const DRaggedRightArrayKokkos <size_t> points_in_point,
+    const DCArrayKokkos <double>& vol,
+    const CArrayKokkos <double>& p_coeffs,
+    const CArrayKokkos <double>& M_inv,
+    double h)
+{
+
+    // actual number of points
+    size_t num_points = x.dims(0);
+
+    
+    // loop over all nodes in the problem
+    FOR_ALL(point_gid, 0, num_points, {
+
+        double M_1D[num_poly_basis*num_poly_basis]; 
+        ViewCArrayKokkos <double> M(&M_1D[0], num_poly_basis, num_poly_basis);
+        M.set_values(0.0);
+
+        // values in rhs after this function will be accessed as p_coeffs(i,0:N)
+        ViewCArrayKokkos <double> rhs (&p_coeffs(point_gid,0), num_poly_basis);
+        rhs.set_values(0.0);
+        rhs(0) = 1.0;   // enforce reproduction of constant 1, everything else is = 0
+
+        double p[num_poly_basis];    // array holding polynomial basis [x, y, z, x^2, y^2, ... , yz]
+        double r[3];    // vecx_j - vecx_i
+
+        //---------------------------------------------
+        // walk over the neighboring points
+        //---------------------------------------------
+
+        for (size_t neighbor_point_lid=0; neighbor_point_lid<points_num_neighbors(point_gid); neighbor_point_lid++){
+
+            size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+
+            r[0] = x(neighbor_point_gid,0) - x(point_gid,0); // x_j-x_i
+            r[1] = x(neighbor_point_gid,1) - x(point_gid,1); // y_j-y_i
+            r[2] = x(neighbor_point_gid,2) - x(point_gid,2); // z_j-z_i
+
+            double W = kernel(r, h);
+            poly_basis(r,p);
+
+            // assemble matrix
+
+            for (size_t a = 0; a < num_poly_basis; ++a) {
+                for (size_t b = 0; b < num_poly_basis; ++b) {
+                    M(a,b) += vol(neighbor_point_gid) * W * p[a] * p[b]; 
+                } // end for b
+            } // for a
+
+        } // neighbor_point_lid
+
+    
+        // -------------
+        // solve Ax=B
+        // -------------
+
+        size_t perm_1D[num_poly_basis];
+        ViewCArrayKokkos <size_t> perm (&perm_1D[0], num_poly_basis);
+        for (size_t a = 0; a < num_poly_basis; ++a) {
+            perm(a)= 0;
+        } // end a
+
+        double vv_1D[num_poly_basis];
+        ViewCArrayKokkos <double> vv(&vv_1D[0], num_poly_basis);
+        
+        // used for LU problem
+        int singular = 0;
+        int parity = 0;
+        singular = LU_decompose(M, perm, vv, parity);  // M is returned as the LU matrix  
+        if(singular==0){
+            printf("ERROR: matrix is singluar \n");
+        }
+
+
+        // --------------------------------------------------
+        // things needed for gradient of the basis function
+        double col_1D[num_poly_basis];
+        ViewCArrayKokkos <double> col(&col_1D[0], num_poly_basis);
+
+        // making a view, inverting only the matrix at point i
+        ViewCArrayKokkos <double> M_inv_point(&M_inv(point_gid,0,0), num_poly_basis,num_poly_basis);
+
+        LU_invert(M,        // input matrix
+                  perm,     // permutations
+                  M_inv_point, // inverse matrix at point gid
+                  col);     // tmp array
+        // -------------------------------------------------
+        
+        // solve for p_coefs
+        LU_backsub(M, perm, rhs);  // note: answer is sent back in rhs
+
+    }); // end parallel loop
+
+
+    return; 
+} // end function
+
+
+
+
+
+int main(int argc, char *argv[])
+{
+    Kokkos::initialize(argc, argv);
+    {  
+
+        printf("Pointcloud Reproducing Kernels \n\n");
+
+
+        // define a point cloud
+        DCArrayKokkos <double> point_positions(num_points, 3, "point_positions");
+        DCArrayKokkos <double> point_values(num_points, "point_values"); 
+
+        DCArrayKokkos <double> vol(num_points);
+        vol.set_values(0.0);
+
+        // point locations
+        if(RAND_CLOUD){
+            srand(static_cast<unsigned int>(time(0))); // Seed the random number generator
+                for(size_t i=0; i<num_points; i++){
+                    point_positions.host(i, 0) = X0 + LX*static_cast<double>(rand())/static_cast<double>(RAND_MAX);
+                    point_positions.host(i, 1) = Y0 + LY*static_cast<double>(rand())/static_cast<double>(RAND_MAX);
+                    point_positions.host(i, 2) = Z0 + LZ*static_cast<double>(rand())/static_cast<double>(RAND_MAX);
+                }
+                vol.set_values(1.0);
+            }
+        else {
+
+            double dx = (LX-X0)/((double)num_1d_x - 1);
+            double dy = (LY-Y0)/((double)num_1d_y - 1);
+            double dz = (LZ-Z0)/((double)num_1d_z - 1);
+
+            size_t point_gid = 0;  
+            for(size_t k=0; k<num_1d_z; k++){
+                for(size_t j=0; j<num_1d_y; j++){
+                    for(size_t i=0; i<num_1d_x; i++){
+                        point_positions.host(point_gid, 0) = X0 + static_cast<double>(i)*dx;
+                        point_positions.host(point_gid, 1) = Y0 + static_cast<double>(j)*dy;
+                        point_positions.host(point_gid, 2) = Z0 + static_cast<double>(k)*dz;
+                        point_gid++;
+                    } // end i
+                } // end j
+            } // end k
+
+
+
+            const double elem_dx = LX/((double)num_1d_x);
+            const double elem_dy = LY/((double)num_1d_y);
+            const double elem_dz = LZ/((double)num_1d_z); 
+            const double elem_vol = elem_dx*elem_dy*elem_dz;
+
+            const size_t num_cells_1d_x = num_1d_x-1;
+            const size_t num_cells_1d_y = num_1d_y-1;
+            const size_t num_cells_1d_z = num_1d_z-1;
+
+            FOR_ALL(k,0,num_cells_1d_z,
+                    j,0,num_cells_1d_y,
+                    i,0,num_cells_1d_x,{
+
+                for (int kcount=k; kcount<=k+1; kcount++){
+                    for (int jcount=j; jcount<=j+1; jcount++){
+                        for (int icount=i; icount<=i+1; icount++){
+                            size_t point_gid = get_gid(icount, jcount, kcount, num_1d_x, num_1d_y);
+                            Kokkos::atomic_add(&vol(point_gid), elem_vol*0.25);
+                        } // end i
+                    } // end j
+                } // end k
+                        
+            }); // end parallel over k,j,i 
+            
+
+        } // end if
+        vol.update_host();
+        point_positions.update_device();
+        Kokkos::fence();
+
+        // point values
+        FOR_ALL(i, 0, num_points, {
+
+            //printf("point location at i=%d is (%f, %f, %f) \n", i, point_positions(i, 0), point_positions(i, 1), point_positions(i, 2));
+            point_values(i) = sqrt(point_positions(i, 0)*point_positions(i, 0) + 
+                                   point_positions(i, 1)*point_positions(i, 1) +
+                                   point_positions(i, 2)*point_positions(i, 2));
+
+        }); // end parallel for tri's in the file
+        point_values.update_host();
+        Kokkos::fence();
+        //printf("\n");
+
+
+        // ----------------------------
+        // Make bins here
+        // ----------------------------
+
+        printf("making bins \n");
+        
+        // the number of nodes in the mesh
+        size_t num_bins_x = (size_t)( round( (LX - X0)/bin_dx) + 1 );  
+        size_t num_bins_y = (size_t)( round( (LY - Y0)/bin_dy) + 1 );  
+        size_t num_bins_z = (size_t)( round( (LZ - Z0)/bin_dz) + 1 );  
+        //  bin_dx = (LX-X0)/(num_bins_x - 1);
+        //  bin_dy = (LY-Y0)/(num_bins_y - 1);
+        //  bin_dz = (LZ-Z0)/(num_bins_z - 1);
+
+
+        size_t num_bins = num_bins_x*num_bins_y*num_bins_z;
+        printf("num bins x=%zu, y=%zu, z=%zu \n", num_bins_x, num_bins_y, num_bins_z);
+
+        // bins and their connectivity to each other and points
+        DCArrayKokkos <bin_keys_t> keys_in_bin(num_bins, "keys_in_bin"); // mapping from gid to (i,j,k)
+        DCArrayKokkos <size_t> num_points_in_bin(num_bins, "num_bins");
+        num_points_in_bin.set_values(0);
+        DRaggedRightArrayKokkos <size_t> points_in_bin; // allocated later
+        
+
+        // connectivity from points to bins
+        DCArrayKokkos <size_t> points_bin_gid(num_points, "points_in_gid");
+        CArrayKokkos <size_t>  points_bin_lid_storage(num_points, "bin_lid_storage");  // only used to create storage
+        DCArrayKokkos <int> points_bin_stencil(num_points, 6, "bin_stencil");   // how imin,imax,jmin,jmax,kmin,kmax range for bins in stencil
+        DCArrayKokkos <size_t> points_num_neighbors(num_points, "num_neighbors");
+        
+        printf("Starting timers \n\n");
+
+
+        // start timer
+        auto time_1 = std::chrono::high_resolution_clock::now();
+
+        // build reverse mapping between gid and i,j,k
+        FOR_ALL(i, 0, num_bins_x,
+                j, 0, num_bins_y,
+                k, 0, num_bins_z, {
+            
+
+            // get bin gid for this i,j,k
+            size_t bin_gid = get_gid(i, j, k, num_bins_x, num_bins_y);
+
+            // the i,j,k for this bin
+            bin_keys_t bin_keys;
+            bin_keys.i = i;
+            bin_keys.j = j;
+            bin_keys.k = k;
+
+            // save mapping from bin_gid to bin_keys i,j,k
+            keys_in_bin(bin_gid) = bin_keys;
+
+        });
+        Kokkos::fence();
+        keys_in_bin.update_host();
+
+        // end timer
+        auto time_2 = std::chrono::high_resolution_clock::now();
+
+
+        // -------------------------------------------------------------------
+        // below here, these routine must be called every time particles move
+        // -------------------------------------------------------------------
+
+        // start timer
+        auto time_3 = std::chrono::high_resolution_clock::now();
+
+        printf("building neighbor point list \n");
+
+        // save bin id to points
+        FOR_ALL(point_gid, 0, num_points, {
+
+            // get the 1D index for this bin
+            size_t bin_gid = get_bin_gid(point_positions(point_gid,0), 
+                                         point_positions(point_gid,1), 
+                                         point_positions(point_gid,2),
+                                         num_bins_x, 
+                                         num_bins_y,
+                                         num_bins_z);
+
+            size_t storage_lid = Kokkos::atomic_fetch_add(&num_points_in_bin(bin_gid), 1);
+            points_bin_gid(point_gid) = bin_gid; // the id of the bin
+            points_bin_lid_storage(point_gid) = storage_lid; // the storage place in the bin
+
+        }); // end for all
+        Kokkos::fence();
+        points_bin_gid.update_host();
+        num_points_in_bin.update_host();
+
+
+        // allocate points in bin connectivity
+        points_in_bin = DRaggedRightArrayKokkos <size_t> (num_points_in_bin, "num_points_in_bin");
+
+        // save points in bin
+        FOR_ALL(point_gid, 0, num_points, {
+
+            // get bin gid
+            size_t bin_gid = points_bin_gid(point_gid);
+
+            // get it's storage location in the ragged right compressed storage
+            size_t storage_lid = points_bin_lid_storage(point_gid);
+
+            // save the point to this bin
+            points_in_bin(bin_gid, storage_lid) = point_gid;
+
+        }); // end for all
+
+
+        // for(size_t bin_gid=0; bin_gid<num_bins; bin_gid++){
+        //     if(num_points_in_bin.host(bin_gid) > 0){
+        //         size_t i = keys_in_bin(bin_gid).i;
+        //         size_t j = keys_in_bin(bin_gid).j; 
+        //         size_t k = keys_in_bin(bin_gid).k;
+
+        //         double bin_x = ((double)i)*bin_dx;
+        //         double bin_y = ((double)j)*bin_dy;
+        //         double bin_z = ((double)k)*bin_dz;
+        //         //printf("num points in bin = %zu, bin keys = (%zu, %zu, %zu), bin x = (%f, %f, %f) \n", 
+        //         //    num_points_in_bin.host(bin_gid), i, j, k, bin_x, bin_y, bin_z);
+        //     }
+        // } // end for
+
+
+
+        // ------------------------------------------------
+        // Find the neighbors around each point using bins
+        // ------------------------------------------------
+        
+        FOR_ALL(point_gid, 0, num_points, {
+
+            // get bin gid
+            size_t bin_gid = points_bin_gid(point_gid);
+            
+            // get i,j,k for this bin
+            bin_keys_t bin_keys = keys_in_bin(bin_gid);
+            // printf(" keys = %zu, %zu, %zu, bin size = %zu, %zu, %zu \n", 
+            //     bin_keys.i, bin_keys.j, bin_keys.k,
+            //     num_bins_x, num_bins_y, num_bins_z);
+
+            // loop over neighboring bins
+            size_t num_points_found;
+
+            // establish the stencil size to get enough particles
+            for(int stencil=1; stencil<100000; stencil++){
+
+                num_points_found = 0;
+
+                const int i = bin_keys.i;
+                const int j = bin_keys.j;
+                const int k = bin_keys.k;
+
+                const int imin = MAX(0, i-stencil);
+                const int imax = MIN(num_bins_x-1, i+stencil);
+
+                const int jmin = MAX(0, j-stencil);
+                const int jmax = MIN(num_bins_y-1, j+stencil);
+
+                const int kmin = MAX(0, k-stencil);
+                const int kmax = MIN(num_bins_z-1, k+stencil);
+                    
+                for (int kcount=kmin; kcount<=kmax; kcount++){
+                    for (int jcount=jmin; jcount<=jmax; jcount++) {
+                        for (int icount=imin; icount<=imax; icount++){
+
+                            // get bin neighbor gid 
+                            size_t neighbor_bin_gid = get_gid(icount, jcount, kcount, num_bins_x, num_bins_y);
+                            num_points_found += num_points_in_bin(neighbor_bin_gid);
+
+                        } // end for kcount
+                    } // end for jcount
+                } // end for icount
+
+                // the min number of points required to solve the system is num_poly_basis+1, was 2*num_poly_basis
+                if (num_points_found >= num_points_fit  || num_points_found==num_points){
+
+                    const double x_pt_middle = bin_dx*((double)i) + X0; 
+                    const double y_pt_middle = bin_dy*((double)j) + Y0; 
+                    const double z_pt_middle = bin_dz*((double)k) + Z0; 
+
+                    const double x_pt_minus = bin_dx*((double)imin) + X0; 
+                    const double y_pt_minus = bin_dy*((double)jmin) + Y0; 
+                    const double z_pt_minus = bin_dz*((double)kmin) + Z0; 
+                    
+                    const double x_pt_plus = bin_dx*((double)imax) + X0; 
+                    const double y_pt_plus = bin_dy*((double)jmax) + Y0; 
+                    const double z_pt_plus = bin_dz*((double)kmax) + Z0; 
+
+                    const double dist_minus = sqrt( (x_pt_minus - x_pt_middle)*(x_pt_minus - x_pt_middle) +
+                                                    (y_pt_minus - y_pt_middle)*(y_pt_minus - y_pt_middle) +
+                                                    (z_pt_minus - z_pt_middle)*(z_pt_minus - z_pt_middle) );
+
+                    const double dist_plus = sqrt( (x_pt_plus - x_pt_middle)*(x_pt_plus - x_pt_middle) +
+                                                   (y_pt_plus - y_pt_middle)*(y_pt_plus - y_pt_middle) +
+                                                   (z_pt_plus - z_pt_middle)*(z_pt_plus - z_pt_middle) );
+
+                    //printf("h = %f, dist_m = %f, dist_p = %f, num_points=%zu, imin = %d, imax = %d, jmin = %d,  jmax = %d, kmin = %d,  kmax = %d, \n", 
+                    //         h_kernel, dist_minus, dist_plus, num_points_found, imin, imax, jmin, jmax, kmin, kmax);
+
+                    // only exit when we exceed kernel distance
+                    if (dist_minus >= h_kernel || dist_plus >= h_kernel || num_points_found==num_points){
+
+                        //printf("exiting \n\n");
+
+                        points_bin_stencil(point_gid,0) = imin;
+                        points_bin_stencil(point_gid,1) = imax;
+                        points_bin_stencil(point_gid,2) = jmin;
+                        points_bin_stencil(point_gid,3) = jmax;
+                        points_bin_stencil(point_gid,4) = kmin;
+                        points_bin_stencil(point_gid,5) = kmax;
+
+                        points_num_neighbors(point_gid) = num_points_found; // including node_i in the list of neighbors
+                        points_num_neighbors(point_gid) = num_points_found - 1; // the -1 is because we counted point i as a neighbor
+
+                        break;
+                    }
+                    // else increase stencil size
+
+
+                } // end of check
+                
+            } // end for stencil
+
+
+        }); // end for all
+        Kokkos::fence();
+        points_bin_stencil.update_host();
+
+
+
+        // account for stencels not overlapping, fixing assymetry in points connectivity
+        FOR_ALL(point_gid, 0, num_points, {
+
+            // get bin gid for this point
+            size_t bin_gid = points_bin_gid(point_gid);
+                    
+            // get i,j,k for this bin
+            bin_keys_t bin_keys = keys_in_bin(bin_gid);
+
+            const int i = bin_keys.i;
+            const int j = bin_keys.j;
+            const int k = bin_keys.k;
+
+            // walk over the stencil to get neighbors of this bin
+            const int imin = points_bin_stencil(point_gid,0);
+            const int imax = points_bin_stencil(point_gid,1);
+            const int jmin = points_bin_stencil(point_gid,2);
+            const int jmax = points_bin_stencil(point_gid,3);
+            const int kmin = points_bin_stencil(point_gid,4);
+            const int kmax = points_bin_stencil(point_gid,5);
+
+            // loop over my bin stencil
+            for (int kcount=kmin; kcount<=kmax; kcount++){
+                for (int jcount=jmin; jcount<=jmax; jcount++) {
+                    for (int icount=imin; icount<=imax; icount++){
+
+                        // get bin neighbor gid 
+                        size_t neighbor_bin_gid = get_gid(icount, jcount, kcount, num_bins_x, num_bins_y);
+
+                        // save the points in this bin
+                        for(size_t neighbor_pt_lid=0; neighbor_pt_lid<num_points_in_bin(neighbor_bin_gid); neighbor_pt_lid++){
+
+                            size_t neighbor_point_gid = points_in_bin(neighbor_bin_gid, neighbor_pt_lid);
+
+                            // check if the point-point pairs have identical, overlapping stencils, if not, increment the number of neighbors
+                            const int neighbor_imin = points_bin_stencil(neighbor_point_gid,0);
+                            const int neighbor_imax = points_bin_stencil(neighbor_point_gid,1);
+                            const int neighbor_jmin = points_bin_stencil(neighbor_point_gid,2);
+                            const int neighbor_jmax = points_bin_stencil(neighbor_point_gid,3);
+                            const int neighbor_kmin = points_bin_stencil(neighbor_point_gid,4);
+                            const int neighbor_kmax = points_bin_stencil(neighbor_point_gid,5);
+                            
+                            // i,j,k is the bin where point_gid lives
+                            bool inside =
+                                (i >= neighbor_imin && i <= neighbor_imax) &&
+                                (j >= neighbor_jmin && j <= neighbor_jmax) &&
+                                (k >= neighbor_kmin && k <= neighbor_kmax);
+
+                            if(!inside){
+                                Kokkos::atomic_increment(&points_num_neighbors(neighbor_point_gid)); 
+                                // the other stencil didn't see my point because it was smaller, now it does see it
+                            }
+
+                        } // neighbor_point_lid
+
+                    } // end for kcount
+                } // end for jcount
+            } // end for icount        
+
+        }); // end for all
+        Kokkos::fence();
+        points_num_neighbors.update_host();
+        
+        // allocate memory for points in point
+        DRaggedRightArrayKokkos <size_t> points_in_point(points_num_neighbors, "points_in_point");
+        points_num_neighbors.set_values(0);  // this is a num saved counter now
+
+        // ---------------------
+        // Save the neighbors
+        // ---------------------
+
+        // find neighbors using bins
+        FOR_ALL(point_gid, 0, num_points, {
+
+            // get bin gid for this point
+            size_t bin_gid = points_bin_gid(point_gid);
+                    
+            // get i,j,k for this bin
+            bin_keys_t bin_keys = keys_in_bin(bin_gid);
+
+            const int i = bin_keys.i;
+            const int j = bin_keys.j;
+            const int k = bin_keys.k;
+
+            // walk over the stencil to get neighbors
+            int imin = points_bin_stencil(point_gid,0);
+            int imax = points_bin_stencil(point_gid,1);
+            int jmin = points_bin_stencil(point_gid,2);
+            int jmax = points_bin_stencil(point_gid,3);
+            int kmin = points_bin_stencil(point_gid,4);
+            int kmax = points_bin_stencil(point_gid,5);
+
+
+            for (int kcount=kmin; kcount<=kmax; kcount++){
+                for (int jcount=jmin; jcount<=jmax; jcount++) {
+                    for (int icount=imin; icount<=imax; icount++){
+
+                        // get bin neighbor gid 
+                        size_t neighbor_bin_gid = get_gid(icount, jcount, kcount, num_bins_x, num_bins_y);
+
+                        // save the points in this bin
+                        for(size_t neighbor_pt_lid=0; neighbor_pt_lid<num_points_in_bin(neighbor_bin_gid); neighbor_pt_lid++){
+
+                            size_t neighbor_point_gid = points_in_bin(neighbor_bin_gid, neighbor_pt_lid);
+                            
+                            // make sure its a neighbor
+                            if(neighbor_point_gid != point_gid){
+
+                                // save the neighbor
+                                size_t num_saved = Kokkos::atomic_fetch_add(&points_num_neighbors(point_gid), 1);
+                                points_in_point(point_gid, num_saved) = neighbor_point_gid;
+                                
+                                
+                                // if point j's stencil did not see point i, then save i to j's list
+                                const int neighbor_imin = points_bin_stencil(neighbor_point_gid,0);
+                                const int neighbor_imax = points_bin_stencil(neighbor_point_gid,1);
+                                const int neighbor_jmin = points_bin_stencil(neighbor_point_gid,2);
+                                const int neighbor_jmax = points_bin_stencil(neighbor_point_gid,3);
+                                const int neighbor_kmin = points_bin_stencil(neighbor_point_gid,4);
+                                const int neighbor_kmax = points_bin_stencil(neighbor_point_gid,5);
+
+                                // i,j,k is the bin where point_gid lives
+                                bool inside =
+                                    (i >= neighbor_imin && i <= neighbor_imax) &&
+                                    (j >= neighbor_jmin && j <= neighbor_jmax) &&
+                                    (k >= neighbor_kmin && k <= neighbor_kmax);
+
+                                if(!inside){
+
+                                    size_t num_saved_neighbor = Kokkos::atomic_fetch_add(&points_num_neighbors(neighbor_point_gid), 1);
+                                    points_in_point(neighbor_point_gid, num_saved_neighbor) = point_gid;
+                                    // the other stencil didn't see my point because it was smaller, now it does see it
+
+                                } // end if
+
+                            } // end if neighbor != point_gid
+
+                        } // neighbor_point_lid
+
+                    } // end for kcount
+                } // end for jcount
+            } // end for icount        
+
+        }); // end for all
+        Kokkos::fence();
+        points_in_point.update_host();
+
+
+        // build the reverse map
+        DRaggedRightArrayKokkos <size_t> reverse_neighbor_lid(points_num_neighbors); 
+
+        FOR_ALL(point_gid, 0, num_points, {
+                
+            for(int neighbor_point_lid = 0; neighbor_point_lid<points_num_neighbors(point_gid); neighbor_point_lid++){
+                
+                // get the point gid for this neighbor
+                int neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                
+                // loop over the neighbors of my neighbor
+                size_t found = 0;
+                for(int j_lid = 0; j_lid<points_num_neighbors(neighbor_point_gid); j_lid++){
+
+                    // get the neighboring point gid of my neighbor
+                    int j_point_gid = points_in_point(neighbor_point_gid, j_lid);
+                    if (point_gid == j_point_gid){
+                        reverse_neighbor_lid(point_gid, neighbor_point_lid) = j_lid;
+                        found = 1;
+                        //printf("found \n");
+                        break;
+                    }
+                } // end loop over j's neighboring points
+                if(found==0)printf("reverse map for i=%d and j=%d pair not found \n", point_gid, neighbor_point_gid);
+            } // end loop over i's neighboring points
+                
+        });
+
+
+        // end timer
+        auto time_4 = std::chrono::high_resolution_clock::now();
+
+        printf("done building neighbor point list \n");
+
+        // ----------------------------------------
+        // Find basis that reconstructs polynomial 
+        // ----------------------------------------
+
+        printf("Reconstructing basis using point cloud data \n\n");
+
+        auto time_5 = std::chrono::high_resolution_clock::now();
+
+        CArrayKokkos <double> p_coeffs(num_points, num_poly_basis); // reproducing kernel coefficients at each point
+        
+
+
+        CArrayKokkos <double> M_inv(num_points, num_poly_basis, num_poly_basis);
+        CArrayKokkos <double> grad_M(num_points, num_poly_basis, num_poly_basis);
+        
+        DRaggedRightArrayKokkos <double> basis(points_num_neighbors);        // reproducing kernel basis (num_points, num_neighbors)
+        DRaggedRightArrayKokkos <double> grad_basis_i(points_num_neighbors,3); // grad kernel basis j with respect to i (num_points, num_neighbors)
+        DRaggedRightArrayKokkos <double> grad_basis_j(points_num_neighbors,3); // grad kernel basis i with respect to j (num_points, num_neighbors)
+        
+
+
+        double h = h_kernel; // kernel width
+
+
+        printf("building reproducing kernel coefficients \n");
+
+        // build coefficients on basis functions
+        calc_p_coefficients(point_positions, 
+                            points_num_neighbors, 
+                            points_in_point, 
+                            vol, 
+                            p_coeffs, 
+                            M_inv,
+                            h);
+        
+        
+        calc_basis_and_grad_basis_functions(
+                                    point_positions,
+                                    points_num_neighbors, 
+                                    points_in_point,
+                                    vol,
+                                    p_coeffs,
+                                    M_inv,
+                                    basis,
+                                    grad_basis_i,
+                                    h,
+                                    true);
+        
+        calc_basis_and_grad_basis_functions(
+                                    point_positions,
+                                    points_num_neighbors, 
+                                    points_in_point,
+                                    vol,
+                                    p_coeffs,
+                                    M_inv,
+                                    basis,
+                                    grad_basis_j,
+                                    h,
+                                    false);
+
+        // end timer
+        auto time_6 = std::chrono::high_resolution_clock::now();
+
+
+        // -----------------
+        //  Timers
+        // -----------------
+        printf("\n");
+        std::chrono::duration <double, std::milli> ms = time_2 - time_1;
+        std::cout << "runtime to create bins = " << ms.count() << "ms\n\n";
+
+        ms = time_4 - time_3;
+        std::cout << "runtime to find and save neighbors = " << ms.count() << "ms\n\n";
+
+        ms = time_6 - time_5;
+        std::cout << "runtime to calculate basis and grad basis = " << ms.count() << "ms\n\n";
+
+
+
+        printf("Checking gradients at points \n\n");
+
+        // performing checks on p_coeffs, basis, and grad_basis
+        double partion_unity;
+        double partion_unity_lcl;
+
+        double linear_preserving;
+        double linear_preserving_lcl;
+
+        double quadratic_preserving;
+        double quadratic_preserving_lcl;
+
+        double grad_x_p0; 
+        double grad_x_p0_lcl; 
+        double grad_y_p0; 
+        double grad_y_p0_lcl; 
+        double grad_z_p0; 
+        double grad_z_p0_lcl; 
+
+        double grad_x_p1; 
+        double grad_x_p1_lcl; 
+        double grad_y_p1; 
+        double grad_y_p1_lcl; 
+        double grad_z_p1; 
+        double grad_z_p1_lcl; 
+
+        double grad_x_p2; 
+        double grad_x_p2_lcl; 
+        double grad_y_p2; 
+        double grad_y_p2_lcl; 
+        double grad_z_p2; 
+        double grad_z_p2_lcl; 
+
+        // loop over the particles in the domain
+        for(size_t point_gid=0; point_gid<num_points; point_gid++){
+
+            // partition of unity
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), partion_unity_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                partion_unity_lcl += basis(point_gid,neighbor_point_lid)*vol(neighbor_point_gid);
+            }, partion_unity);
+            
+
+            // linear reproducing
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), linear_preserving_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                linear_preserving_lcl += basis(point_gid,neighbor_point_lid)*vol(neighbor_point_gid)*point_positions(neighbor_point_gid,0);
+            }, linear_preserving);
+
+
+            // quadratic reproducing
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), quadratic_preserving_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                quadratic_preserving_lcl += basis(point_gid,neighbor_point_lid)*vol(neighbor_point_gid)*point_positions(neighbor_point_gid,0)*point_positions(neighbor_point_gid,0);
+            }, quadratic_preserving);
+
+            if(fabs(partion_unity-1.0)>1e-13)
+                printf("partition unity = %f, ", partion_unity);
+
+            if(fabs(linear_preserving-point_positions(point_gid,0))>1e-13)
+                printf("linear fcn error = %f, ", fabs(linear_preserving-point_positions(point_gid,0)));
+
+        #if defined(P2)
+            if(fabs(quadratic_preserving-point_positions(point_gid,0)*point_positions(point_gid,0))>1e-13)
+                printf("quadratic fcn error = %f at i=%zu \n", fabs(quadratic_preserving-point_positions(point_gid,0)*point_positions(point_gid,0)), point_gid);
+        #endif
+
+            // -----------------
+            // gradient checks
+            // -----------------
+
+            // Sum(grad) = [0]; 
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), grad_x_p0_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                grad_x_p0_lcl += grad_basis_i(point_gid,neighbor_point_lid,0)*vol(neighbor_point_gid);
+            }, grad_x_p0);
+
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), grad_y_p0_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                grad_y_p0_lcl += grad_basis_i(point_gid,neighbor_point_lid,1)*vol(neighbor_point_gid);
+            }, grad_y_p0);
+
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), grad_z_p0_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                grad_z_p0_lcl += grad_basis_i(point_gid,neighbor_point_lid,2)*vol(neighbor_point_gid);
+            }, grad_z_p0);
+
+            const double grad_check_P0 = fabs(grad_x_p0)+fabs(grad_y_p0)+fabs(grad_z_p0);
+            if(0.333*fabs(grad_check_P0)>1e-8)
+                printf("error in grad(P0) = %f, %f, %f, \n", grad_x_p0, grad_y_p0, grad_z_p0);
+
+
+            // Sum(grad(P1)) = [1]; 
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), grad_x_p1_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                grad_x_p1_lcl += grad_basis_i(point_gid,neighbor_point_lid,0)*vol(neighbor_point_gid)*point_positions(neighbor_point_gid,0);
+            }, grad_x_p1);
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), grad_y_p1_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                grad_y_p1_lcl += grad_basis_i(point_gid,neighbor_point_lid,1)*vol(neighbor_point_gid)*point_positions(neighbor_point_gid,1);
+            }, grad_y_p1);
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), grad_z_p1_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                grad_z_p1_lcl += grad_basis_i(point_gid,neighbor_point_lid,2)*vol(neighbor_point_gid)*point_positions(neighbor_point_gid,2);
+            }, grad_z_p1);
+
+            const double grad_check_P1 = fabs(grad_x_p1 - 1.0)+fabs(grad_y_p1 - 1.0)+fabs(grad_z_p1 - 1.0);
+            if(0.333*fabs(grad_check_P1)>1e-8){
+                printf("error in grad(P1) = %f, %f, %f, \n", 
+                    fabs(grad_x_p1 - 1.0), 
+                    fabs(grad_y_p1 - 1.0), 
+                    fabs(grad_z_p1 - 1.0));
+            }
+
+
+        #if defined(P2)
+            // Sum(grad(P2)) = [2]; 
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), grad_x_p2_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                grad_x_p2_lcl += grad_basis_i(point_gid,neighbor_point_lid,0)*vol(neighbor_point_gid)*point_positions(neighbor_point_gid,0)*point_positions(neighbor_point_gid,0);
+            }, grad_x_p2);
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), grad_y_p2_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                grad_y_p2_lcl += grad_basis_i(point_gid,neighbor_point_lid,1)*vol(neighbor_point_gid)*point_positions(neighbor_point_gid,1)*point_positions(neighbor_point_gid,1);
+            }, grad_y_p2);
+            FOR_REDUCE_SUM(neighbor_point_lid, 0, points_num_neighbors.host(point_gid), grad_z_p2_lcl, {
+                // get the point gid for this neighboring
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+                grad_z_p2_lcl += grad_basis_i(point_gid,neighbor_point_lid,2)*vol(neighbor_point_gid)*point_positions(neighbor_point_gid,2)*point_positions(neighbor_point_gid,2);
+            }, grad_z_p2);
+
+            const double grad_check_P2 = fabs(grad_x_p2-2.0*point_positions(point_gid,0)) + fabs(grad_y_p2-2.0*point_positions(point_gid,1)) + fabs(grad_z_p2-2.0*point_positions(point_gid,2));
+            if(0.333*fabs(grad_check_P2)>1e-8){
+                printf("error in grad(P2) = %f, %f, %f, \n", 
+                        fabs(grad_x_p2-2.0*point_positions(point_gid,0)), 
+                        fabs(grad_y_p2-2.0*point_positions(point_gid,1)), 
+                        fabs(grad_z_p2-2.0*point_positions(point_gid,2)));
+            }
+        #endif
+
+        } // end for point gid
+
+
+        if(check_maps){
+            size_t bad = 0;
+            for (size_t i=0; i<num_points; ++i) {
+                for (size_t lid=0; lid<points_num_neighbors(i); ++lid) {
+                    size_t j = points_in_point(i, lid);
+                    size_t rev = reverse_neighbor_lid(i, lid);
+                    if (rev == SIZE_MAX) {
+                    printf("MISSING reverse for pair (%zu,%zu)\n", i, j); ++bad;
+                    if (bad>50) break;
+                    } else {
+                    size_t check = points_in_point(j, rev);
+                    if (check != i) {
+                        printf("WRONG reverse: i=%zu j=%zu rev=%zu check=%zu\n", i, j, rev, check);
+                        ++bad;
+                        if (bad>50) break;
+                    }
+                    }
+                }
+                if (bad>50) break;
+            }
+            if (bad==0) printf("reverse map OK\n");
+
+
+            for (size_t i=0; i<num_points; ++i) {
+                std::set<size_t> seen;
+                for (size_t lid=0; lid<points_num_neighbors(i); ++lid) {
+                    size_t j = points_in_point(i, lid);
+                    //if (j == i) printf("SELF neighbor found at i=%zu lid=%zu\n", i, lid);
+                    if (!seen.insert(j).second) printf("DUPLICATE neighbor %zu in list of %zu\n", j, i);
+                }
+            }
+        }
+
+
+        printf("Building anti-symmetric gradient \n\n");
+
+        // -------------------
+        // Anti-sym gradient
+        // -------------------
+        printf("Testing divergence of vector field u = (x, y, z) \n\n");
+
+        DCArrayKokkos <double> u(num_points, 3);
+        FOR_ALL(i, 0, num_points, {
+            u(i, 0) = point_positions(i, 0);        
+            u(i, 1) = point_positions(i, 1);
+            u(i, 2) = point_positions(i, 2);
+        });
+        u.update_device();
+
+        DCArrayKokkos <double> div(num_points);
+        div.set_values(0.0);
+
+        DCArrayKokkos <double> div_fd(num_points);
+        div_fd.set_values(0.0);
+
+        FOR_ALL(i_gid, 0, num_points, {
+
+            for(size_t j_lid = 0; j_lid<points_num_neighbors(i_gid); j_lid++){     
+
+                size_t j_gid = points_in_point(i_gid, j_lid);
+                size_t i_lid = reverse_neighbor_lid(i_gid, j_lid);
+
+                if(i_gid != points_in_point(j_gid, i_lid)){
+                    printf("CHECK: point i = %d, reverse map point i = %zu for j = %zu \n", i_gid, points_in_point(j_gid, i_lid), j_gid);
+                }
+                //printf("map check: edge points (%d,%zu), rev from j = %zu using i_lid = %zu \n", i_gid, j_gid,  points_in_point(j_gid, i_lid), i_lid);
+
+                double g_ij[3];
+                double g_ji[3];
+                double sum[3];
+                for (int dim=0; dim<3; ++dim) {
+                    g_ij[dim] = grad_basis_i(i_gid,j_lid,dim);
+                    g_ji[dim] = grad_basis_j(i_gid,j_lid,dim);
+                    sum[dim] = g_ij[dim] + g_ji[dim];
+                }
+                double norm = sqrt(sum[0]*sum[0] + sum[1]*sum[1] + sum[2]*sum[2]);
+                //printf("errors: norm=%f, g_ij = (%f, %f, %f), g_ji = (%f, %f, %f) \n", norm, g_ij[0], g_ij[1], g_ij[2], g_ji[0], g_ji[1], g_ji[2]);
+
+                // conservative mesh-free FE
+                double contrib = 0.0;
+                for (int dim=0; dim<3; ++dim) {
+                    contrib += 0.5*(g_ij[dim] - g_ji[dim]) * (u(j_gid, dim) + u(i_gid, dim));
+                }
+                div(i_gid) += vol(i_gid) * vol(j_gid) * contrib;
+
+                // finite difference
+                contrib = 0.0;
+                for (int dim=0; dim<3; ++dim) {
+                    contrib += g_ij[dim]*u(j_gid, dim)*vol(j_gid);
+                }
+                div_fd(i_gid) += contrib;
+
+            } // end loop over neighbors
+
+            div(i_gid) /= vol(i_gid);
+            // remember: finite difference doesn't have the V_i on the right side, so no division
+        });
+        div.update_host();
+        div_fd.update_host();
+
+
+        for(size_t point_gid=0; point_gid<num_points; point_gid++){
+            double error = fabs(div.host(point_gid) - 3.0);
+            if(error > 1e-8){
+                printf("div(u) = %f at point %zu, error = %g, vol = %f\n", div.host(point_gid), point_gid, error, vol.host(point_gid));
+            }
+        } // end for point_gid
+
+        for(size_t point_gid=0; point_gid<num_points; point_gid++){
+            double error = fabs(div_fd.host(point_gid) - 3.0);
+            if(error > 1e-8){
+                printf("div_fd(u) = %f at point %zu, error = %g\n", div_fd.host(point_gid), point_gid, error);
+            }
+        } // end for point_gid
+
+
+        double conserve_check;
+        double conserve_check_lcl;
+        FOR_REDUCE_SUM(point_gid, 0, num_points, 
+                       conserve_check_lcl, {
+
+            for(size_t neighbor_point_lid = 0; neighbor_point_lid<points_num_neighbors(point_gid); neighbor_point_lid++){
+                
+                // get the point gid for this neighbor
+                size_t neighbor_point_gid = points_in_point(point_gid, neighbor_point_lid);
+
+                // get the local id of my neighbor that matches my point_gid
+                size_t neighbor_lid = reverse_neighbor_lid(point_gid, neighbor_point_lid);
+
+                for (size_t dim=0; dim<3; dim++){
+                    // from point i
+                    conserve_check_lcl += 0.5*(grad_basis_i(point_gid,neighbor_point_lid,dim) - grad_basis_j(point_gid,neighbor_point_lid,dim));
+
+                    // from neighbor point
+                    conserve_check_lcl += 0.5*(grad_basis_i(neighbor_point_gid,neighbor_lid,dim) - grad_basis_j(neighbor_point_gid,neighbor_lid,dim));
+                }
+            }
+
+        }, conserve_check);
+        printf("conservation error = %f \n\n", conserve_check);
+
+
+
+        printf("Testing sin(u) of vector field u = (x, y, z) \n\n");
+
+        FOR_ALL(i, 0, num_points, {
+            u(i, 0) = sin(point_positions(i, 0));        
+            u(i, 1) = sin(point_positions(i, 1));
+            u(i, 2) = sin(point_positions(i, 2));
+        });
+        u.update_device();
+
+        DCArrayKokkos <double> div_sinx(num_points);
+        div_sinx.set_values(0.0);
+
+        DCArrayKokkos <double> div_fd_sinx(num_points);
+        div_fd_sinx.set_values(0.0);
+
+        FOR_ALL(i_gid, 0, num_points, {
+
+            for(size_t j_lid = 0; j_lid<points_num_neighbors(i_gid); j_lid++){     
+
+                size_t j_gid = points_in_point(i_gid, j_lid);
+                size_t i_lid = reverse_neighbor_lid(i_gid, j_lid);
+
+                if(i_gid != points_in_point(j_gid, i_lid)){
+                    printf("CHECK: point i = %d, reverse map point i = %zu for j = %zu \n", i_gid, points_in_point(j_gid, i_lid), j_gid);
+                }
+                //printf("map check: edge points (%d,%zu), rev from j = %zu using i_lid = %zu \n", i_gid, j_gid,  points_in_point(j_gid, i_lid), i_lid);
+
+                double g_ij[3];
+                double g_ji[3];
+                double sum[3];
+                for (int dim=0; dim<3; ++dim) {
+                    g_ij[dim] = grad_basis_i(i_gid,j_lid,dim);
+                    g_ji[dim] = grad_basis_j(i_gid,j_lid,dim);
+                    sum[dim] = g_ij[dim] + g_ji[dim];
+                }
+                double norm = sqrt(sum[0]*sum[0] + sum[1]*sum[1] + sum[2]*sum[2]);
+                //printf("errors: norm=%f, g_ij = (%f, %f, %f), g_ji = (%f, %f, %f) \n", norm, g_ij[0], g_ij[1], g_ij[2], g_ji[0], g_ji[1], g_ji[2]);
+
+                // conservative mesh-free FE
+                double contrib = 0.0;
+                for (int dim=0; dim<3; ++dim) {
+                    contrib += 0.5*(g_ij[dim] - g_ji[dim]) * (u(j_gid, dim) + u(i_gid, dim));
+                }
+                div_sinx(i_gid) += vol(i_gid) * vol(j_gid) * contrib;
+
+                // finite difference
+                contrib = 0.0;
+                for (int dim=0; dim<3; ++dim) {
+                    contrib += g_ij[dim]*u(j_gid, dim)*vol(j_gid);
+                }
+                div_fd_sinx(i_gid) += contrib;
+
+            } // end loop over neighbors
+
+            div_sinx(i_gid) /= vol(i_gid);
+            // remember: finite difference doesn't have the V_i on the right side, so no division
+        });
+        div_sinx.update_host();
+        div_fd_sinx.update_host();
+
+
+
+       
+
+
+
+
+
+
+
+        printf("Writing VTK Graphics File \n\n");
+
+        std::ofstream out("cloud.vtk");
+
+        out << "# vtk DataFile Version 3.0\n";
+        out << "3D point cloud\n";
+        out << "ASCII\n";
+        out << "DATASET POLYDATA\n";
+        out << "POINTS " << num_points << " float\n";
+        for (size_t point_gid = 0; point_gid < num_points; ++point_gid) {
+            out << point_positions.host(point_gid,0) << " " 
+                << point_positions.host(point_gid,1) << " " 
+                << point_positions.host(point_gid,2) << "\n";
+        }
+
+        out << "\nPOINT_DATA " << num_points << "\n";
+        out << "SCALARS error_div(x) float 1\n";
+        out << "LOOKUP_TABLE default\n";
+        for (size_t point_gid = 0; point_gid < num_points; ++point_gid) {
+            out << div.host(point_gid)-3 << "\n";
+        }
+
+        out << "SCALARS error_div(sin(x)) float 1\n";
+        out << "LOOKUP_TABLE default\n";
+        for (size_t point_gid = 0; point_gid < num_points; ++point_gid) {
+            // vec[0] = sin(x), dvec[0]/dx = cos(x)
+            // vec[1] = sin(y), dvec[1]/dy = cos(y)
+            // vec[2] = sin(z), dvec[2]/dz = cos(z)
+            double val0 = cos(point_positions.host(point_gid,0));
+            double val1 = cos(point_positions.host(point_gid,1));
+            double val2 = cos(point_positions.host(point_gid,2));
+            double exact_div = val0 + val1 + val2;
+            out << (div_sinx.host(point_gid) - exact_div) << "\n";
+        }
+
+        printf("Finished \n\n");
+
+
+
+    } // end of kokkos scope
+
+
+    Kokkos::finalize();
+
+
+
+    return 0;
+    
+} // end main
diff --git a/solvers/lu_solver.hpp b/solvers/lu_solver.hpp
index f9b611e8..fe095cc7 100644
--- a/solvers/lu_solver.hpp
+++ b/solvers/lu_solver.hpp
@@ -269,6 +269,226 @@ double LU_determinant(
 
 
 
+/////////// ViewKokkos Versions //////////
+
+
+// the function is run on the GPU
+KOKKOS_FUNCTION
+int LU_decompose(
+    const ViewCArrayKokkos <double> &A, // matrix A passed in and is sent out in LU decomp format
+    const ViewCArrayKokkos <size_t> &perm,  // permutations
+    const ViewCArrayKokkos <double> &vv,
+    int &parity) {                 // parity (+1 or -1)
+                          
+    const int n = A.dims(0);  // size of matrix 
+
+    parity = 1;
+
+    // helper variables
+    double temp;
+    
+    // search for the largest element in each row; save the scaling in the 
+    // temporary array vv and return zero if the matrix is singular 
+    for(size_t i = 0; i < n; i++) {
+        
+        double big = 0.;
+        for(size_t j = 0; j < n; j++){
+            if((temp=fabs(A(i,j))) > big){
+                big=temp;
+            }
+        }
+        
+        if(big == 0.0) return(0);
+        
+        vv(i) = big;
+    }
+
+    // the main loop for the Crout's algorithm
+    for(size_t j = 0; j < n; j++) {
+        
+        // this is the part a) of the algorithm except for i==j 
+        for(size_t i=0;i<j;i++) {
+            
+            double sum=A(i,j);
+            
+            for(size_t k=0;k<i;k++){
+                sum -= A(i,k)*A(k,j);
+            }
+
+            A(i,j) = sum;
+        }
+    
+        // initialize for the search for the largest pivot element
+        double big = 0.;
+        size_t imax = j;
+        
+        
+        // this is the part a) for i==j and part b) for i>j + pivot search 
+        for(size_t i = j; i < n; i++) {
+            
+            double sum = A(i,j);
+            
+            for(size_t k=0; k<j; k++){
+                sum -= A(i,k)*A(k,j);
+            }
+            
+            A(i,j) = sum;
+            
+            // is the figure of merit for the pivot better than the best so far?
+            if((temp = vv(i)*fabs(sum)) >= big){
+                big = temp; 
+                imax = i;
+            }
+        } // end for i
+
+        // interchange rows, if needed, change parity and the scale factor
+        if(imax != j) {
+            
+            for(size_t k = 0; k < n; k++){
+                temp = A(imax,k);
+                A(imax,k) = A(j,k);
+                A(j,k) = temp;
+            }
+            
+            parity = -(parity);
+            vv(imax) = vv(j);
+        }
+        
+        // store the index
+        perm(j) = imax;
+        // if the pivot element is zero, the matrix is singular but for some 
+        // applications a tiny number is desirable instead 
+        
+        if(A(j,j) == 0.0){
+            A(j,j) = TINY;
+        }
+        // finally, divide by the pivot element
+        
+        if(j<n-1) {
+            
+            temp=1./A(j,j);
+            for(size_t i = j+1; i < n; i++){
+                A(i,j)*=temp;
+            } // end for i
+        } // end if j
+
+    } // end for j
+    
+    return(1);
+} // end function
+
+
+
+// -------------------------------
+// LU back substitution functions 
+// -------------------------------
+
+// this function is run on the GPU
+KOKKOS_FUNCTION
+void LU_backsub(
+    const ViewCArrayKokkos <double> &A,     // input matrix A in LU decomp format
+    const ViewCArrayKokkos <size_t> &perm,  // permutations
+    const ViewCArrayKokkos <double> &b){          // RHS and is answer x to Ax=B
+
+        const int n = A.dims(0);    // size of matrix
+
+        int ii = -1;
+
+
+        // First step of backsubstitution; the only wrinkle is to unscramble 
+        // the permutation order. Note: the algorithm is optimized for a 
+        // possibility of large amount of zeroes in b
+        
+        for(size_t i = 0; i < n; i++) {
+           
+            size_t ip = perm(i);
+
+            double sum = b(ip);
+            b(ip) = b(i);
+         
+            if(ii >= 0){
+                for(size_t j = ii; j<i; j++){
+                    sum -= A(i,j)*b(j);
+                }
+            }
+            else if(sum>0){
+                ii=i;  // a nonzero element encounted
+            }
+          
+            b(i) = sum;
+        } // end loop i
+        
+        // the second step
+        for(int i=n-1; i>=0; i--) {
+            
+            double sum = b(i);
+            for(size_t j=i+1; j<n; j++){
+                sum-=A(i,j)*b(j);
+            } // end j
+       
+            b(i)=sum/A(i,i);
+        } // end loop i
+
+} // end if
+
+
+// ------------------ 
+// LU invert function 
+// ------------------ 
+KOKKOS_INLINE_FUNCTION
+void LU_invert(
+    ViewCArrayKokkos <double> &A,       // input matrix
+    ViewCArrayKokkos <size_t> &perm,    // permutations
+    ViewCArrayKokkos <double> &inv_mat, // inverse matrix
+    ViewCArrayKokkos <double> &col) {   // tmp array
+
+    const size_t n = A.dims(0);    // size of matrix
+
+
+    for(size_t j = 0; j < n; j++){
+
+        for(size_t i = 0; i < n; i++){
+            col(i) = 0.0;
+        } // end for i
+        
+        col(j) = 1.0;
+        LU_backsub(A, perm, col);
+        
+        for(size_t i = 0; i < n; i++){
+            inv_mat(i,j) = col(i);
+        } // end for i
+
+    } // end for j
+
+    return;
+
+} // end function
+
+// -----------------------
+// LU determinant function 
+//  Input:  A filled in LUPDecompose; N - dimension.
+//  Output: determinate of original A matrix
+// ----------------------- 
+KOKKOS_INLINE_FUNCTION
+double LU_determinant(
+    ViewCArrayKokkos <double> &A,  // input matrix
+    const int parity){          // parity (+1 0r -1)
+
+    const int n = A.dims(0);    // size of matrix
+
+    double res = (double)(parity);
+    
+    for(size_t j=0; j<n; j++){
+        res *= A(j,j);
+    } // end j
+
+    return(res);
+
+} // end function
+
+
+
+
 // ============================================
 //  GPU kernals
 // ============================================