diff --git a/.gitignore b/.gitignore
index 98b65133..282ea97e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
-*.vcxproj
+.DS_Store
+.vs/
 App/App.vcxproj.user
 *.filters
 *.obj
@@ -18,15 +19,24 @@ Bin/Release/x64/App64.exe
 *.exe
 *.opendb
 *.db
-*.sln
 *.d
+*.sln
+*.vcxproj
 
+build/
 *.o
 
 *.a
 
 *.so
 
+*.cmake
+CMakeCache.txt
+CMakeFiles/
+CMakeCache/
+
+packages/
+
 App/Makefile
 
 Bin/Release/x64/
diff --git a/.gitmodules b/.gitmodules
index 43834ac7..e69de29b 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "Anvil"]
-	path = Anvil
-	url = https://github.com/GPUOpen-LibrariesAndSDKs/Anvil.git
diff --git a/3rdparty/embree/bin/x32/embree.dll b/3rdparty/embree/bin/x32/embree.dll
deleted file mode 100644
index 6d2c2ee5..00000000
Binary files a/3rdparty/embree/bin/x32/embree.dll and /dev/null differ
diff --git a/3rdparty/embree/bin/x64/benchmark.exe b/3rdparty/embree/bin/x64/benchmark.exe
deleted file mode 100644
index a714450a..00000000
Binary files a/3rdparty/embree/bin/x64/benchmark.exe and /dev/null differ
diff --git a/3rdparty/embree/bin/x64/embree.dll b/3rdparty/embree/bin/x64/embree.dll
deleted file mode 100644
index 603b2ca8..00000000
Binary files a/3rdparty/embree/bin/x64/embree.dll and /dev/null differ
diff --git a/3rdparty/embree/bin/x64/tbb.dll b/3rdparty/embree/bin/x64/tbb.dll
deleted file mode 100644
index b0dcda76..00000000
Binary files a/3rdparty/embree/bin/x64/tbb.dll and /dev/null differ
diff --git a/3rdparty/embree/include/embree2/rtcore.h b/3rdparty/embree/include/embree2/rtcore.h
deleted file mode 100644
index fb24757a..00000000
--- a/3rdparty/embree/include/embree2/rtcore.h
+++ /dev/null
@@ -1,257 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_H__
-#define __RTCORE_H__
-
-#include <stddef.h>
-#include <sys/types.h>
-
-#if defined(_WIN32)
-#if defined(_M_X64)
-typedef long long ssize_t;
-#else
-typedef int ssize_t;
-#endif
-#endif
-
-#ifndef RTCORE_API
-#if defined(_WIN32) && !defined(ENABLE_STATIC_LIB)
-#  define RTCORE_API extern "C" __declspec(dllimport) 
-#else
-#  define RTCORE_API extern "C"
-#endif
-#endif
-
-#ifdef _WIN32
-#  define RTCORE_ALIGN(...) __declspec(align(__VA_ARGS__))
-#else
-#  define RTCORE_ALIGN(...) __attribute__((aligned(__VA_ARGS__)))
-#endif
-
-#ifdef __GNUC__
-  #define RTCORE_DEPRECATED __attribute__((deprecated))
-#elif defined(_MSC_VER)
-  #define RTCORE_DEPRECATED __declspec(deprecated)
-#else
-  #define RTCORE_DEPRECATED
-#endif
-
-/*! Embree API version */
-#define RTCORE_VERSION_MAJOR 2
-#define RTCORE_VERSION_MINOR 9
-#define RTCORE_VERSION_PATCH 0
-#define RTCORE_VERSION       20900
-
-/*! \file rtcore.h Defines the Embree Ray Tracing Kernel API for C and C++ 
-
-   This file defines the Embree ray tracing kernel API for C and
-   C++. The user is supposed to include this file, and alternatively
-   the rtcore_ray.h file, but none of the other .h files in this
-   folder. */
-
-/*! \{ */
-
-/*! Axis aligned bounding box representation */
-struct RTCORE_ALIGN(16) RTCBounds
-{
-  float lower_x, lower_y, lower_z, align0;
-  float upper_x, upper_y, upper_z, align1;
-};
-
-/*! \brief Defines an opaque device type */
-typedef struct __RTCDevice {}* RTCDevice;
-
-/*! \brief Creates a new Embree device.
-
-  Creates a new Embree device to be used by the application. An
-  application typically creates only a single Embree device, but it is
-  valid to use multiple devices inside an application. A configuration
-  string can be passed at construction time, that allows to configure
-  implementation specific parameters. If this string is NULL, a
-  default configuration is used. The following configuration flags are
-  supported by the Embree implementation of the API:
-  
-  verbose = num,       // sets verbosity level (default is 0)
-
-  If Embree is started on an unsupported CPU, rtcNewDevice will fail and
-  set the RTC_UNSUPPORTED_CPU error code.
-  
-*/
-RTCORE_API RTCDevice rtcNewDevice(const char* cfg = NULL);
-
-/*! \brief Deletes an Embree device.
-
-  Deletes the Embree device again. After deletion, all scene handles
-  are invalid. The application should invoke this call before
-  terminating. */
-RTCORE_API void rtcDeleteDevice(RTCDevice device);
-
-/*! \brief Initializes the Embree ray tracing core
-
-  WARNING: This function is deprecated, use rtcNewDevice instead.
-
-  Initializes the ray tracing core and passed some configuration
-  string. The configuration string allows to configure implementation
-  specific parameters. If this string is NULL, a default configuration
-  is used. The following configuration flags are supported by the
-  Embree implementation of the API:
-  
-  verbose = num,       // sets verbosity level (default is 0)
-
-  If Embree is started on an unsupported CPU, rtcInit will fail and
-  set the RTC_UNSUPPORTED_CPU error code.
-  
-*/
-RTCORE_API RTCORE_DEPRECATED void rtcInit(const char* cfg = NULL);
-
-/*! \brief Shuts down Embree
-
-  WARNING: This function is deprecated, use rtcDeleteDevice instead.
-
-  Shuts down the ray tracing core. After shutdown, all scene handles
-  are invalid, and invoking any API call except rtcInit is not
-  allowed. The application should invoke this call before
-  terminating. It is safe to call rtcInit again after an rtcExit
-  call. */
-RTCORE_API RTCORE_DEPRECATED void rtcExit();
-
-/*! \brief Parameters that can get configured using the rtcSetParameter functions. */
-enum RTCParameter {
-  RTC_SOFTWARE_CACHE_SIZE = 0,                /*! Configures the software cache size (used
-                                                to cache subdivision surfaces for
-                                                instance). The size is specified as an
-                                                integer number of bytes. The software
-                                                cache cannot be configured during
-                                                rendering. (write only) */
-
-  RTC_CONFIG_INTERSECT1 = 1,                  //!< checks if rtcIntersect1 is supported (read only)
-  RTC_CONFIG_INTERSECT4 = 2,                  //!< checks if rtcIntersect4 is supported (read only)
-  RTC_CONFIG_INTERSECT8 = 3,                  //!< checks if rtcIntersect8 is supported (read only)
-  RTC_CONFIG_INTERSECT16 = 4,                 //!< checks if rtcIntersect16 is supported (read only)
-  RTC_CONFIG_INTERSECTN = 5,                  //!< checks if rtcIntersectN is supported (read only)
-
-  RTC_CONFIG_RAY_MASK = 6,                    //!< checks if ray masks are supported (read only)
-  RTC_CONFIG_BACKFACE_CULLING = 7,            //!< checks if backface culling is supported (read only)
-  RTC_CONFIG_INTERSECTION_FILTER = 8,         //!< checks if intersection filters are enabled (read only)
-  RTC_CONFIG_INTERSECTION_FILTER_RESTORE = 9, //!< checks if intersection filters restores previous hit (read only)
-  RTC_CONFIG_BUFFER_STRIDE = 10,              //!< checks if buffer strides are supported (read only)
-  RTC_CONFIG_IGNORE_INVALID_RAYS = 11,        //!< checks if invalid rays are ignored (read only)
-  RTC_CONFIG_TASKING_SYSTEM = 12,             //!< return used tasking system (0 = INTERNAL, 1 = TBB) (read only)
-
-  RTC_CONFIG_VERSION_MAJOR = 13,           //!< returns Embree major version (read only)
-  RTC_CONFIG_VERSION_MINOR = 14,           //!< returns Embree minor version (read only)
-  RTC_CONFIG_VERSION_PATCH = 15,           //!< returns Embree patch version (read only)
-  RTC_CONFIG_VERSION = 16,                 //!< returns Embree version as integer (e.g. Embree v2.8.2 -> 20802) (read only)
-};
-
-/*! \brief Configures some parameters. 
-    WARNING: This function is deprecated, use rtcDeviceSetParameter1i instead.
-*/
-RTCORE_API RTCORE_DEPRECATED void rtcSetParameter1i(const RTCParameter parm, ssize_t val);
-
-/*! \brief Reads some device parameter. 
-    WARNING: This function is deprecated, use rtcDeviceGetParameter1i instead.
-*/
-RTCORE_API RTCORE_DEPRECATED ssize_t rtcGetParameter1i(const RTCParameter parm);
-
-/*! \brief Configures some device parameters. */
-RTCORE_API void rtcDeviceSetParameter1i(RTCDevice device, const RTCParameter parm, ssize_t val);
-
-/*! \brief Reads some device parameter. */
-RTCORE_API ssize_t rtcDeviceGetParameter1i(RTCDevice device, const RTCParameter parm);
-
-/*! \brief Error codes returned by the rtcGetError function. */
-enum RTCError {
-  RTC_NO_ERROR = 0,          //!< No error has been recorded.
-  RTC_UNKNOWN_ERROR = 1,     //!< An unknown error has occured.
-  RTC_INVALID_ARGUMENT = 2,  //!< An invalid argument is specified
-  RTC_INVALID_OPERATION = 3, //!< The operation is not allowed for the specified object.
-  RTC_OUT_OF_MEMORY = 4,     //!< There is not enough memory left to execute the command.
-  RTC_UNSUPPORTED_CPU = 5,   //!< The CPU is not supported as it does not support SSE2.
-  RTC_CANCELLED = 6,         //!< The user has cancelled the operation through the RTC_PROGRESS_MONITOR_FUNCTION callback
-};
-
-/*! \brief Returns the value of the per-thread error flag. 
-
-  WARNING: This function is deprecated, use rtcDeviceGetError instead.
-
-  If an error occurs this flag is set to an error code if it stores no
-  previous error. The rtcGetError function reads and returns the
-  currently stored error and clears the error flag again. */
-RTCORE_API RTCORE_DEPRECATED RTCError rtcGetError();
-
-/*! \brief Returns the value of the per-thread error flag. 
-
-  If an error occurs this flag is set to an error code if it stores no
-  previous error. The rtcGetError function reads and returns the
-  currently stored error and clears the error flag again. */
-RTCORE_API RTCError rtcDeviceGetError(RTCDevice device);
-
-/*! \brief Type of error callback function. */
-typedef void (*RTCErrorFunc)(const RTCError code, const char* str);
-RTCORE_DEPRECATED typedef RTCErrorFunc RTC_ERROR_FUNCTION;
-
-/*! \brief Sets a callback function that is called whenever an error occurs. 
-   WARNING: This function is deprecated, use rtcDeviceSetErrorFunction instead.
-   */
-RTCORE_API RTCORE_DEPRECATED void rtcSetErrorFunction(RTCErrorFunc func);
-
-/*! \brief Sets a callback function that is called whenever an error occurs. */
-RTCORE_API void rtcDeviceSetErrorFunction(RTCDevice device, RTCErrorFunc func);
-
-/*! \brief Type of memory consumption callback function. */
-typedef bool (*RTCMemoryMonitorFunc)(const ssize_t bytes, const bool post);
-RTCORE_DEPRECATED typedef RTCMemoryMonitorFunc RTC_MEMORY_MONITOR_FUNCTION;
-
-/*! \brief Sets the memory consumption callback function which is
- *  called before or after the library allocates or frees memory. 
-   WARNING: This function is deprecated, use rtcDeviceSetMemoryMonitorFunction instead.
-*/
-RTCORE_API RTCORE_DEPRECATED void rtcSetMemoryMonitorFunction(RTCMemoryMonitorFunc func);
-
-/*! \brief Sets the memory consumption callback function which is
- *  called before or after the library allocates or frees memory. */
-RTCORE_API void rtcDeviceSetMemoryMonitorFunction(RTCDevice device, RTCMemoryMonitorFunc func);
-
-/*! \brief Implementation specific (do not call).
-
-  This function is implementation specific and only for debugging
-  purposes. Do not call it. */
-RTCORE_API RTCORE_DEPRECATED void rtcDebug(); // FIXME: remove
-
-#include "rtcore_scene.h"
-#include "rtcore_geometry.h"
-#include "rtcore_geometry_user.h"
-
-/*! \brief Helper to easily combing scene flags */
-inline RTCSceneFlags operator|(const RTCSceneFlags a, const RTCSceneFlags b) {
-  return (RTCSceneFlags)((size_t)a | (size_t)b);
-}
-
-/*! \brief Helper to easily combing algorithm flags */
-inline RTCAlgorithmFlags operator|(const RTCAlgorithmFlags a, const RTCAlgorithmFlags b) {
-  return (RTCAlgorithmFlags)((size_t)a | (size_t)b);
-}
-
-/*! \brief Helper to easily combing geometry flags */
-inline RTCGeometryFlags operator|(const RTCGeometryFlags a, const RTCGeometryFlags b) {
-  return (RTCGeometryFlags)((size_t)a | (size_t)b);
-}
-
-/*! \} */
-
-#endif
diff --git a/3rdparty/embree/include/embree2/rtcore.isph b/3rdparty/embree/include/embree2/rtcore.isph
deleted file mode 100644
index f46f0525..00000000
--- a/3rdparty/embree/include/embree2/rtcore.isph
+++ /dev/null
@@ -1,220 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_ISPH__
-#define __RTCORE_ISPH__
-
-#ifdef _WIN32
-#  define RTCORE_ALIGN(...) // FIXME: need to specify alignment
-#else
-#  define RTCORE_ALIGN(...) // FIXME: need to specify alignment
-#endif
-
-#define RTCORE_DEPRECATED // FIXME: deprecation not supported by ISPC
-
-/*! Embree API version */
-#define RTCORE_VERSION_MAJOR 2
-#define RTCORE_VERSION_MINOR 9
-#define RTCORE_VERSION_PATCH 0
-#define RTCORE_VERSION       20900
-
-/*! \file rtcore.isph Defines the Embree Ray Tracing Kernel API for ISPC.
-
-   This file defines the Embree ray tracing kernel API for C and
-   C++. The user is supposed to include this file, and alternatively
-   the rtcore_ray.isph file, but none of the other .isph files in this
-   folder. */
-
-/*! \{ */
-
-/*! Axis aligned bounding box representation */
-RTCORE_ALIGN(16) struct RTCBounds
-{
-  float lower_x, lower_y, lower_z, align0;
-  float upper_x, upper_y, upper_z, align1;
-};
-
-/*! \brief Defines an opaque device type */
-typedef uniform struct __RTCDevice {}* uniform RTCDevice;
-
-/*! \brief Creates a new Embree device.
-
-  Creates a new Embree device to be used by the application. An
-  application typically creates only a single Embree device, but it is
-  valid to use multiple devices inside an application. A configuration
-  string can be passed at construction time, that allows to configure
-  implementation specific parameters. If this string is NULL, a
-  default configuration is used. The following configuration flags are
-  supported by the Embree implementation of the API:
-  
-  verbose = num,       // sets verbosity level (default is 0)
-
-  If Embree is started on an unsupported CPU, rtcNewDevice will fail and
-  set the RTC_UNSUPPORTED_CPU error code.
-  
-*/
-RTCDevice rtcNewDevice(const uniform int8* uniform cfg = NULL);
-
-/*! \brief Deletes an Embree device.
-
-  Deletes the Embree device again. After deletion, all scene handles
-  are invalid. The application should invoke this call before
-  terminating. */
-void rtcDeleteDevice(RTCDevice device);
-
-/*! \brief Initializes the Embree ray tracing core
-
-  WARNING: This function is deprecated, use rtcNewDevice instead.
-
-  Initializes the ray tracing core and passed some configuration
-  string. The configuration string allows to configure implementation
-  specific parameters. If this string is NULL, a default configuration
-  is used. The following configuration flags are supported by the
-  Embree implementation of the API:
-  
-  verbose = num,       // sets verbosity level (default is 0)
-
-  If Embree is started on an unsupported CPU, rtcInit will fail and
-  set the RTC_UNSUPPORTED_CPU error code.
-  
-*/
-RTCORE_DEPRECATED void rtcInit(const uniform int8* uniform cfg = NULL);
-
-/*! \brief Shuts down Embree. 
-
-  WARNING: This function is deprecated, use rtcDeleteDevice instead.
-
-  Shuts down the ray tracing core. After shutdown, all scene handles
-  are invalid, and invoking any API call except rtcInit is not
-  allowed. The application should invoke this call before
-  terminating. It is safe to call rtcInit again after an rtcExit
-  call. */
-RTCORE_DEPRECATED void rtcExit();
-
-/*! \brief Parameters that can get configured using the rtcSetParameter functions. */
-enum RTCParameter {
-  RTC_SOFTWARE_CACHE_SIZE = 0,                /*! Configures the software cache size (used
-                                                to cache subdivision surfaces for
-                                                instance). The size is specified as an
-                                                integer number of bytes. The software
-                                                cache cannot be configured during
-                                                rendering. (write only) */
-
-  RTC_CONFIG_INTERSECT1 = 1,                  //!< checks if rtcIntersect1 is supported (read only)
-  RTC_CONFIG_INTERSECT4 = 2,                  //!< checks if rtcIntersect4 is supported (read only)
-  RTC_CONFIG_INTERSECT8 = 3,                  //!< checks if rtcIntersect8 is supported (read only)
-  RTC_CONFIG_INTERSECT16 = 4,                 //!< checks if rtcIntersect16 is supported (read only)
-  RTC_CONFIG_INTERSECTN = 5,                  //!< checks if rtcIntersectN is supported (read only)
-
-  RTC_CONFIG_RAY_MASK = 6,                    //!< checks if ray masks are supported (read only)
-  RTC_CONFIG_BACKFACE_CULLING = 7,            //!< checks if backface culling is supported (read only)
-  RTC_CONFIG_INTERSECTION_FILTER = 8,         //!< checks if intersection filters are enabled (read only)
-  RTC_CONFIG_INTERSECTION_FILTER_RESTORE = 9, //!< checks if intersection filters restores previous hit (read only)
-  RTC_CONFIG_BUFFER_STRIDE = 10,               //!< checks if buffer strides are supported (read only)
-  RTC_CONFIG_IGNORE_INVALID_RAYS = 11,        //!< checks if invalid rays are ignored (read only)
-  RTC_CONFIG_TASKING_SYSTEM = 12,             //!< return used tasking system (0 = INTERNAL, 1 = TBB) (read only)
-
-
-  RTC_CONFIG_VERSION_MAJOR = 13,           //!< returns Embree major version (read only)
-  RTC_CONFIG_VERSION_MINOR = 14,           //!< returns Embree minor version (read only)
-  RTC_CONFIG_VERSION_PATCH = 15,           //!< returns Embree patch version (read only)
-  RTC_CONFIG_VERSION = 16,                 //!< returns Embree version as integer (e.g. Embree v2.8.2 -> 20802) (read only)
-};
-
-/*! \brief Configures some parameters. 
-   WARNING: This function is deprecated, use rtcDeviceSetParameter1i instead.
-*/
-RTCORE_DEPRECATED void rtcSetParameter1i(const uniform RTCParameter parm, uniform size_t val); // FIXME: should be ssize_t
-
-/*! \brief Reads some parameters.
-   WARNING: This function is deprecated, use rtcDeviceGetParameter1i instead.
-*/
-uniform size_t rtcGetParameter1i(const uniform RTCParameter parm); // FIXME: should return ssize_t
-
-/*! \brief Configures some device parameters.*/
-void rtcDeviceSetParameter1i(RTCDevice device, const uniform RTCParameter parm, uniform size_t val); // FIXME: should be ssize_t
-
-/*! \brief Reads some device parameters. */
-uniform size_t rtcDeviceGetParameter1i(RTCDevice device, const uniform RTCParameter parm); // FIXME: should return ssize_t
-
-/*! \brief Error codes returned by the rtcGetError function. */
-enum RTCError {
-  RTC_NO_ERROR = 0,          //!< No error has been recorded.
-  RTC_UNKNOWN_ERROR = 1,     //!< An unknown error has occured.
-  RTC_INVALID_ARGUMENT = 2,  //!< An invalid argument is specified
-  RTC_INVALID_OPERATION = 3, //!< The operation is not allowed for the specified object.
-  RTC_OUT_OF_MEMORY = 4,     //!< There is not enough memory left to execute the command.
-  RTC_UNSUPPORTED_CPU = 5,   //!< The CPU is not supported as it does not support SSE2.
-  RTC_CANCELLED = 6          //!< The user has cancelled the operation through the RTCProgressMonitorFunc callback
-};
-
-/*! \brief Returns the value of the per-thread error flag. 
-
-  WARNING: This function is deprecated, use rtcDeviceGetError instead.
-
-  If an error occurs this flag is set to an error code if it stores no
-  previous error. The rtcGetError function reads and returns the
-  currently stored error and clears the error flag again. */
-RTCORE_DEPRECATED uniform RTCError rtcGetError();
-
-/*! \brief Returns the value of the per-thread error flag. 
-
-  If an error occurs this flag is set to an error code if it stores no
-  previous error. The rtcGetError function reads and returns the
-  currently stored error and clears the error flag again. */
-uniform RTCError rtcDeviceGetError(RTCDevice device);
-
-/*! \brief Type of error callback function. */
-typedef void (*uniform RTCErrorFunc)(const uniform RTCError code, const uniform int8* uniform str);
-RTCORE_DEPRECATED typedef RTCErrorFunc RTC_ERROR_FUNCTION;
-
-/*! \brief Sets a callback function that is called whenever an error occurs. 
-    WARNING: This function is deprecated, use rtcDeviceSetErrorFunction instead.
-*/
-RTCORE_DEPRECATED void rtcSetErrorFunction(uniform RTCErrorFunc func);
-
-/*! \brief Sets a callback function that is called whenever an error occurs. */
-void rtcDeviceSetErrorFunction(RTCDevice device, uniform RTCErrorFunc func);
-
-/*! \brief Type of memory consumption callback function. */
-typedef uniform bool (*uniform RTCMemoryMonitorFunc)(const uniform size_t bytes, const uniform bool post); // FIXME: should be ssize_t
-RTCORE_DEPRECATED typedef RTCMemoryMonitorFunc RTC_MEMORY_MONITOR_FUNCTION;
-
-/*! \brief Sets the memory consumption callback function which is
- *  called before the library allocates or after the library frees
- *  memory. 
- *  WARNING: This function is deprecated, use rtcDeviceSetMemoryMonitorFunction instead.
-*/
-RTCORE_DEPRECATED void rtcSetMemoryMonitorFunction(RTCMemoryMonitorFunc func);
-
-/*! \brief Sets the memory consumption callback function which is
- *  called before the library allocates or after the library frees
- *  memory. */
-void rtcDeviceSetMemoryMonitorFunction(RTCDevice device, RTCMemoryMonitorFunc func);
-
-/*! \brief Implementation specific (do not call).
-
-  This function is implementation specific and only for debugging
-  purposes. Do not call it. */
-RTCORE_DEPRECATED void rtcDebug();  // FIXME: remove
-
-#include "rtcore_scene.isph"
-#include "rtcore_geometry.isph"
-#include "rtcore_geometry_user.isph"
-
-/*! \} */
-
-#endif
diff --git a/3rdparty/embree/include/embree2/rtcore_geometry.h b/3rdparty/embree/include/embree2/rtcore_geometry.h
deleted file mode 100644
index 5b80130f..00000000
--- a/3rdparty/embree/include/embree2/rtcore_geometry.h
+++ /dev/null
@@ -1,483 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_GEOMETRY_H__
-#define __RTCORE_GEOMETRY_H__
-
-/*! \ingroup embree_kernel_api */
-/*! \{ */
-
-/*! invalid geometry ID */
-#define RTC_INVALID_GEOMETRY_ID ((unsigned)-1)
-
-/*! \brief Specifies the type of buffers when mapping buffers */
-enum RTCBufferType {
-  RTC_INDEX_BUFFER         = 0x01000000,
-  
-  RTC_VERTEX_BUFFER        = 0x02000000,
-  RTC_VERTEX_BUFFER0       = 0x02000000,
-  RTC_VERTEX_BUFFER1       = 0x02000001,
-
-  RTC_USER_VERTEX_BUFFER   = 0x02100000,
-  RTC_USER_VERTEX_BUFFER0  = 0x02100000,
-  RTC_USER_VERTEX_BUFFER1  = 0x02100001,
-
-  RTC_FACE_BUFFER          = 0x03000000,
-  RTC_LEVEL_BUFFER         = 0x04000001,
-
-  RTC_EDGE_CREASE_INDEX_BUFFER = 0x05000000,
-  RTC_EDGE_CREASE_WEIGHT_BUFFER = 0x06000000,
-
-  RTC_VERTEX_CREASE_INDEX_BUFFER = 0x07000000,
-  RTC_VERTEX_CREASE_WEIGHT_BUFFER = 0x08000000,
-
-  RTC_HOLE_BUFFER          = 0x09000001,
-};
-
-/*! \brief Supported types of matrix layout for functions involving matrices */
-enum RTCMatrixType {
-  RTC_MATRIX_ROW_MAJOR = 0,
-  RTC_MATRIX_COLUMN_MAJOR = 1,
-  RTC_MATRIX_COLUMN_MAJOR_ALIGNED16 = 2,
-};
-
-/*! \brief Supported geometry flags to specify handling in dynamic scenes. */
-enum RTCGeometryFlags 
-{
-  RTC_GEOMETRY_STATIC     = 0,    //!< specifies static geometry that will change rarely
-  RTC_GEOMETRY_DEFORMABLE = 1,    //!< specifies dynamic geometry with deformable motion (BVH refit possible)
-  RTC_GEOMETRY_DYNAMIC    = 2,    //!< specifies dynamic geometry with arbitrary motion (BVH refit not possible)
-};
-
-/*! \brief Boundary interpolation mode for subdivision surfaces */
-enum RTCBoundaryMode
-{
-  RTC_BOUNDARY_NONE = 0,               //!< ignores border patches
-  RTC_BOUNDARY_EDGE_ONLY = 1,          //!< soft boundary (default)
-  RTC_BOUNDARY_EDGE_AND_CORNER = 2     //!< boundary corner vertices are sharp vertices
-};
-
-/*! Intersection filter function for single rays. */
-typedef void (*RTCFilterFunc)(void* ptr,           /*!< pointer to user data */
-                              RTCRay& ray          /*!< intersection to filter */);
-
-/*! Intersection filter function for ray packets of size 4. */
-typedef void (*RTCFilterFunc4)(const void* valid,  /*!< pointer to valid mask */
-                               void* ptr,          /*!< pointer to user data */
-                               RTCRay4& ray        /*!< intersection to filter */);
-
-/*! Intersection filter function for ray packets of size 8. */
-typedef void (*RTCFilterFunc8)(const void* valid,  /*!< pointer to valid mask */
-                               void* ptr,          /*!< pointer to user data */
-                               RTCRay8& ray        /*!< intersection to filter */);
-
-/*! Intersection filter function for ray packets of size 16. */
-typedef void (*RTCFilterFunc16)(const void* valid, /*!< pointer to valid mask */
-                                void* ptr,         /*!< pointer to user data */
-                                RTCRay16& ray      /*!< intersection to filter */);
-
-/*! Displacement mapping function. */
-typedef void (*RTCDisplacementFunc)(void* ptr,           /*!< pointer to user data of geometry */
-                                    unsigned geomID,     /*!< ID of geometry to displace */
-                                    unsigned primID,     /*!< ID of primitive of geometry to displace */
-                                    const float* u,      /*!< u coordinates (source) */
-                                    const float* v,      /*!< v coordinates (source) */
-                                    const float* nx,     /*!< x coordinates of normalized normal at point to displace (source) */
-                                    const float* ny,     /*!< y coordinates of normalized normal at point to displace (source) */
-                                    const float* nz,     /*!< z coordinates of normalized normal at point to displace (source) */
-                                    float* px,           /*!< x coordinates of points to displace (source and target) */
-                                    float* py,           /*!< y coordinates of points to displace (source and target) */
-                                    float* pz,           /*!< z coordinates of points to displace (source and target) */
-                                    size_t N             /*!< number of points to displace */ );
-
-/*! \brief Creates a new scene instance. 
-
-  A scene instance contains a reference to a scene to instantiate and
-  the transformation to instantiate the scene with. An implementation
-  will typically transform the ray with the inverse of the provided
-  transformation and continue traversing the ray through the provided
-  scene. If any geometry is hit, the instance ID (instID) member of
-  the ray will get set to the geometry ID of the instance. */
-RTCORE_API unsigned rtcNewInstance (RTCScene target,                  //!< the scene the instance belongs to
-                                    RTCScene source                   //!< the scene to instantiate
-  );
-
-/*! \brief Creates a new scene instance. 
-
-  A scene instance contains a reference to a scene to instantiate and
-  the transformation to instantiate the scene with. For motion blurred
-  instances, a number of timesteps can get specified (currently only 1
-  or 2 timesteps are supported). An implementation will typically
-  transform the ray with the inverse of the provided transformation
-  and continue traversing the ray through the provided scene. If any
-  geometry is hit, the instance ID (instID) member of the ray will get
-  set to the geometry ID of the instance. */
-RTCORE_API unsigned rtcNewInstance2 (RTCScene target,                  //!< the scene the instance belongs to
-                                     RTCScene source,                  //!< the scene to instantiate
-                                     size_t numTimeSteps = 1);         //!< number of timesteps, one matrix per timestep
-
-/*! \brief Sets transformation of the instance */
-RTCORE_API void rtcSetTransform (RTCScene scene,                          //!< scene handle
-                                 unsigned geomID,                         //!< ID of geometry
-                                 RTCMatrixType layout,                    //!< layout of transformation matrix
-                                 const float* xfm                         //!< pointer to transformation matrix
-  );
-
-
-/*! \brief Sets transformation of the instance for specified timestep */
-RTCORE_API void rtcSetTransform2 (RTCScene scene,                         //!< scene handle
-                                  unsigned int geomID,                    //!< ID of geometry 
-                                  RTCMatrixType layout,                   //!< layout of transformation matrix
-                                  const float* xfm,                       //!< pointer to transformation matrix
-                                  size_t timeStep = 0                     //!< timestep to set the matrix for 
-  );
-
-/*! \brief Creates a new triangle mesh. The number of triangles
-  (numTriangles), number of vertices (numVertices), and number of time
-  steps (1 for normal meshes, and 2 for linear motion blur), have to
-  get specified. The triangle indices can be set be mapping and
-  writing to the index buffer (RTC_INDEX_BUFFER) and the triangle
-  vertices can be set by mapping and writing into the vertex buffer
-  (RTC_VERTEX_BUFFER). In case of linear motion blur, two vertex
-  buffers have to get filled (RTC_VERTEX_BUFFER0, RTC_VERTEX_BUFFER1),
-  one for each time step. The index buffer has the default layout of
-  three 32 bit integer indices for each triangle. An index points to
-  the ith vertex. The vertex buffer stores single precision x,y,z
-  floating point coordinates aligned to 16 bytes. The value of the 4th
-  float used for alignment can be arbitrary. */
-RTCORE_API unsigned rtcNewTriangleMesh (RTCScene scene,                    //!< the scene the mesh belongs to
-                                        RTCGeometryFlags flags,            //!< geometry flags
-                                        size_t numTriangles,               //!< number of triangles
-                                        size_t numVertices,                //!< number of vertices
-                                        size_t numTimeSteps = 1            //!< number of motion blur time steps
-  );
-
-
-/*! \brief Creates a new quad mesh. The number of quads
-  (numQuads), number of vertices (numVertices), and number of time
-  steps (1 for normal meshes, and 2 for linear motion blur), have to
-  get specified. The quad indices can be set be mapping and
-  writing to the index buffer (RTC_INDEX_BUFFER) and the quad
-  vertices can be set by mapping and writing into the vertex buffer
-  (RTC_VERTEX_BUFFER). In case of linear motion blur, two vertex
-  buffers have to get filled (RTC_VERTEX_BUFFER0, RTC_VERTEX_BUFFER1),
-  one for each time step. The index buffer has the default layout of
-  three 32 bit integer indices for each quad. An index points to
-  the ith vertex. The vertex buffer stores single precision x,y,z
-  floating point coordinates aligned to 16 bytes. The value of the 4th
-  float used for alignment can be arbitrary. */
-RTCORE_API unsigned rtcNewQuadMesh (RTCScene scene,                //!< the scene the mesh belongs to
-                                    RTCGeometryFlags flags,        //!< geometry flags
-                                    size_t numQuads,               //!< number of quads
-                                    size_t numVertices,            //!< number of vertices
-                                    size_t numTimeSteps = 1        //!< number of motion blur time steps
-  );
-
-/*! \brief Creates a new subdivision mesh. The number of faces
- (numFaces), edges/indices (numEdges), vertices (numVertices), edge
- creases (numEdgeCreases), vertex creases (numVertexCreases), holes
- (numHoles), and time steps (numTimeSteps) have to get speficied at
- construction time.
-
- The following buffers have to get filled by the application: the face
- buffer (RTC_FACE_BUFFER) contains the number edges/indices (3 or 4)
- of each of the numFaces faces, the index buffer (RTC_INDEX_BUFFER)
- contains multiple (3 or 4) 32bit vertex indices for each face and
- numEdges indices in total, the vertex buffer (RTC_VERTEX_BUFFER)
- stores numVertices vertices as single precision x,y,z floating point
- coordinates aligned to 16 bytes. The value of the 4th float used for
- alignment can be arbitrary.
-
- Optionally, the application can fill the hole buffer
- (RTC_HOLE_BUFFER) with numHoles many 32 bit indices of faces that
- should be considered non-existing.
-
- Optionally, the application can fill the level buffer
- (RTC_LEVEL_BUFFER) with a tessellation level for each of the numEdges
- edges. The subdivision level is a positive floating point value, that
- specifies how many quads along the edge should get generated during
- tessellation. The tessellation level is a lower bound, thus the
- implementation is free to choose a larger level. If no level buffer
- is specified a level of 1 is used.
-
- Optionally, the application can fill the sparse edge crease buffers
- to make some edges appear sharper. The edge crease index buffer
- (RTC_EDGE_CREASE_INDEX_BUFFER) contains numEdgeCreases many pairs of
- 32 bit vertex indices that specify unoriented edges. The edge crease
- weight buffer (RTC_EDGE_CREASE_WEIGHT_BUFFER) stores for each of
- theses crease edges a positive floating point weight. The larger this
- weight, the sharper the edge. Specifying a weight of infinify is
- supported and marks an edge as infinitely sharp. Storing an edge
- multiple times with the same crease weight is allowed, but has lower
- performance. Storing the an edge multiple times with different
- crease weights results in undefined behaviour. For a stored edge
- (i,j), the reverse direction edges (j,i) does not have to get stored,
- as both are considered the same edge.
-
- Optionally, the application can fill the sparse vertex crease buffers
- to make some vertices appear sharper. The vertex crease index buffer
- (RTC_VERTEX_CREASE_INDEX_BUFFER), contains numVertexCreases many 32
- bit vertex indices to speficy a set of vertices. The vertex crease
- weight buffer (RTC_VERTEX_CREASE_WEIGHT_BUFFER) specifies for each of
- these vertices a positive floating point weight. The larger this
- weight, the sharper the vertex. Specifying a weight of infinity is
- supported and makes the vertex infinitely sharp. Storing a vertex
- multiple times with the same crease weight is allowed, but has lower
- performance. Storing a vertex multiple times with different crease
- weights results in undefined behaviour.
-
-*/
-RTCORE_API unsigned rtcNewSubdivisionMesh (RTCScene scene,                //!< the scene the mesh belongs to
-                                           RTCGeometryFlags flags,        //!< geometry flags
-                                           size_t numFaces,               //!< number of faces
-                                           size_t numEdges,               //!< number of edges
-                                           size_t numVertices,            //!< number of vertices
-                                           size_t numEdgeCreases,         //!< number of edge creases
-                                           size_t numVertexCreases,       //!< number of vertex creases
-                                           size_t numHoles,               //!< number of holes
-                                           size_t numTimeSteps = 1        //!< number of motion blur time steps
-  );
-
-/*! \brief Creates a new hair geometry, consisting of multiple hairs
-  represented as cubic bezier curves with varying radii. The number of
-  curves (numCurves), number of vertices (numVertices), and number of
-  time steps (1 for normal curves, and 2 for linear motion blur), have
-  to get specified at construction time. Further, the curve index
-  buffer (RTC_INDEX_BUFFER) and the curve vertex buffer
-  (RTC_VERTEX_BUFFER) have to get set by mapping and writing to the
-  appropiate buffers. In case of linear motion blur, two vertex
-  buffers have to get filled (RTC_VERTEX_BUFFER0, RTC_VERTEX_BUFFER1),
-  one for each time step. The index buffer has the default layout of a
-  single 32 bit integer index for each curve, that references the
-  start vertex of the curve. The vertex buffer stores 4 control points
-  per curve, each such control point consists of a single precision
-  (x,y,z) position and radius, stored in that order in
-  memory. Individual hairs are considered to be subpixel sized which
-  allows the implementation to approximate the intersection
-  calculation. This in particular means that zooming onto one hair
-  might show geometric artefacts. */
-RTCORE_API unsigned rtcNewHairGeometry (RTCScene scene,                    //!< the scene the curves belong to
-                                        RTCGeometryFlags flags,            //!< geometry flags
-                                        size_t numCurves,                  //!< number of curves
-                                        size_t numVertices,                //!< number of vertices
-                                        size_t numTimeSteps = 1            //!< number of motion blur time steps
-  );
-
-/*! Sets a uniform tessellation rate for subdiv meshes and hair
- *  geometry. For subdivision meshes the RTC_LEVEL_BUFFER can also be used
- *  optionally to set a different tessellation rate per edge.*/
-RTCORE_API void rtcSetTessellationRate (RTCScene scene, unsigned geomID, float tessellationRate);
-
-/*! \brief Creates a new line segment geometry, consisting of multiple
-  segments with varying radii. The number of line segments (numSegments),
-  number of vertices (numVertices), and number of time steps (1 for
-  normal line segments, and 2 for linear motion blur), have to get
-  specified at construction time. Further, the segment index buffer
-  (RTC_INDEX_BUFFER) and the segment vertex buffer (RTC_VERTEX_BUFFER)
-  have to get set by mapping and writing to the appropiate buffers. In
-  case of linear motion blur, two vertex buffers have to get filled
-  (RTC_VERTEX_BUFFER0, RTC_VERTEX_BUFFER1), one for each time step. The
-  index buffer has the default layout of a single 32 bit integer index
-  for each line segment, that references the start vertex of the segment.
-  The vertex buffer stores 2 end points per line segment, each such point
-  consists of a single precision (x,y,z) position and radius, stored in
-  that order in memory. Individual segments are considered to be subpixel
-  sized which allows the implementation to approximate the intersection
-  calculation. This in particular means that zooming onto one line segment
-  might show geometric artefacts. */
-RTCORE_API unsigned rtcNewLineSegments (RTCScene scene,                    //!< the scene the line segments belong to
-                                        RTCGeometryFlags flags,            //!< geometry flags
-                                        size_t numSegments,                //!< number of line segments
-                                        size_t numVertices,                //!< number of vertices
-                                        size_t numTimeSteps = 1            //!< number of motion blur time steps
-  );
-
-/*! \brief Sets 32 bit ray mask. */
-RTCORE_API void rtcSetMask (RTCScene scene, unsigned geomID, int mask);
-
-/*! \brief Sets boundary interpolation mode for subdivision surfaces */                                                                        
-RTCORE_API void rtcSetBoundaryMode(RTCScene scene, unsigned geomID, RTCBoundaryMode mode);
-
-/*! \brief Maps specified buffer. This function can be used to set index and
- *  vertex buffers of geometries. */
-RTCORE_API void* rtcMapBuffer(RTCScene scene, unsigned geomID, RTCBufferType type);
-
-/*! \brief Unmaps specified buffer. 
-
-  A buffer has to be unmapped before the rtcEnable, rtcDisable,
-  rtcUpdate, or rtcDeleteGeometry calls are executed. */
-RTCORE_API void rtcUnmapBuffer(RTCScene scene, unsigned geomID, RTCBufferType type);
-
-/*! \brief Shares a data buffer between the application and
- *  Embree. The passed buffer is used by Embree to store index and
- *  vertex data. It has to remain valid as long as the mesh exists,
- *  and the user is responsible to free the data when the mesh gets
- *  deleted. One can optionally speficy a byte offset and byte stride
- *  of the elements stored inside the buffer. The addresses
- *  ptr+offset+i*stride have to be aligned to 4 bytes on Xeon CPUs and
- *  16 bytes on Xeon Phi accelerators. For vertex buffers, the 4 bytes
- *  after the z-coordinate of the last vertex have to be readable memory,
- *  thus padding is required for some layouts. If this function is not
- *  called, Embree will allocate and manage buffers of the default
- *  layout. */
-RTCORE_API void rtcSetBuffer(RTCScene scene, unsigned geomID, RTCBufferType type, 
-                             const void* ptr, size_t byteOffset, size_t byteStride);
-
-/*! \brief Enable geometry. Enabled geometry can be hit by a ray. */
-RTCORE_API void rtcEnable (RTCScene scene, unsigned geomID);
-
-/*! \brief Update all geometry buffers. 
-
-  Each time geometry buffers got modified, the user has to call some
-  update function to tell the ray tracing engine which buffers got
-  modified. The rtcUpdate function taggs each geometry buffer of the
-  specified geometry as modified. */
-RTCORE_API void rtcUpdate (RTCScene scene, unsigned geomID);
-
-/*! \brief Update spefific geometry buffer. 
-
-  Each time geometry buffers got modified, the user has to call some
-  update function to tell the ray tracing engine which buffers got
-  modified. The rtcUpdateBuffer function taggs a specific buffer of
-  some geometry as modified. */
-RTCORE_API void rtcUpdateBuffer (RTCScene scene, unsigned geomID, RTCBufferType type);
-
-/*! \brief Disable geometry. 
-
-  Disabled geometry is not hit by any ray. Disabling and enabling
-  geometry gives higher performance than deleting and recreating
-  geometry. */
-RTCORE_API void rtcDisable (RTCScene scene, unsigned geomID);
-
-/*! \brief Sets the displacement function. */
-RTCORE_API void rtcSetDisplacementFunction (RTCScene scene, unsigned geomID, RTCDisplacementFunc func, RTCBounds* bounds);
-
-/*! \brief Sets the intersection filter function for single rays. */
-RTCORE_API void rtcSetIntersectionFilterFunction (RTCScene scene, unsigned geomID, RTCFilterFunc func);
-
-/*! \brief Sets the intersection filter function for ray packets of size 4. */
-RTCORE_API void rtcSetIntersectionFilterFunction4 (RTCScene scene, unsigned geomID, RTCFilterFunc4 func);
-
-/*! \brief Sets the intersection filter function for ray packets of size 8. */
-RTCORE_API void rtcSetIntersectionFilterFunction8 (RTCScene scene, unsigned geomID, RTCFilterFunc8 func);
-
-/*! \brief Sets the intersection filter function for ray packets of size 16. */
-RTCORE_API void rtcSetIntersectionFilterFunction16 (RTCScene scene, unsigned geomID, RTCFilterFunc16 func);
-
-/*! \brief Sets the occlusion filter function for single rays. */
-RTCORE_API void rtcSetOcclusionFilterFunction (RTCScene scene, unsigned geomID, RTCFilterFunc func);
-
-/*! \brief Sets the occlusion filter function for ray packets of size 4. */
-RTCORE_API void rtcSetOcclusionFilterFunction4 (RTCScene scene, unsigned geomID, RTCFilterFunc4 func);
-
-/*! \brief Sets the occlusion filter function for ray packets of size 8. */
-RTCORE_API void rtcSetOcclusionFilterFunction8 (RTCScene scene, unsigned geomID, RTCFilterFunc8 func);
-
-/*! \brief Sets the occlusion filter function for ray packets of size 16. */
-RTCORE_API void rtcSetOcclusionFilterFunction16 (RTCScene scene, unsigned geomID, RTCFilterFunc16 func);
-
-/*! Set pointer for user defined data per geometry. Invokations
- *  of the various user intersect and occluded functions get passed
- *  this data pointer when called. */
-RTCORE_API void rtcSetUserData (RTCScene scene, unsigned geomID, void* ptr);
-
-/*! Get pointer for user defined data per geometry based on geomID. */
-RTCORE_API void* rtcGetUserData (RTCScene scene, unsigned geomID);
-
-/*! Interpolates user data to some u/v location. The data buffer
- *  specifies per vertex data to interpolate and can be one of the
- *  RTC_VERTEX_BUFFER0/1 or RTC_USER_VERTEX_BUFFER0/1 and has to
- *  contain numFloats floating point values to interpolate for each
- *  vertex of the geometry. The dP array will get filled with the
- *  interpolated data and the dPdu and dPdv arrays with the u and v
- *  derivative of the interpolation. If the pointers dP is NULL, the
- *  value will not get calculated. If dPdu and dPdv are NULL the
- *  derivatives will not get calculated. Both dPdu and dPdv have to be
- *  either valid or NULL. The buffer has to be padded at the end such
- *  that the last element can be read safely using SSE
- *  instructions. */
-RTCORE_API void rtcInterpolate(RTCScene scene, unsigned geomID, unsigned primID, float u, float v, RTCBufferType buffer, 
-                               float* P, float* dPdu, float* dPdv, size_t numFloats);
-
-/*! Interpolates user data to some u/v location. The data buffer
- *  specifies per vertex data to interpolate and can be one of the
- *  RTC_VERTEX_BUFFER0/1 or RTC_USER_VERTEX_BUFFER0/1 and has to
- *  contain numFloats floating point values to interpolate for each
- *  vertex of the geometry. The P array will get filled with the
- *  interpolated datam the dPdu and dPdv arrays with the u and v
- *  derivative of the interpolation, and the ddPdudu, ddPdvdv, and
- *  ddPdudv arrays with the respective second derivatives. One can
- *  disable 1) the calculation of the interpolated value by setting P
- *  to NULL, 2) the calculation of the 1st order derivatives by
- *  setting dPdu and dPdv to NULL, 3) the calculation of the second
- *  order derivatives by setting ddPdudu, ddPdvdv, and ddPdudv to
- *  NULL. The buffers have to be padded at the end such that the last
- *  element can be read or written safely using SSE instructions. */
-RTCORE_API void rtcInterpolate2(RTCScene scene, unsigned geomID, unsigned primID, float u, float v, RTCBufferType buffer, 
-                                float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, size_t numFloats);
-
-/*! Interpolates user data to an array of u/v locations. The valid
- *  pointer points to an integer array that specified which entries in
- *  the u/v arrays are valid (-1 denotes valid, and 0 invalid). If the
- *  valid pointer is NULL all elements are considers valid. The data
- *  buffer specifies per vertex data to interpolate and can be one of
- *  the RTC_VERTEX_BUFFER0/1 or RTC_USER_VERTEX_BUFFER0/1 and has to
- *  contain numFloats floating point values to interpolate for each
- *  vertex of the geometry. The P array will get filled with the
- *  interpolated data, and the dPdu and dPdv arrays with the u and v
- *  derivative of the interpolation. If the pointers P is NULL, the
- *  value will not get calculated. If dPdu and dPdv are NULL the
- *  derivatives will not get calculated. Both dPdu and dPdv have to be
- *  either valid or NULL. These destination arrays are filled in
- *  structure of array (SoA) layout. The buffer has to be padded at
- *  the end such that the last element can be read safely using SSE
- *  instructions.*/
-RTCORE_API void rtcInterpolateN(RTCScene scene, unsigned geomID, 
-                                const void* valid, const unsigned* primIDs, const float* u, const float* v, size_t numUVs, 
-                                RTCBufferType buffer, 
-                                float* P, float* dPdu, float* dPdv, size_t numFloats);
-
-/*! Interpolates user data to an array of u/v locations. The valid
- *  pointer points to an integer array that specified which entries in
- *  the u/v arrays are valid (-1 denotes valid, and 0 invalid). If the
- *  valid pointer is NULL all elements are considers valid. The data
- *  buffer specifies per vertex data to interpolate and can be one of
- *  the RTC_VERTEX_BUFFER0/1 or RTC_USER_VERTEX_BUFFER0/1 and has to
- *  contain numFloats floating point values to interpolate for each
- *  vertex of the geometry. The P array will get filled with the
- *  interpolated datam the dPdu and dPdv arrays with the u and v
- *  derivative of the interpolation, and the ddPdudu, ddPdvdv, and
- *  ddPdudv arrays with the respective second derivatives. One can
- *  disable 1) the calculation of the interpolated value by setting P
- *  to NULL, 2) the calculation of the 1st order derivatives by
- *  setting dPdu and dPdv to NULL, 3) the calculation of the second
- *  order derivatives by setting ddPdudu, ddPdvdv, and ddPdudv to
- *  NULL. These destination arrays are filled in structure of array
- *  (SoA) layout. The buffer has to be padded at the end such that
- *  the last element can be read safely using SSE
- *  instructions. */
-RTCORE_API void rtcInterpolateN2(RTCScene scene, unsigned geomID, 
-                                const void* valid, const unsigned* primIDs, const float* u, const float* v, size_t numUVs, 
-                                RTCBufferType buffer, 
-                                float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, size_t numFloats);
-
-/*! \brief Deletes the geometry. */
-RTCORE_API void rtcDeleteGeometry (RTCScene scene, unsigned geomID);
-
-
-/*! @} */
-
-#endif
diff --git a/3rdparty/embree/include/embree2/rtcore_geometry.isph b/3rdparty/embree/include/embree2/rtcore_geometry.isph
deleted file mode 100644
index b2fa68cf..00000000
--- a/3rdparty/embree/include/embree2/rtcore_geometry.isph
+++ /dev/null
@@ -1,405 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_GEOMETRY_ISPH__
-#define __RTCORE_GEOMETRY_ISPH__
-
-/*! \ingroup embree_kernel_api_ispc */
-/*! \{ */
-
-/*! invalid geometry ID */
-#define RTC_INVALID_GEOMETRY_ID ((uniform unsigned int)-1)
-
-/*! \brief Specifies the type of buffers when mapping buffers */
-enum RTCBufferType {
-  RTC_INDEX_BUFFER         = 0x01000000,
-
-  RTC_VERTEX_BUFFER        = 0x02000000,
-  RTC_VERTEX_BUFFER0       = 0x02000000,
-  RTC_VERTEX_BUFFER1       = 0x02000001,
-
-  RTC_USER_VERTEX_BUFFER   = 0x02100000,
-  RTC_USER_VERTEX_BUFFER0  = 0x02100000,
-  RTC_USER_VERTEX_BUFFER1  = 0x02100001,
-
-  RTC_FACE_BUFFER          = 0x03000000,
-  RTC_LEVEL_BUFFER         = 0x04000001,
-
-  RTC_EDGE_CREASE_INDEX_BUFFER = 0x05000000,
-  RTC_EDGE_CREASE_WEIGHT_BUFFER = 0x06000000,
-
-  RTC_VERTEX_CREASE_INDEX_BUFFER = 0x07000000,
-  RTC_VERTEX_CREASE_WEIGHT_BUFFER = 0x08000000,
-
-  RTC_HOLE_BUFFER          = 0x09000001,
-};
-
-/*! \brief Supported types of matrix layout for functions involving matrices */
-enum RTCMatrixType {
-  RTC_MATRIX_ROW_MAJOR = 0,
-  RTC_MATRIX_COLUMN_MAJOR = 1,
-  RTC_MATRIX_COLUMN_MAJOR_ALIGNED16 = 2,
-};
-
-/*! \brief Supported geometry flags to specify handling in dynamic scenes. */
-enum RTCGeometryFlags 
-{
-  RTC_GEOMETRY_STATIC     = 0,    //!< specifies static geometry that will change rarely
-  RTC_GEOMETRY_DEFORMABLE = 1,    //!< specifies dynamic geometry with deformable motion (BVH refit possible)
-  RTC_GEOMETRY_DYNAMIC    = 2,    //!< specifies dynamic geometry with arbitrary motion (BVH refit not possible)
-};
-
-/*! \brief Boundary interpolation mode for subdivision surfaces */
-enum RTCBoundaryMode
-{
-  RTC_BOUNDARY_NONE = 0,               //!< ignores border patches
-  RTC_BOUNDARY_EDGE_ONLY = 1,          //!< soft boundary (default)
-  RTC_BOUNDARY_EDGE_AND_CORNER = 2     //!< boundary corner vertices are sharp vertices
-};
-
-/*! Intersection filter function for uniform rays. */
-typedef void (*uniform RTCFilterFuncUniform)(void* uniform ptr,    /*!< pointer to user data */
-                                             uniform RTCRay1& ray  /*!< intersection to filter */);
-
-/*! Intersection filter function for varying rays. */
-typedef void (*uniform RTCFilterFuncVarying)(void* uniform ptr,   /*!< pointer to user data */
-                                             varying RTCRay& ray  /*!< intersection to filter */);
-
-
-/*! \brief Creates a new scene instance. 
-
-  A scene instance contains a reference to a scene to instantiate and
-  the transformation to instantiate the scene with. An implementation
-  will typically transform the ray with the inverse of the provided
-  transformation and continue traversing the ray through the provided
-  scene. If any geometry is hit, the instance ID (instID) member of
-  the ray will get set to the geometry ID of the instance. */
-uniform unsigned int rtcNewInstance (RTCScene target,           //!< the scene the instance belongs to
-                                     RTCScene source            //!< the geometry to instantiate
-  );
-
-/*! \brief Creates a new scene instance. 
-
-  A scene instance contains a reference to a scene to instantiate and
-  the transformation to instantiate the scene with. For motion blurred
-  instances, a number of timesteps can get specified (currently only 1
-  or 2 timesteps are supported). An implementation will typically
-  transform the ray with the inverse of the provided transformation
-  and continue traversing the ray through the provided scene. If any
-  geometry is hit, the instance ID (instID) member of the ray will get
-  set to the geometry ID of the instance. */
-uniform unsigned rtcNewInstance2 (RTCScene target,                  //!< the scene the instance belongs to
-                                  RTCScene source,                  //!< the scene to instantiate
-                                  uniform size_t numTimeSteps = 1); //!< number of timesteps, one matrix per timestep
-
-
-/*! \brief Sets transformation of the instance */
-void rtcSetTransform (RTCScene scene,                                  //!< scene handle
-                      uniform unsigned int geomID,                     //!< ID of geometry
-                      uniform RTCMatrixType layout,                    //!< layout of transformation matrix
-                      const uniform float* uniform xfm                 //!< pointer to transformation matrix
-                      );
-
-/*! \brief Sets transformation of the instance for specified timestep */
-void rtcSetTransform2 (RTCScene scene,                                 //!< scene handle
-                       uniform unsigned int geomID,                    //!< ID of geometry 
-                       uniform RTCMatrixType layout,                   //!< layout of transformation matrix
-                       const uniform float* uniform xfm,               //!< pointer to transformation matrix
-                       uniform size_t timeStep = 0                     //!< timestep to set the matrix for 
-  );
-
-/*! \brief Creates a new triangle mesh. The number of triangles
-  (numTriangles), number of vertices (numVertices), and number of time
-  steps (1 for normal meshes, and 2 for linear motion blur), have to
-  get specified. The triangle indices can be set be mapping and
-  writing to the index buffer (RTC_INDEX_BUFFER) and the triangle
-  vertices can be set by mapping and writing into the vertex buffer
-  (RTC_VERTEX_BUFFER). In case of linear motion blur, two vertex
-  buffers have to get filled (RTC_VERTEX_BUFFER0, RTC_VERTEX_BUFFER1),
-  one for each time step. The index buffer has the default layout of
-  three 32 bit integer indices for each triangle. An index points to
-  the ith vertex. The vertex buffer stores single precision x,y,z
-  floating point coordinates aligned to 16 bytes. The value of the 4th
-  float used for alignment can be arbitrary. */
-uniform unsigned int rtcNewTriangleMesh (RTCScene scene,                  //!< the scene the mesh belongs to
-                                         uniform RTCGeometryFlags flags,  //!< geometry flags
-                                         uniform size_t numTriangles,     //!< number of triangles
-                                         uniform size_t numVertices,      //!< number of vertices
-                                         uniform size_t numTimeSteps = 1  //!< number of motion blur time steps
-  );
-
-/*! \brief Creates a new quad mesh. The number of quads
-  (numQuads), number of vertices (numVertices), and number of time
-  steps (1 for normal meshes, and 2 for linear motion blur), have to
-  get specified. The quad indices can be set be mapping and
-  writing to the index buffer (RTC_INDEX_BUFFER) and the quad
-  vertices can be set by mapping and writing into the vertex buffer
-  (RTC_VERTEX_BUFFER). In case of linear motion blur, two vertex
-  buffers have to get filled (RTC_VERTEX_BUFFER0, RTC_VERTEX_BUFFER1),
-  one for each time step. The index buffer has the default layout of
-  three 32 bit integer indices for each quad. An index points to
-  the ith vertex. The vertex buffer stores single precision x,y,z
-  floating point coordinates aligned to 16 bytes. The value of the 4th
-  float used for alignment can be arbitrary. */
-uniform unsigned int rtcNewQuadMesh (RTCScene scene,                  //!< the scene the mesh belongs to
-                                     uniform RTCGeometryFlags flags,  //!< geometry flags
-                                     uniform size_t numQuads,         //!< number of quads
-                                     uniform size_t numVertices,      //!< number of vertices
-                                     uniform size_t numTimeSteps = 1  //!< number of motion blur time steps
-  );
-
-/*! \brief Creates a new subdivision mesh. The number of faces
- (numFaces), edges/indices (numEdges), vertices (numVertices), edge
- creases (numEdgeCreases), vertex creases (numVertexCreases), holes
- (numHoles), and time steps (numTimeSteps) have to get speficied at
- construction time.
-
- The following buffers have to get filled by the application: the face
- buffer (RTC_FACE_BUFFER) contains the number edges/indices (3 or 4)
- of each of the numFaces faces, the index buffer (RTC_INDEX_BUFFER)
- contains multiple (3 or 4) 32bit vertex indices for each face and
- numEdges indices in total, the vertex buffer (RTC_VERTEX_BUFFER)
- stores numVertices vertices as single precision x,y,z floating point
- coordinates aligned to 16 bytes. The value of the 4th float used for
- alignment can be arbitrary.
-
- Optionally, the application can fill the hole buffer
- (RTC_HOLE_BUFFER) with numHoles many 32 bit indices of faces that
- should be considered non-existing.
-
- Optionally, the application can fill the level buffer
- (RTC_LEVEL_BUFFER) with a tessellation level for each of the numEdges
- edges. The subdivision level is a positive floating point value, that
- specifies how many quads along the edge should get generated during
- tessellation. The tessellation level is a lower bound, thus the
- implementation is free to choose a larger level. If no level buffer
- is specified a level of 1 is used.
-
- Optionally, the application can fill the sparse edge crease buffers
- to make some edges appear sharper. The edge crease index buffer
- (RTC_EDGE_CREASE_INDEX_BUFFER) contains numEdgeCreases many pairs of
- 32 bit vertex indices that specify unoriented edges. The edge crease
- weight buffer (RTC_EDGE_CREASE_WEIGHT_BUFFER) stores for each of
- theses crease edges a positive floating point weight. The larger this
- weight, the sharper the edge. Specifying a weight of infinify is
- supported and marks an edge as infinitely sharp. Storing an edge
- multiple times with the same crease weight is allowed, but has lower
- performance. Storing the an edge multiple times with different
- crease weights results in undefined behaviour. For a stored edge
- (i,j), the reverse direction edges (j,i) does not have to get stored,
- as both are considered the same edge.
-
- Optionally, the application can fill the sparse vertex crease buffers
- to make some vertices appear sharper. The vertex crease index buffer
- (RTC_VERTEX_CREASE_INDEX_BUFFER), contains numVertexCreases many 32
- bit vertex indices to speficy a set of vertices. The vertex crease
- weight buffer (RTC_VERTEX_CREASE_WEIGHT_BUFFER) specifies for each of
- these vertices a positive floating point weight. The larger this
- weight, the sharper the vertex. Specifying a weight of infinity is
- supported and makes the vertex infinitely sharp. Storing a vertex
- multiple times with the same crease weight is allowed, but has lower
- performance. Storing a vertex multiple times with different crease
- weights results in undefined behaviour.
-
-*/
-
-uniform unsigned int rtcNewSubdivisionMesh (RTCScene scene,                //!< the scene the mesh belongs to
-                                            uniform RTCGeometryFlags flags,        //!< geometry flags
-                                            uniform size_t numFaces,               //!< number of faces
-                                            uniform size_t numEdges,               //!< number of edges
-                                            uniform size_t numVertices,            //!< number of vertices
-                                            uniform size_t numEdgeCreases,         //!< number of edge creases
-                                            uniform size_t numVertexCreases,       //!< number of vertex creases
-                                            uniform size_t numHoles,               //!< number of holes
-                                            uniform size_t numTimeSteps = 1        //!< number of motion blur time steps
-  );
-
-/*! \brief Creates a new hair geometry, consisting of multiple hairs
-  represented as cubic bezier curves with varying radii. The number of
-  curves (numCurves), number of vertices (numVertices), and number of
-  time steps (1 for normal curves, and 2 for linear motion blur), have
-  to get specified at construction time. Further, the curve index
-  buffer (RTC_INDEX_BUFFER) and the curve vertex buffer
-  (RTC_VERTEX_BUFFER) have to get set by mapping and writing to the
-  appropiate buffers. In case of linear motion blur, two vertex
-  buffers have to get filled (RTC_VERTEX_BUFFER0, RTC_VERTEX_BUFFER1),
-  one for each time step. The index buffer has the default layout of a
-  single 32 bit integer index for each curve, that references the
-  start vertex of the curve. The vertex buffer stores 4 control points
-  per curve, each such control point consists of a single precision
-  (x,y,z) position and radius, stored in that order in
-  memory. Individual hairs are considered to be subpixel sized which
-  allows the implementation to approximate the intersection
-  calculation. This in particular means that zooming onto one hair
-  might show geometric artefacts. */
-uniform unsigned int rtcNewHairGeometry (RTCScene scene,                    //!< the scene the curves belong to
-                                         uniform RTCGeometryFlags flags,    //!< geometry flags
-                                         uniform size_t numCurves,          //!< number of curves
-                                         uniform size_t numVertices,        //!< number of vertices
-                                         uniform size_t numTimeSteps = 1    //!< number of motion blur time steps
-  );
-
-/*! Sets a uniform tessellation rate for subdiv meshes and hair
- *  geometry. For subdivision meshes the RTC_LEVEL_BUFFER can also be used
- *  optionally to set a different tessellation rate per edge.*/
-void rtcSetTessellationRate (RTCScene scene, uniform unsigned geomID, uniform float tessellationRate);
-
-/*! \brief Creates a new line segment geometry, consisting of multiple
-  segments with varying radii. The number of line segments (numSegments),
-  number of vertices (numVertices), and number of time steps (1 for
-  normal line segments, and 2 for linear motion blur), have to get
-  specified at construction time. Further, the segment index buffer
-  (RTC_INDEX_BUFFER) and the segment vertex buffer (RTC_VERTEX_BUFFER)
-  have to get set by mapping and writing to the appropiate buffers. In
-  case of linear motion blur, two vertex buffers have to get filled
-  (RTC_VERTEX_BUFFER0, RTC_VERTEX_BUFFER1), one for each time step. The
-  index buffer has the default layout of a single 32 bit integer index
-  for each line segment, that references the start vertex of the segment.
-  The vertex buffer stores 2 end points per line segment, each such point
-  consists of a single precision (x,y,z) position and radius, stored in
-  that order in memory. Individual segments are considered to be subpixel
-  sized which allows the implementation to approximate the intersection
-  calculation. This in particular means that zooming onto one line segment
-  might show geometric artefacts. */
-uniform unsigned int rtcNewLineSegments (RTCScene scene,                    //!< the scene the line segments belong to
-                                         uniform RTCGeometryFlags flags,    //!< geometry flags
-                                         uniform size_t numSegments,        //!< number of line segments
-                                         uniform size_t numVertices,        //!< number of vertices
-                                         uniform size_t numTimeSteps = 1    //!< number of motion blur time steps
-  );
-
-/*! \brief Sets 32 bit ray mask. */
-void rtcSetMask (RTCScene scene, uniform unsigned int geomID, uniform int mask);
-
-/*! \brief Sets boundary interpolation mode for subdivision surfaces */                                                                        
-void rtcSetBoundaryMode(RTCScene scene, uniform unsigned int geomID, uniform RTCBoundaryMode mode);
-
-/*! \brief Maps specified buffer. This function can be used to set index and
- *  vertex buffers of geometries. */
-void* uniform rtcMapBuffer(RTCScene scene, uniform unsigned int geomID, uniform RTCBufferType type);
-
-/*! \brief Unmaps specified buffer. 
-
-  A buffer has to be unmapped before the rtcEnable, rtcDisable,
-  rtcUpdate, or rtcDeleteGeometry calls are executed. */
-void rtcUnmapBuffer(RTCScene scene, uniform unsigned int geomID, uniform RTCBufferType type);
-
-/*! \brief Shares a data buffer between the application and
- *  Embree. The passed buffer is used by Embree to store index and
- *  vertex data. It has to remain valid as long as the mesh exists,
- *  and the user is responsible to free the data when the mesh gets
- *  deleted. One can optionally speficy a byte offset and byte stride
- *  of the elements stored inside the buffer. The addresses
- *  ptr+offset+i*stride have to be aligned to 4 bytes on Xeon CPUs and
- *  16 bytes on Xeon Phi accelerators. For vertex buffers, the 4 bytes
- *  after the z-coordinate of the last vertex have to be readable memory,
- *  thus padding is required for some layouts. If this function is not
- *  called, Embree will allocate and manage buffers of the default
- *  layout. */
-void rtcSetBuffer(RTCScene scene, uniform unsigned int geomID, uniform RTCBufferType type, 
-                  const void* uniform ptr, uniform size_t byteOffset, uniform size_t byteStride);
-
-/*! \brief Enable geometry. Enabled geometry can be hit by a ray. */
-void rtcEnable (RTCScene scene, uniform unsigned int geomID);
-
-/*! \brief Update spefific geometry buffer. 
-
-  Each time geometry buffers got modified, the user has to call some
-  update function to tell the ray tracing engine which buffers got
-  modified. The rtcUpdateBuffer function taggs a specific buffer of
-  some geometry as modified. */
-void rtcUpdate (RTCScene scene, uniform unsigned int geomID);
-
-/*! \brief Update spefific geometry buffer. 
-
-  Each time geometry buffers got modified, the user has to call some
-  update function to tell the ray tracing engine which buffers got
-  modified. The rtcUpdateBuffer function taggs a specific buffer of
-  some geometry as modified. */
-void rtcUpdateBuffer (RTCScene scene, uniform unsigned int geomID, uniform RTCBufferType type);
-
-/*! \brief Disable geometry. 
-
-  Disabled geometry is not hit by any ray. Disabling and enabling
-  geometry gives higher performance than deleting and recreating
-  geometry. */
-void rtcDisable (RTCScene scene, uniform unsigned int geomID);
-
-/*! \brief Sets the intersection filter function for uniform rays. */
-void rtcSetIntersectionFilterFunction1 (RTCScene scene, uniform unsigned int geomID, uniform RTCFilterFuncUniform func);
-
-/*! \brief Sets the intersection filter function for varying rays. */
-void rtcSetIntersectionFilterFunction (RTCScene scene, uniform unsigned int geomID, uniform RTCFilterFuncVarying func);
-
-/*! \brief Sets the occlusion filter function for uniform rays. */
-void rtcSetOcclusionFilterFunction1 (RTCScene scene, uniform unsigned int geomID, uniform RTCFilterFuncUniform func);
-
-/*! \brief Sets the occlusion filter function for varying rays. */
-void rtcSetOcclusionFilterFunction (RTCScene scene, uniform unsigned int geomID, uniform RTCFilterFuncVarying func);
-
-/*! Set pointer for user defined data per geometry. Invokations
- *  of the various user intersect and occluded functions get passed
- *  this data pointer when called. */
-void rtcSetUserData (RTCScene scene, uniform unsigned int geomID, void* uniform ptr);
-
-/*! Get pointer for user defined data per geometry based on geomID. */
-void* uniform rtcGetUserData (RTCScene scene, uniform unsigned int geomID);
-
-/*! Interpolates user data to some varying u/v location. The data
- *  buffer specifies per vertex data to interpolate and can be one of
- *  the RTC_VERTEX_BUFFER0/1 or RTC_USER_VERTEX_BUFFER0/1 and has to contain
- *  numFloats floating point values to interpolate for each vertex of
- *  the geometry. The P array will get filled with the interpolated
- *  data, and the dPdu and dPdv arrays with the u and v derivative of
- *  the interpolation. If the pointers P is NULL, the value will not
- *  get calculated. If dPdu and dPdv are NULL the derivatives will not
- *  get calculated. Both dPdu and dPdv have to be either valid or
- *  NULL. These destination arrays are filled in structure of array
- *  (SoA) layout. The buffer has to be padded at the end such
- *  that the last element can be read safely using SSE
- *  instructions. */
-void rtcInterpolate(RTCScene scene, uniform unsigned int geomID, varying unsigned int primIDs, varying float u, varying float v, 
-                    uniform RTCBufferType buffer,
-                    varying float* uniform P, varying float* uniform dPdu, varying float* uniform dPdv, uniform size_t numFloats);
-
-/*! Interpolates user data to some varying u/v location. The data
- *  buffer specifies per vertex data to interpolate and can be one of
- *  the RTC_VERTEX_BUFFER0/1 or RTC_USER_VERTEX_BUFFER0/1 and has to contain
- *  numFloats floating point values to interpolate for each vertex of
- *  the geometry. The P array will get filled with the
- *  interpolated datam the dPdu and dPdv arrays with the u and v
- *  derivative of the interpolation, and the ddPdudu, ddPdvdv, and
- *  ddPdudv arrays with the respective second derivatives. One can
- *  disable 1) the calculation of the interpolated value by setting P
- *  to NULL, 2) the calculation of the 1st order derivatives by
- *  setting dPdu and dPdv to NULL, 3) the calculation of the second
- *  order derivatives by setting ddPdudu, ddPdvdv, and ddPdudv to
- *  NULL. These destination arrays are filled in structure of array
- *  (SoA) layout. The buffer has to be padded at the end such that
- *  the last element can be read safely using SSE
- *  instructions. */
-void rtcInterpolate2(RTCScene scene, uniform unsigned int geomID, varying unsigned int primIDs, varying float u, varying float v, 
-                    uniform RTCBufferType buffer,
-                    varying float* uniform P, varying float* uniform dPdu, varying float* uniform dPdv,
-                    varying float* uniform ddPdudu, varying float* uniform ddPdvdv, varying float* uniform ddPdudv,
-                    uniform size_t numFloats);
-
-/*! \brief Deletes the geometry. */
-void rtcDeleteGeometry (RTCScene scene, uniform unsigned int geomID);
-
-/*! @} */
-
-#endif
diff --git a/3rdparty/embree/include/embree2/rtcore_geometry_user.h b/3rdparty/embree/include/embree2/rtcore_geometry_user.h
deleted file mode 100644
index e4b4d4b8..00000000
--- a/3rdparty/embree/include/embree2/rtcore_geometry_user.h
+++ /dev/null
@@ -1,154 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_USER_GEOMETRY_H__
-#define __RTCORE_USER_GEOMETRY_H__
-
-/*! \ingroup embree_kernel_api */
-/*! \{ */
-
-/*! Type of bounding function. */
-typedef void (*RTCBoundsFunc)(void* ptr,              /*!< pointer to user data */
-                              size_t item,            /*!< item to calculate bounds for */
-                              RTCBounds& bounds_o     /*!< returns calculated bounds */);
-
-/*! Type of bounding function. */
-typedef void (*RTCBoundsFunc2)(void* userPtr,         /*!< pointer to user data */
-                               void* geomUserPtr,     /*!< pointer to geometry user data */
-                               size_t item,           /*!< item to calculate bounds for */
-                               RTCBounds* bounds_o    /*!< returns calculated bounds */);
-
-/*! Type of intersect function pointer for single rays. */
-typedef void (*RTCIntersectFunc)(void* ptr,           /*!< pointer to user data */
-                                 RTCRay& ray,         /*!< ray to intersect */
-                                 size_t item          /*!< item to intersect */);
-
-/*! Type of intersect function pointer for ray packets of size 4. */
-typedef void (*RTCIntersectFunc4)(const void* valid,  /*!< pointer to valid mask */
-                                  void* ptr,          /*!< pointer to user data */
-                                  RTCRay4& ray,       /*!< ray packet to intersect */
-                                  size_t item         /*!< item to intersect */);
-
-/*! Type of intersect function pointer for ray packets of size 8. */
-typedef void (*RTCIntersectFunc8)(const void* valid,  /*!< pointer to valid mask */
-                                  void* ptr,          /*!< pointer to user data */
-                                  RTCRay8& ray,       /*!< ray packet to intersect */
-                                  size_t item         /*!< item to intersect */);
-
-/*! Type of intersect function pointer for ray packets of size 16. */
-typedef void (*RTCIntersectFunc16)(const void* valid, /*!< pointer to valid mask */
-                                   void* ptr,         /*!< pointer to user data */
-                                   RTCRay16& ray,     /*!< ray packet to intersect */
-                                   size_t item        /*!< item to intersect */);
-
-/*! Type of occlusion function pointer for single rays. */
-typedef void (*RTCOccludedFunc) (void* ptr,           /*!< pointer to user data */ 
-                                 RTCRay& ray,         /*!< ray to test occlusion */
-                                 size_t item          /*!< item to test for occlusion */);
-
-/*! Type of occlusion function pointer for ray packets of size 4. */
-typedef void (*RTCOccludedFunc4) (const void* valid,  /*! pointer to valid mask */
-                                  void* ptr,          /*!< pointer to user data */
-                                  RTCRay4& ray,       /*!< Ray packet to test occlusion. */
-                                  size_t item         /*!< item to test for occlusion */);
-
-/*! Type of occlusion function pointer for ray packets of size 8. */
-typedef void (*RTCOccludedFunc8) (const void* valid,  /*! pointer to valid mask */
-                                  void* ptr,          /*!< pointer to user data */
-                                  RTCRay8& ray,       /*!< Ray packet to test occlusion. */
-                                  size_t item         /*!< item to test for occlusion */);
-
-/*! Type of occlusion function pointer for ray packets of size 16. */
-typedef void (*RTCOccludedFunc16) (const void* valid, /*! pointer to valid mask */
-                                   void* ptr,         /*!< pointer to user data */
-                                   RTCRay16& ray,     /*!< Ray packet to test occlusion. */
-                                   size_t item        /*!< item to test for occlusion */);
-
-/*! Creates a new user geometry object. This feature makes it possible
- *  to add arbitrary types of geometry to the scene by providing
- *  appropiate bounding, intersect and occluded functions. A user
- *  geometry object is a set of user geometries. As the rtcIntersect
- *  and rtcOccluded functions support different ray packet sizes, the
- *  user also has to provide different versions of intersect and
- *  occluded function pointers for these packet sizes. However, the
- *  ray packet size of the called function pointer always matches the
- *  packet size of the originally invoked rtcIntersect and rtcOccluded
- *  functions. A user data pointer, that points to a user specified
- *  representation of the geometry, is passed to each intersect and
- *  occluded function invokation, as well as the index of the geometry
- *  of the set to intersect. */
-RTCORE_API unsigned rtcNewUserGeometry (RTCScene scene,           /*!< the scene the user geometry set is created in */
-                                        size_t numGeometries      /*!< the number of geometries contained in the set */);
-
-RTCORE_API unsigned rtcNewUserGeometry2 (RTCScene scene,          /*!< the scene the user geometry set is created in */
-                                         size_t numGeometries,    /*!< the number of geometries contained in the set */
-                                         size_t numTimeSteps = 1  /*!< number of motion blur time steps */);
-
-/*! Sets the bounding function to calculate bounding boxes of the user
- *  geometry items when building spatial index structures. The
- *  calculated bounding box have to be conservative and should be
- *  tight. */
-RTCORE_API void rtcSetBoundsFunction (RTCScene scene, unsigned geomID, RTCBoundsFunc bounds);
-
-/*! Sets the bounding function to calculate bounding boxes of the user
- *  geometry items when building spatial index structures. The
- *  calculated bounding box have to be conservative and should be
- *  tight. */
-RTCORE_API void rtcSetBoundsFunction2 (RTCScene scene, unsigned geomID, RTCBoundsFunc2 bounds, void* userPtr);
-
-/*! Set intersect function for single rays. The rtcIntersect function
- *  will call the passed function for intersecting the user
- *  geometry. */
-RTCORE_API void rtcSetIntersectFunction (RTCScene scene, unsigned geomID, RTCIntersectFunc intersect);
-
-/*! Set intersect function for ray packets of size 4. The
- *  rtcIntersect4 function will call the passed function for
- *  intersecting the user geometry. */
-RTCORE_API void rtcSetIntersectFunction4 (RTCScene scene, unsigned geomID, RTCIntersectFunc4 intersect4);
-
-/*! Set intersect function for ray packets of size 8. The
- *  rtcIntersect8 function will call the passed function for
- *  intersecting the user geometry.*/
-RTCORE_API void rtcSetIntersectFunction8 (RTCScene scene, unsigned geomID, RTCIntersectFunc8 intersect8);
-
-/*! Set intersect function for ray packets of size 16. The
- *  rtcIntersect16 function will call the passed function for
- *  intersecting the user geometry. */
-RTCORE_API void rtcSetIntersectFunction16 (RTCScene scene, unsigned geomID, RTCIntersectFunc16 intersect16);
-
-/*! Set occlusion function for single rays. The rtcOccluded function
- *  will call the passed function for intersecting the user
- *  geometry. */
-RTCORE_API void rtcSetOccludedFunction (RTCScene scene, unsigned geomID, RTCOccludedFunc occluded);
-
-/*! Set occlusion function for ray packets of size 4. The rtcOccluded4
- *  function will call the passed function for intersecting the user
- *  geometry. */
-RTCORE_API void rtcSetOccludedFunction4 (RTCScene scene, unsigned geomID, RTCOccludedFunc4 occluded4);
-
-/*! Set occlusion function for ray packets of size 8. The rtcOccluded8
- *  function will call the passed function for intersecting the user
- *  geometry. */
-RTCORE_API void rtcSetOccludedFunction8 (RTCScene scene, unsigned geomID, RTCOccludedFunc8 occluded8);
-
-/*! Set occlusion function for ray packets of size 16. The
- *  rtcOccluded16 function will call the passed function for
- *  intersecting the user geometry. */
-RTCORE_API void rtcSetOccludedFunction16 (RTCScene scene, unsigned geomID, RTCOccludedFunc16 occluded16);
-
-/*! @} */
-
-#endif
diff --git a/3rdparty/embree/include/embree2/rtcore_geometry_user.isph b/3rdparty/embree/include/embree2/rtcore_geometry_user.isph
deleted file mode 100644
index d89ec1b9..00000000
--- a/3rdparty/embree/include/embree2/rtcore_geometry_user.isph
+++ /dev/null
@@ -1,128 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_USER_GEOMETRY_ISPH__
-#define __RTCORE_USER_GEOMETRY_ISPH__
-
-/*! \ingroup embree_kernel_api_ispc */
-/*! \{ */
-
-/*! Type of bounding function. */
-typedef void (*RTCBoundsFunc)(void* uniform ptr,                 /*!< pointer to user data */
-                              uniform size_t item,               /*!< item to calculate bounds for */
-                              uniform RTCBounds& bounds_o        /*!< returns calculated bounds */);
-
-/*! Type of bounding function. */
-typedef void (*RTCBoundsFunc2)(void* uniform userPtr,            /*!< pointer to user data */
-                               void* uniform geomUserPtr,        /*!< pointer to geometry user data */
-                               uniform size_t item,              /*!< item to calculate bounds for */
-                               RTCBounds* uniform bounds_o       /*!< returns calculated bounds */);
-
-/*! Type of intersect function pointer for uniform rays. */
-typedef void (*RTCIntersectFuncUniform)(void* uniform ptr,       /*!< pointer to user data */
-                                        uniform RTCRay1& ray,    /*!< ray to intersect */
-                                        uniform size_t item      /*< item to intersect */);
-
-/*! Type of intersect function pointer for varying rays. */
-typedef void (*RTCIntersectFuncVarying)(void* uniform ptr,       /*!< pointer to user data */
-                                        varying RTCRay& ray,     /*!< ray to intersect */
-                                        uniform size_t item      /*< item to intersect */);
-
-/*! Type of occlusion function pointer for uniform rays. */
-typedef void (*RTCOccludedFuncUniform) (void* uniform ptr,       /*!< pointer to user data */ 
-                                        uniform RTCRay1& ray,    /*!< ray to test occlusion */
-                                        uniform size_t item      /*< item to test for occlusion */);
-
-
-/*! Type of occlusion function pointer for varying rays. */
-typedef void (*RTCOccludedFuncVarying) (void* uniform ptr,       /*!< pointer to user data */ 
-                                        varying RTCRay& ray,     /*!< ray to test occlusion */
-                                        uniform size_t item      /*< item to test for occlusion */);
-
-
-typedef void (*RTCDisplacementFunc)(void* uniform ptr,               /*!< pointer to user data of geometry */
-                                    uniform unsigned int geomID,     /*!< ID of geometry to displace */
-                                    uniform unsigned int primID,     /*!< ID of primitive of geometry to displace */
-                                    uniform const float* uniform u,  /*!< u coordinates (source) */
-                                    uniform const float* uniform v,  /*!< v coordinates (source) */
-                                    uniform const float* uniform nx, /*!< x coordinates of normal at point to displace (source) */
-                                    uniform const float* uniform ny, /*!< y coordinates of normal at point to displace (source) */
-                                    uniform const float* uniform nz, /*!< z coordinates of normal at point to displace (source) */
-                                    uniform float* uniform px,       /*!< x coordinates of points to displace (source and target) */
-                                    uniform float* uniform py,       /*!< y coordinates of points to displace (source and target) */
-                                    uniform float* uniform pz,       /*!< z coordinates of points to displace (source and target) */
-                                    uniform size_t N                 /*!< number of points to displace */ );
-
-
-/*! Creates a new user geometry object. This feature makes it possible
- *  to add arbitrary types of geometry to the scene by providing
- *  appropiate intersect and occluded functions, as well as a bounding
- *  box of the implemented geometry. As the rtcIntersect and
- *  rtcOccluded functions support different ray packet sizes, the user
- *  also has to provide different versions of intersect and occluded
- *  function pointers for the different packet sized. However, only
- *  rtcIntersect and rtcOccluded functions of specific packet sizes
- *  are called, it is sufficient to provide only the corresponding
- *  function pointer for the user geometry. However, the functions
- *  provided have to intersect the same geometry. A user data pointer,
- *  that points to a user specified representation of the geometry, is
- *  passed to each intersect and occluded function invokation. */
-uniform unsigned int rtcNewUserGeometry (RTCScene scene,                  /*!< the scene the user geometry set is created in */
-                                         uniform size_t numGeometries     /*!< the number of geometries contained in the set */);
-
-uniform unsigned int rtcNewUserGeometry2 (RTCScene scene,                 /*!< the scene the user geometry set is created in */
-                                          uniform size_t numGeometries,    /*!< the number of geometries contained in the set */
-                                          uniform size_t numTimeSteps = 1  /*!< number of motion blur time steps */);
-
-/*! Sets the bounding function to calculate bounding boxes of the user
- *  geometry items when building spatial index structures. The
- *  calculated bounding box have to be conservative and should be
- *  tight.*/
-void rtcSetBoundsFunction (RTCScene scene, uniform unsigned int geomID, uniform RTCBoundsFunc bounds);
-
-/*! Sets the bounding function to calculate bounding boxes of the user
- *  geometry items when building spatial index structures. The
- *  calculated bounding box have to be conservative and should be
- *  tight.*/
-void rtcSetBoundsFunction2 (RTCScene scene, uniform unsigned int geomID, uniform RTCBoundsFunc2 bounds, void* uniform userPtr);
-
-/*! Set intersect function for uniform rays. The rtcIntersect1
- *  function will call the passed function for intersecting the user
- *  geometry. */
-void rtcSetIntersectFunction1 (RTCScene scene, uniform unsigned int geomID, uniform RTCIntersectFuncUniform intersect);
-
-/*! Set intersect function for varying rays. The rtcIntersect function
- *  will call the passed function for intersecting the user
- *  geometry. */
-void rtcSetIntersectFunction (RTCScene scene, uniform unsigned int geomID, uniform RTCIntersectFuncVarying intersect);
-
-/*! Set occlusion function for uniform rays. The rtcOccluded1 function
- *  will call the passed function for intersecting the user
- *  geometry. */
-void rtcSetOccludedFunction1 (RTCScene scene, uniform unsigned int geomID, uniform RTCOccludedFuncUniform occluded);
-
-/*! Set occlusion function for varying rays. The rtcOccluded function
- *  will call the passed function for intersecting the user
- *  geometry. */
-void rtcSetOccludedFunction (RTCScene scene, uniform unsigned int geomID, uniform RTCOccludedFuncVarying occluded);
-
-
-/*! \brief Sets the displacement function. */
-void rtcSetDisplacementFunction (RTCScene scene, uniform unsigned int geomID, uniform RTCDisplacementFunc func, uniform RTCBounds *uniform bounds);
-
-/*! @} */
-
-#endif
diff --git a/3rdparty/embree/include/embree2/rtcore_ray.h b/3rdparty/embree/include/embree2/rtcore_ray.h
deleted file mode 100644
index f20b11b5..00000000
--- a/3rdparty/embree/include/embree2/rtcore_ray.h
+++ /dev/null
@@ -1,195 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_RAY_H__
-#define __RTCORE_RAY_H__
-
-/*! \ingroup embree_kernel_api */
-/*! \{ */
-
-/*! \brief Ray structure for an individual ray */
-struct RTCORE_ALIGN(16)  RTCRay
-{
-  /* ray data */
-public:
-  float org[3];      //!< Ray origin
-  float align0;
-  
-  float dir[3];      //!< Ray direction
-  float align1;
-  
-  float tnear;       //!< Start of ray segment
-  float tfar;        //!< End of ray segment (set to hit distance)
-
-  float time;        //!< Time of this ray for motion blur
-  unsigned mask;        //!< Used to mask out objects during traversal
-  
-  /* hit data */
-public:
-  float Ng[3];       //!< Unnormalized geometry normal
-  float align2;
-  
-  float u;           //!< Barycentric u coordinate of hit
-  float v;           //!< Barycentric v coordinate of hit
-
-  unsigned geomID;        //!< geometry ID
-  unsigned primID;        //!< primitive ID
-  unsigned instID;        //!< instance ID
-};
-
-/*! Ray structure for packets of 4 rays. */
-struct RTCORE_ALIGN(16) RTCRay4
-{
-  /* ray data */
-public:
-  float orgx[4];  //!< x coordinate of ray origin
-  float orgy[4];  //!< y coordinate of ray origin
-  float orgz[4];  //!< z coordinate of ray origin
-  
-  float dirx[4];  //!< x coordinate of ray direction
-  float diry[4];  //!< y coordinate of ray direction
-  float dirz[4];  //!< z coordinate of ray direction
-  
-  float tnear[4]; //!< Start of ray segment 
-  float tfar[4];  //!< End of ray segment (set to hit distance)
-
-  float time[4];  //!< Time of this ray for motion blur
-  unsigned mask[4];  //!< Used to mask out objects during traversal
-  
-  /* hit data */
-public:
-  float Ngx[4];   //!< x coordinate of geometry normal
-  float Ngy[4];   //!< y coordinate of geometry normal
-  float Ngz[4];   //!< z coordinate of geometry normal
-  
-  float u[4];     //!< Barycentric u coordinate of hit
-  float v[4];     //!< Barycentric v coordinate of hit
-  
-  unsigned geomID[4];  //!< geometry ID
-  unsigned primID[4];  //!< primitive ID
-  unsigned instID[4];  //!< instance ID
-};
-
-/*! Ray structure for packets of 8 rays. */
-struct RTCORE_ALIGN(32) RTCRay8
-{
-  /* ray data */
-public:
-  float orgx[8];  //!< x coordinate of ray origin
-  float orgy[8];  //!< y coordinate of ray origin
-  float orgz[8];  //!< z coordinate of ray origin
-  
-  float dirx[8];  //!< x coordinate of ray direction
-  float diry[8];  //!< y coordinate of ray direction
-  float dirz[8];  //!< z coordinate of ray direction
-  
-  float tnear[8]; //!< Start of ray segment 
-  float tfar[8];  //!< End of ray segment (set to hit distance)
-
-  float time[8];  //!< Time of this ray for motion blur
-  unsigned mask[8];  //!< Used to mask out objects during traversal
-  
-  /* hit data */
-public:
-  float Ngx[8];   //!< x coordinate of geometry normal
-  float Ngy[8];   //!< y coordinate of geometry normal
-  float Ngz[8];   //!< z coordinate of geometry normal
-  
-  float u[8];     //!< Barycentric u coordinate of hit
-  float v[8];     //!< Barycentric v coordinate of hit
-  
-  unsigned geomID[8];  //!< geometry ID
-  unsigned primID[8];  //!< primitive ID
-  unsigned instID[8];  //!< instance ID
-};
-
-/*! \brief Ray structure for packets of 16 rays. */
-struct RTCORE_ALIGN(64) RTCRay16
-{
-  /* ray data */
-public:
-  float orgx[16];  //!< x coordinate of ray origin
-  float orgy[16];  //!< y coordinate of ray origin
-  float orgz[16];  //!< z coordinate of ray origin
-  
-  float dirx[16];  //!< x coordinate of ray direction
-  float diry[16];  //!< y coordinate of ray direction
-  float dirz[16];  //!< z coordinate of ray direction
-  
-  float tnear[16]; //!< Start of ray segment 
-  float tfar[16];  //!< End of ray segment (set to hit distance)
-
-  float time[16];  //!< Time of this ray for motion blur
-  unsigned mask[16];  //!< Used to mask out objects during traversal
-  
-  /* hit data */
-public:
-  float Ngx[16];   //!< x coordinate of geometry normal
-  float Ngy[16];   //!< y coordinate of geometry normal
-  float Ngz[16];   //!< z coordinate of geometry normal
-  
-  float u[16];     //!< Barycentric u coordinate of hit
-  float v[16];     //!< Barycentric v coordinate of hit
-  
-  unsigned geomID[16];  //!< geometry ID
-  unsigned primID[16];  //!< primitive ID
-  unsigned instID[16];  //!< instance ID
-};
-
-
-/*! \brief Ray structure template for packets of N rays in SOA layout. */
-struct RTCRaySOA
-{
-  /* ray data */
-public:
-
-  float* orgx;  //!< x coordinate of ray origin
-  float* orgy;  //!< y coordinate of ray origin
-  float* orgz;  //!< z coordinate of ray origin
-
-  float* dirx;  //!< x coordinate of ray direction
-  float* diry;  //!< y coordinate of ray direction
-  float* dirz;  //!< z coordinate of ray direction
-
-  float* tnear; //!< Start of ray segment (optional)
-  float* tfar;  //!< End of ray segment (set to hit distance)
-
- 
-  float* time;  //!< Time of this ray for motion blur (optional)
-  unsigned* mask;  //!< Used to mask out objects during traversal (optional)
-
-  /* hit data */
-
-public:
-
-  float* Ngx;   //!< x coordinate of geometry normal (optional)
-  float* Ngy;   //!< y coordinate of geometry normal (optional)
-  float* Ngz;   //!< z coordinate of geometry normal (optional)
-
- 
-
-  float* u;     //!< Barycentric u coordinate of hit
-  float* v;     //!< Barycentric v coordinate of hit
-
- 
-  unsigned* geomID;  //!< geometry ID
-  unsigned* primID;  //!< primitive ID
-  unsigned* instID;  //!< instance ID (optional)
-};
-
-/*! @} */
-
-#endif
diff --git a/3rdparty/embree/include/embree2/rtcore_ray.isph b/3rdparty/embree/include/embree2/rtcore_ray.isph
deleted file mode 100644
index f1bf4d26..00000000
--- a/3rdparty/embree/include/embree2/rtcore_ray.isph
+++ /dev/null
@@ -1,117 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_RAY_ISPH__
-#define __RTCORE_RAY_ISPH__
-
-/*! \ingroup embree_kernel_api_ispc */
-/*! \{ */
-
-/*! Ray structure for uniform (single) rays. */
-struct RTCRay1 
-{
-  /* ray data */
-  float org[3];      //!< Ray origin
-  float align0;      //!< unused member to force alignment of following members
-  
-  float dir[3];      //!< Ray direction
-  float align1;      //!< unused member to force alignment of following members
-  
-  float tnear;       //!< Start of ray segment
-  float tfar;        //!< End of ray segment (set to hit distance)
-  float time;        //!< Time of this ray for motion blur
-  unsigned   mask;        //!< Used to mask out objects during traversal
-  
-  /* hit data */
-  float Ng[3];       //!< Unnormalized geometry normal
-  float align2;
-  
-  float u;           //!< Barycentric u coordinate of hit
-  float v;           //!< Barycentric v coordinate of hit
-
-  unsigned geomID;        //!< geometry ID
-  unsigned primID;        //!< primitive ID
-  unsigned instID;        //!< instance ID
-  varying unsigned align[0];  //!< aligns ray on stack to at least 16 bytes
-};
-
-/*! Ray structure for packets of 4 rays. */
-struct RTCRay
-{
-  /* ray data */
-  float orgx;  //!< x coordinate of ray origin
-  float orgy;  //!< y coordinate of ray origin
-  float orgz;  //!< z coordinate of ray origin
-  
-  float dirx;  //!< x coordinate of ray direction
-  float diry;  //!< y coordinate of ray direction
-  float dirz;  //!< z coordinate of ray direction
-  
-  float tnear; //!< Start of ray segment 
-  float tfar;  //!< End of ray segment   
-  float time;  //!< Time of this ray for motion blur
-  unsigned mask;  //!< Used to mask out objects during traversal
-  
-  /* hit data */
-  float Ngx;   //!< x coordinate of geometry normal
-  float Ngy;   //!< y coordinate of geometry normal
-  float Ngz;   //!< z coordinate of geometry normal
-  
-  float u;     //!< Barycentric u coordinate of hit
-  float v;     //!< Barycentric v coordinate of hit
-  
-  unsigned geomID;     //!< geometry ID
-  unsigned primID;     //!< primitive ID
-  unsigned instID;     //!< instance ID
-};
-
-
-struct RTCRaySOA
-{
-  /* ray data */
-
-  uniform float* uniform orgx;  //!< x coordinate of ray origin
-  uniform float* uniform orgy;  //!< y coordinate of ray origin
-  uniform float* uniform orgz;  //!< z coordinate of ray origin
-
-  uniform float* uniform dirx;  //!< x coordinate of ray direction
-  uniform float* uniform diry;  //!< y coordinate of ray direction
-  uniform float* uniform dirz;  //!< z coordinate of ray direction
-
-  uniform float* uniform tnear; //!< Start of ray segment (optional)
-  uniform float* uniform tfar;  //!< End of ray segment (set to hit distance)
- 
-  uniform float* uniform time;  //!< Time of this ray for motion blur (optional)
-  uniform unsigned* uniform mask;  //!< Used to mask out objects during traversal (optional)
-
-  /* hit data */
-
-  uniform float* uniform Ngx;   //!< x coordinate of geometry normal (optional)
-  uniform float* uniform Ngy;   //!< y coordinate of geometry normal (optional)
-  uniform float* uniform Ngz;   //!< z coordinate of geometry normal (optional)
-
-  uniform float* uniform u;     //!< Barycentric u coordinate of hit
-  uniform float* uniform v;     //!< Barycentric v coordinate of hit
- 
-  uniform unsigned* uniform geomID;  //!< geometry ID
-  uniform unsigned* uniform primID;  //!< primitive ID
-  uniform unsigned* uniform instID;  //!< instance ID (optional)
-};
-
-
-/*! @} */
-
-#endif
diff --git a/3rdparty/embree/include/embree2/rtcore_scene.h b/3rdparty/embree/include/embree2/rtcore_scene.h
deleted file mode 100644
index df04e0a2..00000000
--- a/3rdparty/embree/include/embree2/rtcore_scene.h
+++ /dev/null
@@ -1,187 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_SCENE_H__
-#define __RTCORE_SCENE_H__
-
-/*! \ingroup embree_kernel_api */
-/*! \{ */
-
-/*! forward declarations for ray structures */
-struct RTCRay;
-struct RTCRay4;
-struct RTCRay8;
-struct RTCRay16;
-struct RTCRaySOA;
-
-/*! scene flags */
-enum RTCSceneFlags 
-{
-  /* dynamic type flags */
-  RTC_SCENE_STATIC     = (0 << 0),    //!< specifies static scene
-  RTC_SCENE_DYNAMIC    = (1 << 0),    //!< specifies dynamic scene
-
-  /* acceleration structure flags */
-  RTC_SCENE_COMPACT    = (1 << 8),    //!< use memory conservative data structures
-  RTC_SCENE_COHERENT   = (1 << 9),    //!< optimize data structures for coherent rays
-  RTC_SCENE_INCOHERENT = (1 << 10),    //!< optimize data structures for in-coherent rays (enabled by default)
-  RTC_SCENE_HIGH_QUALITY = (1 << 11),  //!< create higher quality data structures
-
-  /* traversal algorithm flags */
-  RTC_SCENE_ROBUST     = (1 << 16)     //!< use more robust traversal algorithms
-};
-
-/*! enabled algorithm flags */
-enum RTCAlgorithmFlags 
-{
-  RTC_INTERSECT1 = (1 << 0),    //!< enables the rtcIntersect1 and rtcOccluded1 functions for this scene
-  RTC_INTERSECT4 = (1 << 1),    //!< enables the rtcIntersect4 and rtcOccluded4 functions for this scene
-  RTC_INTERSECT8 = (1 << 2),    //!< enables the rtcIntersect8 and rtcOccluded8 functions for this scene
-  RTC_INTERSECT16 = (1 << 3),   //!< enables the rtcIntersect16 and rtcOccluded16 functions for this scene
-  RTC_INTERPOLATE = (1 << 4),   //!< enables the rtcInterpolate function for this scene
-
-  RTC_INTERSECTN = (1 << 5),    //!< enables the rtcIntersectN and rtcOccludedN functions for this scene  
-};
-
-/*! layout flags for ray streams */
-enum RTCRayNFlags
-{
-  RTC_RAYN_DEFAULT = (1 << 0)
-};
-
-
-/*! \brief Defines an opaque scene type */
-typedef struct __RTCScene {}* RTCScene;
-
-/*! Creates a new scene. 
-   WARNING: This function is deprecated, use rtcDeviceNewScene instead.
-*/
-RTCORE_API RTCORE_DEPRECATED RTCScene rtcNewScene (RTCSceneFlags flags, RTCAlgorithmFlags aflags);
-
-/*! Creates a new scene. */
-RTCORE_API RTCScene rtcDeviceNewScene (RTCDevice device, RTCSceneFlags flags, RTCAlgorithmFlags aflags);
-
-/*! \brief Type of progress callback function. */
-typedef bool (*RTCProgressMonitorFunc)(void* ptr, const double n);
-RTCORE_DEPRECATED typedef RTCProgressMonitorFunc RTC_PROGRESS_MONITOR_FUNCTION;
-
-/*! \brief Sets the progress callback function which is called during hierarchy build of this scene. */
-RTCORE_API void rtcSetProgressMonitorFunction(RTCScene scene, RTCProgressMonitorFunc func, void* ptr);
-
-/*! Commits the geometry of the scene. After initializing or modifying
- *  geometries, commit has to get called before tracing
- *  rays. */
-RTCORE_API void rtcCommit (RTCScene scene);
-
-/*! Commits the geometry of the scene. The calling threads will be
- *  used internally as a worker threads on some implementations. The
- *  function will wait until 'numThreads' threads have called this
- *  function and all threads return from the function after the scene
- *  commit is finished. The application threads will not be used as
- *  worker threads when the TBB tasking system is enabled (which is
- *  the default). On CPUs, we recommend also using TBB inside your
- *  application to share threads. We recommend using the
- *  rtcCommitThread feature to share threads on the Xeon Phi
- *  coprocessor. */
-RTCORE_API void rtcCommitThread(RTCScene scene, unsigned int threadID, unsigned int numThreads);
-
-/*! Returns to AABB of the scene. rtcCommit has to get called
- *  previously to this function. */
-RTCORE_API void rtcGetBounds(RTCScene scene, RTCBounds& bounds_o);
-
-/*! Intersects a single ray with the scene. The ray has to be aligned
- *  to 16 bytes. This function can only be called for scenes with the
- *  RTC_INTERSECT1 flag set. */
-RTCORE_API void rtcIntersect (RTCScene scene, RTCRay& ray);
-
-/*! Intersects a packet of 4 rays with the scene. The valid mask and
- *  ray have both to be aligned to 16 bytes. This function can only be
- *  called for scenes with the RTC_INTERSECT4 flag set. */
-RTCORE_API void rtcIntersect4 (const void* valid, RTCScene scene, RTCRay4& ray);
-
-/*! Intersects a packet of 8 rays with the scene. The valid mask and
- *  ray have both to be aligned to 32 bytes. This function can only be
- *  called for scenes with the RTC_INTERSECT8 flag set. For performance
- *  reasons, the rtcIntersect8 function should only get called if the
- *  CPU supports AVX. */
-RTCORE_API void rtcIntersect8 (const void* valid, RTCScene scene, RTCRay8& ray);
-
-/*! Intersects a packet of 16 rays with the scene. The valid mask and
- *  ray have both to be aligned to 64 bytes. This function can only be
- *  called for scenes with the RTC_INTERSECT16 flag set. For
- *  performance reasons, the rtcIntersect16 function should only get
- *  called if the CPU supports the 16-wide SIMD instructions. */
-RTCORE_API void rtcIntersect16 (const void* valid, RTCScene scene, RTCRay16& ray);
-
-/*! Intersects a stream of N rays in AOS layout with the scene. This
- *  function can only be called for scenes with the RTC_INTERSECTN
- *  flag set. The stride specifies the offset between rays in
- *  bytes. */
-RTCORE_API void rtcIntersectN (RTCScene scene, RTCRay* rayN, const size_t N, const size_t stride, const size_t flags = RTC_RAYN_DEFAULT);
-
-/*! Intersects one or multiple streams of N rays in compact SOA layout
- *  with the scene. This function can only be called for scenes with
- *  the RTC_INTERSECTN flag set. 'streams' specifies the number of
- *  dense SOA ray streams, and 'stride' the offset in bytes between
- *  those. */
-RTCORE_API void rtcIntersectN_SOA (RTCScene scene, RTCRaySOA& rayN, const size_t N, const size_t streams, const size_t stride, const size_t flags = RTC_RAYN_DEFAULT);
-
-
-/*! Tests if a single ray is occluded by the scene. The ray has to be
- *  aligned to 16 bytes. This function can only be called for scenes
- *  with the RTC_INTERSECT1 flag set. */
-RTCORE_API void rtcOccluded (RTCScene scene, RTCRay& ray);
-
-/*! Tests if a packet of 4 rays is occluded by the scene. This
- *  function can only be called for scenes with the RTC_INTERSECT4
- *  flag set. The valid mask and ray have both to be aligned to 16
- *  bytes. */
-RTCORE_API void rtcOccluded4 (const void* valid, RTCScene scene, RTCRay4& ray);
-
-/*! Tests if a packet of 8 rays is occluded by the scene. The valid
- *  mask and ray have both to be aligned to 32 bytes. This function
- *  can only be called for scenes with the RTC_INTERSECT8 flag
- *  set. For performance reasons, the rtcOccluded8 function should
- *  only get called if the CPU supports AVX. */
-RTCORE_API void rtcOccluded8 (const void* valid, RTCScene scene, RTCRay8& ray);
-
-/*! Tests if a packet of 16 rays is occluded by the scene. The valid
- *  mask and ray have both to be aligned to 64 bytes. This function
- *  can only be called for scenes with the RTC_INTERSECT16 flag
- *  set. For performance reasons, the rtcOccluded16 function should
- *  only get called if the CPU supports the 16-wide SIMD
- *  instructions. */
-RTCORE_API void rtcOccluded16 (const void* valid, RTCScene scene, RTCRay16& ray);
-
-/*! Tests if a stream of N rays on AOS layout is occluded by the
- *  scene. This function can only be called for scenes with the
- *  RTC_INTERSECTN flag set. The stride specifies the offset between
- *  rays in bytes.*/
-RTCORE_API void rtcOccludedN (RTCScene scene, RTCRay* rayN, const size_t N, const size_t stride, const size_t flags = RTC_RAYN_DEFAULT);
-
-/*! Intersects one or multiple streams of N rays in compact SOA layout
- *  with the scene. This function can only be called for scenes with
- *  the RTC_INTERSECTN flag set. 'streams' specifies the number of
- *  dense SOA ray streams, and 'stride' the offset in bytes between
- *  those. */
-RTCORE_API void rtcOccludedN_SOA (RTCScene scene, RTCRaySOA& rayN, const size_t N, const size_t streams, const size_t stride, const size_t flags = RTC_RAYN_DEFAULT);
-
-/*! Deletes the scene. All contained geometry get also destroyed. */
-RTCORE_API void rtcDeleteScene (RTCScene scene);
-
-/*! @} */
-
-#endif
diff --git a/3rdparty/embree/include/embree2/rtcore_scene.isph b/3rdparty/embree/include/embree2/rtcore_scene.isph
deleted file mode 100644
index a85eee21..00000000
--- a/3rdparty/embree/include/embree2/rtcore_scene.isph
+++ /dev/null
@@ -1,152 +0,0 @@
-// ======================================================================== //
-// Copyright 2009-2015 Intel Corporation                                    //
-//                                                                          //
-// Licensed under the Apache License, Version 2.0 (the "License");          //
-// you may not use this file except in compliance with the License.         //
-// You may obtain a copy of the License at                                  //
-//                                                                          //
-//     http://www.apache.org/licenses/LICENSE-2.0                           //
-//                                                                          //
-// Unless required by applicable law or agreed to in writing, software      //
-// distributed under the License is distributed on an "AS IS" BASIS,        //
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. //
-// See the License for the specific language governing permissions and      //
-// limitations under the License.                                           //
-// ======================================================================== //
-
-#ifndef __RTCORE_SCENE_ISPH__
-#define __RTCORE_SCENE_ISPH__
-
-/*! \ingroup embree_kernel_api */
-/*! \{ */
-
-/*! forward declarations for ray structures */
-struct RTCRay1;
-struct RTCRay;
-struct RTCRaySOA;
-
-/*! scene flags */
-enum RTCSceneFlags 
-{
-  /* dynamic type flags */
-  RTC_SCENE_STATIC     = (0 << 0),    //!< specifies static scene
-  RTC_SCENE_DYNAMIC    = (1 << 0),    //!< specifies dynamic scene
-
-  /* acceleration structure flags */
-  RTC_SCENE_COMPACT    = (1 << 8),    //!< use memory conservative data structures
-  RTC_SCENE_COHERENT   = (1 << 9),    //!< optimize data structures for coherent rays (enabled by default)
-  RTC_SCENE_INCOHERENT = (1 << 10),    //!< optimize data structures for in-coherent rays
-  RTC_SCENE_HIGH_QUALITY = (1 << 11),  //!< create higher quality data structures
-
-  /* traversal algorithm flags */
-  RTC_SCENE_ROBUST     = (1 << 16)     //!< use more robust traversal algorithms
-};
-
-/*! enabled algorithm flags */
-enum RTCAlgorithmFlags 
-{
-  RTC_INTERSECT_UNIFORM = (1 << 0),    //!< enables the uniform rtcIntersect1 and uniform rtcOccluded1 functions for this scene
-  RTC_INTERSECT_VARYING = (1 << 1),    //!< enables the varying rtcIntersect and varying rtcOccluded functions for this scene
-  RTC_INTERPOLATE       = (1 << 4)     //!< enables the rtcInterpolate function for this scene
-};
-
-/*! layout flags for ray streams */
-enum RTCRayNFlags
-{
-  RTC_RAYN_DEFAULT = (1 << 0)
-};
-
-
-/*! \brief Defines an opaque scene type */
-typedef uniform struct __RTCScene {}* uniform RTCScene;
-
-/*! Creates a new scene. 
-     WARNING: This function is deprecated, use rtcDeviceNewScene instead.
-*/
-RTCORE_DEPRECATED RTCScene rtcNewScene (uniform RTCSceneFlags flags, uniform RTCAlgorithmFlags aflags);
-
-/*! Creates a new scene. */
-RTCScene rtcDeviceNewScene (RTCDevice device, uniform RTCSceneFlags flags, uniform RTCAlgorithmFlags aflags);
-
-/*! \brief Type of progress callback function. */
-typedef uniform bool (*uniform RTC_PROGRESS_MONITOR_FUNCTION)(void* uniform ptr, const uniform double n);
-
-/*! \brief Sets the progress callback function which is called during hierarchy build. */
-void rtcSetProgressMonitorFunction(RTCScene scene, RTC_PROGRESS_MONITOR_FUNCTION func, void* uniform ptr);
-
-/*! Commits the geometry of the scene. After initializing or modifying
- *  geometries, commit has to get called before tracing
- *  rays. */
-void rtcCommit (RTCScene scene); 
-
-/*! Commits the geometry of the scene. The calling threads will be
- *  used internally as a worker threads on some implementations. The
- *  function will wait until 'numThreads' threads have called this
- *  function and all threads return from the function after the scene
- *  commit is finished. The application threads will not be used as
- *  worker threads when the TBB tasking system is enabled (which is
- *  the default). On CPUs, we recommend also using TBB inside your
- *  application to share threads. We recommend using the
- *  rtcCommitThread feature to share threads on the Xeon Phi
- *  coprocessor. */
-void rtcCommitThread(RTCScene scene, uniform unsigned int threadID, uniform unsigned int numThreads);
-
-/*! Returns to AABB of the scene. rtcCommit has to get called
- *  previously to this function. */
-void rtcGetBounds(RTCScene scene, uniform RTCBounds& bounds_o);
-
-/*! Intersects a uniform ray with the scene. This function can only be
- *  called for scenes with the RTC_INTERSECT_UNIFORM flag set. The ray
- *  has to be aligned to 16 bytes. */
-void rtcIntersect1 (RTCScene scene, uniform RTCRay1& ray);
-
-/*! Intersects a varying ray with the scene. This function can only be
- *  called for scenes with the RTC_INTERSECT_VARYING flag set. The
- *  valid mask and ray have both to be aligned to sizeof(varing float)
- *  bytes. */
-void rtcIntersect (RTCScene scene, varying RTCRay& ray);
-
-
-/*! Intersects a stream of N rays in AOS layout with the scene. This
- *  function can only be called for scenes with the RTC_INTERSECTN
- *  flag set. The stride specifies the offset between rays in
- *  bytes. */
-void rtcIntersectN (RTCScene scene, uniform RTCRay* uniform rayN, const uniform size_t N, const uniform size_t stride, const uniform size_t flags);
-
-/*! Intersects one or multiple streams of N rays in compact SOA layout with the scene. This
- *  function can only be called for scenes with the RTC_INTERSECTN
- *  flag set. 'streams' specifies the number of dense SOA ray
- *  streams, and 'stride' the offset in bytes between those. */
-void rtcIntersectN_SOA (RTCScene scene, uniform RTCRaySOA& rayN, const uniform size_t N, const uniform size_t streams, const uniform size_t offset, const uniform size_t flags);
-
-
-/*! Tests if a uniform ray is occluded by the scene. This function can
- *  only be called for scenes with the RTC_INTERSECT_UNIFORM flag
- *  set. The ray has to be aligned to 16 bytes. */
-void rtcOccluded1 (RTCScene scene, uniform RTCRay1& ray);
-
-/*! Tests if a varying ray is occluded by the scene. This function can
- *  only be called for scenes with the RTC_INTERSECT_VARYING flag
- *  set. The valid mask and ray have both to be aligned to
- *  sizeof(varing float) bytes. */
-void rtcOccluded (RTCScene scene, varying RTCRay& ray);
-
-
-/*! Tests if a stream of N rays on AOS layout is occluded by the
- *  scene. This function can only be called for scenes with the
- *  RTC_INTERSECTN flag set. The stride specifies the offset between
- *  rays in bytes.*/
-void rtcOccludedN (RTCScene scene, uniform RTCRay* uniform rayN, const uniform size_t N, const uniform size_t stride, const uniform size_t flags);
-
-/*! Intersects one or multiple streams of N rays in compact SOA layout with the scene. This
- *  function can only be called for scenes with the RTC_INTERSECTN
- *  flag set. 'streams' specifies the number of dense SOA ray
- *  streams, and 'stride' the offset in bytes between those. */
-void rtcOccludedN_SOA (RTCScene scene, uniform RTCRaySOA& rayN, const uniform size_t N, const uniform size_t streams, const uniform size_t offset, const uniform size_t flags);
-
-/*! Deletes the geometry again. */
-void rtcDeleteScene (RTCScene scene);
-
-/*! @} */
-
-#endif
diff --git a/3rdparty/embree/lib/x32/embree.lib b/3rdparty/embree/lib/x32/embree.lib
deleted file mode 100644
index 5a03b9d1..00000000
Binary files a/3rdparty/embree/lib/x32/embree.lib and /dev/null differ
diff --git a/3rdparty/embree/lib/x64/embree.lib b/3rdparty/embree/lib/x64/embree.lib
deleted file mode 100644
index 7c4614f7..00000000
Binary files a/3rdparty/embree/lib/x64/embree.lib and /dev/null differ
diff --git a/3rdparty/embree/lib/x64/libembree.2.dylib b/3rdparty/embree/lib/x64/libembree.2.dylib
deleted file mode 100644
index 17e21b07..00000000
Binary files a/3rdparty/embree/lib/x64/libembree.2.dylib and /dev/null differ
diff --git a/3rdparty/embree/lib/x64/libtbb.dylib b/3rdparty/embree/lib/x64/libtbb.dylib
deleted file mode 100644
index 964b0bc8..00000000
Binary files a/3rdparty/embree/lib/x64/libtbb.dylib and /dev/null differ
diff --git a/3rdparty/embree/lib/x64/libtbbmalloc.dylib b/3rdparty/embree/lib/x64/libtbbmalloc.dylib
deleted file mode 100644
index 74c36196..00000000
Binary files a/3rdparty/embree/lib/x64/libtbbmalloc.dylib and /dev/null differ
diff --git a/Anvil b/Anvil
deleted file mode 160000
index 84d22865..00000000
--- a/Anvil
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 84d22865842d45fe7762a3d68ef2837e60a8e9f5
diff --git a/CLW/CLWBuffer.h b/CLW/CLWBuffer.h
index 54519d66..93a82fde 100644
--- a/CLW/CLWBuffer.h
+++ b/CLW/CLWBuffer.h
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include <cassert>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/CLWCommandQueue.h b/CLW/CLWCommandQueue.h
index 9c0da59b..d36f4906 100644
--- a/CLW/CLWCommandQueue.h
+++ b/CLW/CLWCommandQueue.h
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <memory>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/CLWContext.h b/CLW/CLWContext.h
index d7504dd4..16275ee6 100644
--- a/CLW/CLWContext.h
+++ b/CLW/CLWContext.h
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #include <string>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/CLWDevice.h b/CLW/CLWDevice.h
index 087bb4b3..55162fd6 100644
--- a/CLW/CLWDevice.h
+++ b/CLW/CLWDevice.h
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #include <string>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/CLWEvent.h b/CLW/CLWEvent.h
index 0237a9e8..ea366d71 100644
--- a/CLW/CLWEvent.h
+++ b/CLW/CLWEvent.h
@@ -29,7 +29,7 @@ THE SOFTWARE.
 #include <string>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/CLWExcept.h b/CLW/CLWExcept.h
index e1222fac..a8b357a7 100644
--- a/CLW/CLWExcept.h
+++ b/CLW/CLWExcept.h
@@ -26,7 +26,7 @@ THE SOFTWARE.
 #include <stdexcept>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/CLWImage2D.h b/CLW/CLWImage2D.h
index d7c0742d..68264a67 100644
--- a/CLW/CLWImage2D.h
+++ b/CLW/CLWImage2D.h
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #include <cassert>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #include <OpenGL/OpenGL.h>
 #else
 #include <CL/cl.h>
diff --git a/CLW/CLWKernel.h b/CLW/CLWKernel.h
index ca1c6e49..642f69b5 100644
--- a/CLW/CLWKernel.h
+++ b/CLW/CLWKernel.h
@@ -27,7 +27,7 @@ THE SOFTWARE.
 #include "ReferenceCounter.h"
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/CLWPlatform.h b/CLW/CLWPlatform.h
index 29fb7f02..a5448e74 100644
--- a/CLW/CLWPlatform.h
+++ b/CLW/CLWPlatform.h
@@ -27,7 +27,7 @@ THE SOFTWARE.
 #include <memory>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/CLWProgram.h b/CLW/CLWProgram.h
index 44696d3a..c3a66da1 100644
--- a/CLW/CLWProgram.h
+++ b/CLW/CLWProgram.h
@@ -30,7 +30,7 @@ THE SOFTWARE.
 #include <cstdint>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/ParameterHolder.h b/CLW/ParameterHolder.h
index 37f56863..03fa00a2 100644
--- a/CLW/ParameterHolder.h
+++ b/CLW/ParameterHolder.h
@@ -25,7 +25,7 @@ THE SOFTWARE.
 #include <iostream>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/CLW/ReferenceCounter.h b/CLW/ReferenceCounter.h
index 139fe6f8..38c1aab8 100644
--- a/CLW/ReferenceCounter.h
+++ b/CLW/ReferenceCounter.h
@@ -26,7 +26,7 @@ THE SOFTWARE.
 
 #ifdef __APPLE__
 #define STDCALL
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #elif WIN32
 #define STDCALL __stdcall
 #include <CL/cl.h>
diff --git a/CLW/clwkernels_cl.h b/CLW/clwkernels_cl.h
new file mode 100644
index 00000000..cc6368be
--- /dev/null
+++ b/CLW/clwkernels_cl.h
@@ -0,0 +1,1611 @@
+/* This is an auto-generated file. Do not edit manually*/
+
+static const char g_CLW_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable \n"\
+"#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable \n"\
+" \n"\
+"// --------------------- CONSTANTS ------------------------ \n"\
+"// add neutral elements \n"\
+"__constant int neutral_add_int = 0; \n"\
+"__constant float neutral_add_float = 0; \n"\
+"__constant float3 neutral_add_float3 = (float3)(0.0, 0.0, 0.0); \n"\
+"// max neutral elements \n"\
+"__constant int neutral_max_int = INT_MIN; \n"\
+"__constant float neutral_max_float = FLT_MIN; \n"\
+"__constant float3 neutral_max_float3 = (float3)(FLT_MIN, FLT_MIN, FLT_MIN); \n"\
+"// min neutral elements \n"\
+"__constant int neutral_min_int = INT_MAX; \n"\
+"__constant float neutral_min_float = FLT_MAX; \n"\
+"__constant float3 neutral_min_float3 = (float3)(FLT_MAX, FLT_MAX, FLT_MAX); \n"\
+" \n"\
+"__constant float epsilon = .00001f; \n"\
+" \n"\
+"// --------------------- HELPERS ------------------------ \n"\
+"//#define INT_MAX 0x7FFFFFFF \n"\
+" \n"\
+"// -------------------- MACRO -------------------------- \n"\
+"// Apple OCL compiler has this by default,  \n"\
+"// so embrace with #ifdef in the future \n"\
+"#define DEFINE_MAKE_4(type)\\ \n"\
+"    type##4 make_##type##4(type x, type y, type z, type w)\\ \n"\
+"{\\ \n"\
+"    type##4 res;\\ \n"\
+"    res.x = x;\\ \n"\
+"    res.y = y;\\ \n"\
+"    res.z = z;\\ \n"\
+"    res.w = w;\\ \n"\
+"    return res;\\ \n"\
+"} \n"\
+" \n"\
+"// Multitype macros to handle parallel primitives \n"\
+"#define DEFINE_SAFE_LOAD_4(type)\\ \n"\
+"    type##4 safe_load_##type##4(__global type##4* source, uint idx, uint sizeInTypeUnits)\\ \n"\
+"{\\ \n"\
+"    type##4 res = make_##type##4(0, 0, 0, 0);\\ \n"\
+"    if (((idx + 1) << 2)  <= sizeInTypeUnits)\\ \n"\
+"    res = source[idx];\\ \n"\
+"    else\\ \n"\
+"    {\\ \n"\
+"    if ((idx << 2) < sizeInTypeUnits) res.x = source[idx].x;\\ \n"\
+"    if ((idx << 2) + 1 < sizeInTypeUnits) res.y = source[idx].y;\\ \n"\
+"    if ((idx << 2) + 2 < sizeInTypeUnits) res.z = source[idx].z;\\ \n"\
+"    }\\ \n"\
+"    return res;\\ \n"\
+"} \n"\
+" \n"\
+"#define DEFINE_SAFE_STORE_4(type)\\ \n"\
+"    void safe_store_##type##4(type##4 val, __global type##4* dest, uint idx, uint sizeInTypeUnits)\\ \n"\
+"{\\ \n"\
+"    if ((idx + 1) * 4  <= sizeInTypeUnits)\\ \n"\
+"    dest[idx] = val;\\ \n"\
+"    else\\ \n"\
+"    {\\ \n"\
+"    if (idx*4 < sizeInTypeUnits) dest[idx].x = val.x;\\ \n"\
+"    if (idx*4 + 1 < sizeInTypeUnits) dest[idx].y = val.y;\\ \n"\
+"    if (idx*4 + 2 < sizeInTypeUnits) dest[idx].z = val.z;\\ \n"\
+"    }\\ \n"\
+"} \n"\
+" \n"\
+"#define DEFINE_GROUP_SCAN_EXCLUSIVE(type)\\ \n"\
+"    void group_scan_exclusive_##type(int localId, int groupSize, __local type* shmem)\\ \n"\
+"{\\ \n"\
+"    for (int stride = 1; stride <= (groupSize >> 1); stride <<= 1)\\ \n"\
+"    {\\ \n"\
+"    if (localId < groupSize/(2*stride))\\ \n"\
+"        {\\ \n"\
+"        shmem[2*(localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1] + shmem[(2*localId + 1)*stride-1];\\ \n"\
+"        }\\ \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    }\\ \n"\
+"    if (localId == 0)\\ \n"\
+"    shmem[groupSize - 1] = 0;\\ \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    for (int stride = (groupSize >> 1); stride > 0; stride >>= 1)\\ \n"\
+"    {\\ \n"\
+"    if (localId < groupSize/(2*stride))\\ \n"\
+"        {\\ \n"\
+"        type temp = shmem[(2*localId + 1)*stride-1];\\ \n"\
+"        shmem[(2*localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1];\\ \n"\
+"        shmem[2*(localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1] + temp;\\ \n"\
+"        }\\ \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    }\\ \n"\
+"} \n"\
+" \n"\
+"#define DEFINE_GROUP_SCAN_EXCLUSIVE_SUM(type)\\ \n"\
+"    void group_scan_exclusive_sum_##type(int localId, int groupSize, __local type* shmem, type* sum)\\ \n"\
+"{\\ \n"\
+"    for (int stride = 1; stride <= (groupSize >> 1); stride <<= 1)\\ \n"\
+"    {\\ \n"\
+"    if (localId < groupSize/(2*stride))\\ \n"\
+"        {\\ \n"\
+"        shmem[2*(localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1] + shmem[(2*localId + 1)*stride-1];\\ \n"\
+"        }\\ \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    }\\ \n"\
+"    *sum = shmem[groupSize - 1];\\ \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    if (localId == 0){\\ \n"\
+"    shmem[groupSize - 1] = 0;}\\ \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    for (int stride = (groupSize >> 1); stride > 0; stride >>= 1)\\ \n"\
+"    {\\ \n"\
+"    if (localId < groupSize/(2*stride))\\ \n"\
+"        {\\ \n"\
+"        type temp = shmem[(2*localId + 1)*stride-1];\\ \n"\
+"        shmem[(2*localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1];\\ \n"\
+"        shmem[2*(localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1] + temp;\\ \n"\
+"        }\\ \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    }\\ \n"\
+"} \n"\
+" \n"\
+" \n"\
+"#define DEFINE_GROUP_SCAN_EXCLUSIVE_PART(type)\\ \n"\
+"    type group_scan_exclusive_part_##type( int localId, int groupSize, __local type* shmem)\\ \n"\
+"{\\ \n"\
+"    type sum = 0;\\ \n"\
+"    for (int stride = 1; stride <= (groupSize >> 1); stride <<= 1)\\ \n"\
+"    {\\ \n"\
+"    if (localId < groupSize/(2*stride))\\ \n"\
+"        {\\ \n"\
+"        shmem[2*(localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1] + shmem[(2*localId + 1)*stride-1];\\ \n"\
+"        }\\ \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    }\\ \n"\
+"    if (localId == 0)\\ \n"\
+"    {\\ \n"\
+"    sum = shmem[groupSize - 1];\\ \n"\
+"    shmem[groupSize - 1] = 0;\\ \n"\
+"    }\\ \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    for (int stride = (groupSize >> 1); stride > 0; stride >>= 1)\\ \n"\
+"    {\\ \n"\
+"    if (localId < groupSize/(2*stride))\\ \n"\
+"        {\\ \n"\
+"        type temp = shmem[(2*localId + 1)*stride-1];\\ \n"\
+"        shmem[(2*localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1];\\ \n"\
+"        shmem[2*(localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1] + temp;\\ \n"\
+"        }\\ \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    }\\ \n"\
+"    return sum;\\ \n"\
+"} \n"\
+" \n"\
+"#define DEFINE_SCAN_EXCLUSIVE(type)\\ \n"\
+"    __kernel void scan_exclusive_##type(__global type const* in_array, __global type* out_array, __local type* shmem)\\ \n"\
+"{\\ \n"\
+"    int globalId  = get_global_id(0);\\ \n"\
+"    int localId   = get_local_id(0);\\ \n"\
+"    int groupSize = get_local_size(0);\\ \n"\
+"    int groupId   = get_group_id(0);\\ \n"\
+"    shmem[localId] = in_array[2*globalId] + in_array[2*globalId + 1];\\ \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    group_scan_exclusive_##type(localId, groupSize, shmem);\\ \n"\
+"    out_array[2 * globalId + 1] = shmem[localId] + in_array[2*globalId];\\ \n"\
+"    out_array[2 * globalId] = shmem[localId];\\ \n"\
+"} \n"\
+" \n"\
+"#define DEFINE_SCAN_EXCLUSIVE_4(type)\\ \n"\
+"    __attribute__((reqd_work_group_size(64, 1, 1)))\\ \n"\
+"    __kernel void scan_exclusive_##type##4(__global type##4 const* in_array, __global type##4* out_array, uint numElems, __local type* shmem)\\ \n"\
+"{\\ \n"\
+"    int globalId  = get_global_id(0);\\ \n"\
+"    int localId   = get_local_id(0);\\ \n"\
+"    int groupSize = get_local_size(0);\\ \n"\
+"    type##4 v1 = safe_load_##type##4(in_array, 2*globalId, numElems);\\ \n"\
+"    type##4 v2 = safe_load_##type##4(in_array, 2*globalId + 1, numElems);\\ \n"\
+"    v1.y += v1.x; v1.w += v1.z; v1.w += v1.y;\\ \n"\
+"    v2.y += v2.x; v2.w += v2.z; v2.w += v2.y;\\ \n"\
+"    v2.w += v1.w;\\ \n"\
+"    shmem[localId] = v2.w;\\ \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    group_scan_exclusive_##type(localId, groupSize, shmem);\\ \n"\
+"    v2.w = shmem[localId];\\ \n"\
+"    type t = v1.w; v1.w = v2.w; v2.w += t;\\ \n"\
+"    t = v1.y; v1.y = v1.w; v1.w += t;\\ \n"\
+"    t = v2.y; v2.y = v2.w; v2.w += t;\\ \n"\
+"    t = v1.x; v1.x = v1.y; v1.y += t;\\ \n"\
+"    t = v2.x; v2.x = v2.y; v2.y += t;\\ \n"\
+"    t = v1.z; v1.z = v1.w; v1.w += t;\\ \n"\
+"    t = v2.z; v2.z = v2.w; v2.w += t;\\ \n"\
+"    safe_store_##type##4(v2, out_array, 2 * globalId + 1, numElems);\\ \n"\
+"    safe_store_##type##4(v1, out_array, 2 * globalId, numElems);\\ \n"\
+"} \n"\
+" \n"\
+"#define DEFINE_SCAN_EXCLUSIVE_4_V1(type)\\ \n"\
+"    __attribute__((reqd_work_group_size(64, 1, 1)))\\ \n"\
+"    __kernel void scan_exclusive_##type##4##_v1(__global type##4 const* in_array, __global type##4* out_array, uint numElems, __local type* shmem)\\ \n"\
+"{\\ \n"\
+"    int globalId  = get_global_id(0);\\ \n"\
+"    int localId   = get_local_id(0);\\ \n"\
+"    int groupSize = get_local_size(0);\\ \n"\
+"    type##4 v1 = safe_load_##type##4(in_array, 2*globalId, numElems);\\ \n"\
+"    type##4 v2 = safe_load_##type##4(in_array, 2*globalId + 1, numElems);\\ \n"\
+"    shmem[localId] = v1.x + v1.y + v1.z + v1.w + v2.x + v2.y + v2.z + v2.w;\\ \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    group_scan_exclusive_##type(localId, groupSize, shmem);\\ \n"\
+"    type offset = shmem[localId];\\ \n"\
+"    type t = v1.x; v1.x = offset; offset += t;\\ \n"\
+"    t = v1.y; v1.y = offset; offset += t;\\ \n"\
+"    t = v1.z; v1.z = offset; offset += t;\\ \n"\
+"    t = v1.w; v1.w = offset; offset += t;\\ \n"\
+"    t = v2.x; v2.x = offset; offset += t;\\ \n"\
+"    t = v2.y; v2.y = offset; offset += t;\\ \n"\
+"    t = v2.z; v2.z = offset; offset += t;\\ \n"\
+"    v2.w = offset;\\ \n"\
+"    safe_store_##type##4(v2, out_array, 2 * globalId + 1, numElems);\\ \n"\
+"    safe_store_##type##4(v1, out_array, 2 * globalId, numElems);\\ \n"\
+"} \n"\
+" \n"\
+"#define DEFINE_SCAN_EXCLUSIVE_PART_4(type)\\ \n"\
+"    __attribute__((reqd_work_group_size(64, 1, 1)))\\ \n"\
+"    __kernel void scan_exclusive_part_##type##4(__global type##4 const* in_array, __global type##4* out_array, uint numElems, __global type* out_sums, __local type* shmem)\\ \n"\
+"{\\ \n"\
+"    int globalId  = get_global_id(0);\\ \n"\
+"    int localId   = get_local_id(0);\\ \n"\
+"    int groupId   = get_group_id(0);\\ \n"\
+"    int groupSize = get_local_size(0);\\ \n"\
+"    type##4 v1 = safe_load_##type##4(in_array, 2*globalId, numElems);\\ \n"\
+"    type##4 v2 = safe_load_##type##4(in_array, 2*globalId + 1, numElems);\\ \n"\
+"    v1.y += v1.x; v1.w += v1.z; v1.w += v1.y;\\ \n"\
+"    v2.y += v2.x; v2.w += v2.z; v2.w += v2.y;\\ \n"\
+"    v2.w += v1.w;\\ \n"\
+"    shmem[localId] = v2.w;\\ \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    type sum = group_scan_exclusive_part_##type(localId, groupSize, shmem);\\ \n"\
+"    if (localId == 0) out_sums[groupId] = sum;\\ \n"\
+"    v2.w = shmem[localId];\\ \n"\
+"    type t = v1.w; v1.w = v2.w; v2.w += t;\\ \n"\
+"    t = v1.y; v1.y = v1.w; v1.w += t;\\ \n"\
+"    t = v2.y; v2.y = v2.w; v2.w += t;\\ \n"\
+"    t = v1.x; v1.x = v1.y; v1.y += t;\\ \n"\
+"    t = v2.x; v2.x = v2.y; v2.y += t;\\ \n"\
+"    t = v1.z; v1.z = v1.w; v1.w += t;\\ \n"\
+"    t = v2.z; v2.z = v2.w; v2.w += t;\\ \n"\
+"    safe_store_##type##4(v2, out_array, 2 * globalId + 1, numElems);\\ \n"\
+"    safe_store_##type##4(v1, out_array, 2 * globalId, numElems);\\ \n"\
+"} \n"\
+" \n"\
+"#define DEFINE_GROUP_REDUCE(type)\\ \n"\
+"    void group_reduce_##type(int localId, int groupSize, __local type* shmem)\\ \n"\
+"{\\ \n"\
+"    for (int stride = 1; stride <= (groupSize >> 1); stride <<= 1)\\ \n"\
+"    {\\ \n"\
+"    if (localId < groupSize/(2*stride))\\ \n"\
+"        {\\ \n"\
+"        shmem[2*(localId + 1)*stride-1] = shmem[2*(localId + 1)*stride-1] + shmem[(2*localId + 1)*stride-1];\\ \n"\
+"        }\\ \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    }\\ \n"\
+"} \n"\
+" \n"\
+"#define DEFINE_DISTRIBUTE_PART_SUM_4(type)\\ \n"\
+"    __kernel void distribute_part_sum_##type##4( __global type* in_sums, __global type##4* inout_array, uint numElems)\\ \n"\
+"{\\ \n"\
+"    int globalId  = get_global_id(0);\\ \n"\
+"    int groupId   = get_group_id(0);\\ \n"\
+"    type##4 v1 = safe_load_##type##4(inout_array, globalId, numElems);\\ \n"\
+"    type    sum = in_sums[groupId >> 1];\\ \n"\
+"    v1.xyzw += sum;\\ \n"\
+"    safe_store_##type##4(v1, inout_array, globalId, numElems);\\ \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// These are already defined in Apple OCL runtime \n"\
+"#ifndef APPLE \n"\
+"DEFINE_MAKE_4(int) \n"\
+"DEFINE_MAKE_4(float) \n"\
+"#endif \n"\
+" \n"\
+"DEFINE_SAFE_LOAD_4(int) \n"\
+"DEFINE_SAFE_LOAD_4(float) \n"\
+" \n"\
+"DEFINE_SAFE_STORE_4(int) \n"\
+"DEFINE_SAFE_STORE_4(float) \n"\
+" \n"\
+"DEFINE_GROUP_SCAN_EXCLUSIVE(int) \n"\
+"DEFINE_GROUP_SCAN_EXCLUSIVE(uint) \n"\
+"DEFINE_GROUP_SCAN_EXCLUSIVE(float) \n"\
+"DEFINE_GROUP_SCAN_EXCLUSIVE(short) \n"\
+" \n"\
+"DEFINE_GROUP_SCAN_EXCLUSIVE_SUM(uint) \n"\
+" \n"\
+"DEFINE_GROUP_SCAN_EXCLUSIVE_PART(int) \n"\
+"DEFINE_GROUP_SCAN_EXCLUSIVE_PART(float) \n"\
+" \n"\
+"DEFINE_SCAN_EXCLUSIVE(int) \n"\
+"DEFINE_SCAN_EXCLUSIVE(float) \n"\
+" \n"\
+"DEFINE_SCAN_EXCLUSIVE_4(int) \n"\
+"DEFINE_SCAN_EXCLUSIVE_4(float) \n"\
+" \n"\
+"DEFINE_SCAN_EXCLUSIVE_PART_4(int) \n"\
+"DEFINE_SCAN_EXCLUSIVE_PART_4(float) \n"\
+" \n"\
+"DEFINE_DISTRIBUTE_PART_SUM_4(int) \n"\
+"DEFINE_DISTRIBUTE_PART_SUM_4(float) \n"\
+" \n"\
+"/// Specific function for radix-sort needs \n"\
+"/// Group exclusive add multiscan on 4 arrays of shorts in parallel \n"\
+"/// with 4x reduction in registers \n"\
+"void group_scan_short_4way(int localId, int groupSize, \n"\
+"    short4 mask0, \n"\
+"    short4 mask1, \n"\
+"    short4 mask2, \n"\
+"    short4 mask3, \n"\
+"    __local short* shmem0, \n"\
+"    __local short* shmem1, \n"\
+"    __local short* shmem2, \n"\
+"    __local short* shmem3, \n"\
+"    short4* offset0, \n"\
+"    short4* offset1, \n"\
+"    short4* offset2, \n"\
+"    short4* offset3, \n"\
+"    short4* histogram) \n"\
+"{ \n"\
+"    short4 v1 = mask0; \n"\
+"    v1.y += v1.x; v1.w += v1.z; v1.w += v1.y; \n"\
+"    shmem0[localId] = v1.w; \n"\
+" \n"\
+"    short4 v2 = mask1; \n"\
+"    v2.y += v2.x; v2.w += v2.z; v2.w += v2.y; \n"\
+"    shmem1[localId] = v2.w; \n"\
+" \n"\
+"    short4 v3 = mask2; \n"\
+"    v3.y += v3.x; v3.w += v3.z; v3.w += v3.y; \n"\
+"    shmem2[localId] = v3.w; \n"\
+" \n"\
+"    short4 v4 = mask3; \n"\
+"    v4.y += v4.x; v4.w += v4.z; v4.w += v4.y; \n"\
+"    shmem3[localId] = v4.w; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    for (int stride = 1; stride <= (groupSize >> 1); stride <<= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            shmem0[2 * (localId + 1)*stride - 1] = shmem0[2 * (localId + 1)*stride - 1] + shmem0[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem1[2 * (localId + 1)*stride - 1] = shmem1[2 * (localId + 1)*stride - 1] + shmem1[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem2[2 * (localId + 1)*stride - 1] = shmem2[2 * (localId + 1)*stride - 1] + shmem2[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem3[2 * (localId + 1)*stride - 1] = shmem3[2 * (localId + 1)*stride - 1] + shmem3[(2 * localId + 1)*stride - 1]; \n"\
+"        } \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+" \n"\
+"    short4 total; \n"\
+"    total.s0 = shmem0[groupSize - 1]; \n"\
+"    total.s1 = shmem1[groupSize - 1]; \n"\
+"    total.s2 = shmem2[groupSize - 1]; \n"\
+"    total.s3 = shmem3[groupSize - 1]; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    if (localId == 0) \n"\
+"    { \n"\
+"        shmem0[groupSize - 1] = 0; \n"\
+"        shmem1[groupSize - 1] = 0; \n"\
+"        shmem2[groupSize - 1] = 0; \n"\
+"        shmem3[groupSize - 1] = 0; \n"\
+"    } \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    for (int stride = (groupSize >> 1); stride > 0; stride >>= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            int temp = shmem0[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem0[(2 * localId + 1)*stride - 1] = shmem0[2 * (localId + 1)*stride - 1]; \n"\
+"            shmem0[2 * (localId + 1)*stride - 1] = shmem0[2 * (localId + 1)*stride - 1] + temp; \n"\
+" \n"\
+"            temp = shmem1[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem1[(2 * localId + 1)*stride - 1] = shmem1[2 * (localId + 1)*stride - 1]; \n"\
+"            shmem1[2 * (localId + 1)*stride - 1] = shmem1[2 * (localId + 1)*stride - 1] + temp; \n"\
+" \n"\
+"            temp = shmem2[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem2[(2 * localId + 1)*stride - 1] = shmem2[2 * (localId + 1)*stride - 1]; \n"\
+"            shmem2[2 * (localId + 1)*stride - 1] = shmem2[2 * (localId + 1)*stride - 1] + temp; \n"\
+" \n"\
+"            temp = shmem3[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem3[(2 * localId + 1)*stride - 1] = shmem3[2 * (localId + 1)*stride - 1]; \n"\
+"            shmem3[2 * (localId + 1)*stride - 1] = shmem3[2 * (localId + 1)*stride - 1] + temp; \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+" \n"\
+"    v1.w = shmem0[localId]; \n"\
+" \n"\
+"    short t = v1.y; v1.y = v1.w; v1.w += t; \n"\
+"    t = v1.x; v1.x = v1.y; v1.y += t; \n"\
+"    t = v1.z; v1.z = v1.w; v1.w += t; \n"\
+"    *offset0 = v1; \n"\
+" \n"\
+"    v2.w = shmem1[localId]; \n"\
+" \n"\
+"    t = v2.y; v2.y = v2.w; v2.w += t; \n"\
+"    t = v2.x; v2.x = v2.y; v2.y += t; \n"\
+"    t = v2.z; v2.z = v2.w; v2.w += t; \n"\
+"    *offset1 = v2; \n"\
+" \n"\
+"    v3.w = shmem2[localId]; \n"\
+" \n"\
+"    t = v3.y; v3.y = v3.w; v3.w += t; \n"\
+"    t = v3.x; v3.x = v3.y; v3.y += t; \n"\
+"    t = v3.z; v3.z = v3.w; v3.w += t; \n"\
+"    *offset2 = v3; \n"\
+" \n"\
+"    v4.w = shmem3[localId]; \n"\
+" \n"\
+"    t = v4.y; v4.y = v4.w; v4.w += t; \n"\
+"    t = v4.x; v4.x = v4.y; v4.y += t; \n"\
+"    t = v4.z; v4.z = v4.w; v4.w += t; \n"\
+"    *offset3 = v4; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    *histogram = total; \n"\
+"} \n"\
+" \n"\
+"// Calculate bool radix mask \n"\
+"short4 radix_mask(int offset, uchar digit, int4 val) \n"\
+"{ \n"\
+"    short4 res; \n"\
+"    res.x = ((val.x >> offset) & 3) == digit ? 1 : 0; \n"\
+"    res.y = ((val.y >> offset) & 3) == digit ? 1 : 0; \n"\
+"    res.z = ((val.z >> offset) & 3) == digit ? 1 : 0; \n"\
+"    res.w = ((val.w >> offset) & 3) == digit ? 1 : 0; \n"\
+"    return res; \n"\
+"} \n"\
+" \n"\
+"// Choose offset based on radix mask value  \n"\
+"short offset_4way(int val, int offset, short offset0, short offset1, short offset2, short offset3, short4 hist) \n"\
+"{ \n"\
+"    switch ((val >> offset) & 3) \n"\
+"    { \n"\
+"    case 0: \n"\
+"        return offset0; \n"\
+"    case 1: \n"\
+"        return offset1 + hist.x; \n"\
+"    case 2: \n"\
+"        return offset2 + hist.x + hist.y; \n"\
+"    case 3: \n"\
+"        return offset3 + hist.x + hist.y + hist.z; \n"\
+"    } \n"\
+" \n"\
+"    return 0; \n"\
+"} \n"\
+" \n"\
+" \n"\
+" \n"\
+"// Perform group split using 2-bits pass \n"\
+"void group_split_radix_2bits( \n"\
+"    int localId, \n"\
+"    int groupSize, \n"\
+"    int offset, \n"\
+"    int4 val, \n"\
+"    __local short* shmem, \n"\
+"    int4* localOffset, \n"\
+"    short4* histogram) \n"\
+"{ \n"\
+"    /// Pointers to radix flag arrays \n"\
+"    __local short* shmem0 = shmem; \n"\
+"    __local short* shmem1 = shmem0 + groupSize; \n"\
+"    __local short* shmem2 = shmem1 + groupSize; \n"\
+"    __local short* shmem3 = shmem2 + groupSize; \n"\
+" \n"\
+"    /// Radix masks for each digit \n"\
+"    short4 mask0 = radix_mask(offset, 0, val); \n"\
+"    short4 mask1 = radix_mask(offset, 1, val); \n"\
+"    short4 mask2 = radix_mask(offset, 2, val); \n"\
+"    short4 mask3 = radix_mask(offset, 3, val); \n"\
+" \n"\
+"    /// Resulting offsets \n"\
+"    short4 offset0; \n"\
+"    short4 offset1; \n"\
+"    short4 offset2; \n"\
+"    short4 offset3; \n"\
+" \n"\
+"    group_scan_short_4way(localId, groupSize, \n"\
+"        mask0, mask1, mask2, mask3, \n"\
+"        shmem0, shmem1, shmem2, shmem3, \n"\
+"        &offset0, &offset1, &offset2, &offset3, \n"\
+"        histogram); \n"\
+" \n"\
+"    (*localOffset).x = offset_4way(val.x, offset, offset0.x, offset1.x, offset2.x, offset3.x, *histogram); \n"\
+"    (*localOffset).y = offset_4way(val.y, offset, offset0.y, offset1.y, offset2.y, offset3.y, *histogram); \n"\
+"    (*localOffset).z = offset_4way(val.z, offset, offset0.z, offset1.z, offset2.z, offset3.z, *histogram); \n"\
+"    (*localOffset).w = offset_4way(val.w, offset, offset0.w, offset1.w, offset2.w, offset3.w, *histogram); \n"\
+"} \n"\
+" \n"\
+"int4 safe_load_int4_intmax(__global int4* source, uint idx, uint sizeInInts) \n"\
+"{ \n"\
+"    int4 res = make_int4(INT_MAX, INT_MAX, INT_MAX, INT_MAX); \n"\
+"    if (((idx + 1) << 2) <= sizeInInts) \n"\
+"        res = source[idx]; \n"\
+"    else \n"\
+"    { \n"\
+"        if ((idx << 2) < sizeInInts) res.x = source[idx].x; \n"\
+"        if ((idx << 2) + 1 < sizeInInts) res.y = source[idx].y; \n"\
+"        if ((idx << 2) + 2 < sizeInInts) res.z = source[idx].z; \n"\
+"    } \n"\
+"    return res; \n"\
+"} \n"\
+" \n"\
+"void safe_store_int(int val, __global int* dest, uint idx, uint sizeInInts) \n"\
+"{ \n"\
+"    if (idx < sizeInInts) \n"\
+"        dest[idx] = val; \n"\
+"} \n"\
+" \n"\
+"// Split kernel launcher \n"\
+"__kernel void split4way(int bitshift, __global int4* in_array, uint numElems, __global int* out_histograms, __global int4* out_array, \n"\
+"    __global int* out_local_histograms, \n"\
+"    __global int4* out_debug_offset, \n"\
+"    __local short* shmem) \n"\
+"{ \n"\
+"    int globalId = get_global_id(0); \n"\
+"    int localId = get_local_id(0); \n"\
+"    int groupSize = get_local_size(0); \n"\
+"    int groupId = get_group_id(0); \n"\
+"    int numGroups = get_global_size(0) / groupSize; \n"\
+" \n"\
+"    /// Load single int4 value \n"\
+"    int4 val = safe_load_int4_intmax(in_array, globalId, numElems); \n"\
+" \n"\
+"    int4 localOffset; \n"\
+"    short4 localHistogram; \n"\
+"    group_split_radix_2bits(localId, groupSize, bitshift, val, shmem, &localOffset, \n"\
+"        &localHistogram); \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    __local int* sharedData = (__local int*)shmem; \n"\
+"    __local int4* sharedData4 = (__local int4*)shmem; \n"\
+" \n"\
+"    sharedData[localOffset.x] = val.x; \n"\
+"    sharedData[localOffset.y] = val.y; \n"\
+"    sharedData[localOffset.z] = val.z; \n"\
+"    sharedData[localOffset.w] = val.w; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    // Now store to memory \n"\
+"    if (((globalId + 1) << 2) <= numElems) \n"\
+"    { \n"\
+"        out_array[globalId] = sharedData4[localId]; \n"\
+"        out_debug_offset[globalId] = localOffset; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        if ((globalId << 2) < numElems) out_array[globalId].x = sharedData4[localId].x; \n"\
+"        if ((globalId << 2) + 1 < numElems) out_array[globalId].y = sharedData4[localId].y; \n"\
+"        if ((globalId << 2) + 2 < numElems) out_array[globalId].z = sharedData4[localId].z; \n"\
+"    } \n"\
+" \n"\
+"    if (localId == 0) \n"\
+"    { \n"\
+"        out_histograms[groupId] = localHistogram.x; \n"\
+"        out_histograms[groupId + numGroups] = localHistogram.y; \n"\
+"        out_histograms[groupId + 2 * numGroups] = localHistogram.z; \n"\
+"        out_histograms[groupId + 3 * numGroups] = localHistogram.w; \n"\
+" \n"\
+"        out_local_histograms[groupId] = 0; \n"\
+"        out_local_histograms[groupId + numGroups] = localHistogram.x; \n"\
+"        out_local_histograms[groupId + 2 * numGroups] = localHistogram.x + localHistogram.y; \n"\
+"        out_local_histograms[groupId + 3 * numGroups] = localHistogram.x + localHistogram.y + localHistogram.z; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"#define GROUP_SIZE 64 \n"\
+"#define NUMBER_OF_BLOCKS_PER_GROUP 8 \n"\
+"#define NUM_BINS 16 \n"\
+" \n"\
+"// The kernel computes 16 bins histogram of the 256 input elements. \n"\
+"// The bin is determined by (in_array[tid] >> bitshift) & 0xF \n"\
+"__kernel \n"\
+"__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) \n"\
+"void BitHistogram( \n"\
+"    // Number of bits to shift \n"\
+"    int bitshift, \n"\
+"    // Input array \n"\
+"    __global int const* restrict in_array, \n"\
+"    // Number of elements in input array \n"\
+"    uint numelems, \n"\
+"    // Output histograms in column layout \n"\
+"    // [bin0_group0, bin0_group1, ... bin0_groupN, bin1_group0, bin1_group1, ... bin1_groupN, ...] \n"\
+"    __global int* restrict out_histogram \n"\
+"    ) \n"\
+"{ \n"\
+"    // Histogram storage \n"\
+"    __local int histogram[NUM_BINS * GROUP_SIZE]; \n"\
+" \n"\
+"    int globalid = get_global_id(0); \n"\
+"    int localid = get_local_id(0); \n"\
+"    int groupsize = get_local_size(0); \n"\
+"    int groupid = get_group_id(0); \n"\
+"    int numgroups = get_global_size(0) / groupsize; \n"\
+" \n"\
+"    /// Clear local histogram \n"\
+"    for (int i = 0; i < NUM_BINS; ++i) \n"\
+"    { \n"\
+"        histogram[i*GROUP_SIZE + localid] = 0; \n"\
+"    } \n"\
+" \n"\
+"    // Make sure everything is up to date \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    const int numblocks_per_group = NUMBER_OF_BLOCKS_PER_GROUP; \n"\
+"    const int numelems_per_group = numblocks_per_group * GROUP_SIZE; \n"\
+" \n"\
+"    int numblocks_total = (numelems + GROUP_SIZE * 4 - 1) / (GROUP_SIZE * 4); \n"\
+"    int maxblocks = numblocks_total - groupid * numblocks_per_group; \n"\
+" \n"\
+"    int loadidx = groupid * numelems_per_group + localid; \n"\
+"    for (int block = 0; block < min(numblocks_per_group, maxblocks); ++block, loadidx += GROUP_SIZE) \n"\
+"    { \n"\
+"        /// Load single int4 value \n"\
+"        int4 value = safe_load_int4_intmax(in_array, loadidx, numelems); \n"\
+" \n"\
+"        /// Handle value adding histogram bins \n"\
+"        /// for all 4 elements \n"\
+"        int4 bin = ((value >> bitshift) & 0xF); \n"\
+"        //++histogram[localid*kNumBins + bin]; \n"\
+"        atom_inc(&histogram[bin.x*GROUP_SIZE + localid]); \n"\
+"        //bin = ((value.y >> bitshift) & 0xF); \n"\
+"        //++histogram[localid*kNumBins + bin]; \n"\
+"        atom_inc(&histogram[bin.y*GROUP_SIZE + localid]); \n"\
+"        //bin = ((value.z >> bitshift) & 0xF); \n"\
+"        //++histogram[localid*kNumBins + bin]; \n"\
+"        atom_inc(&histogram[bin.z*GROUP_SIZE + localid]); \n"\
+"        //bin = ((value.w >> bitshift) & 0xF); \n"\
+"        //++histogram[localid*kNumBins + bin]; \n"\
+"        atom_inc(&histogram[bin.w*GROUP_SIZE + localid]); \n"\
+"    } \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    int sum = 0; \n"\
+"    if (localid < NUM_BINS) \n"\
+"    { \n"\
+"        for (int i = 0; i < GROUP_SIZE; ++i) \n"\
+"        { \n"\
+"            sum += histogram[localid * GROUP_SIZE + i]; \n"\
+"        } \n"\
+" \n"\
+"        out_histogram[numgroups*localid + groupid] = sum; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+" \n"\
+"__kernel \n"\
+"__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) \n"\
+"void ScatterKeys(// Number of bits to shift \n"\
+"    int bitshift, \n"\
+"    // Input keys \n"\
+"    __global int4 const* restrict in_keys, \n"\
+"    // Number of input keys \n"\
+"    uint           numelems, \n"\
+"    // Scanned histograms \n"\
+"    __global int const* restrict  in_histograms, \n"\
+"    // Output keys \n"\
+"    __global int* restrict  out_keys \n"\
+"    ) \n"\
+"{ \n"\
+"    // Local memory for offsets counting \n"\
+"    __local int  keys[GROUP_SIZE * 4]; \n"\
+"    __local int  scanned_histogram[NUM_BINS]; \n"\
+" \n"\
+"    int globalid = get_global_id(0); \n"\
+"    int localid = get_local_id(0); \n"\
+"    int groupsize = get_local_size(0); \n"\
+"    int groupid = get_group_id(0); \n"\
+"    int numgroups = get_global_size(0) / groupsize; \n"\
+" \n"\
+"    __local uint* histogram = (__local uint*)keys; \n"\
+" \n"\
+"    int numblocks_per_group = NUMBER_OF_BLOCKS_PER_GROUP; \n"\
+"    int numelems_per_group = numblocks_per_group * GROUP_SIZE; \n"\
+"    int numblocks_total = (numelems + GROUP_SIZE * 4 - 1) / (GROUP_SIZE * 4); \n"\
+"    int maxblocks = numblocks_total - groupid * numblocks_per_group; \n"\
+" \n"\
+"    // Copy scanned histogram for the group to local memory for fast indexing \n"\
+"    if (localid < NUM_BINS) \n"\
+"    { \n"\
+"        scanned_histogram[localid] = in_histograms[groupid + localid * numgroups]; \n"\
+"    } \n"\
+" \n"\
+"    // Make sure everything is up to date \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    int loadidx = groupid * numelems_per_group + localid; \n"\
+"    for (int block = 0; block < min(numblocks_per_group, maxblocks); ++block, loadidx += GROUP_SIZE) \n"\
+"    { \n"\
+"        // Load single int4 value \n"\
+"        int4 localvals = safe_load_int4_intmax(in_keys, loadidx, numelems); \n"\
+" \n"\
+"        // Clear the histogram \n"\
+"        histogram[localid] = 0; \n"\
+" \n"\
+"        // Make sure everything is up to date \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        // Do 2 bits per pass \n"\
+"        for (int bit = 0; bit <= 2; bit += 2) \n"\
+"        { \n"\
+"            // Count histogram \n"\
+"            int4 b = ((localvals >> bitshift) >> bit) & 0x3; \n"\
+" \n"\
+"            int4 p; \n"\
+"            p.x = 1 << (8 * b.x); \n"\
+"            p.y = 1 << (8 * b.y); \n"\
+"            p.z = 1 << (8 * b.z); \n"\
+"            p.w = 1 << (8 * b.w); \n"\
+" \n"\
+"            // Pack the histogram \n"\
+"            uint packed_key = (uint)(p.x + p.y + p.z + p.w); \n"\
+" \n"\
+"            // Put into LDS \n"\
+"            histogram[localid] = packed_key; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"            // Scan the histogram in LDS with 4-way plus scan \n"\
+"            uint total = 0; \n"\
+"            group_scan_exclusive_sum_uint(localid, GROUP_SIZE, histogram, &total); \n"\
+" \n"\
+"            // Load value back \n"\
+"            packed_key = histogram[localid]; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"            // Scan total histogram (4 chars) \n"\
+"            total = (total << 8) + (total << 16) + (total << 24); \n"\
+"            uint offset = total + packed_key; \n"\
+" \n"\
+"            int4 newoffset; \n"\
+" \n"\
+"            int t = p.y + p.x; \n"\
+"            p.w = p.z + t; \n"\
+"            p.z = t; \n"\
+"            p.y = p.x; \n"\
+"            p.x = 0; \n"\
+" \n"\
+"            p += (int)offset; \n"\
+"            newoffset = (p >> (b * 8)) & 0xFF; \n"\
+" \n"\
+"            keys[newoffset.x] = localvals.x; \n"\
+"            keys[newoffset.y] = localvals.y; \n"\
+"            keys[newoffset.z] = localvals.z; \n"\
+"            keys[newoffset.w] = localvals.w; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"            // Reload values back to registers for the second bit pass \n"\
+"            localvals.x = keys[localid << 2]; \n"\
+"            localvals.y = keys[(localid << 2) + 1]; \n"\
+"            localvals.z = keys[(localid << 2) + 2]; \n"\
+"            localvals.w = keys[(localid << 2) + 3]; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"        } \n"\
+" \n"\
+"        // Clear LDS \n"\
+"        histogram[localid] = 0; \n"\
+" \n"\
+"        // Make sure everything is up to date \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        // Reconstruct 16 bins histogram \n"\
+"        int4 bin = (localvals >> bitshift) & 0xF; \n"\
+"        atom_inc(&histogram[bin.x]); \n"\
+"        atom_inc(&histogram[bin.y]); \n"\
+"        atom_inc(&histogram[bin.z]); \n"\
+"        atom_inc(&histogram[bin.w]); \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        int sum = 0; \n"\
+"        if (localid < NUM_BINS) \n"\
+"        { \n"\
+"            sum = histogram[localid]; \n"\
+"        } \n"\
+" \n"\
+"        // Make sure everything is up to date \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        // Scan reconstructed histogram \n"\
+"        group_scan_exclusive_uint(localid, 16, histogram); \n"\
+" \n"\
+"        // Put data back to global memory \n"\
+"        int offset = scanned_histogram[bin.x] + (localid << 2) - histogram[bin.x]; \n"\
+"        if (offset < numelems) \n"\
+"        { \n"\
+"            out_keys[offset] = localvals.x; \n"\
+"        } \n"\
+" \n"\
+"        offset = scanned_histogram[bin.y] + (localid << 2) + 1 - histogram[bin.y]; \n"\
+"        if (offset < numelems) \n"\
+"        { \n"\
+"            out_keys[offset] = localvals.y; \n"\
+"        } \n"\
+" \n"\
+"        offset = scanned_histogram[bin.z] + (localid << 2) + 2 - histogram[bin.z]; \n"\
+"        if (offset < numelems) \n"\
+"        { \n"\
+"            out_keys[offset] = localvals.z; \n"\
+"        } \n"\
+" \n"\
+"        offset = scanned_histogram[bin.w] + (localid << 2) + 3 - histogram[bin.w]; \n"\
+"        if (offset < numelems) \n"\
+"        { \n"\
+"            out_keys[offset] = localvals.w; \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        if (localid < NUM_BINS) \n"\
+"        { \n"\
+"            scanned_histogram[localid] += sum; \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+" \n"\
+" \n"\
+"__kernel \n"\
+"__attribute__((reqd_work_group_size(GROUP_SIZE, 1, 1))) \n"\
+"void ScatterKeysAndValues(// Number of bits to shift \n"\
+"    int bitshift, \n"\
+"    // Input keys \n"\
+"    __global int4 const* restrict in_keys, \n"\
+"    // Input values \n"\
+"    __global int4 const* restrict in_values, \n"\
+"    // Number of input keys \n"\
+"    uint           numelems, \n"\
+"    // Scanned histograms \n"\
+"    __global int const* restrict  in_histograms, \n"\
+"    // Output keys \n"\
+"    __global int* restrict  out_keys, \n"\
+"    // Output values \n"\
+"    __global int* restrict  out_values \n"\
+"    ) \n"\
+"{ \n"\
+"    // Local memory for offsets counting \n"\
+"    __local int  keys[GROUP_SIZE * 4]; \n"\
+"    __local int  scanned_histogram[NUM_BINS]; \n"\
+" \n"\
+"    int globalid = get_global_id(0); \n"\
+"    int localid = get_local_id(0); \n"\
+"    int groupsize = get_local_size(0); \n"\
+"    int groupid = get_group_id(0); \n"\
+"    int numgroups = get_global_size(0) / groupsize; \n"\
+" \n"\
+"    __local uint* histogram = (__local uint*)keys; \n"\
+" \n"\
+"    int numblocks_per_group = NUMBER_OF_BLOCKS_PER_GROUP; \n"\
+"    int numelems_per_group = numblocks_per_group * GROUP_SIZE; \n"\
+"    int numblocks_total = (numelems + GROUP_SIZE * 4 - 1) / (GROUP_SIZE * 4); \n"\
+"    int maxblocks = numblocks_total - groupid * numblocks_per_group; \n"\
+" \n"\
+"    // Copy scanned histogram for the group to local memory for fast indexing \n"\
+"    if (localid < NUM_BINS) \n"\
+"    { \n"\
+"        scanned_histogram[localid] = in_histograms[groupid + localid * numgroups]; \n"\
+"    } \n"\
+" \n"\
+"    // Make sure everything is up to date \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    int loadidx = groupid * numelems_per_group + localid; \n"\
+"    for (int block = 0; block < min(numblocks_per_group, maxblocks); ++block, loadidx += GROUP_SIZE) \n"\
+"    { \n"\
+"        // Load single int4 value \n"\
+"        int4 localkeys = safe_load_int4_intmax(in_keys, loadidx, numelems); \n"\
+"        int4 localvals = safe_load_int4_intmax(in_values, loadidx, numelems); \n"\
+" \n"\
+"        // Clear the histogram \n"\
+"        histogram[localid] = 0; \n"\
+" \n"\
+"        // Make sure everything is up to date \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        // Do 2 bits per pass \n"\
+"        for (int bit = 0; bit <= 2; bit += 2) \n"\
+"        { \n"\
+"            // Count histogram \n"\
+"            int4 b = ((localkeys >> bitshift) >> bit) & 0x3; \n"\
+" \n"\
+"            int4 p; \n"\
+"            p.x = 1 << (8 * b.x); \n"\
+"            p.y = 1 << (8 * b.y); \n"\
+"            p.z = 1 << (8 * b.z); \n"\
+"            p.w = 1 << (8 * b.w); \n"\
+" \n"\
+"            // Pack the histogram \n"\
+"            uint packed_key = (uint)(p.x + p.y + p.z + p.w); \n"\
+" \n"\
+"            // Put into LDS \n"\
+"            histogram[localid] = packed_key; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"            // Scan the histogram in LDS with 4-way plus scan \n"\
+"            uint total = 0; \n"\
+"            group_scan_exclusive_sum_uint(localid, GROUP_SIZE, histogram, &total); \n"\
+" \n"\
+"            // Load value back \n"\
+"            packed_key = histogram[localid]; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"            // Scan total histogram (4 chars) \n"\
+"            total = (total << 8) + (total << 16) + (total << 24); \n"\
+"            uint offset = total + packed_key; \n"\
+" \n"\
+"            int4 newoffset; \n"\
+" \n"\
+"            int t = p.y + p.x; \n"\
+"            p.w = p.z + t; \n"\
+"            p.z = t; \n"\
+"            p.y = p.x; \n"\
+"            p.x = 0; \n"\
+" \n"\
+"            p += (int)offset; \n"\
+"            newoffset = (p >> (b * 8)) & 0xFF; \n"\
+" \n"\
+"            keys[newoffset.x] = localkeys.x; \n"\
+"            keys[newoffset.y] = localkeys.y; \n"\
+"            keys[newoffset.z] = localkeys.z; \n"\
+"            keys[newoffset.w] = localkeys.w; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"            // Reload values back to registers for the second bit pass \n"\
+"            localkeys.x = keys[localid << 2]; \n"\
+"            localkeys.y = keys[(localid << 2) + 1]; \n"\
+"            localkeys.z = keys[(localid << 2) + 2]; \n"\
+"            localkeys.w = keys[(localid << 2) + 3]; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"            keys[newoffset.x] = localvals.x; \n"\
+"            keys[newoffset.y] = localvals.y; \n"\
+"            keys[newoffset.z] = localvals.z; \n"\
+"            keys[newoffset.w] = localvals.w; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"            // Reload values back to registers for the second bit pass \n"\
+"            localvals.x = keys[localid << 2]; \n"\
+"            localvals.y = keys[(localid << 2) + 1]; \n"\
+"            localvals.z = keys[(localid << 2) + 2]; \n"\
+"            localvals.w = keys[(localid << 2) + 3]; \n"\
+" \n"\
+"            // Make sure everything is up to date \n"\
+"            barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"        } \n"\
+" \n"\
+"        // Clear LDS \n"\
+"        histogram[localid] = 0; \n"\
+" \n"\
+"        // Make sure everything is up to date \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        // Reconstruct 16 bins histogram \n"\
+"        int4 bin = (localkeys >> bitshift) & 0xF; \n"\
+"        atom_inc(&histogram[bin.x]); \n"\
+"        atom_inc(&histogram[bin.y]); \n"\
+"        atom_inc(&histogram[bin.z]); \n"\
+"        atom_inc(&histogram[bin.w]); \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        int sum = 0; \n"\
+"        if (localid < NUM_BINS) \n"\
+"        { \n"\
+"            sum = histogram[localid]; \n"\
+"        } \n"\
+" \n"\
+"        // Make sure everything is up to date \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        // Scan reconstructed histogram \n"\
+"        group_scan_exclusive_uint(localid, 16, histogram); \n"\
+" \n"\
+"        // Put data back to global memory \n"\
+"        int offset = scanned_histogram[bin.x] + (localid << 2) - histogram[bin.x]; \n"\
+"        if (offset < numelems) \n"\
+"        { \n"\
+"            out_keys[offset] = localkeys.x; \n"\
+"            out_values[offset] = localvals.x; \n"\
+"        } \n"\
+" \n"\
+"        offset = scanned_histogram[bin.y] + (localid << 2) + 1 - histogram[bin.y]; \n"\
+"        if (offset < numelems) \n"\
+"        { \n"\
+"            out_keys[offset] = localkeys.y; \n"\
+"            out_values[offset] = localvals.y; \n"\
+"        } \n"\
+" \n"\
+"        offset = scanned_histogram[bin.z] + (localid << 2) + 2 - histogram[bin.z]; \n"\
+"        if (offset < numelems) \n"\
+"        { \n"\
+"            out_keys[offset] = localkeys.z; \n"\
+"            out_values[offset] = localvals.z; \n"\
+"        } \n"\
+" \n"\
+"        offset = scanned_histogram[bin.w] + (localid << 2) + 3 - histogram[bin.w]; \n"\
+"        if (offset < numelems) \n"\
+"        { \n"\
+"            out_keys[offset] = localkeys.w; \n"\
+"            out_values[offset] = localvals.w; \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"        scanned_histogram[localid] += sum; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+" \n"\
+"__kernel void compact_int(__global int* in_predicate, __global int* in_address, \n"\
+"    __global int* in_input, uint in_size, \n"\
+"    __global int* out_output) \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+"    int group_id = get_group_id(0); \n"\
+" \n"\
+"    if (global_id < in_size) \n"\
+"    { \n"\
+"        if (in_predicate[global_id]) \n"\
+"        { \n"\
+"            out_output[in_address[global_id]] = in_input[global_id]; \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"__kernel void compact_int_1(__global int* in_predicate, __global int* in_address, \n"\
+"    __global int* in_input, uint in_size, \n"\
+"    __global int* out_output, \n"\
+"    __global int* out_size) \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+"    int group_id = get_group_id(0); \n"\
+" \n"\
+"    if (global_id < in_size) \n"\
+"    { \n"\
+"        if (in_predicate[global_id]) \n"\
+"        { \n"\
+"            out_output[in_address[global_id]] = in_input[global_id]; \n"\
+"        } \n"\
+"    } \n"\
+" \n"\
+"    if (global_id == 0) \n"\
+"    { \n"\
+"        *out_size = in_address[in_size - 1] + in_predicate[in_size - 1]; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"__kernel void copy(__global int4* in_input, \n"\
+"    uint  in_size, \n"\
+"    __global int4* out_output) \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+"    int4 value = safe_load_int4(in_input, global_id, in_size); \n"\
+"    safe_store_int4(value, out_output, global_id, in_size); \n"\
+"} \n"\
+" \n"\
+" \n"\
+"#define FLAG(x) (flags[(x)] & 0x1) \n"\
+"#define FLAG_COMBINED(x) (flags[(x)]) \n"\
+"#define FLAG_ORIG(x) ((flags[(x)] >> 1) & 0x1) \n"\
+" \n"\
+"void group_segmented_scan_exclusive_int( \n"\
+"    int localId, \n"\
+"    int groupSize, \n"\
+"    __local int* shmem, \n"\
+"    __local char* flags \n"\
+"    ) \n"\
+"{ \n"\
+"    for (int stride = 1; stride <= (groupSize >> 1); stride <<= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            if (FLAG(2 * (localId + 1)*stride - 1) == 0) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1] + shmem[(2 * localId + 1)*stride - 1]; \n"\
+"            } \n"\
+" \n"\
+"            FLAG_COMBINED(2 * (localId + 1)*stride - 1) = FLAG_COMBINED(2 * (localId + 1)*stride - 1) | FLAG((2 * localId + 1)*stride - 1); \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+" \n"\
+"    if (localId == 0) \n"\
+"        shmem[groupSize - 1] = 0; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    for (int stride = (groupSize >> 1); stride > 0; stride >>= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            int temp = shmem[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem[(2 * localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1]; \n"\
+" \n"\
+"            // optimize with a conditional = operator \n"\
+"            if (FLAG_ORIG((2 * localId + 1)*stride) == 1) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = 0; \n"\
+"            } \n"\
+"            else if (FLAG((2 * localId + 1)*stride - 1) == 1) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = temp; \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1] + temp; \n"\
+"            } \n"\
+" \n"\
+"            FLAG_COMBINED((2 * localId + 1)*stride - 1) = FLAG_COMBINED((2 * localId + 1)*stride - 1) & 2; \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"void group_segmented_scan_exclusive_int_nocut( \n"\
+"    int localId, \n"\
+"    int groupSize, \n"\
+"    __local int* shmem, \n"\
+"    __local char* flags \n"\
+"    ) \n"\
+"{ \n"\
+"    for (int stride = 1; stride <= (groupSize >> 1); stride <<= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            if (FLAG(2 * (localId + 1)*stride - 1) == 0) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1] + shmem[(2 * localId + 1)*stride - 1]; \n"\
+"            } \n"\
+" \n"\
+"            FLAG_COMBINED(2 * (localId + 1)*stride - 1) = FLAG_COMBINED(2 * (localId + 1)*stride - 1) | FLAG((2 * localId + 1)*stride - 1); \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+" \n"\
+"    if (localId == 0) \n"\
+"        shmem[groupSize - 1] = 0; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    for (int stride = (groupSize >> 1); stride > 0; stride >>= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            int temp = shmem[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem[(2 * localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1]; \n"\
+" \n"\
+"            if (FLAG((2 * localId + 1)*stride - 1) == 1) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = temp; \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1] + temp; \n"\
+"            } \n"\
+" \n"\
+"            FLAG_COMBINED((2 * localId + 1)*stride - 1) = FLAG_COMBINED((2 * localId + 1)*stride - 1) & 2; \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+" \n"\
+"void group_segmented_scan_exclusive_int_part( \n"\
+"    int localId, \n"\
+"    int groupId, \n"\
+"    int groupSize, \n"\
+"    __local int* shmem, \n"\
+"    __local char* flags, \n"\
+"    __global int* part_sums, \n"\
+"    __global int* part_flags \n"\
+"    ) \n"\
+"{ \n"\
+"    for (int stride = 1; stride <= (groupSize >> 1); stride <<= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            if (FLAG(2 * (localId + 1)*stride - 1) == 0) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1] + shmem[(2 * localId + 1)*stride - 1]; \n"\
+"            } \n"\
+" \n"\
+"            FLAG_COMBINED(2 * (localId + 1)*stride - 1) = FLAG_COMBINED(2 * (localId + 1)*stride - 1) | FLAG((2 * localId + 1)*stride - 1); \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+" \n"\
+"    if (localId == 0) \n"\
+"    { \n"\
+"        part_sums[groupId] = shmem[groupSize - 1]; \n"\
+"        part_flags[groupId] = FLAG(groupSize - 1); \n"\
+"        shmem[groupSize - 1] = 0; \n"\
+"    } \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    for (int stride = (groupSize >> 1); stride > 0; stride >>= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            int temp = shmem[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem[(2 * localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1]; \n"\
+" \n"\
+"            // optimize with a conditional = operator \n"\
+"            if (FLAG_ORIG((2 * localId + 1)*stride) == 1) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = 0; \n"\
+"            } \n"\
+"            else if (FLAG((2 * localId + 1)*stride - 1) == 1) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = temp; \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1] + temp; \n"\
+"            } \n"\
+" \n"\
+"            FLAG_COMBINED((2 * localId + 1)*stride - 1) = FLAG_COMBINED((2 * localId + 1)*stride - 1) & 2; \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"void group_segmented_scan_exclusive_int_nocut_part( \n"\
+"    int localId, \n"\
+"    int groupId, \n"\
+"    int groupSize, \n"\
+"    __local int* shmem, \n"\
+"    __local char* flags, \n"\
+"    __global int* part_sums, \n"\
+"    __global int* part_flags \n"\
+"    ) \n"\
+"{ \n"\
+"    for (int stride = 1; stride <= (groupSize >> 1); stride <<= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            if (FLAG(2 * (localId + 1)*stride - 1) == 0) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1] + shmem[(2 * localId + 1)*stride - 1]; \n"\
+"            } \n"\
+" \n"\
+"            FLAG_COMBINED(2 * (localId + 1)*stride - 1) = FLAG_COMBINED(2 * (localId + 1)*stride - 1) | FLAG((2 * localId + 1)*stride - 1); \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+" \n"\
+"    if (localId == 0) \n"\
+"    { \n"\
+"        part_sums[groupId] = shmem[groupSize - 1]; \n"\
+"        part_flags[groupId] = FLAG(groupSize - 1); \n"\
+"        shmem[groupSize - 1] = 0; \n"\
+"    } \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    for (int stride = (groupSize >> 1); stride > 0; stride >>= 1) \n"\
+"    { \n"\
+"        if (localId < groupSize / (2 * stride)) \n"\
+"        { \n"\
+"            int temp = shmem[(2 * localId + 1)*stride - 1]; \n"\
+"            shmem[(2 * localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1]; \n"\
+" \n"\
+"            if (FLAG((2 * localId + 1)*stride - 1) == 1) \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = temp; \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                shmem[2 * (localId + 1)*stride - 1] = shmem[2 * (localId + 1)*stride - 1] + temp; \n"\
+"            } \n"\
+" \n"\
+"            FLAG_COMBINED((2 * localId + 1)*stride - 1) = FLAG_COMBINED((2 * localId + 1)*stride - 1) & 2; \n"\
+"        } \n"\
+" \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE); \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+" \n"\
+"__kernel void segmented_scan_exclusive_int_nocut(__global int const* in_array, \n"\
+"    __global int const* in_segment_heads_array, \n"\
+"    int numelems, \n"\
+"    __global int* out_array, \n"\
+"    __local int* shmem) \n"\
+"{ \n"\
+"    int globalId = get_global_id(0); \n"\
+"    int localId = get_local_id(0); \n"\
+"    int groupSize = get_local_size(0); \n"\
+"    int groupId = get_group_id(0); \n"\
+" \n"\
+"    __local int* keys = shmem; \n"\
+"    __local char* flags = (__local char*)(keys + groupSize); \n"\
+" \n"\
+"    keys[localId] = globalId < numelems ? in_array[globalId] : 0; \n"\
+"    flags[localId] = globalId < numelems ? (in_segment_heads_array[globalId] ? 3 : 0) : 0; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    group_segmented_scan_exclusive_int_nocut(localId, groupSize, keys, flags); \n"\
+" \n"\
+"    out_array[globalId] = keys[localId]; \n"\
+"} \n"\
+" \n"\
+"__kernel void segmented_scan_exclusive_int(__global int const* in_array, \n"\
+"    __global int const* in_segment_heads_array, \n"\
+"    int numelems, \n"\
+"    __global int* out_array, \n"\
+"    __local int* shmem) \n"\
+"{ \n"\
+"    int globalId = get_global_id(0); \n"\
+"    int localId = get_local_id(0); \n"\
+"    int groupSize = get_local_size(0); \n"\
+"    int groupId = get_group_id(0); \n"\
+" \n"\
+"    __local int* keys = shmem; \n"\
+"    __local char* flags = (__local char*)(keys + groupSize); \n"\
+" \n"\
+"    keys[localId] = globalId < numelems ? in_array[globalId] : 0; \n"\
+"    flags[localId] = globalId < numelems ? (in_segment_heads_array[globalId] ? 3 : 0) : 0; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    group_segmented_scan_exclusive_int(localId, groupSize, keys, flags); \n"\
+" \n"\
+"    out_array[globalId] = keys[localId]; \n"\
+"} \n"\
+" \n"\
+"__kernel void segmented_scan_exclusive_int_part(__global int const* in_array, \n"\
+"    __global int const* in_segment_heads_array, \n"\
+"    int numelems, \n"\
+"    __global int* out_array, \n"\
+"    __global int* out_part_sums, \n"\
+"    __global int* out_part_flags, \n"\
+"    __local int* shmem) \n"\
+"{ \n"\
+"    int globalId = get_global_id(0); \n"\
+"    int localId = get_local_id(0); \n"\
+"    int groupSize = get_local_size(0); \n"\
+"    int groupId = get_group_id(0); \n"\
+" \n"\
+"    __local int* keys = shmem; \n"\
+"    __local char* flags = (__local char*)(keys + groupSize); \n"\
+" \n"\
+"    keys[localId] = globalId < numelems ? in_array[globalId] : 0; \n"\
+"    flags[localId] = globalId < numelems ? (in_segment_heads_array[globalId] ? 3 : 0) : 0; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    group_segmented_scan_exclusive_int_part(localId, groupId, groupSize, keys, flags, out_part_sums, out_part_flags); \n"\
+" \n"\
+"    out_array[globalId] = keys[localId]; \n"\
+"} \n"\
+" \n"\
+"__kernel void segmented_scan_exclusive_int_nocut_part(__global int const* in_array, \n"\
+"    __global int const* in_segment_heads_array, \n"\
+"    int numelems, \n"\
+"    __global int* out_array, \n"\
+"    __global int* out_part_sums, \n"\
+"    __global int* out_part_flags, \n"\
+"    __local int* shmem) \n"\
+"{ \n"\
+"    int globalId = get_global_id(0); \n"\
+"    int localId = get_local_id(0); \n"\
+"    int groupSize = get_local_size(0); \n"\
+"    int groupId = get_group_id(0); \n"\
+" \n"\
+"    __local int* keys = shmem; \n"\
+"    __local char* flags = (__local char*)(keys + groupSize); \n"\
+" \n"\
+"    keys[localId] = globalId < numelems ? in_array[globalId] : 0; \n"\
+"    flags[localId] = globalId < numelems ? (in_segment_heads_array[globalId] ? 3 : 0) : 0; \n"\
+" \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE); \n"\
+" \n"\
+"    group_segmented_scan_exclusive_int_nocut_part(localId, groupId, groupSize, keys, flags, out_part_sums, out_part_flags); \n"\
+" \n"\
+"    out_array[globalId] = keys[localId]; \n"\
+"} \n"\
+" \n"\
+" \n"\
+"__kernel void segmented_distribute_part_sum_int( \n"\
+"    __global int* inout_array, \n"\
+"    __global int* in_flags, \n"\
+"    int numelems, \n"\
+"    __global int* in_sums \n"\
+"    ) \n"\
+"{ \n"\
+"    int globalId = get_global_id(0); \n"\
+"    int localId = get_local_id(0); \n"\
+"    int groupSize = get_local_size(0); \n"\
+"    int groupId = get_group_id(0); \n"\
+" \n"\
+"    int sum = in_sums[groupId]; \n"\
+"    //inout_array[globalId] += sum; \n"\
+" \n"\
+"    if (localId == 0) \n"\
+"    { \n"\
+"        for (int i = 0; in_flags[globalId + i] == 0 && i < groupSize; ++i) \n"\
+"        { \n"\
+"            if (globalId + i < numelems) \n"\
+"            { \n"\
+"                inout_array[globalId + i] += sum; \n"\
+"            } \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"__kernel void segmented_distribute_part_sum_int_nocut( \n"\
+"    __global int* inout_array, \n"\
+"    __global int* in_flags, \n"\
+"    int numelems, \n"\
+"    __global int* in_sums \n"\
+"    ) \n"\
+"{ \n"\
+"    int globalId = get_global_id(0); \n"\
+"    int localId = get_local_id(0); \n"\
+"    int groupSize = get_local_size(0); \n"\
+"    int groupId = get_group_id(0); \n"\
+" \n"\
+"    int sum = in_sums[groupId]; \n"\
+"    bool stop = false; \n"\
+"    //inout_array[globalId] += sum; \n"\
+" \n"\
+"    if (localId == 0) \n"\
+"    { \n"\
+"        for (int i = 0; i < groupSize; ++i) \n"\
+"        { \n"\
+"            if (globalId + i < numelems) \n"\
+"            { \n"\
+"                if (in_flags[globalId + i] == 0) \n"\
+"                { \n"\
+"                    inout_array[globalId + i] += sum; \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"                    if (stop) \n"\
+"                    { \n"\
+"                        break; \n"\
+"                    } \n"\
+"                    else \n"\
+"                    { \n"\
+"                        inout_array[globalId + i] += sum; \n"\
+"                        stop = true; \n"\
+"                    } \n"\
+"                } \n"\
+"            } \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"// --------------------- ATOMIC OPERTIONS ------------------------ \n"\
+" \n"\
+"#define DEFINE_ATOMIC(operation)\\ \n"\
+"    __attribute__((always_inline)) void atomic_##operation##_float(volatile __global float* addr, float value)\\ \n"\
+"    {\\ \n"\
+"        union\\ \n"\
+"        {\\ \n"\
+"        unsigned int u32;\\ \n"\
+"        float        f32;\\ \n"\
+"        } next, expected, current;\\ \n"\
+"        current.f32 = *addr;\\ \n"\
+"        do\\ \n"\
+"        {\\ \n"\
+"            expected.f32 = current.f32;\\ \n"\
+"            next.f32 = operation(expected.f32, value);\\ \n"\
+"            current.u32 = atomic_cmpxchg((volatile __global unsigned int *)addr,\\ \n"\
+"                expected.u32, next.u32);\\ \n"\
+"        } while (current.u32 != expected.u32);\\ \n"\
+"    } \n"\
+" \n"\
+"#define DEFINE_ATOMIC_FLOAT3(operation)\\ \n"\
+"    __attribute__((always_inline)) void atomic_##operation##_float3(volatile __global float3* addr, float3 value)\\ \n"\
+"    {\\ \n"\
+"        volatile __global float* p = (volatile __global float*)addr;\\ \n"\
+"        atomic_##operation##_float(p, value.x);\\ \n"\
+"        atomic_##operation##_float(p + 1, value.y);\\ \n"\
+"        atomic_##operation##_float(p + 2, value.z);\\ \n"\
+"    } \n"\
+" \n"\
+"__attribute__((always_inline)) void atomic_max_int(volatile __global int* addr, int value) \n"\
+"{ \n"\
+"    atomic_max(addr, value); \n"\
+"} \n"\
+" \n"\
+"__attribute__((always_inline)) void atomic_min_int(volatile __global int* addr, int value) \n"\
+"{ \n"\
+"    atomic_min(addr, value); \n"\
+"} \n"\
+" \n"\
+"// --------------------- REDUCTION ------------------------ \n"\
+" \n"\
+"#define DEFINE_REDUCTION(bin_op, type)\\ \n"\
+"__kernel void reduction_##bin_op##_##type(const __global type* buffer,\\ \n"\
+"                                          int count,\\ \n"\
+"                                          __local type* shared_mem,\\ \n"\
+"                                          __global type* out,\\ \n"\
+"                                          int /* in elements */ out_offset)\\ \n"\
+"{\\ \n"\
+"    int global_id = get_global_id(0);\\ \n"\
+"    int group_id = get_group_id(0);\\ \n"\
+"    int local_id = get_local_id(0);\\ \n"\
+"    int group_size = get_local_size(0);\\ \n"\
+"    if (global_id < count)\\ \n"\
+"        shared_mem[local_id] = buffer[global_id];\\ \n"\
+"    else\\ \n"\
+"        shared_mem[local_id] = neutral_##bin_op##_##type;\\ \n"\
+"    barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    for (int i = group_size / 2; i > 0; i >>= 1)\\ \n"\
+"    {\\ \n"\
+"        if (local_id < i)\\ \n"\
+"            shared_mem[local_id] = bin_op(shared_mem[local_id], shared_mem[local_id + i]);\\ \n"\
+"        barrier(CLK_LOCAL_MEM_FENCE);\\ \n"\
+"    }\\ \n"\
+"    if (local_id == 0)\\ \n"\
+"        atomic_##bin_op##_##type(out + out_offset, shared_mem[0]);\\ \n"\
+"} \n"\
+" \n"\
+"// --------------------- NORMALIZATION ------------------------ \n"\
+" \n"\
+"#define DEFINE_BUFFER_NORMALIZATION(type)\\ \n"\
+"__kernel void buffer_normalization_##type(const __global type* input,\\ \n"\
+"                                          __global type* output,\\ \n"\
+"                                          int count,\\ \n"\
+"                                          const __global type* storage)\\ \n"\
+"{\\ \n"\
+"    type norm_coef = storage[0] - storage[1];\\ \n"\
+"    int global_id = get_global_id(0);\\ \n"\
+"    if (global_id < count)\\ \n"\
+"        output[global_id] = (input[global_id] - storage[1]) / norm_coef;\\ \n"\
+"} \n"\
+" \n"\
+"// Do not change the order \n"\
+"DEFINE_ATOMIC(min) \n"\
+"DEFINE_ATOMIC(max) \n"\
+"DEFINE_ATOMIC_FLOAT3(min) \n"\
+"DEFINE_ATOMIC_FLOAT3(max) \n"\
+" \n"\
+"DEFINE_REDUCTION(min, int) \n"\
+"DEFINE_REDUCTION(min, float) \n"\
+"DEFINE_REDUCTION(min, float3) \n"\
+"DEFINE_REDUCTION(max, int) \n"\
+"DEFINE_REDUCTION(max, float) \n"\
+"DEFINE_REDUCTION(max, float3) \n"\
+" \n"\
+"DEFINE_BUFFER_NORMALIZATION(int) \n"\
+"DEFINE_BUFFER_NORMALIZATION(float) \n"\
+"DEFINE_BUFFER_NORMALIZATION(float3) \n"\
+;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index f7d96316..00daf866 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.8)
 
 project(RadeonRaysSDK CXX)
 
-option(RR_EMBED_KERNELS "Embed CL kernels into binary module" OFF)
+option(RR_EMBED_KERNELS "Embed CL kernels into binary module" ON)
 option(RR_ALLOW_CPU_DEVICES "Allows CPU Devices" OFF)
 option(RR_USE_OPENCL "Use OpenCL for GPU hit testing" ON)
 option(RR_USE_EMBREE "Use Intel(R) Embree for CPU hit testing" OFF)
diff --git a/Calc/inc/calc_cl.h b/Calc/inc/calc_cl.h
index 1c18da3c..4c70b447 100644
--- a/Calc/inc/calc_cl.h
+++ b/Calc/inc/calc_cl.h
@@ -27,7 +27,7 @@ THE SOFTWARE.
 #include <cstdint>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/Calc/inc/device_cl.h b/Calc/inc/device_cl.h
index 0e229655..4b0a593e 100644
--- a/Calc/inc/device_cl.h
+++ b/Calc/inc/device_cl.h
@@ -31,7 +31,7 @@ THE SOFTWARE.
 #include "device.h"
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/README.md b/README.md
index 86acf3af..b328d4ca 100644
--- a/README.md
+++ b/README.md
@@ -1,76 +1,25 @@
-# Important
-Sample renderer has been moved from RadeonRays repo into a separate one here: [Baikal](https://github.com/GPUOpen-LibrariesAndSDKs/RadeonProRender-Baikal)
-
 # Summary
 Radeon Rays is ray intersection acceleration library provided by AMD which makes the most of the hardware and allows for efficient ray queries. Three backends support a range of use cases.
 
-# Description
-Radeon Rays has three backends, 
-- OpenCL
-- Vulkan
-- Embree
-
-OpenCL uses GPUs and CPUs that support at least OpenCL 1.2
-Vulkan supports GPUs with Vulkan 1.0 or greater
-Embree uses Intels Optimized CPU ray casting software for x86 and x64 devices
-
-The source tree consist of the following subdirectories:
-
-- Radeon Rays: library binaries
-
-- App: Standalone sample/application featuring Radeon Rays OpenCL to implement a path tracer.
-
-# Preliminaries
-## System requirements
-The library is cross-platform and the following compilers are supported:
-
-- Visual Studio 2015
-
-- Xcode 4 and later
-
-- GCC 4.8 and later
-
-- CMake 3.8 and later
-
-- Python (for --embed_kernels option only)
-
-- [Anvil](https://github.com/GPUOpen-LibrariesAndSDKs/Anvil) for Vulkan backend only
-
-- [Embree](https://github.com/embree/embree) for Embree backend only
-
-- AMD OpenCL APP SDK 2.0+ is also required for the standalone app build.  
+*This fork adds/removes stuff to work with AViS. Use this instead of the official repository if you are using it with AViS.*
 
 ## Set up OpenCL
 Set environmental variable.  GPU_MAX_ALLOC_PERCENT = 100. This is necessary to allocate a large buffers.
 
-## Set up Vulkan
-Anvil is set as a submodule and will be downloaded by using `git submodule update --init --recursive` from the command line.
-Some gui clients (github app for example) may do this automatically for you
-
-## Multiple Backends
-You can either choose a particular backend (OpenCL, Vulkan or Embree) or compile any combination of them and pick at run-time. By default OpenCL only will be compiled in (see Options below to enable other backends).
-At runtime OpenCL devices will appear first, then Vulkan devices (if enabled) with the Embree device last (if enabled).
-
-If the default behaviour is not what you want, an API call `IntersectionApi::SetPlatform( backend )` takes a backend argument bitfield allows you to specify exactly which backends device will be enumurated.
-
 ## Build                                                                                       
 
 ### Windows
-- Create Visual Studio 2015 Solution
 
-`cmake -G "Visual Studio 14 2015 Win64"`
+Do this instead of the official guide
 
-### OSX
-- Install Homebrew
+`cmake -G "Visual Studio 15 2017 Win64"`
 
-`/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"`
+`cmake --build . --config Release`
 
-- Install OpenImageIO
+### OSX
 
 `brew install homebrew/science/openimageio`
 
-- Make build folder and generate make files
-
 `mkdir build`
 
 `cd build`
@@ -80,77 +29,13 @@ If the default behaviour is not what you want, an API call `IntersectionApi::Set
 `make`
 
 ### Linux
-on Ubuntu:
-install complimentary libraries:
-
-`sudo apt-get install g++`
-
-install build dependencies:
 
 `sudo apt-get install libopenimageio-dev libglew-dev freeglut3-dev`
 
-Also make sure you have the `opencl-dev` headers installed. Then create the Makefile:
-
 `mkdir build`
 
 `cd build`
 
-`cmake -DCMAKE_BUILD_TYPE=<Release ro Debug> ..`
+`cmake -DCMAKE_BUILD_TYPE=<Release or Debug> ..`
 
 `make`
-
-### Options
-Available premake options:
-- `RR_USE_EMBREE` will enable the embree backend. Embree device will be the last one in IntersectionApi device list.
- example of usage : 
- `cmake -DCMAKE_BUILD_TYPE=<Release ro Debug> -DRR_USE_EMBREE=ON ..`
-
-- `RR_USE_OPENCL` will enable the OpenCL backend. If no other option is provided, this is the default
-
-- `RR_SHARED_CALC` will build Calc (Compute Abstraction Layer) as a shared object. This means RadeonRays library does not directly depend on OpenCL and can be used on the systems where OpenCL is not available (with Embree backend). 
-
-## Run unit tests
-They need to be run from the <Radeon Rays_SDK path>/UnitTest path.
-CMake should be runned with the `RR_SAFE_MATH` option.
-
-# Hardware  support
-
-The library has been tested on the following hardware and OSes:
-
-## Linux
- - Ubuntu Linux 14.04
- - AMD FirePro driver 15.201: W9100, W8100, W9000, W7000, W7100, S9300x2, W5100
- - AMD Radeon driver 15.302: R9 Nano, R9 Fury X, R9 290
- - NVIDIA driver 352.79: GeForce GTX970, Titan X
-
-## Windows
- - Windows 7/8.1/10
- - AMD FirePro driver 15.201: W9100, W8100, W9000, W7000, W7100, S9300x2, W5100
- - AMD Radeon driver 16.4: R9 Nano, R9 Fury X, R9 290, Pro Duo
- - NVIDIA driver 364.72: GeForce GTX970, Titan X
-
-## OSX
- - OSX El Capitan 10.11.4
- - Mac Pro (Late 2013) AMD FirePro D500 x2
- - Macbook Pro Retina 13" (Early 2013) Intel HD 4300
- - Macbook 12" (Early 2015) Intel HD 5300
-
----
-# Known Issues
-## Vulkan
-
- - Currently the public Anvil has a [bug](https://github.com/GPUOpen-LibrariesAndSDKs/Anvil/issues/3) that causes a crash on exit. Its already been fixed in an internal branch which will which will move to public when its cooked and ready. For now just comment out the free of the layout manager in Anvil::BasePipelineManager::Pipeline::release_vulkan_objects()
-
-## Windows
-
-## OSX
-
-## Linux
-
- - If <CL/cl.h> is missing try to specify OpenCL SDK location.
- - If your are experiencing problems creating your CL context with a default config chances are CL-GL interop is broken on your system, try running the sample app with -interop 0 command line option (expect performance drop). 
- 
-AMD:
-`export $AMDAPPSDKROOT=<SDK_PATH>`
-NVIDIA:
-`export $CUDA_PATH=<SDK_PATH>`
diff --git a/RadeonRays/include/radeon_rays.h b/RadeonRays/include/radeon_rays.h
index 17d26b16..61191dfc 100644
--- a/RadeonRays/include/radeon_rays.h
+++ b/RadeonRays/include/radeon_rays.h
@@ -225,10 +225,14 @@ namespace RadeonRays
         // Create an instance of a shape with its own transform (set via Shape interface).
         // The call is blocking, so the returned value is ready upon return.
         virtual Shape* CreateInstance(Shape const* shape) const = 0;
+        // *EDIT* Preallocate shape container
+        virtual void AllocShapes(size_t const size) {}
         // Delete the shape (to simplify DLL boundary crossing
         virtual void DeleteShape(Shape const* shape) = 0;
         // Attach shape to participate in intersection process
         virtual void AttachShape(Shape const* shape) = 0;
+        // *EDIT* Attach shape without error checking
+        virtual void AttachShapeUnchecked(Shape const* shape) { AttachShape(shape); }
         // Detach shape, i.e. it is not going to be considered part of the scene anymore
         virtual void DetachShape(Shape const* shape) = 0;
         // Detach all objects
diff --git a/RadeonRays/include/radeon_rays_cl.h b/RadeonRays/include/radeon_rays_cl.h
index 751760b1..9ddbb44e 100644
--- a/RadeonRays/include/radeon_rays_cl.h
+++ b/RadeonRays/include/radeon_rays_cl.h
@@ -28,7 +28,7 @@ THE SOFTWARE.
 
 #if USE_OPENCL
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/RadeonRays/kernels_cl.h b/RadeonRays/kernels_cl.h
new file mode 100644
index 00000000..1a1754e1
--- /dev/null
+++ b/RadeonRays/kernels_cl.h
@@ -0,0 +1,5752 @@
+/* This is an auto-generated file. Do not edit manually*/
+
+static const char g_build_hlbvh_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+"/** \n"\
+"    \\file build_bvh.cl \n"\
+"    \\author Dmitry Kozlov \n"\
+"    \\version 1.0 \n"\
+"    \\brief HLBVH build implementation \n"\
+" \n"\
+"    IntersectorHlbvh implementation is based on the following paper: \n"\
+"    \"HLBVH: Hierarchical LBVH Construction for Real-Time Ray Tracing\" \n"\
+"    Jacopo Pantaleoni (NVIDIA), David Luebke (NVIDIA), in High Performance Graphics 2010, June 2010 \n"\
+"    https://research.nvidia.com/sites/default/files/publications/HLBVH-final.pdf \n"\
+" \n"\
+"    Pros: \n"\
+"        -Very fast to build and update. \n"\
+"    Cons: \n"\
+"        -Poor BVH quality, slow traversal. \n"\
+" */ \n"\
+"/************************************************************************* \n"\
+"INCLUDES \n"\
+"**************************************************************************/ \n"\
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define KERNEL __kernel \n"\
+"#define GLOBAL __global \n"\
+"#define INLINE __attribute__((always_inline)) \n"\
+"#define HIT_MARKER 1 \n"\
+"#define MISS_MARKER -1 \n"\
+"#define INVALID_IDX -1 \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable \n"\
+"#endif \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPES \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"// Axis aligned bounding box \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 pmin; \n"\
+"    float4 pmax; \n"\
+"} bbox; \n"\
+" \n"\
+"// Ray definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 o; \n"\
+"    float4 d; \n"\
+"    int2 extra; \n"\
+"    int doBackfaceCulling; \n"\
+"    int padding; \n"\
+"} ray; \n"\
+" \n"\
+"// Intersection definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int shape_id; \n"\
+"    int prim_id; \n"\
+"    int2 padding; \n"\
+" \n"\
+"    float4 uvwt; \n"\
+"} Intersection; \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"HELPER FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"INLINE \n"\
+"int ray_get_mask(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.x; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_is_active(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.y; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_maxt(ray const* r) \n"\
+"{ \n"\
+"    return r->o.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_time(ray const* r) \n"\
+"{ \n"\
+"    return r->d.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_get_doBackfaceCull(ray const* r) \n"\
+"{ \n"\
+"    return r->doBackfaceCulling; \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"#ifndef APPLE \n"\
+"INLINE \n"\
+"float4 make_float4(float x, float y, float z, float w) \n"\
+"{ \n"\
+"    float4 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    res.w = w; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float3 make_float3(float x, float y, float z) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float2 make_float2(float x, float y) \n"\
+"{ \n"\
+"    float2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int2 make_int2(int x, int y) \n"\
+"{ \n"\
+"    int2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int3 make_int3(int x, int y, int z) \n"\
+"{ \n"\
+"    int3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"#endif \n"\
+" \n"\
+"INLINE float min3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_min3(a, b, c); \n"\
+"#else \n"\
+"    return min(min(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+"INLINE float max3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_max3(a, b, c); \n"\
+"#else \n"\
+"    return max(max(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// Intersect ray against a triangle and return intersection interval value if it is in \n"\
+"// (0, t_max], return t_max otherwise. \n"\
+"INLINE \n"\
+"float fast_intersect_triangle(ray r, float3 v1, float3 v2, float3 v3, float t_max) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+" \n"\
+"#ifdef RR_BACKFACE_CULL \n"\
+"    if (ray_get_doBackfaceCull(&r) && dot(cross(e1, e2), r.d.xyz) > 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"#endif // RR_BACKFACE_CULL \n"\
+" \n"\
+"    float3 const s1 = cross(r.d.xyz, e2); \n"\
+" \n"\
+"    float denom = dot(s1, e1); \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invd = 1.f / denom; \n"\
+"#else \n"\
+"    float const invd = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float3 const d = r.o.xyz - v1; \n"\
+"    float const b1 = dot(d, s1) * invd; \n"\
+"    float3 const s2 = cross(d, e1); \n"\
+"    float const b2 = dot(r.d.xyz, s2) * invd; \n"\
+"    float const temp = dot(e2, s2) * invd; \n"\
+" \n"\
+"    if (b1 < 0.f || b1 > 1.f || b2 < 0.f || b1 + b2 > 1.f || temp < 0.f || temp > t_max) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        return temp; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float3 safe_invdir(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-8; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"// Intersect rays vs bbox and return intersection span.  \n"\
+"// Intersection criteria is ret.x <= ret.y \n"\
+"INLINE \n"\
+"float2 fast_intersect_bbox1(bbox box, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    float3 const f = mad(box.pmax.xyz, invdir, oxinvdir); \n"\
+"    float3 const n = mad(box.pmin.xyz, invdir, oxinvdir); \n"\
+"    float3 const tmax = max(f, n); \n"\
+"    float3 const tmin = min(f, n); \n"\
+"    float const t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    float const t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return make_float2(t0, t1); \n"\
+"} \n"\
+" \n"\
+"// Given a point in triangle plane, calculate its barycentrics \n"\
+"INLINE \n"\
+"float2 triangle_calculate_barycentrics(float3 p, float3 v1, float3 v2, float3 v3) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+"    float3 const e = p - v1; \n"\
+"    float const d00 = dot(e1, e1); \n"\
+"    float const d01 = dot(e1, e2); \n"\
+"    float const d11 = dot(e2, e2); \n"\
+"    float const d20 = dot(e, e1); \n"\
+"    float const d21 = dot(e, e2); \n"\
+" \n"\
+"    float denom = (d00 * d11 - d01 * d01); \n"\
+"     \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return make_float2(0.f, 0.f); \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invdenom = 1.f / denom; \n"\
+"#else \n"\
+"    float const invdenom = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float const b1 = (d11 * d20 - d01 * d21) * invdenom; \n"\
+"    float const b2 = (d00 * d21 - d01 * d20) * invdenom; \n"\
+"    return make_float2(b1, b2); \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define LEAFIDX(i) ((num_prims-1) + i) \n"\
+"#define NODEIDX(i) (i) \n"\
+"// Shortcut for delta evaluation \n"\
+"#define DELTA(i,j) delta(morton_codes,num_prims,i,j) \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPE DEFINITIONS \n"\
+"**************************************************************************/ \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int parent; \n"\
+"    int left; \n"\
+"    int right; \n"\
+"    int next; \n"\
+"} HlbvhNode; \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"// The following two functions are from \n"\
+"// http://devblogs.nvidia.com/parallelforall/thinking-parallel-part-iii-tree-construction-gpu/ \n"\
+"// Expands a 10-bit integer into 30 bits \n"\
+"// by inserting 2 zeros after each bit. \n"\
+"INLINE uint expand_bits(uint v) \n"\
+"{ \n"\
+"    v = (v * 0x00010001u) & 0xFF0000FFu; \n"\
+"    v = (v * 0x00000101u) & 0x0F00F00Fu; \n"\
+"    v = (v * 0x00000011u) & 0xC30C30C3u; \n"\
+"    v = (v * 0x00000005u) & 0x49249249u; \n"\
+"    return v; \n"\
+"} \n"\
+" \n"\
+"// Calculates a 30-bit Morton code for the \n"\
+"// given 3D point located within the unit cube [0,1]. \n"\
+"INLINE uint calculate_morton_code(float3 p) \n"\
+"{ \n"\
+"    float x = min(max(p.x * 1024.0f, 0.0f), 1023.0f); \n"\
+"    float y = min(max(p.y * 1024.0f, 0.0f), 1023.0f); \n"\
+"    float z = min(max(p.z * 1024.0f, 0.0f), 1023.0f); \n"\
+"    unsigned int xx = expand_bits((uint)x); \n"\
+"    unsigned int yy = expand_bits((uint)y); \n"\
+"    unsigned int zz = expand_bits((uint)z); \n"\
+"    return xx * 4 + yy * 2 + zz; \n"\
+"} \n"\
+" \n"\
+"// Make a union of two bboxes \n"\
+"INLINE bbox bbox_union(bbox b1, bbox b2) \n"\
+"{ \n"\
+"    bbox res; \n"\
+"    res.pmin = min(b1.pmin, b2.pmin); \n"\
+"    res.pmax = max(b1.pmax, b2.pmax); \n"\
+"    return res; \n"\
+"} \n"\
+" \n"\
+"// Assign Morton codes to each of positions \n"\
+"KERNEL void calculate_morton_code_main( \n"\
+"    // Centers of primitive bounding boxes \n"\
+"    GLOBAL bbox const* restrict primitive_bounds, \n"\
+"    // Number of primitives \n"\
+"    int num_primitive_bounds, \n"\
+"    // Scene extents \n"\
+"    GLOBAL bbox const* restrict scene_bound,  \n"\
+"    // Morton codes \n"\
+"    GLOBAL int* morton_codes \n"\
+"    ) \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+" \n"\
+"    if (global_id < num_primitive_bounds) \n"\
+"    { \n"\
+"        // Fetch primitive bound \n"\
+"        bbox bound = primitive_bounds[global_id]; \n"\
+"        // Calculate center and scene extents \n"\
+"        float3 const center = (bound.pmax + bound.pmin).xyz * 0.5f; \n"\
+"        float3 const scene_min = scene_bound->pmin.xyz; \n"\
+"        float3 const scene_extents = scene_bound->pmax.xyz - scene_bound->pmin.xyz; \n"\
+"        // Calculate morton code \n"\
+"        morton_codes[global_id] = calculate_morton_code((center - scene_min) / scene_extents); \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+" \n"\
+" \n"\
+"// Calculates longest common prefix length of bit representations \n"\
+"// if  representations are equal we consider sucessive indices \n"\
+"INLINE int delta(GLOBAL int const* morton_codes, int num_prims, int i1, int i2) \n"\
+"{ \n"\
+"    // Select left end \n"\
+"    int left = min(i1, i2); \n"\
+"    // Select right end \n"\
+"    int right = max(i1, i2); \n"\
+"    // This is to ensure the node breaks if the index is out of bounds \n"\
+"    if (left < 0 || right >= num_prims)  \n"\
+"    { \n"\
+"        return -1; \n"\
+"    } \n"\
+"    // Fetch Morton codes for both ends \n"\
+"    int left_code = morton_codes[left]; \n"\
+"    int right_code = morton_codes[right]; \n"\
+" \n"\
+"    // Special handling of duplicated codes: use their indices as a fallback \n"\
+"    return left_code != right_code ? clz(left_code ^ right_code) : (32 + clz(left ^ right)); \n"\
+"} \n"\
+" \n"\
+"// Find span occupied by internal node with index idx \n"\
+"INLINE int2 find_span(GLOBAL int const* restrict morton_codes, int num_prims, int idx) \n"\
+"{ \n"\
+"    // Find the direction of the range \n"\
+"    int d = sign((float)(DELTA(idx, idx+1) - DELTA(idx, idx-1))); \n"\
+" \n"\
+"    // Find minimum number of bits for the break on the other side \n"\
+"    int delta_min = DELTA(idx, idx-d); \n"\
+" \n"\
+"    // Search conservative far end \n"\
+"    int lmax = 2; \n"\
+"    while (DELTA(idx,idx + lmax * d) > delta_min) \n"\
+"        lmax *= 2; \n"\
+" \n"\
+"    // Search back to find exact bound \n"\
+"    // with binary search \n"\
+"    int l = 0; \n"\
+"    int t = lmax; \n"\
+"    do \n"\
+"    { \n"\
+"        t /= 2; \n"\
+"        if(DELTA(idx, idx + (l + t)*d) > delta_min) \n"\
+"        { \n"\
+"            l = l + t; \n"\
+"        } \n"\
+"    } \n"\
+"    while (t > 1); \n"\
+" \n"\
+"    // Pack span  \n"\
+"    int2 span; \n"\
+"    span.x = min(idx, idx + l*d); \n"\
+"    span.y = max(idx, idx + l*d); \n"\
+"    return span; \n"\
+"} \n"\
+" \n"\
+"// Find split idx within the span \n"\
+"INLINE int find_split(GLOBAL int const* restrict morton_codes, int num_prims, int2 span) \n"\
+"{ \n"\
+"    // Fetch codes for both ends \n"\
+"    int left = span.x; \n"\
+"    int right = span.y; \n"\
+" \n"\
+"    // Calculate the number of identical bits from higher end \n"\
+"    int num_identical = DELTA(left, right); \n"\
+" \n"\
+"    do \n"\
+"    { \n"\
+"        // Proposed split \n"\
+"        int new_split = (right + left) / 2; \n"\
+" \n"\
+"        // If it has more equal leading bits than left and right accept it \n"\
+"        if (DELTA(left, new_split) > num_identical) \n"\
+"        { \n"\
+"            left = new_split; \n"\
+"        } \n"\
+"        else \n"\
+"        { \n"\
+"            right = new_split; \n"\
+"        } \n"\
+"    } \n"\
+"    while (right > left + 1); \n"\
+" \n"\
+"    return left; \n"\
+"} \n"\
+" \n"\
+"// Set parent-child relationship \n"\
+"KERNEL void emit_hierarchy_main( \n"\
+"    // Sorted Morton codes of the primitives \n"\
+"    GLOBAL int const* restrict morton_codes, \n"\
+"    // Bounds \n"\
+"    GLOBAL bbox const* restrict bounds, \n"\
+"    // Primitive indices \n"\
+"    GLOBAL int const* restrict indices, \n"\
+"    // Number of primitives \n"\
+"    int num_prims, \n"\
+"    // Nodes \n"\
+"    GLOBAL HlbvhNode* nodes, \n"\
+"    // Leaf bounds \n"\
+"    GLOBAL bbox* bounds_sorted \n"\
+"    ) \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+" \n"\
+"    // Set child \n"\
+"    if (global_id < num_prims) \n"\
+"    { \n"\
+"        nodes[LEAFIDX(global_id)].left = nodes[LEAFIDX(global_id)].right = indices[global_id]; \n"\
+"        bounds_sorted[LEAFIDX(global_id)] = bounds[indices[global_id]]; \n"\
+"    } \n"\
+"     \n"\
+"    // Set internal nodes \n"\
+"    if (global_id < num_prims - 1) \n"\
+"    { \n"\
+"        // Find span occupied by the current node \n"\
+"        int2 range = find_span(morton_codes, num_prims, global_id); \n"\
+" \n"\
+"        // Find split position inside the range \n"\
+"        int  split = find_split(morton_codes, num_prims, range); \n"\
+" \n"\
+"        // Create child nodes if needed \n"\
+"        int c1idx = (split == range.x) ? LEAFIDX(split) : NODEIDX(split); \n"\
+"        int c2idx = (split + 1 == range.y) ? LEAFIDX(split + 1) : NODEIDX(split + 1); \n"\
+" \n"\
+"        nodes[NODEIDX(global_id)].left = c1idx; \n"\
+"        nodes[NODEIDX(global_id)].right = c2idx; \n"\
+"        //nodes[NODEIDX(global_id)].next = (range.y + 1 < num_prims) ? range.y + 1 : -1; \n"\
+"        nodes[c1idx].parent = NODEIDX(global_id); \n"\
+"        //nodes[c1idx].next = c2idx; \n"\
+"        nodes[c2idx].parent = NODEIDX(global_id); \n"\
+"        //nodes[c2idx].next = nodes[NODEIDX(global_id)].next; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"// Propagate bounds up to the root \n"\
+"KERNEL void refit_bounds_main( \n"\
+"    // Node bounds \n"\
+"    GLOBAL bbox* bounds, \n"\
+"    // Number of nodes \n"\
+"    int num_prims, \n"\
+"    // Nodes \n"\
+"    GLOBAL HlbvhNode* nodes, \n"\
+"    // Atomic flags \n"\
+"    GLOBAL int* flags \n"\
+") \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+" \n"\
+"    // Start from leaf nodes \n"\
+"    if (global_id < num_prims) \n"\
+"    { \n"\
+"        // Get my leaf index \n"\
+"        int idx = LEAFIDX(global_id); \n"\
+" \n"\
+"        do \n"\
+"        { \n"\
+"            // Move to parent node \n"\
+"            idx = nodes[idx].parent; \n"\
+" \n"\
+"            // Check node's flag \n"\
+"            if (atomic_cmpxchg(flags + idx, 0, 1) == 1) \n"\
+"            { \n"\
+"                // If the flag was 1 the second child is ready and  \n"\
+"                // this thread calculates bbox for the node \n"\
+" \n"\
+"                // Fetch kids \n"\
+"                int lc = nodes[idx].left; \n"\
+"                int rc = nodes[idx].right; \n"\
+" \n"\
+"                // Calculate bounds \n"\
+"                bbox b = bbox_union(bounds[lc], bounds[rc]); \n"\
+" \n"\
+"                // Write bounds \n"\
+"                bounds[idx] = b; \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                // If the flag was 0 set it to 1 and bail out. \n"\
+"                // The thread handling the second child will \n"\
+"                // handle this node. \n"\
+"                break; \n"\
+"            } \n"\
+"        } \n"\
+"        while (idx != 0); \n"\
+"    } \n"\
+"} \n"\
+;
+static const char g_common_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define KERNEL __kernel \n"\
+"#define GLOBAL __global \n"\
+"#define INLINE __attribute__((always_inline)) \n"\
+"#define HIT_MARKER 1 \n"\
+"#define MISS_MARKER -1 \n"\
+"#define INVALID_IDX -1 \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable \n"\
+"#endif \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPES \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"// Axis aligned bounding box \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 pmin; \n"\
+"    float4 pmax; \n"\
+"} bbox; \n"\
+" \n"\
+"// Ray definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 o; \n"\
+"    float4 d; \n"\
+"    int2 extra; \n"\
+"    int doBackfaceCulling; \n"\
+"    int padding; \n"\
+"} ray; \n"\
+" \n"\
+"// Intersection definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int shape_id; \n"\
+"    int prim_id; \n"\
+"    int2 padding; \n"\
+" \n"\
+"    float4 uvwt; \n"\
+"} Intersection; \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"HELPER FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"INLINE \n"\
+"int ray_get_mask(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.x; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_is_active(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.y; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_maxt(ray const* r) \n"\
+"{ \n"\
+"    return r->o.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_time(ray const* r) \n"\
+"{ \n"\
+"    return r->d.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_get_doBackfaceCull(ray const* r) \n"\
+"{ \n"\
+"    return r->doBackfaceCulling; \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"#ifndef APPLE \n"\
+"INLINE \n"\
+"float4 make_float4(float x, float y, float z, float w) \n"\
+"{ \n"\
+"    float4 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    res.w = w; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float3 make_float3(float x, float y, float z) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float2 make_float2(float x, float y) \n"\
+"{ \n"\
+"    float2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int2 make_int2(int x, int y) \n"\
+"{ \n"\
+"    int2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int3 make_int3(int x, int y, int z) \n"\
+"{ \n"\
+"    int3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"#endif \n"\
+" \n"\
+"INLINE float min3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_min3(a, b, c); \n"\
+"#else \n"\
+"    return min(min(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+"INLINE float max3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_max3(a, b, c); \n"\
+"#else \n"\
+"    return max(max(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// Intersect ray against a triangle and return intersection interval value if it is in \n"\
+"// (0, t_max], return t_max otherwise. \n"\
+"INLINE \n"\
+"float fast_intersect_triangle(ray r, float3 v1, float3 v2, float3 v3, float t_max) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+" \n"\
+"#ifdef RR_BACKFACE_CULL \n"\
+"    if (ray_get_doBackfaceCull(&r) && dot(cross(e1, e2), r.d.xyz) > 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"#endif // RR_BACKFACE_CULL \n"\
+" \n"\
+"    float3 const s1 = cross(r.d.xyz, e2); \n"\
+" \n"\
+"    float denom = dot(s1, e1); \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invd = 1.f / denom; \n"\
+"#else \n"\
+"    float const invd = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float3 const d = r.o.xyz - v1; \n"\
+"    float const b1 = dot(d, s1) * invd; \n"\
+"    float3 const s2 = cross(d, e1); \n"\
+"    float const b2 = dot(r.d.xyz, s2) * invd; \n"\
+"    float const temp = dot(e2, s2) * invd; \n"\
+" \n"\
+"    if (b1 < 0.f || b1 > 1.f || b2 < 0.f || b1 + b2 > 1.f || temp < 0.f || temp > t_max) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        return temp; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float3 safe_invdir(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-8; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"// Intersect rays vs bbox and return intersection span.  \n"\
+"// Intersection criteria is ret.x <= ret.y \n"\
+"INLINE \n"\
+"float2 fast_intersect_bbox1(bbox box, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    float3 const f = mad(box.pmax.xyz, invdir, oxinvdir); \n"\
+"    float3 const n = mad(box.pmin.xyz, invdir, oxinvdir); \n"\
+"    float3 const tmax = max(f, n); \n"\
+"    float3 const tmin = min(f, n); \n"\
+"    float const t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    float const t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return make_float2(t0, t1); \n"\
+"} \n"\
+" \n"\
+"// Given a point in triangle plane, calculate its barycentrics \n"\
+"INLINE \n"\
+"float2 triangle_calculate_barycentrics(float3 p, float3 v1, float3 v2, float3 v3) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+"    float3 const e = p - v1; \n"\
+"    float const d00 = dot(e1, e1); \n"\
+"    float const d01 = dot(e1, e2); \n"\
+"    float const d11 = dot(e2, e2); \n"\
+"    float const d20 = dot(e, e1); \n"\
+"    float const d21 = dot(e, e2); \n"\
+" \n"\
+"    float denom = (d00 * d11 - d01 * d01); \n"\
+"     \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return make_float2(0.f, 0.f); \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invdenom = 1.f / denom; \n"\
+"#else \n"\
+"    float const invdenom = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float const b1 = (d11 * d20 - d01 * d21) * invdenom; \n"\
+"    float const b2 = (d00 * d21 - d01 * d20) * invdenom; \n"\
+"    return make_float2(b1, b2); \n"\
+"} \n"\
+;
+static const char g_intersect_bvh2level_skiplinks_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+"/** \n"\
+"    \\file intersect_2level_skiplinkscl \n"\
+"    \\author Dmitry Kozlov \n"\
+"    \\version 1.0 \n"\
+"    \\brief Intersector implementation based on 2-level BVH with skip links. \n"\
+" \n"\
+"    IntersectorSkipLinks implementation is based on the modification of the following paper: \n"\
+"    \"Efficiency Issues for Ray Tracing\" Brian Smits \n"\
+"    http://www.cse.chalmers.se/edu/year/2016/course/course/TDA361/EfficiencyIssuesForRayTracing.pdf \n"\
+" \n"\
+"    Intersector is using binary BVH with a single bounding box per node. BVH layout guarantees \n"\
+"    that left child of an internal node lies right next to it in memory. Each BVH node has a  \n"\
+"    skip link to the node traversed next. Intersector builds its own BVH for each scene object  \n"\
+"    and then top level BVH across all bottom level BVHs. Top level leafs keep object transforms and \n"\
+"    might reference other leafs making instancing possible. \n"\
+" \n"\
+" \n"\
+"    Pros: \n"\
+"        -Simple and efficient kernel with low VGPR pressure. \n"\
+"        -Can traverse trees of arbitrary depth. \n"\
+"        -Supports motion blur. \n"\
+"        -Supports instancing. \n"\
+"        -Fast to refit. \n"\
+"    Cons: \n"\
+"        -Travesal order is fixed, so poor algorithmic characteristics. \n"\
+"        -Does not benefit from BVH quality optimizations. \n"\
+" */ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"INCLUDES \n"\
+"**************************************************************************/ \n"\
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define KERNEL __kernel \n"\
+"#define GLOBAL __global \n"\
+"#define INLINE __attribute__((always_inline)) \n"\
+"#define HIT_MARKER 1 \n"\
+"#define MISS_MARKER -1 \n"\
+"#define INVALID_IDX -1 \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable \n"\
+"#endif \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPES \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"// Axis aligned bounding box \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 pmin; \n"\
+"    float4 pmax; \n"\
+"} bbox; \n"\
+" \n"\
+"// Ray definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 o; \n"\
+"    float4 d; \n"\
+"    int2 extra; \n"\
+"    int doBackfaceCulling; \n"\
+"    int padding; \n"\
+"} ray; \n"\
+" \n"\
+"// Intersection definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int shape_id; \n"\
+"    int prim_id; \n"\
+"    int2 padding; \n"\
+" \n"\
+"    float4 uvwt; \n"\
+"} Intersection; \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"HELPER FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"INLINE \n"\
+"int ray_get_mask(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.x; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_is_active(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.y; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_maxt(ray const* r) \n"\
+"{ \n"\
+"    return r->o.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_time(ray const* r) \n"\
+"{ \n"\
+"    return r->d.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_get_doBackfaceCull(ray const* r) \n"\
+"{ \n"\
+"    return r->doBackfaceCulling; \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"#ifndef APPLE \n"\
+"INLINE \n"\
+"float4 make_float4(float x, float y, float z, float w) \n"\
+"{ \n"\
+"    float4 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    res.w = w; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float3 make_float3(float x, float y, float z) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float2 make_float2(float x, float y) \n"\
+"{ \n"\
+"    float2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int2 make_int2(int x, int y) \n"\
+"{ \n"\
+"    int2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int3 make_int3(int x, int y, int z) \n"\
+"{ \n"\
+"    int3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"#endif \n"\
+" \n"\
+"INLINE float min3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_min3(a, b, c); \n"\
+"#else \n"\
+"    return min(min(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+"INLINE float max3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_max3(a, b, c); \n"\
+"#else \n"\
+"    return max(max(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// Intersect ray against a triangle and return intersection interval value if it is in \n"\
+"// (0, t_max], return t_max otherwise. \n"\
+"INLINE \n"\
+"float fast_intersect_triangle(ray r, float3 v1, float3 v2, float3 v3, float t_max) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+" \n"\
+"#ifdef RR_BACKFACE_CULL \n"\
+"    if (ray_get_doBackfaceCull(&r) && dot(cross(e1, e2), r.d.xyz) > 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"#endif // RR_BACKFACE_CULL \n"\
+" \n"\
+"    float3 const s1 = cross(r.d.xyz, e2); \n"\
+" \n"\
+"    float denom = dot(s1, e1); \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invd = 1.f / denom; \n"\
+"#else \n"\
+"    float const invd = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float3 const d = r.o.xyz - v1; \n"\
+"    float const b1 = dot(d, s1) * invd; \n"\
+"    float3 const s2 = cross(d, e1); \n"\
+"    float const b2 = dot(r.d.xyz, s2) * invd; \n"\
+"    float const temp = dot(e2, s2) * invd; \n"\
+" \n"\
+"    if (b1 < 0.f || b1 > 1.f || b2 < 0.f || b1 + b2 > 1.f || temp < 0.f || temp > t_max) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        return temp; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float3 safe_invdir(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-8; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"// Intersect rays vs bbox and return intersection span.  \n"\
+"// Intersection criteria is ret.x <= ret.y \n"\
+"INLINE \n"\
+"float2 fast_intersect_bbox1(bbox box, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    float3 const f = mad(box.pmax.xyz, invdir, oxinvdir); \n"\
+"    float3 const n = mad(box.pmin.xyz, invdir, oxinvdir); \n"\
+"    float3 const tmax = max(f, n); \n"\
+"    float3 const tmin = min(f, n); \n"\
+"    float const t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    float const t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return make_float2(t0, t1); \n"\
+"} \n"\
+" \n"\
+"// Given a point in triangle plane, calculate its barycentrics \n"\
+"INLINE \n"\
+"float2 triangle_calculate_barycentrics(float3 p, float3 v1, float3 v2, float3 v3) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+"    float3 const e = p - v1; \n"\
+"    float const d00 = dot(e1, e1); \n"\
+"    float const d01 = dot(e1, e2); \n"\
+"    float const d11 = dot(e2, e2); \n"\
+"    float const d20 = dot(e, e1); \n"\
+"    float const d21 = dot(e, e2); \n"\
+" \n"\
+"    float denom = (d00 * d11 - d01 * d01); \n"\
+"     \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return make_float2(0.f, 0.f); \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invdenom = 1.f / denom; \n"\
+"#else \n"\
+"    float const invdenom = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float const b1 = (d11 * d20 - d01 * d21) * invdenom; \n"\
+"    float const b2 = (d00 * d21 - d01 * d20) * invdenom; \n"\
+"    return make_float2(b1, b2); \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define STARTIDX(x)     (((int)(x.pmin.w)) >> 4) \n"\
+"#define SHAPEIDX(x)     (((int)(x.pmin.w)) >> 4) \n"\
+"#define LEAFNODE(x)     (((x).pmin.w) != -1.f) \n"\
+"#define NEXT(x)     ((int)((x).pmax.w)) \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPE DEFINITIONS \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"typedef bbox bvh_node; \n"\
+" \n"\
+"typedef struct \n"\
+"{ \n"\
+"    // Shape ID \n"\
+"    int id; \n"\
+"    // Shape BVH index (bottom level) \n"\
+"    int bvh_idx; \n"\
+"    // Is the shape disabled? \n"\
+"    unsigned int shapeDisabled; \n"\
+"    int padding1; \n"\
+"    // Transform \n"\
+"    float4 m0; \n"\
+"    float4 m1; \n"\
+"    float4 m2; \n"\
+"    float4 m3; \n"\
+"    // Motion blur params \n"\
+"    float4 velocity_linear; \n"\
+"    float4 velocity_angular; \n"\
+"} Shape; \n"\
+" \n"\
+"typedef struct \n"\
+"{ \n"\
+"    // Vertex indices \n"\
+"    int idx[3]; \n"\
+"    // Shape ID \n"\
+"    int shape_id; \n"\
+"    // Primitive ID \n"\
+"    int prim_id; \n"\
+"} Face; \n"\
+" \n"\
+" \n"\
+"INLINE float3 transform_point(float3 p, float4 m0, float4 m1, float4 m2, float4 m3) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = m0.s0 * p.x + m0.s1 * p.y + m0.s2 * p.z + m0.s3; \n"\
+"    res.y = m1.s0 * p.x + m1.s1 * p.y + m1.s2 * p.z + m1.s3; \n"\
+"    res.z = m2.s0 * p.x + m2.s1 * p.y + m2.s2 * p.z + m2.s3; \n"\
+"    return res; \n"\
+"} \n"\
+" \n"\
+"INLINE float3 transform_vector(float3 p, float4 m0, float4 m1, float4 m2, float4 m3) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = m0.s0 * p.x + m0.s1 * p.y + m0.s2 * p.z; \n"\
+"    res.y = m1.s0 * p.x + m1.s1 * p.y + m1.s2 * p.z; \n"\
+"    res.z = m2.s0 * p.x + m2.s1 * p.y + m2.s2 * p.z; \n"\
+"    return res; \n"\
+"} \n"\
+" \n"\
+"INLINE ray transform_ray(ray r, float4 m0, float4 m1, float4 m2, float4 m3) \n"\
+"{ \n"\
+"    ray res; \n"\
+"    res.o.xyz = transform_point(r.o.xyz, m0, m1, m2, m3); \n"\
+"    res.d.xyz = transform_vector(r.d.xyz, m0, m1, m2, m3); \n"\
+"    res.o.w = r.o.w; \n"\
+"    res.d.w = r.d.w; \n"\
+"    return res; \n"\
+"} \n"\
+" \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void intersect_main( \n"\
+"    // BVH nodes \n"\
+"    GLOBAL bvh_node const* restrict nodes, \n"\
+"    // Vertices \n"\
+"    GLOBAL float3 const* restrict vertices, \n"\
+"    // Faces \n"\
+"    GLOBAL Face const* restrict faces, \n"\
+"    // Shapes \n"\
+"    GLOBAL Shape const* restrict shapes, \n"\
+"    // BVH root index \n"\
+"    int root_idx,               \n"\
+"    // Rays \n"\
+"    GLOBAL ray const* restrict rays, \n"\
+"    // Number of rays in ray buffer \n"\
+"    GLOBAL int const* restrict num_rays, \n"\
+"    // Hits  \n"\
+"    GLOBAL Intersection* hits \n"\
+") \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        // Fetch ray \n"\
+"        ray r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Precompute invdir for bbox testing \n"\
+"            float3 invdir = safe_invdir(r); \n"\
+"            float3 invdirtop = invdir; \n"\
+"            float t_max = r.o.w; \n"\
+" \n"\
+"            // We need to keep original ray around for returns from bottom hierarchy \n"\
+"            ray top_ray = r; \n"\
+"            // Fetch top level BVH index \n"\
+"            int addr = root_idx; \n"\
+" \n"\
+"            // Set top index \n"\
+"            int top_addr = INVALID_IDX; \n"\
+"            // Current shape ID \n"\
+"            int shape_id = INVALID_IDX; \n"\
+"            // Closest shape ID \n"\
+"            int closest_shape_id = INVALID_IDX; \n"\
+"            int closest_prim_id = INVALID_IDX; \n"\
+"            float2 closest_barycentrics; \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node node = nodes[addr]; \n"\
+" \n"\
+"                // Intersect against bbox \n"\
+"                float2 s = fast_intersect_bbox1(node, invdir, -r.o.xyz * invdir, t_max); \n"\
+" \n"\
+"                if (s.x <= s.y) \n"\
+"                { \n"\
+"                    if (LEAFNODE(node)) \n"\
+"                    { \n"\
+"                        // If this is the leaf it can be either a leaf containing primitives (bottom hierarchy) \n"\
+"                        // or containing another BVH (top level hierarhcy) \n"\
+"                        if (top_addr != INVALID_IDX) \n"\
+"                        { \n"\
+"                            // Intersect leaf here \n"\
+"                            // \n"\
+"                            int const face_idx = STARTIDX(node); \n"\
+"                            Face const face = faces[face_idx]; \n"\
+"                            float3 const v1 = vertices[face.idx[0]]; \n"\
+"                            float3 const v2 = vertices[face.idx[1]]; \n"\
+"                            float3 const v3 = vertices[face.idx[2]]; \n"\
+" \n"\
+"                            // Intersect triangle \n"\
+"                            float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                            // If hit update closest hit distance and index \n"\
+"                            if (f < t_max) \n"\
+"                            { \n"\
+"                                t_max = f; \n"\
+"                                closest_prim_id = face.prim_id; \n"\
+"                                closest_shape_id = shape_id; \n"\
+" \n"\
+"                                float3 const p = r.o.xyz + r.d.xyz * t_max; \n"\
+"                                // Calculte barycentric coordinates \n"\
+"                                closest_barycentrics = triangle_calculate_barycentrics(p, v1, v2, v3); \n"\
+"                            } \n"\
+" \n"\
+"                            // And goto next node \n"\
+"                            addr = NEXT(node); \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            // This is top level hierarchy leaf \n"\
+"                            // Save top node index for return \n"\
+"                            top_addr = addr; \n"\
+"                            // Get shape descrition struct index \n"\
+"                            int shape_idx = SHAPEIDX(node); \n"\
+"                            // Get shape mask \n"\
+"                            unsigned int const shapeDisabled = shapes[shape_idx].shapeDisabled; \n"\
+"                            int const shapeId = shapes[shape_idx].id; \n"\
+"                            // Drill into 2nd level BVH only if the geometry is not masked vs current ray \n"\
+"                            // otherwise skip the subtree \n"\
+"                            if (!shapeDisabled \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                                && ray_get_mask(&r) != shapeId \n"\
+"#endif // RR_RAY_MASK \n"\
+"                                ) \n"\
+"                            { \n"\
+"                                // Fetch bottom level BVH index \n"\
+"                                addr = shapes[shape_idx].bvh_idx; \n"\
+"                                shape_id = shapeId; \n"\
+" \n"\
+"                                // Fetch BVH transform \n"\
+"                                float4 wmi0 = shapes[shape_idx].m0; \n"\
+"                                float4 wmi1 = shapes[shape_idx].m1; \n"\
+"                                float4 wmi2 = shapes[shape_idx].m2; \n"\
+"                                float4 wmi3 = shapes[shape_idx].m3; \n"\
+" \n"\
+"                                r = transform_ray(r, wmi0, wmi1, wmi2, wmi3); \n"\
+"                                // Recalc invdir \n"\
+"                                invdir = safe_invdir(r); \n"\
+"                                // And continue traversal of the bottom level BVH \n"\
+"                                continue; \n"\
+"                            } \n"\
+"                            else \n"\
+"                            { \n"\
+"                                addr = INVALID_IDX; \n"\
+"                            } \n"\
+"                        } \n"\
+"                    } \n"\
+"                    // Traverse child nodes otherwise. \n"\
+"                    else \n"\
+"                    { \n"\
+"                        // This is an internal node, proceed to left child (it is at current + 1 index) \n"\
+"                        addr = addr + 1; \n"\
+"                    } \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"                    // We missed the node, goto next one \n"\
+"                    addr = NEXT(node); \n"\
+"                } \n"\
+" \n"\
+"                // Here check if we ended up traversing bottom level BVH \n"\
+"                // in this case idx = -1 and topidx has valid value \n"\
+"                if (addr == INVALID_IDX && top_addr != INVALID_IDX) \n"\
+"                { \n"\
+"                    //  Proceed to next top level node \n"\
+"                    addr = NEXT(nodes[top_addr]); \n"\
+"                    // Set topidx \n"\
+"                    top_addr = INVALID_IDX; \n"\
+"                    // Restore ray here \n"\
+"                    r = top_ray; \n"\
+"                    // Restore invdir \n"\
+"                    invdir = invdirtop; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            // Check if we have found an intersection \n"\
+"            if (closest_shape_id != INVALID_IDX) \n"\
+"            { \n"\
+"                // Update hit information \n"\
+"                hits[global_id].shape_id = closest_shape_id; \n"\
+"                hits[global_id].prim_id = closest_prim_id; \n"\
+"                hits[global_id].uvwt = make_float4(closest_barycentrics.x, closest_barycentrics.y, 0.f, t_max); \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                // Miss here \n"\
+"                hits[global_id].shape_id = MISS_MARKER; \n"\
+"                hits[global_id].prim_id = MISS_MARKER; \n"\
+"            } \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void occluded_main( \n"\
+"    // BVH nodes \n"\
+"    GLOBAL bvh_node const* restrict nodes, \n"\
+"    // Vertices \n"\
+"    GLOBAL float3 const* restrict vertices, \n"\
+"    // Faces \n"\
+"    GLOBAL Face const* restrict faces, \n"\
+"    // Shapes \n"\
+"    GLOBAL Shape const* restrict shapes, \n"\
+"    // BVH root index \n"\
+"    int root_idx, \n"\
+"    // Rays \n"\
+"    GLOBAL ray const* restrict rays, \n"\
+"    // Number of rays in ray buffer \n"\
+"    GLOBAL int const* restrict num_rays, \n"\
+"    // Hits  \n"\
+"    GLOBAL int* hits \n"\
+") \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        // Fetch ray \n"\
+"        ray r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Precompute invdir for bbox testing \n"\
+"            float3 invdir = safe_invdir(r); \n"\
+"            float3 invdirtop = invdir; \n"\
+"            float const t_max = r.o.w; \n"\
+" \n"\
+"            // We need to keep original ray around for returns from bottom hierarchy \n"\
+"            ray top_ray = r; \n"\
+" \n"\
+"            // Fetch top level BVH index \n"\
+"            int addr = root_idx; \n"\
+"            // Set top index \n"\
+"            int top_addr = INVALID_IDX; \n"\
+" \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node node = nodes[addr]; \n"\
+"                // Intersect against bbox \n"\
+"                float2 s = fast_intersect_bbox1(node, invdir, -r.o.xyz * invdir, t_max); \n"\
+" \n"\
+"                if (s.x <= s.y) \n"\
+"                { \n"\
+"                    if (LEAFNODE(node)) \n"\
+"                    { \n"\
+"                        // If this is the leaf it can be either a leaf containing primitives (bottom hierarchy) \n"\
+"                        // or containing another BVH (top level hierarhcy) \n"\
+"                        if (top_addr != INVALID_IDX) \n"\
+"                        { \n"\
+"                            // Intersect leaf here \n"\
+"                            // \n"\
+"                            int const face_idx = STARTIDX(node); \n"\
+"                            Face const face = faces[face_idx]; \n"\
+"                            float3 const v1 = vertices[face.idx[0]]; \n"\
+"                            float3 const v2 = vertices[face.idx[1]]; \n"\
+"                            float3 const v3 = vertices[face.idx[2]]; \n"\
+" \n"\
+"                            // Intersect triangle \n"\
+"                            float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                            // If hit update closest hit distance and index \n"\
+"                            if (f < t_max) \n"\
+"                            { \n"\
+"                                hits[global_id] = HIT_MARKER; \n"\
+"                                return; \n"\
+"                            } \n"\
+" \n"\
+"                            // And goto next node \n"\
+"                            addr = NEXT(node); \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            // This is top level hierarchy leaf \n"\
+"                            // Save top node index for return \n"\
+"                            top_addr = addr; \n"\
+"                            // Get shape descrition struct index \n"\
+"                            int shape_idx = SHAPEIDX(node); \n"\
+"                            // Get shape mask \n"\
+"                            const unsigned int shapeDisabled = shapes[shape_idx].shapeDisabled; \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                            const int shapeId = shapes[shape_idx].id; \n"\
+"#endif // RR_RAY_MASK \n"\
+"                            // Drill into 2nd level BVH only if the geometry is not masked vs current ray \n"\
+"                            // otherwise skip the subtree \n"\
+"                            if (!shapeDisabled  \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                                && ray_get_mask(&r) != shapeId \n"\
+"#endif // RR_RAY_MASK \n"\
+"                                ) \n"\
+"                            { \n"\
+"                                // Fetch bottom level BVH index \n"\
+"                                addr = shapes[shape_idx].bvh_idx; \n"\
+" \n"\
+"                                // Fetch BVH transform \n"\
+"                                float4 wmi0 = shapes[shape_idx].m0; \n"\
+"                                float4 wmi1 = shapes[shape_idx].m1; \n"\
+"                                float4 wmi2 = shapes[shape_idx].m2; \n"\
+"                                float4 wmi3 = shapes[shape_idx].m3; \n"\
+" \n"\
+"                                r = transform_ray(r, wmi0, wmi1, wmi2, wmi3); \n"\
+"                                // Recalc invdir \n"\
+"                                invdir = safe_invdir(r); \n"\
+"                                // And continue traversal of the bottom level BVH \n"\
+"                                continue; \n"\
+"                            } \n"\
+"                            else \n"\
+"                            { \n"\
+"                                addr = INVALID_IDX; \n"\
+"                            } \n"\
+"                        } \n"\
+"                    } \n"\
+"                    // Traverse child nodes otherwise. \n"\
+"                    else \n"\
+"                    { \n"\
+"                        // This is an internal node, proceed to left child (it is at current + 1 index) \n"\
+"                        addr = addr + 1; \n"\
+"                    } \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"                    // We missed the node, goto next one \n"\
+"                    addr = NEXT(node); \n"\
+"                } \n"\
+" \n"\
+"                // Here check if we ended up traversing bottom level BVH \n"\
+"                // in this case idx = -1 and topidx has valid value \n"\
+"                if (addr == INVALID_IDX && top_addr != INVALID_IDX) \n"\
+"                { \n"\
+"                    //  Proceed to next top level node \n"\
+"                    addr = NEXT(nodes[top_addr]); \n"\
+"                    // Set topidx \n"\
+"                    top_addr = INVALID_IDX; \n"\
+"                    // Restore ray here \n"\
+"                    r = top_ray; \n"\
+"                    // Restore invdir \n"\
+"                    invdir = invdirtop; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            hits[global_id] = MISS_MARKER; \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+;
+static const char g_intersect_bvh2_bittrail_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+"/** \n"\
+"    \\file intersect_bvh2_bittrail.cl \n"\
+"    \\author Dmitry Kozlov \n"\
+"    \\version 1.0 \n"\
+"    \\brief Intersector implementation based on BVH stackless traversal using bit trail and perfect hashing. \n"\
+" \n"\
+"    Intersector is using binary BVH with two bounding boxes per node. \n"\
+"    Traversal is using bit trail and perfect hashing for backtracing and based on the following paper: \n"\
+" \n"\
+"    \"Efficient stackless hierarchy traversal on GPUs with backtracking in constant time\"\" \n"\
+"    Nikolaus Binder, Alexander Keller \n"\
+"    http://dl.acm.org/citation.cfm?id=2977343 \n"\
+" \n"\
+"    Traversal pseudocode: \n"\
+" \n"\
+"        while(addr is valid) \n"\
+"        { \n"\
+"            node <- fetch next node at addr \n"\
+"            index <- 1 \n"\
+"            trail <- 0 \n"\
+"            if (node is leaf) \n"\
+"                intersect leaf \n"\
+"            else \n"\
+"            { \n"\
+"                intersect ray vs left child \n"\
+"                intersect ray vs right child \n"\
+"                if (intersect any of children) \n"\
+"                { \n"\
+"                    index <- index << 1 \n"\
+"                    trail <- trail << 1 \n"\
+"                    determine closer child \n"\
+"                    if intersect both \n"\
+"                    { \n"\
+"                        trail <- trail ^ 1 \n"\
+"                        addr = closer child \n"\
+"                    } \n"\
+"                    else \n"\
+"                    { \n"\
+"                        addr = intersected child \n"\
+"                    } \n"\
+"                    if addr is right \n"\
+"                        index <- index ^ 1 \n"\
+"                    continue \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            if (trail == 0) \n"\
+"            { \n"\
+"                break \n"\
+"            } \n"\
+" \n"\
+"            num_levels = count trailing zeroes in trail \n"\
+"            trail <- (trail << num_levels) & 1 \n"\
+"            index <- (index << num_levels) & 1 \n"\
+" \n"\
+"            addr = hash[index] \n"\
+"        } \n"\
+" \n"\
+"    Pros: \n"\
+"        -Very fast traversal. \n"\
+"        -Benefits from BVH quality optimization. \n"\
+"        -Low VGPR pressure \n"\
+"    Cons: \n"\
+"        -Depth is limited. \n"\
+"        -Generates global memory traffic. \n"\
+" */ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"INCLUDES \n"\
+"**************************************************************************/ \n"\
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define KERNEL __kernel \n"\
+"#define GLOBAL __global \n"\
+"#define INLINE __attribute__((always_inline)) \n"\
+"#define HIT_MARKER 1 \n"\
+"#define MISS_MARKER -1 \n"\
+"#define INVALID_IDX -1 \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable \n"\
+"#endif \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPES \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"// Axis aligned bounding box \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 pmin; \n"\
+"    float4 pmax; \n"\
+"} bbox; \n"\
+" \n"\
+"// Ray definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 o; \n"\
+"    float4 d; \n"\
+"    int2 extra; \n"\
+"    int doBackfaceCulling; \n"\
+"    int padding; \n"\
+"} ray; \n"\
+" \n"\
+"// Intersection definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int shape_id; \n"\
+"    int prim_id; \n"\
+"    int2 padding; \n"\
+" \n"\
+"    float4 uvwt; \n"\
+"} Intersection; \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"HELPER FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"INLINE \n"\
+"int ray_get_mask(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.x; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_is_active(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.y; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_maxt(ray const* r) \n"\
+"{ \n"\
+"    return r->o.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_time(ray const* r) \n"\
+"{ \n"\
+"    return r->d.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_get_doBackfaceCull(ray const* r) \n"\
+"{ \n"\
+"    return r->doBackfaceCulling; \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"#ifndef APPLE \n"\
+"INLINE \n"\
+"float4 make_float4(float x, float y, float z, float w) \n"\
+"{ \n"\
+"    float4 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    res.w = w; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float3 make_float3(float x, float y, float z) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float2 make_float2(float x, float y) \n"\
+"{ \n"\
+"    float2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int2 make_int2(int x, int y) \n"\
+"{ \n"\
+"    int2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int3 make_int3(int x, int y, int z) \n"\
+"{ \n"\
+"    int3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"#endif \n"\
+" \n"\
+"INLINE float min3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_min3(a, b, c); \n"\
+"#else \n"\
+"    return min(min(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+"INLINE float max3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_max3(a, b, c); \n"\
+"#else \n"\
+"    return max(max(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// Intersect ray against a triangle and return intersection interval value if it is in \n"\
+"// (0, t_max], return t_max otherwise. \n"\
+"INLINE \n"\
+"float fast_intersect_triangle(ray r, float3 v1, float3 v2, float3 v3, float t_max) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+" \n"\
+"#ifdef RR_BACKFACE_CULL \n"\
+"    if (ray_get_doBackfaceCull(&r) && dot(cross(e1, e2), r.d.xyz) > 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"#endif // RR_BACKFACE_CULL \n"\
+" \n"\
+"    float3 const s1 = cross(r.d.xyz, e2); \n"\
+" \n"\
+"    float denom = dot(s1, e1); \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invd = 1.f / denom; \n"\
+"#else \n"\
+"    float const invd = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float3 const d = r.o.xyz - v1; \n"\
+"    float const b1 = dot(d, s1) * invd; \n"\
+"    float3 const s2 = cross(d, e1); \n"\
+"    float const b2 = dot(r.d.xyz, s2) * invd; \n"\
+"    float const temp = dot(e2, s2) * invd; \n"\
+" \n"\
+"    if (b1 < 0.f || b1 > 1.f || b2 < 0.f || b1 + b2 > 1.f || temp < 0.f || temp > t_max) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        return temp; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float3 safe_invdir(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-8; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"// Intersect rays vs bbox and return intersection span.  \n"\
+"// Intersection criteria is ret.x <= ret.y \n"\
+"INLINE \n"\
+"float2 fast_intersect_bbox1(bbox box, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    float3 const f = mad(box.pmax.xyz, invdir, oxinvdir); \n"\
+"    float3 const n = mad(box.pmin.xyz, invdir, oxinvdir); \n"\
+"    float3 const tmax = max(f, n); \n"\
+"    float3 const tmin = min(f, n); \n"\
+"    float const t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    float const t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return make_float2(t0, t1); \n"\
+"} \n"\
+" \n"\
+"// Given a point in triangle plane, calculate its barycentrics \n"\
+"INLINE \n"\
+"float2 triangle_calculate_barycentrics(float3 p, float3 v1, float3 v2, float3 v3) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+"    float3 const e = p - v1; \n"\
+"    float const d00 = dot(e1, e1); \n"\
+"    float const d01 = dot(e1, e2); \n"\
+"    float const d11 = dot(e2, e2); \n"\
+"    float const d20 = dot(e, e1); \n"\
+"    float const d21 = dot(e, e2); \n"\
+" \n"\
+"    float denom = (d00 * d11 - d01 * d01); \n"\
+"     \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return make_float2(0.f, 0.f); \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invdenom = 1.f / denom; \n"\
+"#else \n"\
+"    float const invdenom = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float const b1 = (d11 * d20 - d01 * d21) * invdenom; \n"\
+"    float const b2 = (d00 * d21 - d01 * d20) * invdenom; \n"\
+"    return make_float2(b1, b2); \n"\
+"} \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPE DEFINITIONS \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"#define LEAFNODE(x) (((x).child0) == -1) \n"\
+" \n"\
+"// BVH node \n"\
+"typedef struct \n"\
+"{ \n"\
+"    union  \n"\
+"    { \n"\
+"        struct \n"\
+"        { \n"\
+"            // Child bounds \n"\
+"            bbox bounds[2]; \n"\
+"        }; \n"\
+" \n"\
+"        struct \n"\
+"        { \n"\
+"            // If node is a leaf we keep vertex indices here \n"\
+"            int i0, i1, i2; \n"\
+"            // Address of a left child \n"\
+"            int child0; \n"\
+"            // Shape ID \n"\
+"            int shape_id; \n"\
+"            // Primitive ID \n"\
+"            int prim_id; \n"\
+"            // Address of a right child \n"\
+"            int child1; \n"\
+"        }; \n"\
+"    }; \n"\
+" \n"\
+"} bvh_node; \n"\
+" \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void \n"\
+"occluded_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL bvh_node const * restrict nodes, \n"\
+"    // Triangles vertices \n"\
+"    GLOBAL float3 const * restrict vertices, \n"\
+"    // Rays \n"\
+"    GLOBAL ray const * restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL int const * restrict num_rays, \n"\
+"    // Displacement table for perfect hashing \n"\
+"    GLOBAL int const * restrict displacement_table, \n"\
+"    // Hash table for perfect hashing \n"\
+"    GLOBAL int const * restrict hash_table, \n"\
+"    // Displacement table size \n"\
+"    int const displacement_table_size, \n"\
+"    // Hit results: 1 for hit and -1 for miss \n"\
+"    GLOBAL int* hits \n"\
+"    ) \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+"    int local_id = get_local_id(0); \n"\
+"    int group_id = get_group_id(0); \n"\
+" \n"\
+"    // Handle only working set \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        ray const r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            float3 const invdir = safe_invdir(r); \n"\
+"            float3 const oxinvdir = -r.o.xyz * invdir; \n"\
+"            // Intersection parametric distance \n"\
+"            float const t_max = r.o.w; \n"\
+" \n"\
+"            // Bit tail to track traversal \n"\
+"            int bit_trail = 0; \n"\
+"            // Current node index (complete tree enumeration) \n"\
+"            int node_idx = 1; \n"\
+"            // Current node address \n"\
+"            int addr = 0; \n"\
+" \n"\
+"            // Start from 0 node (root) \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node const node = nodes[addr]; \n"\
+" \n"\
+"                // Check if it is a leaf \n"\
+"                if (LEAFNODE(node)) \n"\
+"                { \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&r) != node.shape_id) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        // Leafs directly store vertex indices \n"\
+"                        // so we load vertices directly \n"\
+"                        float3 const v1 = vertices[node.i0]; \n"\
+"                        float3 const v2 = vertices[node.i1]; \n"\
+"                        float3 const v3 = vertices[node.i2]; \n"\
+"                        // Intersect triangle \n"\
+"                        float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                        // If hit store the result and bail out \n"\
+"                        if (f < t_max) \n"\
+"                        { \n"\
+"                            hits[global_id] = HIT_MARKER; \n"\
+"                            return; \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"                    // It is internal node, so intersect vs both children bounds \n"\
+"                    float2 const s0 = fast_intersect_bbox1(node.bounds[0], invdir, oxinvdir, t_max); \n"\
+"                    float2 const s1 = fast_intersect_bbox1(node.bounds[1], invdir, oxinvdir, t_max); \n"\
+" \n"\
+"                    // Determine which one to traverse \n"\
+"                    bool const traverse_c0 = (s0.x <= s0.y); \n"\
+"                    bool const traverse_c1 = (s1.x <= s1.y); \n"\
+"                    bool const c1first = traverse_c1 && (s0.x > s1.x); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1) \n"\
+"                    { \n"\
+"                        // Go one level down => shift bit trail \n"\
+"                        bit_trail = bit_trail << 1; \n"\
+"                        // idx = idx * 2 (this is for left child) \n"\
+"                        node_idx = node_idx << 1; \n"\
+" \n"\
+"                        // If we postpone one node here we  \n"\
+"                        // set last bit in bit trail \n"\
+"                        if (traverse_c0 && traverse_c1) \n"\
+"                        { \n"\
+"                            bit_trail = bit_trail ^ 0x1; \n"\
+"                        } \n"\
+" \n"\
+"                        // Determine which one to traverse first \n"\
+"                        if (c1first || !traverse_c0) \n"\
+"                        { \n"\
+"                            // Right one is closer or left one not travesed \n"\
+"                            addr = node.child1; \n"\
+"                            // Fix index \n"\
+"                            // idx = 2 * idx + 1 for right one \n"\
+"                            node_idx = node_idx ^ 0x1; \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            // Traverse left node otherwise \n"\
+"                            addr = node.child0; \n"\
+"                        } \n"\
+" \n"\
+"                        // Continue traversal \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+" \n"\
+"                // Here we need to either backtrack or \n"\
+"                // stop traversal. \n"\
+"                // If bit trail is zero, there is nothing  \n"\
+"                // to traverse. \n"\
+"                if (bit_trail == 0) \n"\
+"                { \n"\
+"                    addr = INVALID_IDX; \n"\
+"                    continue; \n"\
+"                } \n"\
+"                 \n"\
+"                // Backtrack \n"\
+"                // Calculate where we postponed the last node. \n"\
+"                // = number of trailing zeroes in bit_trail \n"\
+"                int const num_levels = 31 - clz(bit_trail & -bit_trail); \n"\
+"                // Update bit trail (shift and unset last bit) \n"\
+"                bit_trail = (bit_trail >> num_levels) ^ 0x1; \n"\
+"                // Calculate postponed index \n"\
+"                node_idx = (node_idx >> num_levels) ^ 0x1; \n"\
+" \n"\
+"                // Calculate node address using perfect hasing of node indices \n"\
+"                int const displacement = displacement_table[node_idx / displacement_table_size]; \n"\
+"                addr = hash_table[displacement + (node_idx & (displacement_table_size - 1))]; \n"\
+"            } \n"\
+" \n"\
+"            // Finished traversal, but no intersection found \n"\
+"            hits[global_id] = MISS_MARKER; \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void intersect_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL bvh_node const * restrict nodes, \n"\
+"    // Triangles vertices \n"\
+"    GLOBAL float3 const * restrict vertices, \n"\
+"    // Rays \n"\
+"    GLOBAL ray const * restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL int const * restrict num_rays, \n"\
+"    // Displacement table for perfect hashing \n"\
+"    GLOBAL int const * restrict displacement_table, \n"\
+"    // Hash table for perfect hashing \n"\
+"    GLOBAL int const * restrict hash_table, \n"\
+"    // Displacement table size \n"\
+"    int const displacement_table_size, \n"\
+"    // Hit results: 1 for hit and -1 for miss \n"\
+"    GLOBAL Intersection* hits) \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+"    int local_id = get_local_id(0); \n"\
+"    int group_id = get_group_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        ray const r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            float3 const invdir = safe_invdir(r); \n"\
+"            float3 const oxinvdir = -r.o.xyz * invdir; \n"\
+"            // Intersection parametric distance \n"\
+"            float t_max = r.o.w; \n"\
+" \n"\
+"            // Bit tail to track traversal \n"\
+"            int bit_trail = 0; \n"\
+"            // Current node index (complete tree enumeration) \n"\
+"            int node_idx = 1; \n"\
+"            // Current node address \n"\
+"            int addr = 0; \n"\
+"            // Current closest intersection leaf index \n"\
+"            int isect_idx = INVALID_IDX; \n"\
+" \n"\
+"            // Start from 0 node (root) \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node const node = nodes[addr]; \n"\
+" \n"\
+"                // Check if it is a leaf \n"\
+"                if (LEAFNODE(node)) \n"\
+"                { \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&r) != node.shape_id) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        // Leafs directly store vertex indices \n"\
+"                        // so we load vertices directly \n"\
+"                        float3 const v1 = vertices[node.i0]; \n"\
+"                        float3 const v2 = vertices[node.i1]; \n"\
+"                        float3 const v3 = vertices[node.i2]; \n"\
+"                        // Intersect triangle \n"\
+"                        float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                        // If hit update closest hit distance and index \n"\
+"                        if (f < t_max) \n"\
+"                        { \n"\
+"                            t_max = f; \n"\
+"                            isect_idx = addr; \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"                    // It is internal node, so intersect vs both children bounds \n"\
+"                    float2 const s0 = fast_intersect_bbox1(node.bounds[0], invdir, oxinvdir, t_max); \n"\
+"                    float2 const s1 = fast_intersect_bbox1(node.bounds[1], invdir, oxinvdir, t_max); \n"\
+" \n"\
+"                    // Determine which one to traverse \n"\
+"                    bool const traverse_c0 = (s0.x <= s0.y); \n"\
+"                    bool const traverse_c1 = (s1.x <= s1.y); \n"\
+"                    bool const c1first = traverse_c1 && (s0.x > s1.x); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1) \n"\
+"                    { \n"\
+"                        // Go one level down => shift bit trail \n"\
+"                        bit_trail = bit_trail << 1; \n"\
+"                        // idx = idx * 2 (this is for left child) \n"\
+"                        node_idx = node_idx << 1; \n"\
+" \n"\
+"                        // If we postpone one node here we  \n"\
+"                        // set last bit in bit trail \n"\
+"                        if (traverse_c0 && traverse_c1) \n"\
+"                        { \n"\
+"                            bit_trail = bit_trail ^ 0x1; \n"\
+"                        } \n"\
+" \n"\
+"                        // Determine which one to traverse first \n"\
+"                        if (c1first || !traverse_c0) \n"\
+"                        { \n"\
+"                            // Right one is closer or left one not travesed \n"\
+"                            addr = node.child1; \n"\
+"                            // Fix index \n"\
+"                            // idx = 2 * idx + 1 for right one \n"\
+"                            node_idx = node_idx ^ 0x1; \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            // Traverse left node otherwise \n"\
+"                            addr = node.child0; \n"\
+"                        } \n"\
+" \n"\
+"                        // Continue traversal \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+" \n"\
+"                // Here we need to either backtrack or \n"\
+"                // stop traversal. \n"\
+"                // If bit trail is zero, there is nothing  \n"\
+"                // to traverse. \n"\
+"                if (bit_trail == 0) \n"\
+"                { \n"\
+"                    addr = INVALID_IDX; \n"\
+"                    continue; \n"\
+"                } \n"\
+" \n"\
+"                // Backtrack \n"\
+"                // Calculate where we postponed the last node. \n"\
+"                // = number of trailing zeroes in bit_trail \n"\
+"                int num_levels = 31 - clz(bit_trail & -bit_trail); \n"\
+"                // Update bit trail (shift and unset last bit) \n"\
+"                bit_trail = (bit_trail >> num_levels) ^ 0x1; \n"\
+"                // Calculate postponed index \n"\
+"                node_idx = (node_idx >> num_levels) ^ 0x1; \n"\
+" \n"\
+"                // Calculate node address using perfect hasing of node indices \n"\
+"                int displacement = displacement_table[node_idx / displacement_table_size]; \n"\
+"                addr = hash_table[displacement + (node_idx & (displacement_table_size - 1))]; \n"\
+"            } \n"\
+" \n"\
+"            // Check if we have found an intersection \n"\
+"            if (isect_idx != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch the node & vertices \n"\
+"                bvh_node const node = nodes[isect_idx]; \n"\
+"                float3 const v1 = vertices[node.i0]; \n"\
+"                float3 const v2 = vertices[node.i1]; \n"\
+"                float3 const v3 = vertices[node.i2]; \n"\
+"                // Calculate hit position \n"\
+"                float3 const p = r.o.xyz + r.d.xyz * t_max; \n"\
+"                // Calculate barycentric coordinates \n"\
+"                float2 const uv = triangle_calculate_barycentrics(p, v1, v2, v3); \n"\
+"                // Update hit information \n"\
+"                hits[global_id].shape_id = node.shape_id; \n"\
+"                hits[global_id].prim_id = node.prim_id; \n"\
+"                hits[global_id].uvwt = make_float4(uv.x, uv.y, 0.f, t_max); \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                // Miss here \n"\
+"                hits[global_id].shape_id = MISS_MARKER; \n"\
+"                hits[global_id].prim_id = MISS_MARKER; \n"\
+"            } \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+;
+static const char g_intersect_bvh2_lds_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"INCLUDES \n"\
+"**************************************************************************/ \n"\
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define KERNEL __kernel \n"\
+"#define GLOBAL __global \n"\
+"#define INLINE __attribute__((always_inline)) \n"\
+"#define HIT_MARKER 1 \n"\
+"#define MISS_MARKER -1 \n"\
+"#define INVALID_IDX -1 \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable \n"\
+"#endif \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPES \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"// Axis aligned bounding box \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 pmin; \n"\
+"    float4 pmax; \n"\
+"} bbox; \n"\
+" \n"\
+"// Ray definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 o; \n"\
+"    float4 d; \n"\
+"    int2 extra; \n"\
+"    int doBackfaceCulling; \n"\
+"    int padding; \n"\
+"} ray; \n"\
+" \n"\
+"// Intersection definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int shape_id; \n"\
+"    int prim_id; \n"\
+"    int2 padding; \n"\
+" \n"\
+"    float4 uvwt; \n"\
+"} Intersection; \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"HELPER FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"INLINE \n"\
+"int ray_get_mask(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.x; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_is_active(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.y; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_maxt(ray const* r) \n"\
+"{ \n"\
+"    return r->o.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_time(ray const* r) \n"\
+"{ \n"\
+"    return r->d.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_get_doBackfaceCull(ray const* r) \n"\
+"{ \n"\
+"    return r->doBackfaceCulling; \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"#ifndef APPLE \n"\
+"INLINE \n"\
+"float4 make_float4(float x, float y, float z, float w) \n"\
+"{ \n"\
+"    float4 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    res.w = w; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float3 make_float3(float x, float y, float z) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float2 make_float2(float x, float y) \n"\
+"{ \n"\
+"    float2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int2 make_int2(int x, int y) \n"\
+"{ \n"\
+"    int2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int3 make_int3(int x, int y, int z) \n"\
+"{ \n"\
+"    int3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"#endif \n"\
+" \n"\
+"INLINE float min3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_min3(a, b, c); \n"\
+"#else \n"\
+"    return min(min(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+"INLINE float max3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_max3(a, b, c); \n"\
+"#else \n"\
+"    return max(max(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// Intersect ray against a triangle and return intersection interval value if it is in \n"\
+"// (0, t_max], return t_max otherwise. \n"\
+"INLINE \n"\
+"float fast_intersect_triangle(ray r, float3 v1, float3 v2, float3 v3, float t_max) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+" \n"\
+"#ifdef RR_BACKFACE_CULL \n"\
+"    if (ray_get_doBackfaceCull(&r) && dot(cross(e1, e2), r.d.xyz) > 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"#endif // RR_BACKFACE_CULL \n"\
+" \n"\
+"    float3 const s1 = cross(r.d.xyz, e2); \n"\
+" \n"\
+"    float denom = dot(s1, e1); \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invd = 1.f / denom; \n"\
+"#else \n"\
+"    float const invd = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float3 const d = r.o.xyz - v1; \n"\
+"    float const b1 = dot(d, s1) * invd; \n"\
+"    float3 const s2 = cross(d, e1); \n"\
+"    float const b2 = dot(r.d.xyz, s2) * invd; \n"\
+"    float const temp = dot(e2, s2) * invd; \n"\
+" \n"\
+"    if (b1 < 0.f || b1 > 1.f || b2 < 0.f || b1 + b2 > 1.f || temp < 0.f || temp > t_max) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        return temp; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float3 safe_invdir(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-8; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"// Intersect rays vs bbox and return intersection span.  \n"\
+"// Intersection criteria is ret.x <= ret.y \n"\
+"INLINE \n"\
+"float2 fast_intersect_bbox1(bbox box, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    float3 const f = mad(box.pmax.xyz, invdir, oxinvdir); \n"\
+"    float3 const n = mad(box.pmin.xyz, invdir, oxinvdir); \n"\
+"    float3 const tmax = max(f, n); \n"\
+"    float3 const tmin = min(f, n); \n"\
+"    float const t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    float const t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return make_float2(t0, t1); \n"\
+"} \n"\
+" \n"\
+"// Given a point in triangle plane, calculate its barycentrics \n"\
+"INLINE \n"\
+"float2 triangle_calculate_barycentrics(float3 p, float3 v1, float3 v2, float3 v3) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+"    float3 const e = p - v1; \n"\
+"    float const d00 = dot(e1, e1); \n"\
+"    float const d01 = dot(e1, e2); \n"\
+"    float const d11 = dot(e2, e2); \n"\
+"    float const d20 = dot(e, e1); \n"\
+"    float const d21 = dot(e, e2); \n"\
+" \n"\
+"    float denom = (d00 * d11 - d01 * d01); \n"\
+"     \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return make_float2(0.f, 0.f); \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invdenom = 1.f / denom; \n"\
+"#else \n"\
+"    float const invdenom = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float const b1 = (d11 * d20 - d01 * d21) * invdenom; \n"\
+"    float const b2 = (d00 * d21 - d01 * d20) * invdenom; \n"\
+"    return make_float2(b1, b2); \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPE DEFINITIONS \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"#define INVALID_ADDR 0xffffffffu \n"\
+"#define INTERNAL_NODE(node) (GetAddrLeft(node) != INVALID_ADDR) \n"\
+" \n"\
+"#define GROUP_SIZE 64 \n"\
+"#define STACK_SIZE 32 \n"\
+"#define LDS_STACK_SIZE 16 \n"\
+" \n"\
+"// BVH node \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 aabb_left_min_or_v0_and_addr_left; \n"\
+"    float4 aabb_left_max_or_v1_and_mesh_id; \n"\
+"    float4 aabb_right_min_or_v2_and_addr_right; \n"\
+"    float4 aabb_right_max_and_prim_id; \n"\
+" \n"\
+"} bvh_node; \n"\
+" \n"\
+"#define GetAddrLeft(node)   as_uint((node).aabb_left_min_or_v0_and_addr_left.w) \n"\
+"#define GetAddrRight(node)  as_uint((node).aabb_right_min_or_v2_and_addr_right.w) \n"\
+"#define GetMeshId(node)     as_uint((node).aabb_left_max_or_v1_and_mesh_id.w) \n"\
+"#define GetPrimId(node)     as_uint((node).aabb_right_max_and_prim_id.w) \n"\
+" \n"\
+"INLINE float2 fast_intersect_bbox2(float3 pmin, float3 pmax, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    const float3 f = mad(pmax.xyz, invdir, oxinvdir); \n"\
+"    const float3 n = mad(pmin.xyz, invdir, oxinvdir); \n"\
+"    const float3 tmax = max(f, n); \n"\
+"    const float3 tmin = min(f, n); \n"\
+"    const float t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    const float t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return (float2)(t0, t1); \n"\
+"} \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void intersect_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL const bvh_node *restrict nodes, \n"\
+"    // Rays \n"\
+"    GLOBAL const ray *restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL const int *restrict num_rays, \n"\
+"    // Stack memory \n"\
+"    GLOBAL uint *stack, \n"\
+"    // Hit data \n"\
+"    GLOBAL Intersection *hits) \n"\
+"{ \n"\
+"    __local uint lds_stack[GROUP_SIZE * LDS_STACK_SIZE]; \n"\
+" \n"\
+"    uint index = get_global_id(0); \n"\
+"    uint local_index = get_local_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (index < *num_rays) \n"\
+"    { \n"\
+"        const ray my_ray = rays[index]; \n"\
+" \n"\
+"        if (ray_is_active(&my_ray)) \n"\
+"        { \n"\
+"            const float3 invDir = safe_invdir(my_ray); \n"\
+"            const float3 oxInvDir = -my_ray.o.xyz * invDir; \n"\
+" \n"\
+"            // Intersection parametric distance \n"\
+"            float closest_t = my_ray.o.w; \n"\
+" \n"\
+"            // Current node address \n"\
+"            uint addr = 0; \n"\
+"            // Current closest address \n"\
+"            uint closest_addr = INVALID_ADDR; \n"\
+" \n"\
+"            uint stack_bottom = STACK_SIZE * index; \n"\
+"            uint sptr = stack_bottom; \n"\
+"            uint lds_stack_bottom = local_index * LDS_STACK_SIZE; \n"\
+"            uint lds_sptr = lds_stack_bottom; \n"\
+" \n"\
+"            lds_stack[lds_sptr++] = INVALID_ADDR; \n"\
+" \n"\
+"            while (addr != INVALID_ADDR) \n"\
+"            { \n"\
+"                const bvh_node node = nodes[addr]; \n"\
+" \n"\
+"                if (INTERNAL_NODE(node)) \n"\
+"                { \n"\
+"                    float2 s0 = fast_intersect_bbox2( \n"\
+"                        node.aabb_left_min_or_v0_and_addr_left.xyz, \n"\
+"                        node.aabb_left_max_or_v1_and_mesh_id.xyz, \n"\
+"                        invDir, oxInvDir, closest_t); \n"\
+"                    float2 s1 = fast_intersect_bbox2( \n"\
+"                        node.aabb_right_min_or_v2_and_addr_right.xyz, \n"\
+"                        node.aabb_right_max_and_prim_id.xyz, \n"\
+"                        invDir, oxInvDir, closest_t); \n"\
+" \n"\
+"                    bool traverse_c0 = (s0.x <= s0.y); \n"\
+"                    bool traverse_c1 = (s1.x <= s1.y); \n"\
+"                    bool c1first = traverse_c1 && (s0.x > s1.x); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1) \n"\
+"                    { \n"\
+"                        uint deferred = INVALID_ADDR; \n"\
+" \n"\
+"                        if (c1first || !traverse_c0)  \n"\
+"                        { \n"\
+"                            addr = GetAddrRight(node); \n"\
+"                            deferred = GetAddrLeft(node); \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            addr = GetAddrLeft(node); \n"\
+"                            deferred = GetAddrRight(node); \n"\
+"                        } \n"\
+" \n"\
+"                        if (traverse_c0 && traverse_c1) \n"\
+"                        { \n"\
+"                            if (lds_sptr - lds_stack_bottom >= LDS_STACK_SIZE) \n"\
+"                            { \n"\
+"                                for (int i = 1; i < LDS_STACK_SIZE; ++i) \n"\
+"                                { \n"\
+"                                    stack[sptr + i] = lds_stack[lds_stack_bottom + i]; \n"\
+"                                } \n"\
+" \n"\
+"                                sptr += LDS_STACK_SIZE; \n"\
+"                                lds_sptr = lds_stack_bottom + 1; \n"\
+"                            } \n"\
+" \n"\
+"                            lds_stack[lds_sptr++] = deferred; \n"\
+"                        } \n"\
+" \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&my_ray) != convert_int(GetMeshId(node))) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        float t = fast_intersect_triangle( \n"\
+"                            my_ray, \n"\
+"                            node.aabb_left_min_or_v0_and_addr_left.xyz, \n"\
+"                            node.aabb_left_max_or_v1_and_mesh_id.xyz, \n"\
+"                            node.aabb_right_min_or_v2_and_addr_right.xyz, \n"\
+"                            closest_t); \n"\
+" \n"\
+"                        if (t < closest_t) \n"\
+"                        { \n"\
+"                            closest_t = t; \n"\
+"                            closest_addr = addr; \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+" \n"\
+"                addr = lds_stack[--lds_sptr]; \n"\
+" \n"\
+"                if (addr == INVALID_ADDR && sptr > stack_bottom) \n"\
+"                { \n"\
+"                    sptr -= LDS_STACK_SIZE; \n"\
+"                    for (int i = 1; i < LDS_STACK_SIZE; ++i) \n"\
+"                    { \n"\
+"                        lds_stack[lds_stack_bottom + i] = stack[sptr + i]; \n"\
+"                    } \n"\
+" \n"\
+"                    lds_sptr = lds_stack_bottom + LDS_STACK_SIZE - 1; \n"\
+"                    addr = lds_stack[lds_sptr]; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            // Check if we have found an intersection \n"\
+"            if (closest_addr != INVALID_ADDR) \n"\
+"            { \n"\
+"                // Calculate hit position \n"\
+"                const bvh_node node = nodes[closest_addr]; \n"\
+"                const float3 p = my_ray.o.xyz + closest_t * my_ray.d.xyz; \n"\
+" \n"\
+"                // Calculate barycentric coordinates \n"\
+"                const float2 uv = triangle_calculate_barycentrics( \n"\
+"                    p, \n"\
+"                    node.aabb_left_min_or_v0_and_addr_left.xyz, \n"\
+"                    node.aabb_left_max_or_v1_and_mesh_id.xyz, \n"\
+"                    node.aabb_right_min_or_v2_and_addr_right.xyz); \n"\
+" \n"\
+"                // Update hit information \n"\
+"                hits[index].prim_id = GetPrimId(node); \n"\
+"                hits[index].shape_id = GetMeshId(node); \n"\
+"                hits[index].uvwt = (float4)(uv.x, uv.y, 0.0f, closest_t); \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                // Miss here \n"\
+"                hits[index].prim_id = MISS_MARKER; \n"\
+"                hits[index].shape_id = MISS_MARKER; \n"\
+"            } \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void occluded_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL const bvh_node *restrict nodes, \n"\
+"    // Rays \n"\
+"    GLOBAL const ray *restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL const int *restrict num_rays, \n"\
+"    // Stack memory \n"\
+"    GLOBAL uint *stack, \n"\
+"    // Hit results: 1 for hit and -1 for miss \n"\
+"    GLOBAL int *hits) \n"\
+"{ \n"\
+"    __local uint lds_stack[GROUP_SIZE * LDS_STACK_SIZE]; \n"\
+" \n"\
+"    uint index = get_global_id(0); \n"\
+"    uint local_index = get_local_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (index < *num_rays) \n"\
+"    { \n"\
+"        const ray my_ray = rays[index]; \n"\
+" \n"\
+"        if (ray_is_active(&my_ray)) \n"\
+"        { \n"\
+"            const float3 invDir = safe_invdir(my_ray); \n"\
+"            const float3 oxInvDir = -my_ray.o.xyz * invDir; \n"\
+" \n"\
+"            // Current node address \n"\
+"            uint addr = 0; \n"\
+"            // Intersection parametric distance \n"\
+"            const float closest_t = my_ray.o.w; \n"\
+" \n"\
+"            uint stack_bottom = STACK_SIZE * index; \n"\
+"            uint sptr = stack_bottom; \n"\
+"            uint lds_stack_bottom = local_index * LDS_STACK_SIZE; \n"\
+"            uint lds_sptr = lds_stack_bottom; \n"\
+" \n"\
+"            lds_stack[lds_sptr++] = INVALID_ADDR; \n"\
+" \n"\
+"            while (addr != INVALID_ADDR) \n"\
+"            { \n"\
+"                const bvh_node node = nodes[addr]; \n"\
+" \n"\
+"                if (INTERNAL_NODE(node)) \n"\
+"                { \n"\
+"                    float2 s0 = fast_intersect_bbox2( \n"\
+"                        node.aabb_left_min_or_v0_and_addr_left.xyz, \n"\
+"                        node.aabb_left_max_or_v1_and_mesh_id.xyz, \n"\
+"                        invDir, oxInvDir, closest_t); \n"\
+"                    float2 s1 = fast_intersect_bbox2( \n"\
+"                        node.aabb_right_min_or_v2_and_addr_right.xyz, \n"\
+"                        node.aabb_right_max_and_prim_id.xyz, \n"\
+"                        invDir, oxInvDir, closest_t); \n"\
+" \n"\
+"                    bool traverse_c0 = (s0.x <= s0.y); \n"\
+"                    bool traverse_c1 = (s1.x <= s1.y); \n"\
+"                    bool c1first = traverse_c1 && (s0.x > s1.x); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1) \n"\
+"                    { \n"\
+"                        uint deferred = INVALID_ADDR; \n"\
+" \n"\
+"                        if (c1first || !traverse_c0) \n"\
+"                        { \n"\
+"                            addr = GetAddrRight(node); \n"\
+"                            deferred = GetAddrLeft(node); \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            addr = GetAddrLeft(node); \n"\
+"                            deferred = GetAddrRight(node); \n"\
+"                        } \n"\
+" \n"\
+"                        if (traverse_c0 && traverse_c1) \n"\
+"                        { \n"\
+"                            if (lds_sptr - lds_stack_bottom >= LDS_STACK_SIZE) \n"\
+"                            { \n"\
+"                                for (int i = 1; i < LDS_STACK_SIZE; ++i) \n"\
+"                                { \n"\
+"                                    stack[sptr + i] = lds_stack[lds_stack_bottom + i]; \n"\
+"                                } \n"\
+" \n"\
+"                                sptr += LDS_STACK_SIZE; \n"\
+"                                lds_sptr = lds_stack_bottom + 1; \n"\
+"                            } \n"\
+" \n"\
+"                            lds_stack[lds_sptr++] = deferred; \n"\
+"                        } \n"\
+" \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&my_ray) != convert_int(GetMeshId(node))) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        float t = fast_intersect_triangle( \n"\
+"                            my_ray, \n"\
+"                            node.aabb_left_min_or_v0_and_addr_left.xyz, \n"\
+"                            node.aabb_left_max_or_v1_and_mesh_id.xyz, \n"\
+"                            node.aabb_right_min_or_v2_and_addr_right.xyz, \n"\
+"                            closest_t); \n"\
+" \n"\
+"                        if (t < closest_t) \n"\
+"                        { \n"\
+"                            hits[index] = HIT_MARKER; \n"\
+"                            return; \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+" \n"\
+"                addr = lds_stack[--lds_sptr]; \n"\
+" \n"\
+"                if (addr == INVALID_ADDR && sptr > stack_bottom) \n"\
+"                { \n"\
+"                    sptr -= LDS_STACK_SIZE; \n"\
+"                    for (int i = 1; i < LDS_STACK_SIZE; ++i) \n"\
+"                    { \n"\
+"                        lds_stack[lds_stack_bottom + i] = stack[sptr + i]; \n"\
+"                    } \n"\
+" \n"\
+"                    lds_sptr = lds_stack_bottom + LDS_STACK_SIZE - 1; \n"\
+"                    addr = lds_stack[lds_sptr]; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            // Finished traversal, but no intersection found \n"\
+"            hits[index] = MISS_MARKER; \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+;
+static const char g_intersect_bvh2_lds_fp16_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"#pragma OPENCL EXTENSION cl_khr_fp16 : enable \n"\
+" \n"\
+"/************************************************************************* \n"\
+"INCLUDES \n"\
+"**************************************************************************/ \n"\
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define KERNEL __kernel \n"\
+"#define GLOBAL __global \n"\
+"#define INLINE __attribute__((always_inline)) \n"\
+"#define HIT_MARKER 1 \n"\
+"#define MISS_MARKER -1 \n"\
+"#define INVALID_IDX -1 \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable \n"\
+"#endif \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPES \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"// Axis aligned bounding box \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 pmin; \n"\
+"    float4 pmax; \n"\
+"} bbox; \n"\
+" \n"\
+"// Ray definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 o; \n"\
+"    float4 d; \n"\
+"    int2 extra; \n"\
+"    int doBackfaceCulling; \n"\
+"    int padding; \n"\
+"} ray; \n"\
+" \n"\
+"// Intersection definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int shape_id; \n"\
+"    int prim_id; \n"\
+"    int2 padding; \n"\
+" \n"\
+"    float4 uvwt; \n"\
+"} Intersection; \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"HELPER FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"INLINE \n"\
+"int ray_get_mask(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.x; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_is_active(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.y; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_maxt(ray const* r) \n"\
+"{ \n"\
+"    return r->o.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_time(ray const* r) \n"\
+"{ \n"\
+"    return r->d.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_get_doBackfaceCull(ray const* r) \n"\
+"{ \n"\
+"    return r->doBackfaceCulling; \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"#ifndef APPLE \n"\
+"INLINE \n"\
+"float4 make_float4(float x, float y, float z, float w) \n"\
+"{ \n"\
+"    float4 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    res.w = w; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float3 make_float3(float x, float y, float z) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float2 make_float2(float x, float y) \n"\
+"{ \n"\
+"    float2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int2 make_int2(int x, int y) \n"\
+"{ \n"\
+"    int2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int3 make_int3(int x, int y, int z) \n"\
+"{ \n"\
+"    int3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"#endif \n"\
+" \n"\
+"INLINE float min3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_min3(a, b, c); \n"\
+"#else \n"\
+"    return min(min(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+"INLINE float max3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_max3(a, b, c); \n"\
+"#else \n"\
+"    return max(max(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// Intersect ray against a triangle and return intersection interval value if it is in \n"\
+"// (0, t_max], return t_max otherwise. \n"\
+"INLINE \n"\
+"float fast_intersect_triangle(ray r, float3 v1, float3 v2, float3 v3, float t_max) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+" \n"\
+"#ifdef RR_BACKFACE_CULL \n"\
+"    if (ray_get_doBackfaceCull(&r) && dot(cross(e1, e2), r.d.xyz) > 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"#endif // RR_BACKFACE_CULL \n"\
+" \n"\
+"    float3 const s1 = cross(r.d.xyz, e2); \n"\
+" \n"\
+"    float denom = dot(s1, e1); \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invd = 1.f / denom; \n"\
+"#else \n"\
+"    float const invd = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float3 const d = r.o.xyz - v1; \n"\
+"    float const b1 = dot(d, s1) * invd; \n"\
+"    float3 const s2 = cross(d, e1); \n"\
+"    float const b2 = dot(r.d.xyz, s2) * invd; \n"\
+"    float const temp = dot(e2, s2) * invd; \n"\
+" \n"\
+"    if (b1 < 0.f || b1 > 1.f || b2 < 0.f || b1 + b2 > 1.f || temp < 0.f || temp > t_max) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        return temp; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float3 safe_invdir(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-8; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"// Intersect rays vs bbox and return intersection span.  \n"\
+"// Intersection criteria is ret.x <= ret.y \n"\
+"INLINE \n"\
+"float2 fast_intersect_bbox1(bbox box, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    float3 const f = mad(box.pmax.xyz, invdir, oxinvdir); \n"\
+"    float3 const n = mad(box.pmin.xyz, invdir, oxinvdir); \n"\
+"    float3 const tmax = max(f, n); \n"\
+"    float3 const tmin = min(f, n); \n"\
+"    float const t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    float const t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return make_float2(t0, t1); \n"\
+"} \n"\
+" \n"\
+"// Given a point in triangle plane, calculate its barycentrics \n"\
+"INLINE \n"\
+"float2 triangle_calculate_barycentrics(float3 p, float3 v1, float3 v2, float3 v3) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+"    float3 const e = p - v1; \n"\
+"    float const d00 = dot(e1, e1); \n"\
+"    float const d01 = dot(e1, e2); \n"\
+"    float const d11 = dot(e2, e2); \n"\
+"    float const d20 = dot(e, e1); \n"\
+"    float const d21 = dot(e, e2); \n"\
+" \n"\
+"    float denom = (d00 * d11 - d01 * d01); \n"\
+"     \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return make_float2(0.f, 0.f); \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invdenom = 1.f / denom; \n"\
+"#else \n"\
+"    float const invdenom = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float const b1 = (d11 * d20 - d01 * d21) * invdenom; \n"\
+"    float const b2 = (d00 * d21 - d01 * d20) * invdenom; \n"\
+"    return make_float2(b1, b2); \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPE DEFINITIONS \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"#define INVALID_ADDR 0xffffffffu \n"\
+"#define INTERNAL_NODE(node) ((node).aabb01_min_or_v0_and_addr0.w != INVALID_ADDR) \n"\
+" \n"\
+"#define GROUP_SIZE 64 \n"\
+"#define STACK_SIZE 32 \n"\
+"#define LDS_STACK_SIZE 16 \n"\
+" \n"\
+"// BVH node \n"\
+"typedef struct \n"\
+"{ \n"\
+"    uint4 aabb01_min_or_v0_and_addr0; \n"\
+"    uint4 aabb01_max_or_v1_and_addr1_or_mesh_id; \n"\
+"    uint4 aabb23_min_or_v2_and_addr2_or_prim_id; \n"\
+"    uint4 aabb23_max_and_addr3; \n"\
+" \n"\
+"} bvh_node; \n"\
+" \n"\
+"#define mymin3(a, b, c) min(min((a), (b)), (c)) \n"\
+"#define mymax3(a, b, c) max(max((a), (b)), (c)) \n"\
+" \n"\
+"INLINE half2 unpackFloat2x16(uint v) \n"\
+"{ \n"\
+"    return (half2) \n"\
+"        (as_half(convert_ushort(v & 0xffffu)), \n"\
+"         as_half(convert_ushort(v >> 16))); \n"\
+"} \n"\
+" \n"\
+"INLINE half4 fast_intersect_bbox2(uint3 pmin, uint3 pmax, half3 invdir, half3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    half2 pmin_x = unpackFloat2x16(pmin.x); \n"\
+"    half2 pmin_y = unpackFloat2x16(pmin.y); \n"\
+"    half2 pmin_z = unpackFloat2x16(pmin.z); \n"\
+"    half2 pmax_x = unpackFloat2x16(pmax.x); \n"\
+"    half2 pmax_y = unpackFloat2x16(pmax.y); \n"\
+"    half2 pmax_z = unpackFloat2x16(pmax.z); \n"\
+" \n"\
+"    half2 f_x = fma(pmax_x, invdir.xx, oxinvdir.xx); \n"\
+"    half2 f_y = fma(pmax_y, invdir.yy, oxinvdir.yy); \n"\
+"    half2 f_z = fma(pmax_z, invdir.zz, oxinvdir.zz); \n"\
+" \n"\
+"    half2 n_x = fma(pmin_x, invdir.xx, oxinvdir.xx); \n"\
+"    half2 n_y = fma(pmin_y, invdir.yy, oxinvdir.yy); \n"\
+"    half2 n_z = fma(pmin_z, invdir.zz, oxinvdir.zz); \n"\
+" \n"\
+"    half2 t_max_x = max(f_x, n_x); \n"\
+"    half2 t_max_y = max(f_y, n_y); \n"\
+"    half2 t_max_z = max(f_z, n_z); \n"\
+" \n"\
+"    half2 t_min_x = min(f_x, n_x); \n"\
+"    half2 t_min_y = min(f_y, n_y); \n"\
+"    half2 t_min_z = min(f_z, n_z); \n"\
+" \n"\
+"    half2 t_zero = (half2)(0.0f, 0.0f); \n"\
+"    half2 t_max2 = (half2)(t_max, t_max); \n"\
+"    half2 t1 = min(mymin3(t_max_x, t_max_y, t_max_z), t_max2); \n"\
+"    half2 t0 = max(mymax3(t_min_x, t_min_y, t_min_z), t_zero); \n"\
+" \n"\
+"    return (half4)(t0, t1); \n"\
+"} \n"\
+" \n"\
+"INLINE float3 safe_invdir2(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-5; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"INLINE void stack_push( \n"\
+"    __local uint *lds_stack, \n"\
+"    __private uint *lds_sptr, \n"\
+"    uint lds_stack_bottom, \n"\
+"    __global uint *stack, \n"\
+"    __private uint *sptr, \n"\
+"    uint idx) \n"\
+"{ \n"\
+"    if (*lds_sptr - lds_stack_bottom >= LDS_STACK_SIZE) \n"\
+"    { \n"\
+"        for (int i = 1; i < LDS_STACK_SIZE; ++i) \n"\
+"        { \n"\
+"            stack[*sptr + i] = lds_stack[lds_stack_bottom + i]; \n"\
+"        } \n"\
+" \n"\
+"        *sptr = *sptr + LDS_STACK_SIZE; \n"\
+"        *lds_sptr = lds_stack_bottom + 1; \n"\
+"    } \n"\
+" \n"\
+"    lds_stack[*lds_sptr] = idx; \n"\
+"    *lds_sptr = *lds_sptr + 1; \n"\
+"} \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void intersect_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL const bvh_node *restrict nodes, \n"\
+"    // Rays \n"\
+"    GLOBAL const ray *restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL const int *restrict num_rays, \n"\
+"    // Stack memory \n"\
+"    GLOBAL uint *stack, \n"\
+"    // Hit data \n"\
+"    GLOBAL Intersection *hits) \n"\
+"{ \n"\
+"    __local uint lds_stack[GROUP_SIZE * LDS_STACK_SIZE]; \n"\
+" \n"\
+"    uint index = get_global_id(0); \n"\
+"    uint local_index = get_local_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (index < *num_rays) \n"\
+"    { \n"\
+"        const ray my_ray = rays[index]; \n"\
+" \n"\
+"        if (ray_is_active(&my_ray)) \n"\
+"        { \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            const float3 invDir32 = safe_invdir2(my_ray); \n"\
+"            const half3 invDir = convert_half3(invDir32); \n"\
+"            const half3 oxInvDir = convert_half3(-my_ray.o.xyz * invDir32); \n"\
+" \n"\
+"            // Intersection parametric distance \n"\
+"            float closest_t = my_ray.o.w; \n"\
+" \n"\
+"            // Current node address \n"\
+"            uint addr = 0; \n"\
+"            // Current closest address \n"\
+"            uint closest_addr = INVALID_ADDR; \n"\
+" \n"\
+"            uint stack_bottom = STACK_SIZE * index; \n"\
+"            uint sptr = stack_bottom; \n"\
+"            uint lds_stack_bottom = local_index * LDS_STACK_SIZE; \n"\
+"            uint lds_sptr = lds_stack_bottom; \n"\
+" \n"\
+"            lds_stack[lds_sptr++] = INVALID_ADDR; \n"\
+" \n"\
+"            while (addr != INVALID_ADDR) \n"\
+"            { \n"\
+"                const bvh_node node = nodes[addr]; \n"\
+" \n"\
+"                if (INTERNAL_NODE(node)) \n"\
+"                { \n"\
+"                    half4 s01 = fast_intersect_bbox2( \n"\
+"                        node.aabb01_min_or_v0_and_addr0.xyz, \n"\
+"                        node.aabb01_max_or_v1_and_addr1_or_mesh_id.xyz, \n"\
+"                        invDir, oxInvDir, closest_t); \n"\
+"                    half4 s23 = fast_intersect_bbox2( \n"\
+"                        node.aabb23_min_or_v2_and_addr2_or_prim_id.xyz, \n"\
+"                        node.aabb23_max_and_addr3.xyz, \n"\
+"                        invDir, oxInvDir, closest_t); \n"\
+" \n"\
+"                    bool traverse_c0 = (s01.x <= s01.z); \n"\
+"                    bool traverse_c1 = (s01.y <= s01.w) && (node.aabb01_max_or_v1_and_addr1_or_mesh_id.w != INVALID_ADDR); \n"\
+"                    bool traverse_c2 = (s23.x <= s23.z); \n"\
+"                    bool traverse_c3 = (s23.y <= s23.w) && (node.aabb23_max_and_addr3.w != INVALID_ADDR); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1 || traverse_c2 || traverse_c3) \n"\
+"                    { \n"\
+"                        uint a = INVALID_ADDR; \n"\
+"                        half d = 100000000.0f; \n"\
+" \n"\
+"                        if (traverse_c0) \n"\
+"                        { \n"\
+"                            a = node.aabb01_min_or_v0_and_addr0.w; \n"\
+"                            d = s01.x; \n"\
+"                        } \n"\
+" \n"\
+"                        if (traverse_c1) \n"\
+"                        { \n"\
+"                            if (a == INVALID_ADDR) \n"\
+"                                a = node.aabb01_max_or_v1_and_addr1_or_mesh_id.w; \n"\
+"                            else \n"\
+"                            { \n"\
+"                                uint topush = s01.y < d ? a : node.aabb01_max_or_v1_and_addr1_or_mesh_id.w; \n"\
+"                                d = min(s01.y, d); \n"\
+"                                a = topush == a ? node.aabb01_max_or_v1_and_addr1_or_mesh_id.w : a; \n"\
+"                                stack_push(lds_stack, &lds_sptr, lds_stack_bottom, stack, &sptr, topush); \n"\
+"                            } \n"\
+"                        } \n"\
+" \n"\
+"                        if (traverse_c2) \n"\
+"                        { \n"\
+"                            if (a == INVALID_ADDR) \n"\
+"                                a = node.aabb23_min_or_v2_and_addr2_or_prim_id.w; \n"\
+"                            else \n"\
+"                            { \n"\
+"                                uint topush = s23.x < d ? a : node.aabb23_min_or_v2_and_addr2_or_prim_id.w; \n"\
+"                                d = min(s23.x, d); \n"\
+"                                a = topush == a ? node.aabb23_min_or_v2_and_addr2_or_prim_id.w : a; \n"\
+"                                stack_push(lds_stack, &lds_sptr, lds_stack_bottom, stack, &sptr, topush); \n"\
+"                            } \n"\
+"                        } \n"\
+" \n"\
+"                        if (traverse_c3) \n"\
+"                        { \n"\
+"                            if (a == INVALID_ADDR) \n"\
+"                                a = node.aabb23_max_and_addr3.w; \n"\
+"                            else \n"\
+"                            { \n"\
+"                                uint topush = s23.y < d ? a : node.aabb23_max_and_addr3.w; \n"\
+"                                d = min(s23.y, d); \n"\
+"                                a = topush == a ? node.aabb23_max_and_addr3.w : a; \n"\
+"                                stack_push(lds_stack, &lds_sptr, lds_stack_bottom, stack, &sptr, topush); \n"\
+"                            } \n"\
+"                        } \n"\
+" \n"\
+"                        addr = a; \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&my_ray) != convert_int(GetMeshId(node))) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        float t = fast_intersect_triangle( \n"\
+"                            my_ray, \n"\
+"                            as_float3(node.aabb01_min_or_v0_and_addr0.xyz), \n"\
+"                            as_float3(node.aabb01_max_or_v1_and_addr1_or_mesh_id.xyz), \n"\
+"                            as_float3(node.aabb23_min_or_v2_and_addr2_or_prim_id.xyz), \n"\
+"                            closest_t); \n"\
+" \n"\
+"                        if (t < closest_t) \n"\
+"                        { \n"\
+"                            closest_t = t; \n"\
+"                            closest_addr = addr; \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+" \n"\
+"                addr = lds_stack[--lds_sptr]; \n"\
+" \n"\
+"                if (addr == INVALID_ADDR && sptr > stack_bottom) \n"\
+"                { \n"\
+"                    sptr -= LDS_STACK_SIZE; \n"\
+"                    for (int i = 1; i < LDS_STACK_SIZE; ++i) \n"\
+"                    { \n"\
+"                        lds_stack[lds_stack_bottom + i] = stack[sptr + i]; \n"\
+"                    } \n"\
+" \n"\
+"                    lds_sptr = lds_stack_bottom + LDS_STACK_SIZE - 1; \n"\
+"                    addr = lds_stack[lds_sptr]; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            // Check if we have found an intersection \n"\
+"            if (closest_addr != INVALID_ADDR) \n"\
+"            { \n"\
+"                // Calculate hit position \n"\
+"                const bvh_node node = nodes[closest_addr]; \n"\
+"                const float3 p = my_ray.o.xyz + closest_t * my_ray.d.xyz; \n"\
+" \n"\
+"                // Calculate barycentric coordinates \n"\
+"                const float2 uv = triangle_calculate_barycentrics( \n"\
+"                    p, \n"\
+"                    as_float3(node.aabb01_min_or_v0_and_addr0.xyz), \n"\
+"                    as_float3(node.aabb01_max_or_v1_and_addr1_or_mesh_id.xyz), \n"\
+"                    as_float3(node.aabb23_min_or_v2_and_addr2_or_prim_id.xyz)); \n"\
+" \n"\
+"                // Update hit information \n"\
+"                hits[index].prim_id = node.aabb23_min_or_v2_and_addr2_or_prim_id.w; \n"\
+"                hits[index].shape_id = node.aabb01_max_or_v1_and_addr1_or_mesh_id.w; \n"\
+"                hits[index].uvwt = (float4)(uv.x, uv.y, 0.0f, closest_t); \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                // Miss here \n"\
+"                hits[index].prim_id = MISS_MARKER; \n"\
+"                hits[index].shape_id = MISS_MARKER; \n"\
+"            } \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void occluded_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL const bvh_node *restrict nodes, \n"\
+"    // Rays \n"\
+"    GLOBAL const ray *restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL const int *restrict num_rays, \n"\
+"    // Stack memory \n"\
+"    GLOBAL uint *stack, \n"\
+"    // Hit results: 1 for hit and -1 for miss \n"\
+"    GLOBAL int *hits) \n"\
+"{ \n"\
+"    __local uint lds_stack[GROUP_SIZE * LDS_STACK_SIZE]; \n"\
+" \n"\
+"    uint index = get_global_id(0); \n"\
+"    uint local_index = get_local_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (index < *num_rays) \n"\
+"    { \n"\
+"        const ray my_ray = rays[index]; \n"\
+" \n"\
+"        if (ray_is_active(&my_ray)) \n"\
+"        { \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            const float3 invDir32 = safe_invdir2(my_ray); \n"\
+"            const half3 invDir = convert_half3(invDir32); \n"\
+"            const half3 oxInvDir = convert_half3(-my_ray.o.xyz * invDir32); \n"\
+" \n"\
+"            // Intersection parametric distance \n"\
+"            float closest_t = my_ray.o.w; \n"\
+" \n"\
+"            // Current node address \n"\
+"            uint addr = 0; \n"\
+"            // Current closest address \n"\
+"            uint closest_addr = INVALID_ADDR; \n"\
+" \n"\
+"            uint stack_bottom = STACK_SIZE * index; \n"\
+"            uint sptr = stack_bottom; \n"\
+"            uint lds_stack_bottom = local_index * LDS_STACK_SIZE; \n"\
+"            uint lds_sptr = lds_stack_bottom; \n"\
+" \n"\
+"            lds_stack[lds_sptr++] = INVALID_ADDR; \n"\
+" \n"\
+"            while (addr != INVALID_ADDR) \n"\
+"            { \n"\
+"                const bvh_node node = nodes[addr]; \n"\
+" \n"\
+"                if (INTERNAL_NODE(node)) \n"\
+"                { \n"\
+"                    half4 s01 = fast_intersect_bbox2( \n"\
+"                        node.aabb01_min_or_v0_and_addr0.xyz, \n"\
+"                        node.aabb01_max_or_v1_and_addr1_or_mesh_id.xyz, \n"\
+"                        invDir, oxInvDir, closest_t); \n"\
+"                    half4 s23 = fast_intersect_bbox2( \n"\
+"                        node.aabb23_min_or_v2_and_addr2_or_prim_id.xyz, \n"\
+"                        node.aabb23_max_and_addr3.xyz, \n"\
+"                        invDir, oxInvDir, closest_t); \n"\
+" \n"\
+"                    bool traverse_c0 = (s01.x <= s01.z); \n"\
+"                    bool traverse_c1 = (s01.y <= s01.w) && (node.aabb01_max_or_v1_and_addr1_or_mesh_id.w != INVALID_ADDR); \n"\
+"                    bool traverse_c2 = (s23.x <= s23.z); \n"\
+"                    bool traverse_c3 = (s23.y <= s23.w) && (node.aabb23_max_and_addr3.w != INVALID_ADDR); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1 || traverse_c2 || traverse_c3) \n"\
+"                    { \n"\
+"                        uint a = INVALID_ADDR; \n"\
+"                        half d = 100000000.0f; \n"\
+" \n"\
+"                        if (traverse_c0) \n"\
+"                        { \n"\
+"                            a = node.aabb01_min_or_v0_and_addr0.w; \n"\
+"                            d = s01.x; \n"\
+"                        } \n"\
+" \n"\
+"                        if (traverse_c1) \n"\
+"                        { \n"\
+"                            if (a == INVALID_ADDR) \n"\
+"                                a = node.aabb01_max_or_v1_and_addr1_or_mesh_id.w; \n"\
+"                            else \n"\
+"                            { \n"\
+"                                uint topush = s01.y < d ? a : node.aabb01_max_or_v1_and_addr1_or_mesh_id.w; \n"\
+"                                d = min(s01.y, d); \n"\
+"                                a = topush == a ? node.aabb01_max_or_v1_and_addr1_or_mesh_id.w : a; \n"\
+"                                stack_push(lds_stack, &lds_sptr, lds_stack_bottom, stack, &sptr, topush); \n"\
+"                            } \n"\
+"                        } \n"\
+" \n"\
+"                        if (traverse_c2) \n"\
+"                        { \n"\
+"                            if (a == INVALID_ADDR) \n"\
+"                                a = node.aabb23_min_or_v2_and_addr2_or_prim_id.w; \n"\
+"                            else \n"\
+"                            { \n"\
+"                                uint topush = s23.x < d ? a : node.aabb23_min_or_v2_and_addr2_or_prim_id.w; \n"\
+"                                d = min(s23.x, d); \n"\
+"                                a = topush == a ? node.aabb23_min_or_v2_and_addr2_or_prim_id.w : a; \n"\
+"                                stack_push(lds_stack, &lds_sptr, lds_stack_bottom, stack, &sptr, topush); \n"\
+"                            } \n"\
+"                        } \n"\
+" \n"\
+"                        if (traverse_c3) \n"\
+"                        { \n"\
+"                            if (a == INVALID_ADDR) \n"\
+"                                a = node.aabb23_max_and_addr3.w; \n"\
+"                            else \n"\
+"                            { \n"\
+"                                uint topush = s23.y < d ? a : node.aabb23_max_and_addr3.w; \n"\
+"                                d = min(s23.y, d); \n"\
+"                                a = topush == a ? node.aabb23_max_and_addr3.w : a; \n"\
+"                                stack_push(lds_stack, &lds_sptr, lds_stack_bottom, stack, &sptr, topush); \n"\
+"                            } \n"\
+"                        } \n"\
+" \n"\
+"                        addr = a; \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&my_ray) != convert_int(GetMeshId(node))) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        float t = fast_intersect_triangle( \n"\
+"                            my_ray, \n"\
+"                            as_float3(node.aabb01_min_or_v0_and_addr0.xyz), \n"\
+"                            as_float3(node.aabb01_max_or_v1_and_addr1_or_mesh_id.xyz), \n"\
+"                            as_float3(node.aabb23_min_or_v2_and_addr2_or_prim_id.xyz), \n"\
+"                            closest_t); \n"\
+" \n"\
+"                        if (t < closest_t) \n"\
+"                        { \n"\
+"                            hits[index] = HIT_MARKER; \n"\
+"                            return; \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+" \n"\
+"                addr = lds_stack[--lds_sptr]; \n"\
+" \n"\
+"                if (addr == INVALID_ADDR && sptr > stack_bottom) \n"\
+"                { \n"\
+"                    sptr -= LDS_STACK_SIZE; \n"\
+"                    for (int i = 1; i < LDS_STACK_SIZE; ++i) \n"\
+"                    { \n"\
+"                        lds_stack[lds_stack_bottom + i] = stack[sptr + i]; \n"\
+"                    } \n"\
+" \n"\
+"                    lds_sptr = lds_stack_bottom + LDS_STACK_SIZE - 1; \n"\
+"                    addr = lds_stack[lds_sptr]; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            // Finished traversal, but no intersection found \n"\
+"            hits[index] = MISS_MARKER; \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+;
+static const char g_intersect_bvh2_short_stack_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+"/** \n"\
+"    \\file intersect_bvh2_short_stack.cl \n"\
+"    \\author Dmitry Kozlov \n"\
+"    \\version 1.0 \n"\
+"    \\brief Intersector implementation based on BVH stacked travesal. \n"\
+" \n"\
+"    Intersector is using binary BVH with two bounding boxes per node. \n"\
+"    Traversal is using a stack which is split into two parts: \n"\
+"        -Top part in fast LDS memory \n"\
+"        -Bottom part in slow global memory. \n"\
+"    Push operations first check for top part overflow and offload top \n"\
+"    part into slow global memory if necessary. \n"\
+"    Pop operations first check for top part emptiness and try to offload \n"\
+"    from bottom part if necessary.  \n"\
+" \n"\
+"    Traversal pseudocode: \n"\
+" \n"\
+"        while(addr is valid) \n"\
+"        { \n"\
+"            node <- fetch next node at addr \n"\
+" \n"\
+"            if (node is leaf) \n"\
+"                intersect leaf \n"\
+"            else \n"\
+"            { \n"\
+"                intersect ray vs left child \n"\
+"                intersect ray vs right child \n"\
+"                if (intersect any of children) \n"\
+"                { \n"\
+"                    determine closer child \n"\
+"                    if intersect both \n"\
+"                    { \n"\
+"                        addr = closer child \n"\
+"                        check top stack and offload if necesary \n"\
+"                        push farther child into the stack \n"\
+"                    } \n"\
+"                    else \n"\
+"                    { \n"\
+"                        addr = intersected child \n"\
+"                    } \n"\
+"                    continue \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            addr <- pop from top stack \n"\
+"            if (addr is not valid) \n"\
+"            { \n"\
+"                try loading data from bottom stack to top stack \n"\
+"                addr <- pop from top stack \n"\
+"            } \n"\
+"        } \n"\
+" \n"\
+"    Pros: \n"\
+"        -Very fast traversal. \n"\
+"        -Benefits from BVH quality optimization. \n"\
+"    Cons: \n"\
+"        -Depth is limited. \n"\
+"        -Generates LDS traffic. \n"\
+" */ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"INCLUDES \n"\
+"**************************************************************************/ \n"\
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define KERNEL __kernel \n"\
+"#define GLOBAL __global \n"\
+"#define INLINE __attribute__((always_inline)) \n"\
+"#define HIT_MARKER 1 \n"\
+"#define MISS_MARKER -1 \n"\
+"#define INVALID_IDX -1 \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable \n"\
+"#endif \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPES \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"// Axis aligned bounding box \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 pmin; \n"\
+"    float4 pmax; \n"\
+"} bbox; \n"\
+" \n"\
+"// Ray definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 o; \n"\
+"    float4 d; \n"\
+"    int2 extra; \n"\
+"    int doBackfaceCulling; \n"\
+"    int padding; \n"\
+"} ray; \n"\
+" \n"\
+"// Intersection definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int shape_id; \n"\
+"    int prim_id; \n"\
+"    int2 padding; \n"\
+" \n"\
+"    float4 uvwt; \n"\
+"} Intersection; \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"HELPER FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"INLINE \n"\
+"int ray_get_mask(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.x; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_is_active(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.y; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_maxt(ray const* r) \n"\
+"{ \n"\
+"    return r->o.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_time(ray const* r) \n"\
+"{ \n"\
+"    return r->d.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_get_doBackfaceCull(ray const* r) \n"\
+"{ \n"\
+"    return r->doBackfaceCulling; \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"#ifndef APPLE \n"\
+"INLINE \n"\
+"float4 make_float4(float x, float y, float z, float w) \n"\
+"{ \n"\
+"    float4 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    res.w = w; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float3 make_float3(float x, float y, float z) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float2 make_float2(float x, float y) \n"\
+"{ \n"\
+"    float2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int2 make_int2(int x, int y) \n"\
+"{ \n"\
+"    int2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int3 make_int3(int x, int y, int z) \n"\
+"{ \n"\
+"    int3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"#endif \n"\
+" \n"\
+"INLINE float min3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_min3(a, b, c); \n"\
+"#else \n"\
+"    return min(min(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+"INLINE float max3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_max3(a, b, c); \n"\
+"#else \n"\
+"    return max(max(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// Intersect ray against a triangle and return intersection interval value if it is in \n"\
+"// (0, t_max], return t_max otherwise. \n"\
+"INLINE \n"\
+"float fast_intersect_triangle(ray r, float3 v1, float3 v2, float3 v3, float t_max) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+" \n"\
+"#ifdef RR_BACKFACE_CULL \n"\
+"    if (ray_get_doBackfaceCull(&r) && dot(cross(e1, e2), r.d.xyz) > 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"#endif // RR_BACKFACE_CULL \n"\
+" \n"\
+"    float3 const s1 = cross(r.d.xyz, e2); \n"\
+" \n"\
+"    float denom = dot(s1, e1); \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invd = 1.f / denom; \n"\
+"#else \n"\
+"    float const invd = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float3 const d = r.o.xyz - v1; \n"\
+"    float const b1 = dot(d, s1) * invd; \n"\
+"    float3 const s2 = cross(d, e1); \n"\
+"    float const b2 = dot(r.d.xyz, s2) * invd; \n"\
+"    float const temp = dot(e2, s2) * invd; \n"\
+" \n"\
+"    if (b1 < 0.f || b1 > 1.f || b2 < 0.f || b1 + b2 > 1.f || temp < 0.f || temp > t_max) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        return temp; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float3 safe_invdir(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-8; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"// Intersect rays vs bbox and return intersection span.  \n"\
+"// Intersection criteria is ret.x <= ret.y \n"\
+"INLINE \n"\
+"float2 fast_intersect_bbox1(bbox box, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    float3 const f = mad(box.pmax.xyz, invdir, oxinvdir); \n"\
+"    float3 const n = mad(box.pmin.xyz, invdir, oxinvdir); \n"\
+"    float3 const tmax = max(f, n); \n"\
+"    float3 const tmin = min(f, n); \n"\
+"    float const t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    float const t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return make_float2(t0, t1); \n"\
+"} \n"\
+" \n"\
+"// Given a point in triangle plane, calculate its barycentrics \n"\
+"INLINE \n"\
+"float2 triangle_calculate_barycentrics(float3 p, float3 v1, float3 v2, float3 v3) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+"    float3 const e = p - v1; \n"\
+"    float const d00 = dot(e1, e1); \n"\
+"    float const d01 = dot(e1, e2); \n"\
+"    float const d11 = dot(e2, e2); \n"\
+"    float const d20 = dot(e, e1); \n"\
+"    float const d21 = dot(e, e2); \n"\
+" \n"\
+"    float denom = (d00 * d11 - d01 * d01); \n"\
+"     \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return make_float2(0.f, 0.f); \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invdenom = 1.f / denom; \n"\
+"#else \n"\
+"    float const invdenom = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float const b1 = (d11 * d20 - d01 * d21) * invdenom; \n"\
+"    float const b2 = (d00 * d21 - d01 * d20) * invdenom; \n"\
+"    return make_float2(b1, b2); \n"\
+"} \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPE DEFINITIONS \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"#define LEAFNODE(x) (((x).child0) == -1) \n"\
+"#define GLOBAL_STACK_SIZE 32 \n"\
+"#define SHORT_STACK_SIZE 16 \n"\
+"#define WAVEFRONT_SIZE 64 \n"\
+" \n"\
+"// BVH node \n"\
+"typedef struct \n"\
+"{ \n"\
+"    union  \n"\
+"    { \n"\
+"        struct \n"\
+"        { \n"\
+"            // Child bounds \n"\
+"            bbox bounds[2]; \n"\
+"        }; \n"\
+" \n"\
+"        struct \n"\
+"        { \n"\
+"            // If node is a leaf we keep vertex indices here \n"\
+"            int i0, i1, i2; \n"\
+"            // Address of a left child \n"\
+"            int child0; \n"\
+"            // Shape ID \n"\
+"            int shape_id; \n"\
+"            // Primitive ID \n"\
+"            int prim_id; \n"\
+"            // Address of a right child \n"\
+"            int child1; \n"\
+"        }; \n"\
+"    }; \n"\
+" \n"\
+"} bvh_node; \n"\
+" \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void \n"\
+"occluded_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL bvh_node const * restrict nodes, \n"\
+"    // Triangles vertices \n"\
+"    GLOBAL float3 const * restrict vertices, \n"\
+"    // Rays \n"\
+"    GLOBAL ray const * restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL int const * restrict num_rays, \n"\
+"    // Stack memory \n"\
+"    GLOBAL int* stack, \n"\
+"    // Hit results: 1 for hit and -1 for miss \n"\
+"    GLOBAL int* hits \n"\
+"    ) \n"\
+"{ \n"\
+"    // Allocate stack in LDS \n"\
+"    __local int lds[SHORT_STACK_SIZE * WAVEFRONT_SIZE]; \n"\
+" \n"\
+"    int global_id = get_global_id(0); \n"\
+"    int local_id = get_local_id(0); \n"\
+"    int group_id = get_group_id(0); \n"\
+" \n"\
+"    // Handle only working set \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        ray const r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Allocate stack in global memory  \n"\
+"            __global int* gm_stack_base = stack + (group_id * WAVEFRONT_SIZE + local_id) * GLOBAL_STACK_SIZE; \n"\
+"            __global int* gm_stack = gm_stack_base; \n"\
+" \n"\
+"            __local int* lm_stack_base = lds + local_id; \n"\
+"            __local int* lm_stack = lm_stack_base; \n"\
+" \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            float3 const invdir = safe_invdir(r); \n"\
+"            float3 const oxinvdir = -r.o.xyz * invdir; \n"\
+"            // Intersection parametric distance \n"\
+"            float const t_max = r.o.w; \n"\
+" \n"\
+"            // Current node address \n"\
+"            int addr = 0; \n"\
+"            // Current closest intersection leaf index \n"\
+"            int isect_idx = INVALID_IDX; \n"\
+" \n"\
+"            //  Initalize local stack \n"\
+"            *lm_stack = INVALID_IDX; \n"\
+"            lm_stack += WAVEFRONT_SIZE; \n"\
+" \n"\
+"            // Start from 0 node (root) \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node const node = nodes[addr]; \n"\
+" \n"\
+"                // Check if it is a leaf \n"\
+"                if (LEAFNODE(node)) \n"\
+"                { \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&r) != node.shape_id) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        // Leafs directly store vertex indices \n"\
+"                        // so we load vertices directly \n"\
+"                        float3 const v1 = vertices[node.i0]; \n"\
+"                        float3 const v2 = vertices[node.i1]; \n"\
+"                        float3 const v3 = vertices[node.i2]; \n"\
+"                        // Intersect triangle \n"\
+"                        float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                        // If hit update closest hit distance and index \n"\
+"                        if (f < t_max) \n"\
+"                        { \n"\
+"                            hits[global_id] = HIT_MARKER; \n"\
+"                            return; \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"                    // It is internal node, so intersect vs both children bounds \n"\
+"                    float2 const s0 = fast_intersect_bbox1(node.bounds[0], invdir, oxinvdir, t_max); \n"\
+"                    float2 const s1 = fast_intersect_bbox1(node.bounds[1], invdir, oxinvdir, t_max); \n"\
+" \n"\
+"                    // Determine which one to traverse \n"\
+"                    bool const traverse_c0 = (s0.x <= s0.y); \n"\
+"                    bool const traverse_c1 = (s1.x <= s1.y); \n"\
+"                    bool const c1first = traverse_c1 && (s0.x > s1.x); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1) \n"\
+"                    { \n"\
+"                        int deferred = -1; \n"\
+" \n"\
+"                        // Determine which one to traverse first \n"\
+"                        if (c1first || !traverse_c0) \n"\
+"                        { \n"\
+"                            // Right one is closer or left one not travesed \n"\
+"                            addr = node.child1; \n"\
+"                            deferred = node.child0; \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            // Traverse left node otherwise \n"\
+"                            addr = node.child0; \n"\
+"                            deferred = node.child1; \n"\
+"                        } \n"\
+" \n"\
+"                        // If we traverse both children we need to postpone the node \n"\
+"                        if (traverse_c0 && traverse_c1) \n"\
+"                        { \n"\
+"                            // If short stack is full, we offload it into global memory \n"\
+"                            if (lm_stack - lm_stack_base >= SHORT_STACK_SIZE * WAVEFRONT_SIZE) \n"\
+"                            { \n"\
+"                                for (int i = 1; i < SHORT_STACK_SIZE; ++i) \n"\
+"                                { \n"\
+"                                    gm_stack[i] = lm_stack_base[i * WAVEFRONT_SIZE]; \n"\
+"                                } \n"\
+" \n"\
+"                                gm_stack += SHORT_STACK_SIZE; \n"\
+"                                lm_stack = lm_stack_base + WAVEFRONT_SIZE; \n"\
+"                            } \n"\
+" \n"\
+"                            *lm_stack = deferred; \n"\
+"                            lm_stack += WAVEFRONT_SIZE; \n"\
+"                        } \n"\
+" \n"\
+"                        // Continue traversal \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+" \n"\
+"                // Try popping from local stack \n"\
+"                lm_stack -= WAVEFRONT_SIZE; \n"\
+"                addr = *(lm_stack); \n"\
+" \n"\
+"                // If we popped INVALID_IDX then check global stack \n"\
+"                if (addr == INVALID_IDX && gm_stack > gm_stack_base) \n"\
+"                { \n"\
+"                    // Adjust stack pointer \n"\
+"                    gm_stack -= SHORT_STACK_SIZE; \n"\
+"                    // Copy data from global memory to LDS \n"\
+"                    for (int i = 1; i < SHORT_STACK_SIZE; ++i) \n"\
+"                    { \n"\
+"                        lm_stack_base[i * WAVEFRONT_SIZE] = gm_stack[i]; \n"\
+"                    } \n"\
+"                    // Point local stack pointer to the end \n"\
+"                    lm_stack = lm_stack_base + (SHORT_STACK_SIZE - 1) * WAVEFRONT_SIZE; \n"\
+"                    addr = lm_stack_base[WAVEFRONT_SIZE * (SHORT_STACK_SIZE - 1)]; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            // Finished traversal, but no intersection found \n"\
+"            hits[global_id] = MISS_MARKER; \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void intersect_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL bvh_node const* restrict nodes, \n"\
+"    // Triangles vertices \n"\
+"    GLOBAL float3 const* restrict vertices, \n"\
+"    // Rays \n"\
+"    GLOBAL ray const* restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL int const* restrict num_rays, \n"\
+"    // Stack memory \n"\
+"    GLOBAL int* stack, \n"\
+"    // Hit data \n"\
+"    GLOBAL Intersection* hits) \n"\
+"{ \n"\
+"    // Allocate stack in LDS \n"\
+"    __local int lds[SHORT_STACK_SIZE * WAVEFRONT_SIZE]; \n"\
+" \n"\
+"    int global_id = get_global_id(0); \n"\
+"    int local_id = get_local_id(0); \n"\
+"    int group_id = get_group_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        ray const r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Allocate stack in global memory  \n"\
+"            __global int* gm_stack_base = stack + (group_id * WAVEFRONT_SIZE + local_id) * GLOBAL_STACK_SIZE; \n"\
+"            __global int* gm_stack = gm_stack_base; \n"\
+"            __local int* lm_stack_base = lds + local_id; \n"\
+"            __local int* lm_stack = lm_stack_base; \n"\
+" \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            float3 const invdir = safe_invdir(r); \n"\
+"            float3 const oxinvdir = -r.o.xyz * invdir; \n"\
+"            // Intersection parametric distance \n"\
+"            float t_max = r.o.w; \n"\
+" \n"\
+"            // Current node address \n"\
+"            int addr = 0; \n"\
+"            // Current closest intersection leaf index \n"\
+"            int isect_idx = INVALID_IDX; \n"\
+" \n"\
+"            //  Initalize local stack \n"\
+"            *lm_stack = INVALID_IDX; \n"\
+"            lm_stack += WAVEFRONT_SIZE; \n"\
+" \n"\
+"            // Start from 0 node (root) \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node const node = nodes[addr]; \n"\
+" \n"\
+"                // Check if it is a leaf \n"\
+"                if (LEAFNODE(node)) \n"\
+"                { \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&r) != node.shape_id) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        // Leafs directly store vertex indices \n"\
+"                        // so we load vertices directly \n"\
+"                        float3 const v1 = vertices[node.i0]; \n"\
+"                        float3 const v2 = vertices[node.i1]; \n"\
+"                        float3 const v3 = vertices[node.i2]; \n"\
+"                        // Intersect triangle \n"\
+"                        float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                        // If hit update closest hit distance and index \n"\
+"                        if (f < t_max) \n"\
+"                        { \n"\
+"                            t_max = f; \n"\
+"                            isect_idx = addr; \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"                    // It is internal node, so intersect vs both children bounds \n"\
+"                    float2 const s0 = fast_intersect_bbox1(node.bounds[0], invdir, oxinvdir, t_max); \n"\
+"                    float2 const s1 = fast_intersect_bbox1(node.bounds[1], invdir, oxinvdir, t_max); \n"\
+" \n"\
+"                    // Determine which one to traverse \n"\
+"                    bool const traverse_c0 = (s0.x <= s0.y); \n"\
+"                    bool const traverse_c1 = (s1.x <= s1.y); \n"\
+"                    bool const c1first = traverse_c1 && (s0.x > s1.x); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1) \n"\
+"                    { \n"\
+"                        int deferred = -1; \n"\
+" \n"\
+"                        // Determine which one to traverse first \n"\
+"                        if (c1first || !traverse_c0) \n"\
+"                        { \n"\
+"                            // Right one is closer or left one not travesed \n"\
+"                            addr = node.child1; \n"\
+"                            deferred = node.child0; \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            // Traverse left node otherwise \n"\
+"                            addr = node.child0; \n"\
+"                            deferred = node.child1; \n"\
+"                        } \n"\
+" \n"\
+"                        // If we traverse both children we need to postpone the node \n"\
+"                        if (traverse_c0 && traverse_c1) \n"\
+"                        { \n"\
+"                            // If short stack is full, we offload it into global memory \n"\
+"                            if ( lm_stack - lm_stack_base >= SHORT_STACK_SIZE * WAVEFRONT_SIZE) \n"\
+"                            { \n"\
+"                                for (int i = 1; i < SHORT_STACK_SIZE; ++i) \n"\
+"                                { \n"\
+"                                    gm_stack[i] = lm_stack_base[i * WAVEFRONT_SIZE]; \n"\
+"                                } \n"\
+" \n"\
+"                                gm_stack += SHORT_STACK_SIZE; \n"\
+"                                lm_stack = lm_stack_base + WAVEFRONT_SIZE; \n"\
+"                            } \n"\
+" \n"\
+"                            *lm_stack = deferred; \n"\
+"                            lm_stack += WAVEFRONT_SIZE; \n"\
+"                        } \n"\
+" \n"\
+"                        // Continue traversal \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+" \n"\
+"                // Try popping from local stack \n"\
+"                lm_stack -= WAVEFRONT_SIZE; \n"\
+"                addr = *(lm_stack); \n"\
+" \n"\
+"                // If we popped INVALID_IDX then check global stack \n"\
+"                if (addr == INVALID_IDX && gm_stack > gm_stack_base) \n"\
+"                { \n"\
+"                    // Adjust stack pointer \n"\
+"                    gm_stack -= SHORT_STACK_SIZE; \n"\
+"                    // Copy data from global memory to LDS \n"\
+"                    for (int i = 1; i < SHORT_STACK_SIZE; ++i) \n"\
+"                    { \n"\
+"                        lm_stack_base[i * WAVEFRONT_SIZE] = gm_stack[i]; \n"\
+"                    } \n"\
+"                    // Point local stack pointer to the end \n"\
+"                    lm_stack = lm_stack_base + (SHORT_STACK_SIZE - 1) * WAVEFRONT_SIZE; \n"\
+"                    addr = lm_stack_base[WAVEFRONT_SIZE * (SHORT_STACK_SIZE - 1)]; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            // Check if we have found an intersection \n"\
+"            if (isect_idx != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch the node & vertices \n"\
+"                bvh_node const node = nodes[isect_idx]; \n"\
+"                float3 const v1 = vertices[node.i0]; \n"\
+"                float3 const v2 = vertices[node.i1]; \n"\
+"                float3 const v3 = vertices[node.i2]; \n"\
+"                // Calculate hit position \n"\
+"                float3 const p = r.o.xyz + r.d.xyz * t_max; \n"\
+"                // Calculate barycentric coordinates \n"\
+"                float2 const uv = triangle_calculate_barycentrics(p, v1, v2, v3); \n"\
+"                // Update hit information \n"\
+"                hits[global_id].shape_id = node.shape_id; \n"\
+"                hits[global_id].prim_id = node.prim_id; \n"\
+"                hits[global_id].uvwt = make_float4(uv.x, uv.y, 0.f, t_max); \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                // Miss here \n"\
+"                hits[global_id].shape_id = MISS_MARKER; \n"\
+"                hits[global_id].prim_id = MISS_MARKER; \n"\
+"            } \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+;
+static const char g_intersect_bvh2_skiplinks_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+"/** \n"\
+"    \\file intersect_bvh2_skiplinks.cl \n"\
+"    \\author Dmitry Kozlov \n"\
+"    \\version 1.0 \n"\
+"    \\brief Intersector implementation based on BVH with skip links. \n"\
+" \n"\
+"    IntersectorSkipLinks implementation is based on the following paper: \n"\
+"    \"Efficiency Issues for Ray Tracing\" Brian Smits \n"\
+"    http://www.cse.chalmers.se/edu/year/2016/course/course/TDA361/EfficiencyIssuesForRayTracing.pdf \n"\
+" \n"\
+"    Intersector is using binary BVH with a single bounding box per node. BVH layout guarantees \n"\
+"    that left child of an internal node lies right next to it in memory. Each BVH node has a  \n"\
+"    skip link to the node traversed next. The traversal pseude code is \n"\
+" \n"\
+"        while(addr is valid) \n"\
+"        { \n"\
+"            node <- fetch next node at addr \n"\
+"            if (rays intersects with node bbox) \n"\
+"            { \n"\
+"                if (node is leaf) \n"\
+"                    intersect leaf \n"\
+"                else \n"\
+"                { \n"\
+"                    addr <- addr + 1 (follow left child) \n"\
+"                    continue \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            addr <- skiplink at node (follow next) \n"\
+"        } \n"\
+" \n"\
+"    Pros: \n"\
+"        -Simple and efficient kernel with low VGPR pressure. \n"\
+"        -Can traverse trees of arbitrary depth. \n"\
+"    Cons: \n"\
+"        -Travesal order is fixed, so poor algorithmic characteristics. \n"\
+"        -Does not benefit from BVH quality optimizations. \n"\
+" */ \n"\
+" \n"\
+"/************************************************************************* \n"\
+" INCLUDES \n"\
+" **************************************************************************/ \n"\
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define KERNEL __kernel \n"\
+"#define GLOBAL __global \n"\
+"#define INLINE __attribute__((always_inline)) \n"\
+"#define HIT_MARKER 1 \n"\
+"#define MISS_MARKER -1 \n"\
+"#define INVALID_IDX -1 \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable \n"\
+"#endif \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPES \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"// Axis aligned bounding box \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 pmin; \n"\
+"    float4 pmax; \n"\
+"} bbox; \n"\
+" \n"\
+"// Ray definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 o; \n"\
+"    float4 d; \n"\
+"    int2 extra; \n"\
+"    int doBackfaceCulling; \n"\
+"    int padding; \n"\
+"} ray; \n"\
+" \n"\
+"// Intersection definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int shape_id; \n"\
+"    int prim_id; \n"\
+"    int2 padding; \n"\
+" \n"\
+"    float4 uvwt; \n"\
+"} Intersection; \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"HELPER FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"INLINE \n"\
+"int ray_get_mask(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.x; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_is_active(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.y; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_maxt(ray const* r) \n"\
+"{ \n"\
+"    return r->o.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_time(ray const* r) \n"\
+"{ \n"\
+"    return r->d.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_get_doBackfaceCull(ray const* r) \n"\
+"{ \n"\
+"    return r->doBackfaceCulling; \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"#ifndef APPLE \n"\
+"INLINE \n"\
+"float4 make_float4(float x, float y, float z, float w) \n"\
+"{ \n"\
+"    float4 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    res.w = w; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float3 make_float3(float x, float y, float z) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float2 make_float2(float x, float y) \n"\
+"{ \n"\
+"    float2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int2 make_int2(int x, int y) \n"\
+"{ \n"\
+"    int2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int3 make_int3(int x, int y, int z) \n"\
+"{ \n"\
+"    int3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"#endif \n"\
+" \n"\
+"INLINE float min3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_min3(a, b, c); \n"\
+"#else \n"\
+"    return min(min(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+"INLINE float max3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_max3(a, b, c); \n"\
+"#else \n"\
+"    return max(max(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// Intersect ray against a triangle and return intersection interval value if it is in \n"\
+"// (0, t_max], return t_max otherwise. \n"\
+"INLINE \n"\
+"float fast_intersect_triangle(ray r, float3 v1, float3 v2, float3 v3, float t_max) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+" \n"\
+"#ifdef RR_BACKFACE_CULL \n"\
+"    if (ray_get_doBackfaceCull(&r) && dot(cross(e1, e2), r.d.xyz) > 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"#endif // RR_BACKFACE_CULL \n"\
+" \n"\
+"    float3 const s1 = cross(r.d.xyz, e2); \n"\
+" \n"\
+"    float denom = dot(s1, e1); \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invd = 1.f / denom; \n"\
+"#else \n"\
+"    float const invd = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float3 const d = r.o.xyz - v1; \n"\
+"    float const b1 = dot(d, s1) * invd; \n"\
+"    float3 const s2 = cross(d, e1); \n"\
+"    float const b2 = dot(r.d.xyz, s2) * invd; \n"\
+"    float const temp = dot(e2, s2) * invd; \n"\
+" \n"\
+"    if (b1 < 0.f || b1 > 1.f || b2 < 0.f || b1 + b2 > 1.f || temp < 0.f || temp > t_max) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        return temp; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float3 safe_invdir(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-8; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"// Intersect rays vs bbox and return intersection span.  \n"\
+"// Intersection criteria is ret.x <= ret.y \n"\
+"INLINE \n"\
+"float2 fast_intersect_bbox1(bbox box, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    float3 const f = mad(box.pmax.xyz, invdir, oxinvdir); \n"\
+"    float3 const n = mad(box.pmin.xyz, invdir, oxinvdir); \n"\
+"    float3 const tmax = max(f, n); \n"\
+"    float3 const tmin = min(f, n); \n"\
+"    float const t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    float const t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return make_float2(t0, t1); \n"\
+"} \n"\
+" \n"\
+"// Given a point in triangle plane, calculate its barycentrics \n"\
+"INLINE \n"\
+"float2 triangle_calculate_barycentrics(float3 p, float3 v1, float3 v2, float3 v3) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+"    float3 const e = p - v1; \n"\
+"    float const d00 = dot(e1, e1); \n"\
+"    float const d01 = dot(e1, e2); \n"\
+"    float const d11 = dot(e2, e2); \n"\
+"    float const d20 = dot(e, e1); \n"\
+"    float const d21 = dot(e, e2); \n"\
+" \n"\
+"    float denom = (d00 * d11 - d01 * d01); \n"\
+"     \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return make_float2(0.f, 0.f); \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invdenom = 1.f / denom; \n"\
+"#else \n"\
+"    float const invdenom = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float const b1 = (d11 * d20 - d01 * d21) * invdenom; \n"\
+"    float const b2 = (d00 * d21 - d01 * d20) * invdenom; \n"\
+"    return make_float2(b1, b2); \n"\
+"} \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define STARTIDX(x)     (((int)(x.pmin.w)) >> 4) \n"\
+"#define NUMPRIMS(x)     (((int)(x.pmin.w)) & 0xF) \n"\
+"#define LEAFNODE(x)     (((x).pmin.w) != -1.f) \n"\
+"#define NEXT(x)     ((int)((x).pmax.w)) \n"\
+" \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+" TYPE DEFINITIONS \n"\
+" **************************************************************************/ \n"\
+"typedef bbox bvh_node; \n"\
+" \n"\
+"typedef struct \n"\
+"{ \n"\
+"    // Vertex indices \n"\
+"    int idx[3]; \n"\
+"    // Shape ID \n"\
+"    int shape_id; \n"\
+"    // Primitive ID \n"\
+"    int prim_id; \n"\
+"} Face; \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL  \n"\
+"void intersect_main( \n"\
+"    // BVH nodes \n"\
+"    GLOBAL bvh_node const* restrict nodes, \n"\
+"    // Triangle vertices \n"\
+"    GLOBAL float3 const* restrict vertices, \n"\
+"    // Triangle indices \n"\
+"    GLOBAL Face const* restrict faces, \n"\
+"    // Rays  \n"\
+"    GLOBAL ray const* restrict rays, \n"\
+"    // Number of rays \n"\
+"    GLOBAL int const* restrict num_rays, \n"\
+"    // Hit data \n"\
+"    GLOBAL Intersection* hits \n"\
+") \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+" \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        // Fetch ray \n"\
+"        ray const r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            float3 const invdir = safe_invdir(r); \n"\
+"            float3 const oxinvdir = -r.o.xyz * invdir; \n"\
+"            // Intersection parametric distance \n"\
+"            float t_max = r.o.w; \n"\
+" \n"\
+"            // Current node address \n"\
+"            int addr = 0; \n"\
+"            // Current closest face index \n"\
+"            int isect_idx = INVALID_IDX; \n"\
+" \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node node = nodes[addr]; \n"\
+"                // Intersect against bbox \n"\
+"                float2 s = fast_intersect_bbox1(node, invdir, oxinvdir, t_max); \n"\
+" \n"\
+"                if (s.x <= s.y) \n"\
+"                { \n"\
+"                    // Check if the node is a leaf \n"\
+"                    if (LEAFNODE(node)) \n"\
+"                    { \n"\
+"                        int const face_idx = STARTIDX(node); \n"\
+"                        Face const face = faces[face_idx]; \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                        if (ray_get_mask(&r) != face.shape_id) \n"\
+"                        { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                            float3 const v1 = vertices[face.idx[0]]; \n"\
+"                            float3 const v2 = vertices[face.idx[1]]; \n"\
+"                            float3 const v3 = vertices[face.idx[2]]; \n"\
+" \n"\
+"                            // Intersect triangle \n"\
+"                            float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                            // If hit update closest hit distance and index \n"\
+"                            if (f < t_max) \n"\
+"                            { \n"\
+"                                t_max = f; \n"\
+"                                isect_idx = face_idx; \n"\
+"                            } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                        } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                    } \n"\
+"                    else \n"\
+"                    { \n"\
+"                        // Move to next node otherwise. \n"\
+"                        // Left child is always at addr + 1 \n"\
+"                        ++addr; \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+" \n"\
+"                addr = NEXT(node); \n"\
+"            } \n"\
+" \n"\
+"            // Check if we have found an intersection \n"\
+"            if (isect_idx != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch the node & vertices \n"\
+"                Face const face = faces[isect_idx]; \n"\
+"                float3 const v1 = vertices[face.idx[0]]; \n"\
+"                float3 const v2 = vertices[face.idx[1]]; \n"\
+"                float3 const v3 = vertices[face.idx[2]]; \n"\
+"                // Calculate hit position \n"\
+"                float3 const p = r.o.xyz + r.d.xyz * t_max; \n"\
+"                // Calculte barycentric coordinates \n"\
+"                float2 const uv = triangle_calculate_barycentrics(p, v1, v2, v3); \n"\
+"                // Update hit information \n"\
+"                hits[global_id].shape_id = face.shape_id; \n"\
+"                hits[global_id].prim_id = face.prim_id; \n"\
+"                hits[global_id].uvwt = make_float4(uv.x, uv.y, 0.f, t_max); \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                // Miss here \n"\
+"                hits[global_id].shape_id = MISS_MARKER; \n"\
+"                hits[global_id].prim_id = MISS_MARKER; \n"\
+"            } \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL  \n"\
+"void occluded_main( \n"\
+"    // BVH nodes \n"\
+"    GLOBAL bvh_node const* restrict nodes, \n"\
+"    // Triangle vertices \n"\
+"    GLOBAL float3 const* restrict vertices, \n"\
+"    // Triangle indices \n"\
+"    GLOBAL Face const* restrict faces, \n"\
+"    // Rays  \n"\
+"    GLOBAL ray const* restrict rays, \n"\
+"    // Number of rays \n"\
+"    GLOBAL int const* restrict num_rays, \n"\
+"    // Hit data \n"\
+"    GLOBAL int* hits \n"\
+") \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        // Fetch ray \n"\
+"        ray const r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            float3 const invdir = safe_invdir(r); \n"\
+"            float3 const oxinvdir = -r.o.xyz * invdir; \n"\
+"            // Intersection parametric distance \n"\
+"            float t_max = r.o.w; \n"\
+" \n"\
+"            // Current node address \n"\
+"            int addr = 0; \n"\
+" \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node node = nodes[addr]; \n"\
+"                // Intersect against bbox \n"\
+"                float2 s = fast_intersect_bbox1(node, invdir, oxinvdir, t_max); \n"\
+" \n"\
+"                if (s.x <= s.y) \n"\
+"                { \n"\
+"                    // Check if the node is a leaf \n"\
+"                    if (LEAFNODE(node)) \n"\
+"                    { \n"\
+"                        int const face_idx = STARTIDX(node); \n"\
+"                        Face const face = faces[face_idx]; \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                        if (ray_get_mask(&r) != face.shape_id) \n"\
+"                        { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                            float3 const v1 = vertices[face.idx[0]]; \n"\
+"                            float3 const v2 = vertices[face.idx[1]]; \n"\
+"                            float3 const v3 = vertices[face.idx[2]]; \n"\
+" \n"\
+"                            // Intersect triangle \n"\
+"                            float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                            // If hit store the result and bail out \n"\
+"                            if (f < t_max) \n"\
+"                            { \n"\
+"                                hits[global_id] = HIT_MARKER; \n"\
+"                                return; \n"\
+"                            } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                        } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                    } \n"\
+"                    else \n"\
+"                    { \n"\
+"                        // Move to next node otherwise. \n"\
+"                        // Left child is always at addr + 1 \n"\
+"                        ++addr; \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+" \n"\
+"                addr = NEXT(node); \n"\
+"            } \n"\
+" \n"\
+"            // Finished traversal, but no intersection found \n"\
+"            hits[global_id] = MISS_MARKER; \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+;
+static const char g_intersect_hlbvh_stack_opencl[]= \
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+"/** \n"\
+"    \\file intersect_hlbvh_stack.cl \n"\
+"    \\author Dmitry Kozlov \n"\
+"    \\version 1.0 \n"\
+"    \\brief HLBVH build implementation \n"\
+" \n"\
+"    IntersectorHlbvh implementation is based on the following paper: \n"\
+"    \"HLBVH: Hierarchical LBVH Construction for Real-Time Ray Tracing\" \n"\
+"    Jacopo Pantaleoni (NVIDIA), David Luebke (NVIDIA), in High Performance Graphics 2010, June 2010 \n"\
+"    https://research.nvidia.com/sites/default/files/publications/HLBVH-final.pdf \n"\
+" \n"\
+"    Pros: \n"\
+"        -Very fast to build and update. \n"\
+"    Cons: \n"\
+"        -Poor BVH quality, slow traversal. \n"\
+" */ \n"\
+" \n"\
+" /************************************************************************* \n"\
+"  INCLUDES \n"\
+"  **************************************************************************/ \n"\
+"/********************************************************************** \n"\
+"Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved. \n"\
+" \n"\
+"Permission is hereby granted, free of charge, to any person obtaining a copy \n"\
+"of this software and associated documentation files (the \"Software\"), to deal \n"\
+"in the Software without restriction, including without limitation the rights \n"\
+"to use, copy, modify, merge, publish, distribute, sublicense, and/or sell \n"\
+"copies of the Software, and to permit persons to whom the Software is \n"\
+"furnished to do so, subject to the following conditions: \n"\
+" \n"\
+"The above copyright notice and this permission notice shall be included in \n"\
+"all copies or substantial portions of the Software. \n"\
+" \n"\
+"THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR \n"\
+"IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, \n"\
+"FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE \n"\
+"AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER \n"\
+"LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, \n"\
+"OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN \n"\
+"THE SOFTWARE. \n"\
+"********************************************************************/ \n"\
+" \n"\
+"/************************************************************************* \n"\
+"DEFINES \n"\
+"**************************************************************************/ \n"\
+"#define PI 3.14159265358979323846f \n"\
+"#define KERNEL __kernel \n"\
+"#define GLOBAL __global \n"\
+"#define INLINE __attribute__((always_inline)) \n"\
+"#define HIT_MARKER 1 \n"\
+"#define MISS_MARKER -1 \n"\
+"#define INVALID_IDX -1 \n"\
+" \n"\
+"/************************************************************************* \n"\
+"EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable \n"\
+"#endif \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPES \n"\
+"**************************************************************************/ \n"\
+" \n"\
+"// Axis aligned bounding box \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 pmin; \n"\
+"    float4 pmax; \n"\
+"} bbox; \n"\
+" \n"\
+"// Ray definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    float4 o; \n"\
+"    float4 d; \n"\
+"    int2 extra; \n"\
+"    int doBackfaceCulling; \n"\
+"    int padding; \n"\
+"} ray; \n"\
+" \n"\
+"// Intersection definition \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int shape_id; \n"\
+"    int prim_id; \n"\
+"    int2 padding; \n"\
+" \n"\
+"    float4 uvwt; \n"\
+"} Intersection; \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"HELPER FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"INLINE \n"\
+"int ray_get_mask(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.x; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_is_active(ray const* r) \n"\
+"{ \n"\
+"    return r->extra.y; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_maxt(ray const* r) \n"\
+"{ \n"\
+"    return r->o.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float ray_get_time(ray const* r) \n"\
+"{ \n"\
+"    return r->d.w; \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"int ray_get_doBackfaceCull(ray const* r) \n"\
+"{ \n"\
+"    return r->doBackfaceCulling; \n"\
+"} \n"\
+" \n"\
+"/************************************************************************* \n"\
+"FUNCTIONS \n"\
+"**************************************************************************/ \n"\
+"#ifndef APPLE \n"\
+"INLINE \n"\
+"float4 make_float4(float x, float y, float z, float w) \n"\
+"{ \n"\
+"    float4 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    res.w = w; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float3 make_float3(float x, float y, float z) \n"\
+"{ \n"\
+"    float3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"float2 make_float2(float x, float y) \n"\
+"{ \n"\
+"    float2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int2 make_int2(int x, int y) \n"\
+"{ \n"\
+"    int2 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    return res; \n"\
+"} \n"\
+"INLINE \n"\
+"int3 make_int3(int x, int y, int z) \n"\
+"{ \n"\
+"    int3 res; \n"\
+"    res.x = x; \n"\
+"    res.y = y; \n"\
+"    res.z = z; \n"\
+"    return res; \n"\
+"} \n"\
+"#endif \n"\
+" \n"\
+"INLINE float min3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_min3(a, b, c); \n"\
+"#else \n"\
+"    return min(min(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+"INLINE float max3(float a, float b, float c) \n"\
+"{ \n"\
+"#ifdef AMD_MEDIA_OPS \n"\
+"    return amd_max3(a, b, c); \n"\
+"#else \n"\
+"    return max(max(a,b), c); \n"\
+"#endif \n"\
+"} \n"\
+" \n"\
+" \n"\
+"// Intersect ray against a triangle and return intersection interval value if it is in \n"\
+"// (0, t_max], return t_max otherwise. \n"\
+"INLINE \n"\
+"float fast_intersect_triangle(ray r, float3 v1, float3 v2, float3 v3, float t_max) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+" \n"\
+"#ifdef RR_BACKFACE_CULL \n"\
+"    if (ray_get_doBackfaceCull(&r) && dot(cross(e1, e2), r.d.xyz) > 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"#endif // RR_BACKFACE_CULL \n"\
+" \n"\
+"    float3 const s1 = cross(r.d.xyz, e2); \n"\
+" \n"\
+"    float denom = dot(s1, e1); \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invd = 1.f / denom; \n"\
+"#else \n"\
+"    float const invd = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float3 const d = r.o.xyz - v1; \n"\
+"    float const b1 = dot(d, s1) * invd; \n"\
+"    float3 const s2 = cross(d, e1); \n"\
+"    float const b2 = dot(r.d.xyz, s2) * invd; \n"\
+"    float const temp = dot(e2, s2) * invd; \n"\
+" \n"\
+"    if (b1 < 0.f || b1 > 1.f || b2 < 0.f || b1 + b2 > 1.f || temp < 0.f || temp > t_max) \n"\
+"    { \n"\
+"        return t_max; \n"\
+"    } \n"\
+"    else \n"\
+"    { \n"\
+"        return temp; \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"INLINE \n"\
+"float3 safe_invdir(ray r) \n"\
+"{ \n"\
+"    float const dirx = r.d.x; \n"\
+"    float const diry = r.d.y; \n"\
+"    float const dirz = r.d.z; \n"\
+"    float const ooeps = 1e-8; \n"\
+"    float3 invdir; \n"\
+"    invdir.x = 1.0f / (fabs(dirx) > ooeps ? dirx : copysign(ooeps, dirx)); \n"\
+"    invdir.y = 1.0f / (fabs(diry) > ooeps ? diry : copysign(ooeps, diry)); \n"\
+"    invdir.z = 1.0f / (fabs(dirz) > ooeps ? dirz : copysign(ooeps, dirz)); \n"\
+"    return invdir; \n"\
+"} \n"\
+" \n"\
+"// Intersect rays vs bbox and return intersection span.  \n"\
+"// Intersection criteria is ret.x <= ret.y \n"\
+"INLINE \n"\
+"float2 fast_intersect_bbox1(bbox box, float3 invdir, float3 oxinvdir, float t_max) \n"\
+"{ \n"\
+"    float3 const f = mad(box.pmax.xyz, invdir, oxinvdir); \n"\
+"    float3 const n = mad(box.pmin.xyz, invdir, oxinvdir); \n"\
+"    float3 const tmax = max(f, n); \n"\
+"    float3 const tmin = min(f, n); \n"\
+"    float const t1 = min(min3(tmax.x, tmax.y, tmax.z), t_max); \n"\
+"    float const t0 = max(max3(tmin.x, tmin.y, tmin.z), 0.f); \n"\
+"    return make_float2(t0, t1); \n"\
+"} \n"\
+" \n"\
+"// Given a point in triangle plane, calculate its barycentrics \n"\
+"INLINE \n"\
+"float2 triangle_calculate_barycentrics(float3 p, float3 v1, float3 v2, float3 v3) \n"\
+"{ \n"\
+"    float3 const e1 = v2 - v1; \n"\
+"    float3 const e2 = v3 - v1; \n"\
+"    float3 const e = p - v1; \n"\
+"    float const d00 = dot(e1, e1); \n"\
+"    float const d01 = dot(e1, e2); \n"\
+"    float const d11 = dot(e2, e2); \n"\
+"    float const d20 = dot(e, e1); \n"\
+"    float const d21 = dot(e, e2); \n"\
+" \n"\
+"    float denom = (d00 * d11 - d01 * d01); \n"\
+"     \n"\
+"    if (denom == 0.f) \n"\
+"    { \n"\
+"        return make_float2(0.f, 0.f); \n"\
+"    } \n"\
+"     \n"\
+"#ifdef USE_SAFE_MATH \n"\
+"    float const invdenom = 1.f / denom; \n"\
+"#else \n"\
+"    float const invdenom = native_recip(denom); \n"\
+"#endif \n"\
+" \n"\
+"    float const b1 = (d11 * d20 - d01 * d21) * invdenom; \n"\
+"    float const b2 = (d00 * d21 - d01 * d20) * invdenom; \n"\
+"    return make_float2(b1, b2); \n"\
+"} \n"\
+" \n"\
+" /************************************************************************* \n"\
+"   EXTENSIONS \n"\
+"**************************************************************************/ \n"\
+" \n"\
+" \n"\
+" \n"\
+"/************************************************************************* \n"\
+"TYPE DEFINITIONS \n"\
+"**************************************************************************/ \n"\
+"#define STARTIDX(x)     (((int)((x).child0))) \n"\
+"#define LEAFNODE(x)     (((x).child0) == ((x).child1)) \n"\
+"#define GLOBAL_STACK_SIZE 32 \n"\
+"#define SHORT_STACK_SIZE 16 \n"\
+"#define WAVEFRONT_SIZE 64 \n"\
+" \n"\
+"typedef struct \n"\
+"{ \n"\
+"    int parent; \n"\
+"    int child0; \n"\
+"    int child1; \n"\
+"    int next; \n"\
+"} bvh_node; \n"\
+" \n"\
+"typedef struct \n"\
+"{ \n"\
+"    // Vertex indices \n"\
+"    int idx[3]; \n"\
+"    // Shape ID \n"\
+"    int shape_id; \n"\
+"    // Primitive ID \n"\
+"    int prim_id; \n"\
+"} Face; \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void \n"\
+"occluded_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL bvh_node const * restrict nodes, \n"\
+"    // Bounding boxes \n"\
+"    GLOBAL bbox const* restrict bounds, \n"\
+"    // Triangles vertices \n"\
+"    GLOBAL float3 const * restrict vertices, \n"\
+"    // Triangle indices \n"\
+"    GLOBAL Face const* faces, \n"\
+"    // Rays \n"\
+"    GLOBAL ray const * restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL int const * restrict num_rays, \n"\
+"    // Stack memory \n"\
+"    GLOBAL int* stack, \n"\
+"    // Hit results: 1 for hit and -1 for miss \n"\
+"    GLOBAL int* hits \n"\
+"    ) \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+"    int local_id = get_local_id(0); \n"\
+"    int group_id = get_group_id(0); \n"\
+" \n"\
+"    // Handle only working set \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        ray const r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Allocate stack in global memory  \n"\
+"            __global int* gm_stack_base = stack + (group_id * WAVEFRONT_SIZE + local_id) * GLOBAL_STACK_SIZE; \n"\
+"            __global int* gm_stack = gm_stack_base; \n"\
+"            // Allocate stack in LDS \n"\
+"            __local int lds[SHORT_STACK_SIZE * WAVEFRONT_SIZE]; \n"\
+"            __local int* lm_stack_base = lds + local_id; \n"\
+"            __local int* lm_stack = lm_stack_base; \n"\
+" \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            float3 const invdir = safe_invdir(r); \n"\
+"            float3 const oxinvdir = -r.o.xyz * invdir; \n"\
+"            // Intersection parametric distance \n"\
+"            float const t_max = r.o.w; \n"\
+" \n"\
+"            // Current node address \n"\
+"            int addr = 0; \n"\
+"            // Current closest intersection leaf index \n"\
+"            int isect_idx = INVALID_IDX; \n"\
+" \n"\
+"            //  Initalize local stack \n"\
+"            *lm_stack = INVALID_IDX; \n"\
+"            lm_stack += WAVEFRONT_SIZE; \n"\
+" \n"\
+"            // Start from 0 node (root) \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node const node = nodes[addr]; \n"\
+" \n"\
+"                // Check if it is a leaf \n"\
+"                if (LEAFNODE(node)) \n"\
+"                { \n"\
+"                    Face face = faces[STARTIDX(node)]; \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&r) != face.shape_id) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        // Leafs directly store vertex indices \n"\
+"                        // so we load vertices directly \n"\
+"                        float3 const v1 = vertices[face.idx[0]]; \n"\
+"                        float3 const v2 = vertices[face.idx[1]]; \n"\
+"                        float3 const v3 = vertices[face.idx[2]]; \n"\
+"                        // Intersect triangle \n"\
+"                        float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                        // If hit update closest hit distance and index \n"\
+"                        if (f < t_max) \n"\
+"                        { \n"\
+"                            hits[global_id] = HIT_MARKER; \n"\
+"                            return; \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"                    // It is internal node, so intersect vs both children bounds \n"\
+"                    float2 const s0 = fast_intersect_bbox1(bounds[node.child0], invdir, oxinvdir, t_max); \n"\
+"                    float2 const s1 = fast_intersect_bbox1(bounds[node.child1], invdir, oxinvdir, t_max); \n"\
+" \n"\
+"                    // Determine which one to traverse \n"\
+"                    bool const traverse_c0 = (s0.x <= s0.y); \n"\
+"                    bool const traverse_c1 = (s1.x <= s1.y); \n"\
+"                    bool const c1first = traverse_c1 && (s0.x > s1.x); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1) \n"\
+"                    { \n"\
+"                        int deferred = -1; \n"\
+" \n"\
+"                        // Determine which one to traverse first \n"\
+"                        if (c1first || !traverse_c0) \n"\
+"                        { \n"\
+"                            // Right one is closer or left one not travesed \n"\
+"                            addr = node.child1; \n"\
+"                            deferred = node.child0; \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            // Traverse left node otherwise \n"\
+"                            addr = node.child0; \n"\
+"                            deferred = node.child1; \n"\
+"                        } \n"\
+" \n"\
+"                        // If we traverse both children we need to postpone the node \n"\
+"                        if (traverse_c0 && traverse_c1) \n"\
+"                        { \n"\
+"                            // If short stack is full, we offload it into global memory \n"\
+"                            if (lm_stack - lm_stack_base >= SHORT_STACK_SIZE * WAVEFRONT_SIZE) \n"\
+"                            { \n"\
+"                                for (int i = 1; i < SHORT_STACK_SIZE; ++i) \n"\
+"                                { \n"\
+"                                    gm_stack[i] = lm_stack_base[i * WAVEFRONT_SIZE]; \n"\
+"                                } \n"\
+" \n"\
+"                                gm_stack += SHORT_STACK_SIZE; \n"\
+"                                lm_stack = lm_stack_base + WAVEFRONT_SIZE; \n"\
+"                            } \n"\
+" \n"\
+"                            *lm_stack = deferred; \n"\
+"                            lm_stack += WAVEFRONT_SIZE; \n"\
+"                        } \n"\
+" \n"\
+"                        // Continue traversal \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+" \n"\
+"                // Try popping from local stack \n"\
+"                lm_stack -= WAVEFRONT_SIZE; \n"\
+"                addr = *(lm_stack); \n"\
+" \n"\
+"                // If we popped INVALID_IDX then check global stack \n"\
+"                if (addr == INVALID_IDX && gm_stack > gm_stack_base) \n"\
+"                { \n"\
+"                    // Adjust stack pointer \n"\
+"                    gm_stack -= SHORT_STACK_SIZE; \n"\
+"                    // Copy data from global memory to LDS \n"\
+"                    for (int i = 1; i < SHORT_STACK_SIZE; ++i) \n"\
+"                    { \n"\
+"                        lm_stack_base[i * WAVEFRONT_SIZE] = gm_stack[i]; \n"\
+"                    } \n"\
+"                    // Point local stack pointer to the end \n"\
+"                    lm_stack = lm_stack_base + (SHORT_STACK_SIZE - 1) * WAVEFRONT_SIZE; \n"\
+"                    addr = lm_stack_base[WAVEFRONT_SIZE * (SHORT_STACK_SIZE - 1)]; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            // Finished traversal, but no intersection found \n"\
+"            hits[global_id] = MISS_MARKER; \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+"__attribute__((reqd_work_group_size(64, 1, 1))) \n"\
+"KERNEL void intersect_main( \n"\
+"    // Bvh nodes \n"\
+"    GLOBAL bvh_node const* restrict nodes, \n"\
+"    // Bounding boxes \n"\
+"    GLOBAL bbox const* restrict bounds, \n"\
+"    // Triangles vertices \n"\
+"    GLOBAL float3 const* restrict vertices, \n"\
+"    // Faces \n"\
+"    GLOBAL Face const* restrict faces, \n"\
+"    // Rays \n"\
+"    GLOBAL ray const* restrict rays, \n"\
+"    // Number of rays in rays buffer \n"\
+"    GLOBAL int const* restrict num_rays, \n"\
+"    // Stack memory \n"\
+"    GLOBAL int* stack, \n"\
+"    // Hit data \n"\
+"    GLOBAL Intersection* hits) \n"\
+"{ \n"\
+"    int global_id = get_global_id(0); \n"\
+"    int local_id = get_local_id(0); \n"\
+"    int group_id = get_group_id(0); \n"\
+" \n"\
+"    // Handle only working subset \n"\
+"    if (global_id < *num_rays) \n"\
+"    { \n"\
+"        ray const r = rays[global_id]; \n"\
+" \n"\
+"        if (ray_is_active(&r)) \n"\
+"        { \n"\
+"            // Allocate stack in global memory  \n"\
+"            __global int* gm_stack_base = stack + (group_id * WAVEFRONT_SIZE + local_id) * GLOBAL_STACK_SIZE; \n"\
+"            __global int* gm_stack = gm_stack_base; \n"\
+"            // Allocate stack in LDS \n"\
+"            __local int lds[SHORT_STACK_SIZE * WAVEFRONT_SIZE]; \n"\
+"            __local int* lm_stack_base = lds + local_id; \n"\
+"            __local int* lm_stack = lm_stack_base; \n"\
+" \n"\
+"            // Precompute inverse direction and origin / dir for bbox testing \n"\
+"            float3 const invdir = safe_invdir(r); \n"\
+"            float3 const oxinvdir = -r.o.xyz * invdir; \n"\
+"            // Intersection parametric distance \n"\
+"            float t_max = r.o.w; \n"\
+" \n"\
+"            // Current node address \n"\
+"            int addr = 0; \n"\
+"            // Current closest intersection leaf index \n"\
+"            int isect_idx = INVALID_IDX; \n"\
+" \n"\
+"            //  Initalize local stack \n"\
+"            *lm_stack = INVALID_IDX; \n"\
+"            lm_stack += WAVEFRONT_SIZE; \n"\
+" \n"\
+"            // Start from 0 node (root) \n"\
+"            while (addr != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch next node \n"\
+"                bvh_node const node = nodes[addr]; \n"\
+" \n"\
+"                // Check if it is a leaf \n"\
+"                if (LEAFNODE(node)) \n"\
+"                { \n"\
+"                    Face face = faces[STARTIDX(node)]; \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    if (ray_get_mask(&r) != face.shape_id) \n"\
+"                    { \n"\
+"#endif // RR_RAY_MASK \n"\
+"                        // Leafs directly store vertex indices \n"\
+"                        // so we load vertices directly \n"\
+"                        float3 const v1 = vertices[face.idx[0]]; \n"\
+"                        float3 const v2 = vertices[face.idx[1]]; \n"\
+"                        float3 const v3 = vertices[face.idx[2]]; \n"\
+"                        // Intersect triangle \n"\
+"                        float const f = fast_intersect_triangle(r, v1, v2, v3, t_max); \n"\
+"                        // If hit update closest hit distance and index \n"\
+"                        if (f < t_max) \n"\
+"                        { \n"\
+"                            t_max = f; \n"\
+"                            isect_idx = STARTIDX(node); \n"\
+"                        } \n"\
+"#ifdef RR_RAY_MASK \n"\
+"                    } \n"\
+"#endif // RR_RAY_MASK \n"\
+"                } \n"\
+"                else \n"\
+"                { \n"\
+"                    // It is internal node, so intersect vs both children bounds \n"\
+"                    float2 const s0 = fast_intersect_bbox1(bounds[node.child0], invdir, oxinvdir, t_max); \n"\
+"                    float2 const s1 = fast_intersect_bbox1(bounds[node.child1], invdir, oxinvdir, t_max); \n"\
+" \n"\
+"                    // Determine which one to traverse \n"\
+"                    bool const traverse_c0 = (s0.x <= s0.y); \n"\
+"                    bool const traverse_c1 = (s1.x <= s1.y); \n"\
+"                    bool const c1first = traverse_c1 && (s0.x > s1.x); \n"\
+" \n"\
+"                    if (traverse_c0 || traverse_c1) \n"\
+"                    { \n"\
+"                        int deferred = -1; \n"\
+" \n"\
+"                        // Determine which one to traverse first \n"\
+"                        if (c1first || !traverse_c0) \n"\
+"                        { \n"\
+"                            // Right one is closer or left one not travesed \n"\
+"                            addr = node.child1; \n"\
+"                            deferred = node.child0; \n"\
+"                        } \n"\
+"                        else \n"\
+"                        { \n"\
+"                            // Traverse left node otherwise \n"\
+"                            addr = node.child0; \n"\
+"                            deferred = node.child1; \n"\
+"                        } \n"\
+" \n"\
+"                        // If we traverse both children we need to postpone the node \n"\
+"                        if (traverse_c0 && traverse_c1) \n"\
+"                        { \n"\
+"                            // If short stack is full, we offload it into global memory \n"\
+"                            if ( lm_stack - lm_stack_base >= SHORT_STACK_SIZE * WAVEFRONT_SIZE) \n"\
+"                            { \n"\
+"                                for (int i = 1; i < SHORT_STACK_SIZE; ++i) \n"\
+"                                { \n"\
+"                                    gm_stack[i] = lm_stack_base[i * WAVEFRONT_SIZE]; \n"\
+"                                } \n"\
+" \n"\
+"                                gm_stack += SHORT_STACK_SIZE; \n"\
+"                                lm_stack = lm_stack_base + WAVEFRONT_SIZE; \n"\
+"                            } \n"\
+" \n"\
+"                            *lm_stack = deferred; \n"\
+"                            lm_stack += WAVEFRONT_SIZE; \n"\
+"                        } \n"\
+" \n"\
+"                        // Continue traversal \n"\
+"                        continue; \n"\
+"                    } \n"\
+"                } \n"\
+" \n"\
+"                // Try popping from local stack \n"\
+"                lm_stack -= WAVEFRONT_SIZE; \n"\
+"                addr = *(lm_stack); \n"\
+" \n"\
+"                // If we popped INVALID_IDX then check global stack \n"\
+"                if (addr == INVALID_IDX && gm_stack > gm_stack_base) \n"\
+"                { \n"\
+"                    // Adjust stack pointer \n"\
+"                    gm_stack -= SHORT_STACK_SIZE; \n"\
+"                    // Copy data from global memory to LDS \n"\
+"                    for (int i = 1; i < SHORT_STACK_SIZE; ++i) \n"\
+"                    { \n"\
+"                        lm_stack_base[i * WAVEFRONT_SIZE] = gm_stack[i]; \n"\
+"                    } \n"\
+"                    // Point local stack pointer to the end \n"\
+"                    lm_stack = lm_stack_base + (SHORT_STACK_SIZE - 1) * WAVEFRONT_SIZE; \n"\
+"                    addr = lm_stack_base[WAVEFRONT_SIZE * (SHORT_STACK_SIZE - 1)]; \n"\
+"                } \n"\
+"            } \n"\
+" \n"\
+"            // Check if we have found an intersection \n"\
+"            if (isect_idx != INVALID_IDX) \n"\
+"            { \n"\
+"                // Fetch the node & vertices \n"\
+"                Face const face = faces[isect_idx]; \n"\
+"                float3 const v1 = vertices[face.idx[0]]; \n"\
+"                float3 const v2 = vertices[face.idx[1]]; \n"\
+"                float3 const v3 = vertices[face.idx[2]]; \n"\
+"                // Calculate hit position \n"\
+"                float3 const p = r.o.xyz + r.d.xyz * t_max; \n"\
+"                // Calculte barycentric coordinates \n"\
+"                float2 const uv = triangle_calculate_barycentrics(p, v1, v2, v3); \n"\
+"                // Update hit information \n"\
+"                hits[global_id].shape_id = face.shape_id; \n"\
+"                hits[global_id].prim_id = face.prim_id; \n"\
+"                hits[global_id].uvwt = make_float4(uv.x, uv.y, 0.f, t_max); \n"\
+"            } \n"\
+"            else \n"\
+"            { \n"\
+"                // Miss here \n"\
+"                hits[global_id].shape_id = MISS_MARKER; \n"\
+"                hits[global_id].prim_id = MISS_MARKER; \n"\
+"            } \n"\
+"        } \n"\
+"    } \n"\
+"} \n"\
+" \n"\
+" \n"\
+" \n"\
+;
diff --git a/RadeonRays/src/accelerator/bvh2.h b/RadeonRays/src/accelerator/bvh2.h
index a80b580a..8aaae4b1 100644
--- a/RadeonRays/src/accelerator/bvh2.h
+++ b/RadeonRays/src/accelerator/bvh2.h
@@ -115,7 +115,7 @@ namespace RadeonRays
 #endif // WIN32
         }
 
-#if _MSC_VER <= 1900 && defined(_WIN32) && !defined(_WIN64)
+#if _MSC_VER <= 1910 && defined(_WIN32) && !defined(_WIN64)
     #define MSVC_X86_ALIGNMENT_FIX &
 #else
     #define MSVC_X86_ALIGNMENT_FIX
diff --git a/RadeonRays/src/api/radeon_rays_impl.cpp b/RadeonRays/src/api/radeon_rays_impl.cpp
index 10b2e787..98ac16da 100644
--- a/RadeonRays/src/api/radeon_rays_impl.cpp
+++ b/RadeonRays/src/api/radeon_rays_impl.cpp
@@ -87,6 +87,11 @@ namespace RadeonRays
         return instance;
     }
 
+    void IntersectionApiImpl::AllocShapes(size_t const size)
+    {
+        world_.ReserveShapes(size);
+    }
+
     void IntersectionApiImpl::DeleteShape(Shape const* shape)
     {
         delete shape;
@@ -97,6 +102,11 @@ namespace RadeonRays
         world_.AttachShape(shape);
     }
 
+    void IntersectionApiImpl::AttachShapeUnchecked(Shape const* shape)
+    {
+        world_.AttachShapeUnchecked(shape);
+    }
+
     void IntersectionApiImpl::DetachShape(Shape const* shape)
     {
         world_.DetachShape(shape);
diff --git a/RadeonRays/src/api/radeon_rays_impl.h b/RadeonRays/src/api/radeon_rays_impl.h
index bde5fa04..6f965d7e 100644
--- a/RadeonRays/src/api/radeon_rays_impl.h
+++ b/RadeonRays/src/api/radeon_rays_impl.h
@@ -65,10 +65,14 @@ namespace RadeonRays
         // Create an instance of a shape with its own transform (set via Shape interface).
         // The call is blocking, so the returned value is ready upon return.
         Shape* CreateInstance(Shape const* shape) const override;
+        // *EDIT* Preallocate shape container
+        void AllocShapes(size_t const size) override;
         // Delete the shape (to simplify DLL boundary crossing
         void DeleteShape(Shape const* shape) override;
         // Attach shape to participate in intersection process
         void AttachShape(Shape const* shape) override;
+        // *EDIT* Attach shape without error checking
+        void AttachShapeUnchecked(Shape const* shape) override;
         // Detach shape, i.e. it is not going to be considered part of the scene anymore
         void DetachShape(Shape const* shape) override;
         // Detach all objects
diff --git a/RadeonRays/src/device/calc_intersection_device_cl.h b/RadeonRays/src/device/calc_intersection_device_cl.h
index 103d1b64..03f3f238 100644
--- a/RadeonRays/src/device/calc_intersection_device_cl.h
+++ b/RadeonRays/src/device/calc_intersection_device_cl.h
@@ -28,7 +28,7 @@ THE SOFTWARE.
 #include "device_cl.h"
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/RadeonRays/src/intersector/intersector_2level.cpp b/RadeonRays/src/intersector/intersector_2level.cpp
index 1745b8dd..305484fb 100644
--- a/RadeonRays/src/intersector/intersector_2level.cpp
+++ b/RadeonRays/src/intersector/intersector_2level.cpp
@@ -210,9 +210,11 @@ namespace RadeonRays
             // Copy the shapes here to be able to partition them and handle more efficiently
             // #22: we need to be able to handle instances whos base shapes are not present 
             // in the scene, so we have to add them manually here.
-            std::vector<Shape const*> shapes;
-            std::set<Shape const*> shapes_disabled;
+			// *EDIT* Base shapes are are always attached to speed up building
+            std::vector<Shape const*> shapes = world.shapes_;
+            //std::set<Shape const*> shapes_disabled;
 
+			/*
             for (auto s : world.shapes_)
             {
                 auto shapeimpl = static_cast<ShapeImpl const*>(s);
@@ -234,6 +236,7 @@ namespace RadeonRays
 
                 shapes.push_back(s);
             }
+			*/
 
             // Now partition the range into meshes and instances
             auto firstinst = std::partition(shapes.begin(), shapes.end(), [&](Shape const* shape)
@@ -455,6 +458,8 @@ namespace RadeonRays
                 // For disabled shapes force mask to zero since these shapes 
                 // present only virtually (they have not been added to the scene)
                 // and we need to skip them while doing traversal.
+				// *EDIT* There are no disabled shapes
+				/*
                 if (shapes_disabled.find(shapeimpl) == shapes_disabled.cend())
                 {
                     m_cpudata->shapedata[i].shapeDisabled = 0;
@@ -463,6 +468,8 @@ namespace RadeonRays
                 {
                     m_cpudata->shapedata[i].shapeDisabled = 1;
                 }
+				*/
+				m_cpudata->shapedata[i].shapeDisabled = 0;
 
                 shapeimpl->GetTransform(m, m_cpudata->shapedata[i].minv);
 
@@ -500,9 +507,11 @@ namespace RadeonRays
             // Copy the shapes here to be able to partition them and handle more efficiently
             // #22: we need to be able to handle instances whos base shapes are not present 
             // in the scene, so we have to add them manually here.
-            std::vector<Shape const*> shapes;
-            std::set<Shape const*> shapes_disabled;
+			// *EDIT* Base shapes are are always attached to speed up building
+            std::vector<Shape const*> shapes = world.shapes_;
+            //std::set<Shape const*> shapes_disabled;
 
+			/*
             for (auto s : world.shapes_)
             {
                 auto shapeimpl = static_cast<ShapeImpl const*>(s);
@@ -524,6 +533,7 @@ namespace RadeonRays
 
                 shapes.push_back(s);
             }
+			*/
 
             // Now partition the range into meshes and instances
             auto firstinst = std::partition(shapes.begin(), shapes.end(), [&](Shape const* shape)
@@ -618,6 +628,8 @@ namespace RadeonRays
                 // For disabled shapes force mask to zero since these shapes 
                 // present only virtually (they have not been added to the scene)
                 // and we need to skip them while doing traversal.
+				// *EDIT* There are no disabled shapes
+				/*
                 if (shapes_disabled.find(shapeimpl) == shapes_disabled.cend())
                 {
                     m_cpudata->shapedata[i].shapeDisabled = 0;
@@ -626,6 +638,8 @@ namespace RadeonRays
                 {
                     m_cpudata->shapedata[i].shapeDisabled = 1;
                 }
+				*/
+				m_cpudata->shapedata[i].shapeDisabled = 0;
 
                 shapeimpl->GetTransform(m, m_cpudata->shapedata[i].minv);
 
diff --git a/RadeonRays/src/world/world.cpp b/RadeonRays/src/world/world.cpp
index 004f3811..a29be3e2 100644
--- a/RadeonRays/src/world/world.cpp
+++ b/RadeonRays/src/world/world.cpp
@@ -25,6 +25,11 @@ THE SOFTWARE.
 
 namespace RadeonRays
 {
+    void World::ReserveShapes(size_t const size)
+    {
+        shapes_.reserve(size);
+    }
+
     void World::AttachShape(Shape const* shape)
     {
         if (std::find(shapes_.cbegin(), shapes_.cend(), shape) == shapes_.cend())
@@ -34,6 +39,12 @@ namespace RadeonRays
         }
     }
 
+    void World::AttachShapeUnchecked(Shape const* shape)
+    {
+        shapes_.push_back(shape);
+        has_changed_ = true;
+    }
+
     void World::DetachShape(Shape const* shape)
     {
         auto iter = std::find(shapes_.begin(), shapes_.end(), shape);
diff --git a/RadeonRays/src/world/world.h b/RadeonRays/src/world/world.h
index 716cb692..b010b48e 100644
--- a/RadeonRays/src/world/world.h
+++ b/RadeonRays/src/world/world.h
@@ -42,8 +42,12 @@ namespace RadeonRays
         World() = default;
         //
         virtual ~World() = default;
+        // *EDIT* Preallocate the shapes container
+        void ReserveShapes(size_t const size);
         // Attach the shape updating all the flags
         void AttachShape(Shape const* shape);
+        // *EDIT* Attach the shape updating all the flags, without checking if shape already exists.
+        void AttachShapeUnchecked(Shape const* shape);
         // Detach the shape 
         void DetachShape(Shape const* shape);
         // Detach all
diff --git a/Resources/CornellBox/orig.mtl b/Resources/CornellBox/orig.mtl
deleted file mode 100644
index 3dbac64e..00000000
--- a/Resources/CornellBox/orig.mtl
+++ /dev/null
@@ -1,75 +0,0 @@
-# Blender MTL File: 'None'
-# Material Count: 8
-
-newmtl backWall
-Ns 7.843137
-Ka 0.000000 0.000000 0.000000
-Kd 0.580000 0.568000 0.544000
-Ks 0.000000 0.000000 0.000000
-Ni 1.000000
-d 1.000000
-illum 2
-
-newmtl ceiling
-Ns 7.843137
-Ka 0.000000 0.000000 0.000000
-Kd 0.580000 0.568000 0.544000
-Ks 0.000000 0.000000 0.000000
-Ni 1.000000
-d 1.000000
-illum 2
-
-newmtl floor
-Ns 7.843137
-Ka 0.000000 0.000000 0.000000
-Kd 0.580000 0.568000 0.544000
-Ks 0.000000 0.000000 0.000000
-Ni 1.000000
-d 1.000000
-illum 2
-
-newmtl leftWall
-Ns 7.843137
-Ka 0.000000 0.000000 0.000000
-Kd 0.504000 0.052000 0.040000
-Ks 0.000000 0.000000 0.000000
-Ni 1.000000
-d 1.000000
-illum 2
-
-newmtl light
-Ns 7.843137
-Ka 0.000000 0.000000 0.000000
-Kd 0.624000 0.624000 0.624000
-Ks 0.000000 0.000000 0.000000
-Ke 36 33 24
-Ni 1.000000
-d 1.000000
-illum 2
-
-newmtl rightWall
-Ns 7.843137
-Ka 0.000000 0.000000 0.000000
-Kd 0.112000 0.360000 0.072800
-Ks 0.000000 0.000000 0.000000
-Ni 1.000000
-d 1.000000
-illum 2
-
-newmtl shortBox
-Ns 7.843137
-Ka 0.000000 0.000000 0.000000
-Kd 0.580000 0.568000 0.544000
-Ks 0.000000 0.000000 0.000000
-Ni 1.000000
-d 1.000000
-illum 2
-
-newmtl tallBox
-Ns 7.843137
-Ka 0.000000 0.000000 0.000000
-Kd 0.580000 0.568000 0.544000
-Ks 0.000000 0.000000 0.000000
-Ni 1.000000
-d 1.000000
-illum 2
diff --git a/Resources/CornellBox/orig.objm b/Resources/CornellBox/orig.objm
deleted file mode 100644
index e0cf6176..00000000
--- a/Resources/CornellBox/orig.objm
+++ /dev/null
@@ -1,132 +0,0 @@
-# Blender v2.69 (sub 0) OBJ File: ''
-# www.blender.org
-mtllib orig.mtl
-
-o light
-v -0.240000 1.980000 0.160000
-v -0.240000 1.980000 -0.220000
-v 0.230000 1.980000 -0.220000
-v 0.230000 1.980000 0.160000
-vn 0.000000 -1.000000 0.000000
-usemtl light
-s off
-f 1//1 2//1 3//1 4//1
-
-o shortBox
-v -1.010000 -0.000000 0.990000
-v -0.990000 0.000000 -1.040000
-v -1.020000 1.990000 -1.040000
-v -1.020000 1.990000 0.990000
-v 0.530000 0.600000 0.750000
-v 0.700000 0.600000 0.170000
-v 0.130000 0.600000 0.000000
-v -0.050000 0.600000 0.570000
-v -0.050000 -0.000000 0.570000
-v -0.050000 0.600000 0.570000
-v 0.130000 0.600000 0.000000
-v 0.130000 0.000000 0.000000
-v 0.530000 -0.000000 0.750000
-v 0.530000 0.600000 0.750000
-v -0.050000 0.600000 0.570000
-v -0.050000 -0.000000 0.570000
-v 0.700000 -0.000000 0.170000
-v 0.700000 0.600000 0.170000
-v 0.530000 0.600000 0.750000
-v 0.530000 -0.000000 0.750000
-v 0.130000 0.000000 0.000000
-v 0.130000 0.600000 0.000000
-v 0.700000 0.600000 0.170000
-v 0.700000 -0.000000 0.170000
-vn -0.000000 1.000000 0.000000
-vn -0.953583 0.000000 -0.301131
-vn -0.296399 -0.000000 0.955064
-vn 0.285805 0.000000 -0.958288
-vn 0.959629 -0.000000 0.281270
-vn 0.999937 0.010050 0.004926
-
-usemtl shortBox
-s off
-f 9//2 10//2 11//2 12//2
-f 13//3 14//3 15//3 16//3
-f 17//4 18//4 19//4 20//4
-f 25//5 26//5 27//5 28//5
-f 21//6 22//6 23//6 24//6
-
-o leftWall
-usemtl leftWall
-f 5//7 6//7 7//7 8//7
-
-o backWall
-v -0.990000 0.000000 -1.040000
-v 1.000000 0.000000 -1.040000
-v 1.000000 1.990000 -1.040000
-v -1.020000 1.990000 -1.040000
-vn 0.000000 -0.000000 1.000000
-usemtl backWall
-s off
-f 29//8 30//8 31//8 32//8
-
-o rightWall
-v 1.000000 0.000000 -1.040000
-v 1.000000 -0.000000 0.990000
-v 1.000000 1.990000 0.990000
-v 1.000000 1.990000 -1.040000
-vn -1.000000 0.000000 0.000000
-usemtl rightWall
-s off
-f 33//9 34//9 35//9 36//9
-
-o ceiling
-v -1.020000 1.990000 0.990000
-v -1.020000 1.990000 -1.040000
-v 1.000000 1.990000 -1.040000
-v 1.000000 1.990000 0.990000
-vn 0.000000 -1.000000 -0.000000
-usemtl ceiling
-s off
-f 37//10 38//10 39//10 40//10
-
-o tallBox
-v -0.530000 1.200000 0.090000
-v 0.040000 1.200000 -0.090000
-v -0.140000 1.200000 -0.670000
-v -0.710000 1.200000 -0.490000
-v -0.530000 -0.000000 0.090000
-v -0.530000 1.200000 0.090000
-v -0.710000 1.200000 -0.490000
-v -0.710000 0.000000 -0.490000
-v -0.710000 0.000000 -0.490000
-v -0.710000 1.200000 -0.490000
-v -0.140000 1.200000 -0.670000
-v -0.140000 0.000000 -0.670000
-v -0.140000 0.000000 -0.670000
-v -0.140000 1.200000 -0.670000
-v 0.040000 1.200000 -0.090000
-v 0.040000 0.000000 -0.090000
-v 0.040000 0.000000 -0.090000
-v 0.040000 1.200000 -0.090000
-v -0.530000 1.200000 0.090000
-v -0.530000 -0.000000 0.090000
-vn 0.000000 1.000000 0.000000
-vn -0.955064 0.000000 0.296399
-vn -0.301131 0.000000 -0.953583
-vn 0.955064 0.000000 -0.296399
-vn 0.301131 -0.000000 0.953583
-usemtl tallBox
-s off
-f 41//11 42//11 43//11 44//11
-f 45//12 46//12 47//12 48//12
-f 49//13 50//13 51//13 52//13
-f 53//14 54//14 55//14 56//14
-f 57//15 58//15 59//15 60//15
-
-
-o floor
-v -1.010000 -0.000000 0.990000
-v 1.000000 -0.000000 0.990000
-v 1.000000 0.000000 -1.040000
-v -0.990000 0.000000 -1.040000
-vn 0.000000 1.000000 0.000000
-usemtl floor
-s off
-f 61//16 62//16 63//16 64//16
diff --git a/Tutorials/CMakeLists.txt b/Tutorials/CMakeLists.txt
deleted file mode 100644
index 9b227f83..00000000
--- a/Tutorials/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-#add_subdirectory(Tools)
-#add_subdirectory(CornellBox)
-#add_subdirectory(CornellBoxShadow)
-#add_subdirectory(Triangle)
-#add_subdirectory(TriangleLight)
diff --git a/Tutorials/CornellBox/CMakeLists.txt b/Tutorials/CornellBox/CMakeLists.txt
deleted file mode 100644
index a6e2ec7d..00000000
--- a/Tutorials/CornellBox/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-project (TutorialCornellBox CXX)
-
-set(SOURCES main.cpp
-    $<TARGET_OBJECTS:TutorialTools>)
-    
-#message(FATAL_ERROR $<TARGET_OBJECTS:TutorialTools>)
-
-add_executable(TutorialCornellBox ${SOURCES})
-target_compile_features(TutorialCornellBox PRIVATE cxx_std_11)
-target_link_libraries(TutorialCornellBox PRIVATE RadeonRays)
-if (APPLE OR UNIX)
-    target_link_libraries(TutorialCornellBox PRIVATE OpenGL::GL GLUT::GLUT GLEW::GLEW ${OIIO_LIBS})
-elseif (WIN32)
-    target_include_directories(TutorialCornellBox 
-        PRIVATE ${GLUT_INCLUDES} 
-        PRIVATE ${GLEW_INCLUDES} 
-        PRIVATE ${OIIO_INCLUDES})
-    target_link_libraries(TutorialCornellBox PRIVATE OpenGL::GL ${GLUT_LIBS} ${GLEW_LIBS} ${OIIO_LIBS})
-endif (APPLE OR UNIX)
diff --git a/Tutorials/CornellBox/main.cpp b/Tutorials/CornellBox/main.cpp
deleted file mode 100644
index dec88f6d..00000000
--- a/Tutorials/CornellBox/main.cpp
+++ /dev/null
@@ -1,308 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-#include "radeon_rays.h"
-#include <GL/glew.h>
-#include <GL/glut.h>
-#include <cassert>
-#include <iostream>
-#include <memory>
-#include "../Tools/shader_manager.h"
-#include "../Tools/tiny_obj_loader.h"
-
-using namespace RadeonRays;
-using namespace tinyobj;
-
-namespace {
-    std::vector<shape_t> g_objshapes;
-    std::vector<material_t> g_objmaterials;
-    
-    GLuint g_vertex_buffer, g_index_buffer;
-    GLuint g_texture;
-    int g_window_width = 640;
-    int g_window_height = 480;
-    std::unique_ptr<ShaderManager> g_shader_manager;
-}
-
-void InitData()
-{
-    std::string basepath = "../../Resources/CornellBox/"; 
-    std::string filename = basepath + "orig.objm";
-    std::string res = LoadObj(g_objshapes, g_objmaterials, filename.c_str(), basepath.c_str());
-    if (res != "")
-    {
-        throw std::runtime_error(res);
-    }
-}
-
-float3 ConvertFromBarycentric(const float* vec, const int* ind, int prim_id, const float4& uvwt)
-{
-    float3 a = { vec[ind[prim_id * 3] * 3],
-                vec[ind[prim_id * 3] * 3 + 1],
-                vec[ind[prim_id * 3] * 3 + 2], };
-
-    float3 b = { vec[ind[prim_id * 3 + 1] * 3],
-                vec[ind[prim_id * 3 + 1] * 3 + 1],
-                vec[ind[prim_id * 3 + 1] * 3 + 2], };
-
-    float3 c = { vec[ind[prim_id * 3 + 2] * 3],
-                vec[ind[prim_id * 3 + 2] * 3 + 1],
-                vec[ind[prim_id * 3 + 2] * 3 + 2], };
-    return a * (1 - uvwt.x - uvwt.y) + b * uvwt.x + c * uvwt.y;
-}
-
-void InitGl()
-{
-    g_shader_manager.reset(new ShaderManager());
-
-    glClearColor(0.0, 0.0, 0.0, 0.0);
-    glCullFace(GL_NONE);
-    glDisable(GL_DEPTH_TEST);
-    glEnable(GL_TEXTURE_2D);
-
-    glGenBuffers(1, &g_vertex_buffer);
-    glGenBuffers(1, &g_index_buffer);
-
-    // create Vertex buffer
-    glBindBuffer(GL_ARRAY_BUFFER, g_vertex_buffer);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, g_index_buffer);
-
-    float quad_vdata[] =
-    {
-        -1, -1, 0.5, 0, 0,
-        1, -1, 0.5, 1, 0,
-        1, 1, 0.5, 1, 1,
-        -1, 1, 0.5, 0, 1
-    };
-
-    GLshort quad_idata[] =
-    {
-        0, 1, 3,
-        3, 1, 2
-    };
-
-    // fill data
-    glBufferData(GL_ARRAY_BUFFER, sizeof(quad_vdata), quad_vdata, GL_STATIC_DRAW);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(quad_idata), quad_idata, GL_STATIC_DRAW);
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
-
-    // texture
-    glGenTextures(1, &g_texture);
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, g_window_width, g_window_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, nullptr);
-    glBindTexture(GL_TEXTURE_2D, 0);
-}
-
-void DrawScene()
-{
-    glDisable(GL_DEPTH_TEST);
-    glViewport(0, 0, g_window_width, g_window_height);
-
-    glClear(GL_COLOR_BUFFER_BIT);
-
-    glBindBuffer(GL_ARRAY_BUFFER, g_vertex_buffer);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, g_index_buffer);
-
-    // shader data
-    GLuint program = g_shader_manager->GetProgram("simple");
-    glUseProgram(program);
-    GLuint texloc = glGetUniformLocation(program, "g_Texture");
-    assert(texloc >= 0);
-
-    glUniform1i(texloc, 0);
-
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-
-    GLuint position_attr = glGetAttribLocation(program, "inPosition");
-    GLuint texcoord_attr = glGetAttribLocation(program, "inTexcoord");
-    glVertexAttribPointer(position_attr, 3, GL_FLOAT, GL_FALSE, sizeof(float) * 5, 0);
-    glVertexAttribPointer(texcoord_attr, 2, GL_FLOAT, GL_FALSE, sizeof(float) * 5, (void*)(sizeof(float) * 3));
-    glEnableVertexAttribArray(position_attr);
-    glEnableVertexAttribArray(texcoord_attr);
-
-    // draw rectanle
-    glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, nullptr);
-
-    glDisableVertexAttribArray(texcoord_attr);
-    glBindTexture(GL_TEXTURE_2D, 0);
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
-    glUseProgram(0);
-
-    glFinish();
-    glutSwapBuffers();
-}
-
-int main(int argc, char* argv[])
-{
-    // GLUT Window Initialization:
-    glutInit(&argc, (char**)argv);
-    glutInitWindowSize(640, 480);
-    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH);
-    glutCreateWindow("Triangle");
-#ifndef __APPLE__
-    GLenum err = glewInit();
-    if (err != GLEW_OK)
-    {
-        std::cout << "GLEW initialization failed\n";
-        return -1;
-    }
-#endif
-    // Prepare rectangle for drawing texture
-    // rendered using intersection results
-    InitGl();
-
-    // Load CornellBox model
-    InitData();
-
-    // Choose device
-    int nativeidx = -1;
-    // Always use OpenCL
-    IntersectionApi::SetPlatform(DeviceInfo::kOpenCL);
-    for (auto idx = 0U; idx < IntersectionApi::GetDeviceCount(); ++idx)
-    {
-        DeviceInfo devinfo;
-        IntersectionApi::GetDeviceInfo(idx, devinfo);
-
-        if (devinfo.type == DeviceInfo::kGpu && nativeidx == -1)
-        {
-            nativeidx = idx;
-        }
-    }
-    assert(nativeidx != -1);
-    IntersectionApi* api = IntersectionApi::Create(nativeidx);
-    
-    // Adding meshes to tracing scene
-    for (int id = 0; id < g_objshapes.size(); ++id)
-    {
-        shape_t& objshape = g_objshapes[id];
-        float* vertdata = objshape.mesh.positions.data();
-        int nvert = objshape.mesh.positions.size();
-        int* indices = objshape.mesh.indices.data();
-        int nfaces = objshape.mesh.indices.size() / 3;
-        Shape* shape = api->CreateMesh(vertdata, nvert, 3 * sizeof(float), indices, 0, nullptr, nfaces);
-
-        assert(shape != nullptr);
-        api->AttachShape(shape);
-        shape->SetId(id);
-    }
-    // Commit scene changes
-    api->Commit();
-
-    const int k_raypack_size = g_window_height * g_window_width;
-    
-    // Prepare rays. One for each texture pixel.
-    std::vector<ray> rays(k_raypack_size);
-    float4 camera_pos = { 0.f, 1.f, 3.f, 1000.f };
-    for (int i = 0; i < g_window_height; ++i)
-        for (int j = 0; j < g_window_width; ++j)
-        {
-            const float xstep = 2.f / (float)g_window_width;
-            const float ystep = 2.f / (float)g_window_height;
-            float x = -1.f + xstep * (float)j;
-            float y = ystep * (float)i;
-            float z = 1.f;
-            // Perspective view
-            rays[i * g_window_width + j].o = camera_pos;
-            rays[i * g_window_width + j].d = float3(x - camera_pos.x, y - camera_pos.y, z - camera_pos.z);
-        }
-    Buffer* ray_buffer = api->CreateBuffer(rays.size() * sizeof(ray), rays.data());
-
-    // Intersection data
-    std::vector<Intersection> isect(k_raypack_size);
-    Buffer* isect_buffer = api->CreateBuffer(isect.size() * sizeof(Intersection), nullptr);
-    
-    // Intersection
-    api->QueryIntersection(ray_buffer, k_raypack_size, isect_buffer, nullptr, nullptr);
-
-    // Get results
-    Event* e = nullptr;
-    Intersection* tmp = nullptr;
-    api->MapBuffer(isect_buffer, kMapRead, 0, isect.size() * sizeof(Intersection), (void**)&tmp, &e);
-    // RadeonRays calls are asynchronous, so need to wait for calculation to complete.
-    e->Wait();
-    api->DeleteEvent(e);
-    e = nullptr;
-    
-    // Copy results
-    for (int i = 0; i < k_raypack_size; ++i)
-    {
-        isect[i] = tmp[i];
-    }
-
-    // Point light position
-    float3 light = { -0.01f, 1.9f, 0.1f };
-
-    // Draw
-    std::vector<unsigned char> tex_data(k_raypack_size * 4);
-    for (int i = 0; i < k_raypack_size ; ++i)
-    {
-        int shape_id = isect[i].shapeid;
-        int prim_id = isect[i].primid;
-
-        if (shape_id != kNullId && prim_id != kNullId)
-        {
-            mesh_t& mesh = g_objshapes[shape_id].mesh;
-            int mat_id = mesh.material_ids[prim_id];
-            material_t& mat = g_objmaterials[mat_id];
-
-            float3 diff_col = { mat.diffuse[0],
-                                mat.diffuse[1],
-                                mat.diffuse[2] };
-
-            // Calculate position and normal of the intersection point
-            float3 pos = ConvertFromBarycentric(mesh.positions.data(), mesh.indices.data(), prim_id, isect[i].uvwt);
-            float3 norm = ConvertFromBarycentric(mesh.normals.data(), mesh.indices.data(), prim_id, isect[i].uvwt);
-            norm.normalize();
-            
-            // Calculate lighting
-            float3 col = { 0.f, 0.f, 0.f };
-            float3 light_dir = light - pos;
-            light_dir.normalize();
-            float dot_prod = dot(norm, light_dir);
-            if (dot_prod > 0)
-                col += dot_prod * diff_col;
-
-            tex_data[i * 4] = col[0] * 255;
-            tex_data[i * 4 + 1] = col[1] * 255;
-            tex_data[i * 4 + 2] = col[2] * 255;
-            tex_data[i * 4 + 3] = 255;
-        }
-    }
-
-    // Update texture data
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, g_window_width, g_window_height, GL_RGBA, GL_UNSIGNED_BYTE, tex_data.data());
-    glBindTexture(GL_TEXTURE_2D, NULL);
-
-    // Start the main loop
-    glutDisplayFunc(DrawScene);
-    glutMainLoop(); 
-
-    // Cleanup
-    IntersectionApi::Delete(api);
-
-    return 0;
-}
diff --git a/Tutorials/CornellBox/simple.fsh b/Tutorials/CornellBox/simple.fsh
deleted file mode 100644
index e569481f..00000000
--- a/Tutorials/CornellBox/simple.fsh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-
-uniform sampler2D g_Texture;
-
-varying vec2 Texcoord;
-
-void main()
-{
-    //gl_FragColor = vec4(1f, 0f, 0f, 1f);
-    gl_FragColor = texture2D(g_Texture, Texcoord);
-}
\ No newline at end of file
diff --git a/Tutorials/CornellBox/simple.vsh b/Tutorials/CornellBox/simple.vsh
deleted file mode 100644
index 002269d4..00000000
--- a/Tutorials/CornellBox/simple.vsh
+++ /dev/null
@@ -1,33 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-
-attribute vec3 inPosition;
-attribute vec2 inTexcoord;
-
-varying vec2 Texcoord;
-
-void main()
-{
-    Texcoord = inTexcoord;
-    gl_Position = vec4(inPosition, 1.0);
-}
-
diff --git a/Tutorials/CornellBoxShadow/kernel.cl b/Tutorials/CornellBoxShadow/kernel.cl
deleted file mode 100644
index 7e7a09f6..00000000
--- a/Tutorials/CornellBoxShadow/kernel.cl
+++ /dev/null
@@ -1,219 +0,0 @@
-/**********************************************************************
- Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
- 
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to deal
- in the Software without restriction, including without limitation the rights
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- 
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- 
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- THE SOFTWARE.
- ********************************************************************/
-#ifndef KERNEL_CL
-#define KERNEL_CL
-
-#define EPSILON 0.001f
-
-typedef struct _Ray
-{
-    /// xyz - origin, w - max range
-    float4 o;
-    /// xyz - direction, w - time
-    float4 d;
-    /// x - ray mask, y - activity flag
-    int2 extra;
-    /// Padding
-    float2 padding;
-} Ray;
-
-typedef struct _Camera
-    {
-        // Camera coordinate frame
-        float3 forward;
-        float3 up;
-        float3 p;
-        
-        // Near and far Z
-        float2 zcap;
-    } Camera;
-
-typedef struct _Intersection
-{
-    // id of a shape
-    int shapeid;
-    // Primitive index
-    int primid;
-    // Padding elements
-    int padding0;
-    int padding1;
-        
-    // uv - hit barycentrics, w - ray distance
-    float4 uvwt;
-} Intersection;
-
-float4 ConvertFromBarycentric(__global const float* vec, 
-                            __global const int* ind, 
-                            int prim_id, 
-                            __global const float4* uvwt)
-{
-    float4 a = (float4)(vec[ind[prim_id * 3] * 3],
-                        vec[ind[prim_id * 3] * 3 + 1],
-                        vec[ind[prim_id * 3] * 3 + 2], 0.f);
-
-    float4 b = (float4)(vec[ind[prim_id * 3 + 1] * 3],
-                        vec[ind[prim_id * 3 + 1] * 3 + 1],
-                        vec[ind[prim_id * 3 + 1] * 3 + 2], 0.f);
-
-    float4 c = (float4)(vec[ind[prim_id * 3 + 2] * 3],
-                        vec[ind[prim_id * 3 + 2] * 3 + 1],
-                        vec[ind[prim_id * 3 + 2] * 3 + 2], 0.f);
-    return a * (1 - uvwt->x - uvwt->y) + b * uvwt->x + c * uvwt->y;
-}
-
-__kernel void GeneratePerspectiveRays(__global Ray* rays,
-                                    __global const Camera* cam,
-                                    int width,
-                                    int height)
-{
-    int2 globalid;
-    globalid.x  = get_global_id(0);
-    globalid.y  = get_global_id(1);
-
-    // Check borders
-    if (globalid.x < width && globalid.y < height)
-    {
-        const float xstep = 2.f / (float)width;
-        const float ystep = 2.f / (float)height;
-        float x = -1.f + xstep * (float)globalid.x;
-        float y = ystep * (float)globalid.y;
-        float z = cam->zcap.x;
-        // Perspective view
-        int k = globalid.y * width + globalid.x;
-        rays[k].o.xyz = cam->p;
-        rays[k].d.x = x - cam->p.x;
-        rays[k].d.y = y - cam->p.y;
-        rays[k].d.z = z - cam->p.z;
-        rays[k].o.w = cam->zcap.y;
-
-        rays[k].extra.x = 0xFFFFFFFF;
-        rays[k].extra.y = 0xFFFFFFFF;
-    }
-}
-
-__kernel void GenerateShadowRays(__global Ray* rays,
-                            //scene
-                            __global float* positions,
-                            __global float* normals,
-                            __global int* ids,
-                            __global float* colors,
-                            __global int* indents,
-                            //intersection
-                            __global Intersection* isect,
-                            //light pos
-                            float4 light,
-                            //window size
-                            int width,
-                            int height)
-{
-    int2 globalid;
-    globalid.x  = get_global_id(0);
-    globalid.y  = get_global_id(1);
-
-    // Check borders
-    if (globalid.x < width && globalid.y < height)
-    {
-        int k = globalid.y * width + globalid.x;
-        int shape_id = isect[k].shapeid;
-        int prim_id = isect[k].primid;
-
-        // Need shadow rays only for intersections
-        if (shape_id == -1 || prim_id == -1)
-        {
-           return;
-        }
-        
-        // Calculate position and normal of the intersection point
-        int ind = indents[shape_id];
-        float4 pos = ConvertFromBarycentric(positions + ind*3, ids + ind, prim_id, &isect[k].uvwt);
-        float4 norm = ConvertFromBarycentric(normals + ind*3, ids + ind, prim_id, &isect[k].uvwt);
-        norm = normalize(norm);
-
-        float4 dir = light - pos;
-        rays[k].d = normalize(dir);
-        rays[k].o = pos + norm * EPSILON;
-        rays[k].o.w = length(dir);
-
-        rays[k].extra.x = 0xFFFFFFFF;
-        rays[k].extra.y = 0xFFFFFFFF;
-   }
-}
-
-
-__kernel void Shading(//scene
-                __global float* positions,
-                __global float* normals,
-                __global int* ids,
-                __global float* colors,
-                __global int* indents,
-                //intersection
-                __global Intersection* isect,
-                __global const int* occl,
-                //light pos
-                float4 light,
-                int width,
-                int height,
-                __global unsigned char* out)
-{
-    int2 globalid;
-    globalid.x  = get_global_id(0);
-    globalid.y  = get_global_id(1);
-
-    // Check borders
-    if (globalid.x < width && globalid.y < height)
-    {
-        int k = globalid.y * width + globalid.x;
-        int shape_id = isect[k].shapeid;
-        int prim_id = isect[k].primid;
-
-        if (shape_id != -1 && prim_id != -1 && occl[k] == -1)
-        {
-            // Calculate position and normal of the intersection point
-            int ind = indents[shape_id];
-
-            float4 pos = ConvertFromBarycentric(positions + ind*3, ids + ind, prim_id, &isect[k].uvwt);
-            float4 norm = ConvertFromBarycentric(normals + ind*3, ids + ind, prim_id, &isect[k].uvwt);
-            norm = normalize(norm);
-
-            //triangle diffuse color
-            int color_id = ind + prim_id*3;
-            float4 diff_col = (float4)( colors[color_id],
-                                        colors[color_id + 1],
-                                        colors[color_id + 2], 1.f);
-
-            // Calculate lighting
-            float4 col = (float4)( 0.f, 0.f, 0.f, 0.f );
-            float4 light_dir = normalize(light - pos);
-            float dot_prod = dot(norm, light_dir);
-            if (dot_prod > 0)
-                col += dot_prod * diff_col;
-
-            out[k * 4] = col.x * 255;
-            out[k * 4 + 1] = col.y * 255;
-            out[k * 4 + 2] = col.z * 255;
-            out[k * 4 + 3] = 255;
-        }
-    }
-}
-
-
-#endif //KERNEL_CL
diff --git a/Tutorials/CornellBoxShadow/main.cpp b/Tutorials/CornellBoxShadow/main.cpp
deleted file mode 100644
index deaec9e4..00000000
--- a/Tutorials/CornellBoxShadow/main.cpp
+++ /dev/null
@@ -1,424 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-#include "radeon_rays.h"
-#include "radeon_rays_cl.h"
-#include "CLW.h"
-
-#include <GL/glew.h>
-#include <GLUT/GLUT.h>
-#include <cassert>
-#include <iostream>
-#include <memory>
-#include "../Tools/shader_manager.h"
-#include "../Tools/tiny_obj_loader.h"
-
-using namespace RadeonRays;
-using namespace tinyobj;
-
-namespace {
-    std::vector<shape_t> g_objshapes;
-    std::vector<material_t> g_objmaterials;
-    GLuint g_vertex_buffer, g_index_buffer;
-    GLuint g_texture;
-    int g_window_width = 640;
-    int g_window_height = 480;
-    std::unique_ptr<ShaderManager> g_shader_manager;
-    
-    IntersectionApi* g_api;
-
-    //CL data
-    CLWContext g_context;
-    CLWProgram g_program;
-    CLWBuffer<float> g_positions;
-    CLWBuffer<float> g_normals;
-    CLWBuffer<int> g_indices;
-    CLWBuffer<float> g_colors;
-    CLWBuffer<int> g_indent;
-
-
-    struct Camera
-    {
-        // Camera coordinate frame
-        float3 forward;
-        float3 up;
-        float3 p;
-
-        // Near and far Z
-        float2 zcap;
-    };
-}
-
-void InitData()
-{
-    //Load
-    std::string basepath = "../../Resources/CornellBox/"; 
-    std::string filename = basepath + "orig.objm";
-    std::string res = LoadObj(g_objshapes, g_objmaterials, filename.c_str(), basepath.c_str());
-    if (res != "")
-    {
-        throw std::runtime_error(res);
-    }
-
-    // Load data to CL
-    std::vector<float> verts;
-    std::vector<float> normals;
-    std::vector<int> inds;
-    std::vector<float> colors;
-    std::vector<int> indents;
-    int indent = 0;
-
-    for (int id = 0; id < g_objshapes.size(); ++id)
-    {
-        const mesh_t& mesh = g_objshapes[id].mesh;
-        verts.insert(verts.end(), mesh.positions.begin(), mesh.positions.end());
-        normals.insert(normals.end(), mesh.normals.begin(), mesh.normals.end());
-        inds.insert(inds.end(), mesh.indices.begin(), mesh.indices.end());
-        for (int mat_id : mesh.material_ids)
-        {
-            const material_t& mat = g_objmaterials[mat_id];
-            colors.push_back(mat.diffuse[0]);
-            colors.push_back(mat.diffuse[1]);
-            colors.push_back(mat.diffuse[2]);
-        }
-        
-        // add additional emty data to simplify indentation in arrays
-        if (mesh.positions.size() / 3 < mesh.indices.size())
-        {
-            int count = mesh.indices.size() - mesh.positions.size() / 3;
-            for (int i = 0; i < count; ++i)
-            {
-                verts.push_back(0.f); normals.push_back(0.f);
-                verts.push_back(0.f); normals.push_back(0.f);
-                verts.push_back(0.f); normals.push_back(0.f);
-            }
-        }
-
-        indents.push_back(indent);
-        indent += mesh.indices.size();
-    }
-    g_positions = CLWBuffer<float>::Create(g_context, CL_MEM_READ_ONLY, verts.size(), verts.data());
-    g_normals = CLWBuffer<float>::Create(g_context, CL_MEM_READ_ONLY, normals.size(), normals.data());
-    g_indices = CLWBuffer<int>::Create(g_context, CL_MEM_READ_ONLY, inds.size(), inds.data());
-    g_colors = CLWBuffer<float>::Create(g_context, CL_MEM_READ_ONLY, colors.size(), colors.data());
-    g_indent = CLWBuffer<int>::Create(g_context, CL_MEM_READ_ONLY, indents.size(), indents.data());
-}
-
-Buffer* GeneratePrimaryRays()
-{
-    //prepare camera buf
-    Camera cam;
-    cam.forward = {0.f, 0.f, 1.f };
-    cam.up = { 0.f, 1.f, 0.f };
-    cam.p = { 0.f, 1.f, 3.f };
-    cam.zcap = { 1.f, 1000.f };
-    CLWBuffer<Camera> camera_buf = CLWBuffer<Camera>::Create(g_context, CL_MEM_READ_ONLY, 1, &cam);
-
-    //run kernel
-    CLWBuffer<ray> ray_buf = CLWBuffer<ray>::Create(g_context, CL_MEM_READ_WRITE, g_window_width*g_window_height);
-    CLWKernel kernel = g_program.GetKernel("GeneratePerspectiveRays");
-    kernel.SetArg(0, ray_buf);
-    kernel.SetArg(1, camera_buf);
-    kernel.SetArg(2, g_window_width);
-    kernel.SetArg(3, g_window_height);
-
-    // Run generation kernel
-    size_t gs[] = { static_cast<size_t>((g_window_width + 7) / 8 * 8), static_cast<size_t>((g_window_height + 7) / 8 * 8) };
-    size_t ls[] = { 8, 8 };
-    g_context.Launch2D(0, gs, ls, kernel);
-    g_context.Flush(0);
-
-    return CreateFromOpenClBuffer(g_api, ray_buf);
-}
-
-Buffer* GenerateShadowRays(CLWBuffer<Intersection> & isect, const float3& light)
-{
-    //prepare buffers
-    CLWBuffer<ray> ray_buf = CLWBuffer<ray>::Create(g_context, CL_MEM_READ_WRITE, g_window_width*g_window_height);
-    cl_float4 light_cl = { light.x,
-                            light.y,
-                            light.z,
-                            light.w };
-    
-    //run kernel
-    CLWKernel kernel = g_program.GetKernel("GenerateShadowRays");
-    kernel.SetArg(0, ray_buf);
-    kernel.SetArg(1, g_positions);
-    kernel.SetArg(2, g_normals);
-    kernel.SetArg(3, g_indices);
-    kernel.SetArg(4, g_colors);
-    kernel.SetArg(5, g_indent);
-    kernel.SetArg(6, isect);
-    kernel.SetArg(7, light_cl);
-    kernel.SetArg(8, g_window_width);
-    kernel.SetArg(9, g_window_height);
-
-    // Run generation kernel
-    size_t gs[] = { static_cast<size_t>((g_window_width + 7) / 8 * 8), static_cast<size_t>((g_window_height + 7) / 8 * 8) };
-    size_t ls[] = { 8, 8 };
-    g_context.Launch2D(0, gs, ls, kernel);
-    g_context.Flush(0);
-
-    return CreateFromOpenClBuffer(g_api, ray_buf);
-}
-
-Buffer* Shading(const CLWBuffer<Intersection> &isect, const CLWBuffer<int> &occluds, const float3& light)
-{
-    //pass data to buffers
-    CLWBuffer<unsigned char> out_buff = CLWBuffer<unsigned char>::Create(g_context, CL_MEM_READ_ONLY, 4*g_window_width*g_window_height);
-    cl_float4 light_cl = { light.x,
-                            light.y,
-                            light.z,
-                            light.w };
-    //run kernel
-    CLWBuffer<ray> ray_buf = CLWBuffer<ray>::Create(g_context, CL_MEM_READ_WRITE, g_window_width*g_window_height);
-    CLWKernel kernel = g_program.GetKernel("Shading");
-    kernel.SetArg(0, g_positions);
-    kernel.SetArg(1, g_normals);
-    kernel.SetArg(2, g_indices);
-    kernel.SetArg(3, g_colors);
-    kernel.SetArg(4, g_indent);
-    kernel.SetArg(5, isect);
-    kernel.SetArg(6, occluds);
-    kernel.SetArg(7, light_cl);
-    kernel.SetArg(8, g_window_width);
-    kernel.SetArg(9, g_window_height);
-    kernel.SetArg(10, out_buff);
-
-    // Run generation kernel
-    size_t gs[] = { static_cast<size_t>((g_window_width + 7) / 8 * 8), static_cast<size_t>((g_window_height + 7) / 8 * 8) };
-    size_t ls[] = { 8, 8 };
-    g_context.Launch2D(0, gs, ls, kernel);
-    g_context.Flush(0);
-
-    return CreateFromOpenClBuffer(g_api, out_buff);
-}
-
-void DrawScene()
-{
-
-    glDisable(GL_DEPTH_TEST);
-    glViewport(0, 0, g_window_width, g_window_height);
-
-    glClear(GL_COLOR_BUFFER_BIT);
-
-    glBindBuffer(GL_ARRAY_BUFFER, g_vertex_buffer);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, g_index_buffer);
-
-    // shader data
-    GLuint program = g_shader_manager->GetProgram("simple");
-    glUseProgram(program);
-    GLuint texloc = glGetUniformLocation(program, "g_Texture");
-    assert(texloc >= 0);
-
-    glUniform1i(texloc, 0);
-
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-
-    GLuint position_attr = glGetAttribLocation(program, "inPosition");
-    GLuint texcoord_attr = glGetAttribLocation(program, "inTexcoord");
-    glVertexAttribPointer(position_attr, 3, GL_FLOAT, GL_FALSE, sizeof(float) * 5, 0);
-    glVertexAttribPointer(texcoord_attr, 2, GL_FLOAT, GL_FALSE, sizeof(float) * 5, (void*)(sizeof(float) * 3));
-    glEnableVertexAttribArray(position_attr);
-    glEnableVertexAttribArray(texcoord_attr);
-
-    // draw rectanle
-    glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, nullptr);
-
-    glDisableVertexAttribArray(texcoord_attr);
-    glBindTexture(GL_TEXTURE_2D, 0);
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
-    glUseProgram(0);
-
-    glFinish();
-    glutSwapBuffers();
-}
-
-void InitGl()
-{
-    g_shader_manager.reset(new ShaderManager());
-
-    glClearColor(0.0, 0.0, 0.0, 0.0);
-    glCullFace(GL_NONE);
-    glDisable(GL_DEPTH_TEST);
-    glEnable(GL_TEXTURE_2D);
-
-    glGenBuffers(1, &g_vertex_buffer);
-    glGenBuffers(1, &g_index_buffer);
-
-    // create Vertex buffer
-    glBindBuffer(GL_ARRAY_BUFFER, g_vertex_buffer);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, g_index_buffer);
-
-    float quad_vdata[] =
-    {
-        -1, -1, 0.5, 0, 0,
-        1, -1, 0.5, 1, 0,
-        1, 1, 0.5, 1, 1,
-        -1, 1, 0.5, 0, 1
-    };
-
-    GLshort quad_idata[] =
-    {
-        0, 1, 3,
-        3, 1, 2
-    };
-
-    // fill data
-    glBufferData(GL_ARRAY_BUFFER, sizeof(quad_vdata), quad_vdata, GL_STATIC_DRAW);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(quad_idata), quad_idata, GL_STATIC_DRAW);
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
-
-    // texture
-    glGenTextures(1, &g_texture);
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, g_window_width, g_window_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, nullptr);
-    glBindTexture(GL_TEXTURE_2D, 0);
-}
-
-void InitCl()
-{
-    std::vector<CLWPlatform> platforms;
-    CLWPlatform::CreateAllPlatforms(platforms);
-
-    if (platforms.size() == 0)
-    {
-        throw std::runtime_error("No OpenCL platforms installed.");
-    }
-
-    for (int i = 0; i < platforms.size(); ++i)
-    {
-        for (int d = 0; d < (int)platforms[i].GetDeviceCount(); ++d)
-        {
-            if (platforms[i].GetDevice(d).GetType() != CL_DEVICE_TYPE_GPU)
-                continue;
-            g_context = CLWContext::Create(platforms[i].GetDevice(d));
-            break;
-        }
-
-        if (g_context)
-            break;
-    }
-    const char* kBuildopts(" -cl-mad-enable -cl-fast-relaxed-math -cl-std=CL1.2 -I . ");
-
-    g_program = CLWProgram::CreateFromFile("kernel.cl", kBuildopts, g_context);
-}
-
-int main(int argc, char* argv[])
-{
-    // GLUT Window Initialization:
-    glutInit(&argc, (char**)argv);
-    glutInitWindowSize(g_window_width, g_window_height);
-    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH);
-    glutCreateWindow("TutorialCornellBoxShadow");
-#ifndef __APPLE__
-    GLenum err = glewInit();
-    if (err != GLEW_OK)
-    {
-        std::cout << "GLEW initialization failed\n";
-        return -1;
-    }
-#endif
-    // Prepare rectangle for drawing texture
-    // rendered using intersection results
-    InitGl();
-
-    InitCl();
-
-    // Load CornellBox model
-    InitData();
-
-    // Create api using already exist opencl context
-    cl_device_id id = g_context.GetDevice(0).GetID();
-    cl_command_queue queue = g_context.GetCommandQueue(0);
-
-    // Create intersection API
-    g_api = RadeonRays::CreateFromOpenClContext(g_context, id, queue);
-    
-    // Adding meshes to tracing scene
-    for (int id = 0; id < g_objshapes.size(); ++id)
-    {
-        shape_t& objshape = g_objshapes[id];
-        float* vertdata = objshape.mesh.positions.data();
-        int nvert = objshape.mesh.positions.size();
-        int* indices = objshape.mesh.indices.data();
-        int nfaces = objshape.mesh.indices.size() / 3;
-        Shape* shape = g_api->CreateMesh(vertdata, nvert, 3 * sizeof(float), indices, 0, nullptr, nfaces);
-
-        assert(shape != nullptr);
-        g_api->AttachShape(shape);
-        shape->SetId(id);
-    }
-    // Commit scene changes
-    g_api->Commit();
-
-    const int k_raypack_size = g_window_height * g_window_width;
-    
-    // Prepare rays. One for each texture pixel.
-    Buffer* ray_buffer = GeneratePrimaryRays();
-    // Intersection data
-    CLWBuffer<Intersection> isect_buffer_cl = CLWBuffer<Intersection>::Create(g_context, CL_MEM_READ_WRITE, g_window_width*g_window_height);
-    Buffer* isect_buffer = CreateFromOpenClBuffer(g_api, isect_buffer_cl);
-    
-    // Intersection
-    g_api->QueryIntersection(ray_buffer, k_raypack_size, isect_buffer, nullptr, nullptr);
-
-    // Point light position
-    float3 light = { -0.01f, 1.85f, 0.1f };
-    
-    // Shadow rays
-    Buffer* shadow_rays_buffer = GenerateShadowRays(isect_buffer_cl, light);
-    CLWBuffer<int> occl_buffer_cl = CLWBuffer<int>::Create(g_context, CL_MEM_READ_WRITE, g_window_width*g_window_height);
-    Buffer* occl_buffer = CreateFromOpenClBuffer(g_api, occl_buffer_cl);
-
-    // Occlusion
-    g_api->QueryOcclusion(shadow_rays_buffer, k_raypack_size, occl_buffer, nullptr, nullptr);
-    
-    // Shading
-    Buffer* tex_buf = Shading(isect_buffer_cl, occl_buffer_cl, light);
-    
-    // Get image data
-    std::vector<unsigned char> tex_data(k_raypack_size * 4);
-    unsigned char* pixels = nullptr;
-    Event* e = nullptr;
-    g_api->MapBuffer(tex_buf, kMapRead, 0, 4 * k_raypack_size * sizeof(unsigned char), (void**)&pixels, &e);
-    e->Wait();
-    memcpy(tex_data.data(), pixels, 4 * k_raypack_size * sizeof(unsigned char));
-
-    // Update texture data
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, g_window_width, g_window_height, GL_RGBA, GL_UNSIGNED_BYTE, tex_data.data());
-    glBindTexture(GL_TEXTURE_2D, NULL);
-
-    // Start the main loop
-    glutDisplayFunc(DrawScene);
-    glutMainLoop();
-
-    // Cleanup
-    IntersectionApi::Delete(g_api); g_api = nullptr;
-
-    return 0;
-}
diff --git a/Tutorials/CornellBoxShadow/simple.fsh b/Tutorials/CornellBoxShadow/simple.fsh
deleted file mode 100644
index e569481f..00000000
--- a/Tutorials/CornellBoxShadow/simple.fsh
+++ /dev/null
@@ -1,31 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-
-uniform sampler2D g_Texture;
-
-varying vec2 Texcoord;
-
-void main()
-{
-    //gl_FragColor = vec4(1f, 0f, 0f, 1f);
-    gl_FragColor = texture2D(g_Texture, Texcoord);
-}
\ No newline at end of file
diff --git a/Tutorials/CornellBoxShadow/simple.vsh b/Tutorials/CornellBoxShadow/simple.vsh
deleted file mode 100644
index 002269d4..00000000
--- a/Tutorials/CornellBoxShadow/simple.vsh
+++ /dev/null
@@ -1,33 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-
-attribute vec3 inPosition;
-attribute vec2 inTexcoord;
-
-varying vec2 Texcoord;
-
-void main()
-{
-    Texcoord = inTexcoord;
-    gl_Position = vec4(inPosition, 1.0);
-}
-
diff --git a/Tutorials/Tools/CMakeLists.txt b/Tutorials/Tools/CMakeLists.txt
deleted file mode 100644
index 46f5f90d..00000000
--- a/Tutorials/Tools/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-project(Tools CXX)
-
-set(SOURCES 
-    shader_manager.cpp
-    shader_manager.h
-    tiny_obj_loader.cpp
-    tiny_obj_loader.h)
-
-add_library(TutorialTools OBJECT ${SOURCES})
-
-target_compile_features(TutorialTools PRIVATE cxx_std_11)
-if (WIN32)
-    target_include_directories(TutorialTools 
-        PRIVATE ${GLUT_INCLUDES} 
-        PRIVATE ${GLEW_INCLUDES} 
-        PRIVATE ${OIIO_INCLUDES})
-endif (WIN32)
diff --git a/Tutorials/Tools/shader_manager.cpp b/Tutorials/Tools/shader_manager.cpp
deleted file mode 100644
index 3ab6091a..00000000
--- a/Tutorials/Tools/shader_manager.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-#include "shader_manager.h"
-
-#include <vector>
-#include <string>
-#include <stdexcept>
-#include <fstream>
-
-static void LoadFileContents(std::string const& name, std::vector<char>& contents, bool binary = false)
-{
-    std::ifstream in(name, std::ios::in | (std::ios_base::openmode)(binary?std::ios::binary : 0));
-
-    if (in)
-    {
-        contents.clear();
-
-        std::streamoff beg = in.tellg();
-
-        in.seekg(0, std::ios::end);
-
-        std::streamoff fileSize = in.tellg() - beg;
-
-        in.seekg(0, std::ios::beg);
-
-        contents.resize(static_cast<unsigned>(fileSize));
-
-        in.read(&contents[0], fileSize);
-    }
-    else
-    {
-        throw std::runtime_error("Cannot read the contents of a file");
-    }
-}
-
-static GLuint CompileShader(std::vector<GLchar> const& shader_source, GLenum type)
-{
-    GLuint shader = glCreateShader(type);
-    
-    GLint len = static_cast<GLint>(shader_source.size());
-    GLchar const* source_array = &shader_source[0];
-    
-    glShaderSource(shader, 1, &source_array, &len);
-    glCompileShader(shader);
-    
-    GLint result = GL_TRUE;
-    glGetShaderiv(shader, GL_COMPILE_STATUS, &result);
-    
-    if(result == GL_FALSE)
-    {
-        std::vector<char> log;
-        
-        glGetShaderiv(shader, GL_INFO_LOG_LENGTH, &len);
-        
-        log.resize(len);
-        
-        glGetShaderInfoLog(shader, len, &result, &log[0]);
-        
-        glDeleteShader(shader);
-        
-        throw std::runtime_error(std::string(log.begin(), log.end()));
-        
-        return 0;
-    }
-    
-    return shader;
-}
-
-
-ShaderManager::ShaderManager()
-{
-}
-
-
-ShaderManager::~ShaderManager()
-{
-    for (auto citer = shadercache_.cbegin(); citer != shadercache_.cend(); ++citer)
-    {
-        glDeleteProgram(citer->second);
-    }
-}
-
-GLuint ShaderManager::CompileProgram(std::string const& name)
-{
-    std::string vsname = name + ".vsh";
-    std::string fsname = name + ".fsh";
-    
-    // Need to wrap the shader program here to be exception-safe
-    std::vector<GLchar> sourcecode;
-    
-    LoadFileContents(vsname, sourcecode);
-    GLuint vertex_shader = CompileShader(sourcecode, GL_VERTEX_SHADER);
-    
-    /// This part is not exception safe:
-    /// if the VS compilation succeeded
-    /// and PS compilation fails then VS object will leak
-    /// fix this by wrapping the shaders into a class
-    LoadFileContents(fsname, sourcecode);
-    GLuint frag_shader = CompileShader(sourcecode, GL_FRAGMENT_SHADER);
-    
-    GLuint program = glCreateProgram();
-    
-    glAttachShader(program, vertex_shader);
-    glAttachShader(program, frag_shader);
-    
-    glDeleteShader(vertex_shader);
-    glDeleteShader(frag_shader);
-    
-    glLinkProgram(program);
-    
-    GLint result = GL_TRUE;
-    glGetProgramiv(program, GL_LINK_STATUS, &result);
-    
-    if(result == GL_FALSE)
-    {
-        GLint length = 0;
-        std::vector<char> log;
-        
-        glGetProgramiv(program, GL_INFO_LOG_LENGTH, &length);
-        
-        log.resize(length);
-        
-        glGetProgramInfoLog(program, length, &result, &log[0]);
-        
-        glDeleteProgram(program);
-        
-        throw std::runtime_error(std::string(log.begin(), log.end()));
-    }
-    
-    return program;
-}
-
-GLuint ShaderManager::GetProgram(std::string const& name)
-{
-    auto iter = shadercache_.find(name);
-    
-    if (iter != shadercache_.end())
-    {
-        return iter->second;
-    }
-    else
-    {
-        GLuint program = CompileProgram(name);
-        shadercache_[name] = program;
-        return program;
-    }
-}
\ No newline at end of file
diff --git a/Tutorials/Tools/shader_manager.h b/Tutorials/Tools/shader_manager.h
deleted file mode 100644
index 5da6bd78..00000000
--- a/Tutorials/Tools/shader_manager.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-#ifndef SHADER_MANAGER_H
-#define SHADER_MANAGER_H
-
-#ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
-#include <OpenGL/OpenGL.h>
-#include <GLUT/GLUT.h>
-#elif WIN32
-#define NOMINMAX
-#include <Windows.h>
-#include "GL/glew.h"
-#include "GL/glut.h"
-#else
-#include <CL/cl.h>
-#include <GL/glew.h>
-#include <GL/glut.h>
-#endif
-
-#include <string>
-#include <map>
-
-class ShaderManager
-{
-public:
-    ShaderManager();
-    ~ShaderManager();
-    
-    GLuint GetProgram(std::string const& name);
-    
-private:
-    GLuint CompileProgram(std::string const& name);
-    
-    ShaderManager(ShaderManager const&);
-    ShaderManager& operator = (ShaderManager const&);
-    
-    std::map<std::string, GLuint> shadercache_;
-};
-
-#endif
-
diff --git a/Tutorials/Tools/tiny_obj_loader.cpp b/Tutorials/Tools/tiny_obj_loader.cpp
deleted file mode 100644
index a61ff2c4..00000000
--- a/Tutorials/Tools/tiny_obj_loader.cpp
+++ /dev/null
@@ -1,725 +0,0 @@
-//
-// Copyright 2012-2013, Syoyo Fujita.
-// 
-// Licensed under 2-clause BSD liecense.
-//
-
-//
-// version 0.9.7: Support multi-materials(per-face material ID) per object/group.
-// version 0.9.6: Support Ni(index of refraction) mtl parameter.
-//                Parse transmittance material parameter correctly.
-// version 0.9.5: Parse multiple group name.
-//                Add support of specifying the base path to load material file.
-// version 0.9.4: Initial suupport of group tag(g)
-// version 0.9.3: Fix parsing triple 'x/y/z'
-// version 0.9.2: Add more .mtl load support
-// version 0.9.1: Add initial .mtl load support
-// version 0.9.0: Initial
-//
-
-
-#include <cstdlib>
-#include <cstring>
-#include <cassert>
-
-#include <string>
-#include <vector>
-#include <map>
-#include <fstream>
-#include <sstream>
-
-#include "tiny_obj_loader.h"
-
-namespace tinyobj {
-
-struct vertex_index {
-  int v_idx, vt_idx, vn_idx;
-  vertex_index() {};
-  vertex_index(int idx) : v_idx(idx), vt_idx(idx), vn_idx(idx) {};
-  vertex_index(int vidx, int vtidx, int vnidx) : v_idx(vidx), vt_idx(vtidx), vn_idx(vnidx) {};
-
-};
-// for std::map
-static inline bool operator<(const vertex_index& a, const vertex_index& b)
-{
-  if (a.v_idx != b.v_idx) return (a.v_idx < b.v_idx);
-  if (a.vn_idx != b.vn_idx) return (a.vn_idx < b.vn_idx);
-  if (a.vt_idx != b.vt_idx) return (a.vt_idx < b.vt_idx);
-
-  return false;
-}
-
-struct obj_shape {
-  std::vector<float> v;
-  std::vector<float> vn;
-  std::vector<float> vt;
-};
-
-static inline bool isSpace(const char c) {
-  return (c == ' ') || (c == '\t');
-}
-
-static inline bool isNewLine(const char c) {
-  return (c == '\r') || (c == '\n') || (c == '\0');
-}
-
-// Make index zero-base, and also support relative index. 
-static inline int fixIndex(int idx, int n)
-{
-  int i;
-
-  if (idx > 0) {
-    i = idx - 1;
-  } else if (idx == 0) {
-    i = 0;
-  } else { // negative value = relative
-    i = n + idx;
-  }
-  return i;
-}
-
-static inline std::string parseString(const char*& token)
-{
-  std::string s;
-  int b = (int)strspn(token, " \t");
-  int e = (int)strcspn(token, " \t\r");
-  s = std::string(&token[b], &token[e]);
-
-  token += (e - b);
-  return s;
-}
-
-static inline int parseInt(const char*& token)
-{
-  token += strspn(token, " \t");
-  int i = atoi(token);
-  token += strcspn(token, " \t\r");
-  return i;
-}
-
-static inline float parseFloat(const char*& token)
-{
-  token += strspn(token, " \t");
-  float f = (float)atof(token);
-  token += strcspn(token, " \t\r");
-  return f;
-}
-
-static inline void parseFloat2(
-  float& x, float& y,
-  const char*& token)
-{
-  x = parseFloat(token);
-  y = parseFloat(token);
-}
-
-static inline void parseFloat3(
-  float& x, float& y, float& z,
-  const char*& token)
-{
-  x = parseFloat(token);
-  y = parseFloat(token);
-  z = parseFloat(token);
-}
-
-
-// Parse triples: i, i/j/k, i//k, i/j
-static vertex_index parseTriple(
-  const char* &token,
-  int vsize,
-  int vnsize,
-  int vtsize)
-{
-    vertex_index vi(-1);
-
-    vi.v_idx = fixIndex(atoi(token), vsize);
-    token += strcspn(token, "/ \t\r");
-    if (token[0] != '/') {
-      return vi;
-    }
-    token++;
-
-    // i//k
-    if (token[0] == '/') {
-      token++;
-      vi.vn_idx = fixIndex(atoi(token), vnsize);
-      token += strcspn(token, "/ \t\r");
-      return vi;
-    }
-    
-    // i/j/k or i/j
-    vi.vt_idx = fixIndex(atoi(token), vtsize);
-    token += strcspn(token, "/ \t\r");
-    if (token[0] != '/') {
-      return vi;
-    }
-
-    // i/j/k
-    token++;  // skip '/'
-    vi.vn_idx = fixIndex(atoi(token), vnsize);
-    token += strcspn(token, "/ \t\r");
-    return vi; 
-}
-
-static unsigned int
-updateVertex(
-  std::map<vertex_index, unsigned int>& vertexCache,
-  std::vector<float>& positions,
-  std::vector<float>& normals,
-  std::vector<float>& texcoords,
-  const std::vector<float>& in_positions,
-  const std::vector<float>& in_normals,
-  const std::vector<float>& in_texcoords,
-  const vertex_index& i)
-{
-  const std::map<vertex_index, unsigned int>::iterator it = vertexCache.find(i);
-
-  if (it != vertexCache.end()) {
-    // found cache
-    return it->second;
-  }
-
-  assert(in_positions.size() > (unsigned int) (3*i.v_idx+2));
-
-  positions.push_back(in_positions[3*i.v_idx+0]);
-  positions.push_back(in_positions[3*i.v_idx+1]);
-  positions.push_back(in_positions[3*i.v_idx+2]);
-
-  if (i.vn_idx >= 0) {
-    normals.push_back(in_normals[3*i.vn_idx+0]);
-    normals.push_back(in_normals[3*i.vn_idx+1]);
-    normals.push_back(in_normals[3*i.vn_idx+2]);
-  }
-
-  if (i.vt_idx >= 0) {
-    texcoords.push_back(in_texcoords[2*i.vt_idx+0]);
-    texcoords.push_back(in_texcoords[2*i.vt_idx+1]);
-  }
-
-  unsigned int idx = (unsigned)positions.size() / 3 - 1;
-  vertexCache[i] = idx;
-
-  return idx;
-}
-
-void InitMaterial(material_t& material) {
-  material.name = "";
-  material.ambient_texname = "";
-  material.diffuse_texname = "";
-  material.specular_texname = "";
-  material.normal_texname = "";
-  for (int i = 0; i < 3; i ++) {
-    material.ambient[i] = 0.f;
-    material.diffuse[i] = 0.f;
-    material.specular[i] = 0.f;
-    material.transmittance[i] = 0.f;
-    material.emission[i] = 0.f;
-  }
-  material.illum = 0;
-  material.dissolve = 1.f;
-  material.shininess = 1.f;
-  material.ior = 1.f;
-  material.unknown_parameter.clear();
-}
-
-static bool
-exportFaceGroupToShape(
-  shape_t& shape,
-  std::map<vertex_index, unsigned int> vertexCache,
-  const std::vector<float> &in_positions,
-  const std::vector<float> &in_normals,
-  const std::vector<float> &in_texcoords,
-  const std::vector<std::vector<vertex_index> >& faceGroup,
-  const int material_id,
-  const std::string &name,
-  bool clearCache)
-{
-  if (faceGroup.empty()) {
-    return false;
-  }
-
-  size_t offset;
-
-  offset = shape.mesh.indices.size();
-
-  // Flatten vertices and indices
-  for (size_t i = 0; i < faceGroup.size(); i++) {
-    const std::vector<vertex_index>& face = faceGroup[i];
-
-    vertex_index i0 = face[0];
-    vertex_index i1(-1);
-    vertex_index i2 = face[1];
-
-    size_t npolys = face.size();
-
-    // Polygon -> triangle fan conversion
-    for (size_t k = 2; k < npolys; k++) {
-      i1 = i2;
-      i2 = face[k];
-
-      unsigned int v0 = updateVertex(vertexCache, shape.mesh.positions, shape.mesh.normals, shape.mesh.texcoords, in_positions, in_normals, in_texcoords, i0);
-      unsigned int v1 = updateVertex(vertexCache, shape.mesh.positions, shape.mesh.normals, shape.mesh.texcoords, in_positions, in_normals, in_texcoords, i1);
-      unsigned int v2 = updateVertex(vertexCache, shape.mesh.positions, shape.mesh.normals, shape.mesh.texcoords, in_positions, in_normals, in_texcoords, i2);
-
-      shape.mesh.indices.push_back(v0);
-      shape.mesh.indices.push_back(v1);
-      shape.mesh.indices.push_back(v2);
-
-      shape.mesh.material_ids.push_back(material_id);
-    }
-
-  }
-
-  shape.name = name;
-
-  if (clearCache)
-      vertexCache.clear();
-
-  return true;
-
-}
-
-std::string LoadMtl (
-  std::map<std::string, int>& material_map,
-  std::vector<material_t>& materials,
-  std::istream& inStream)
-{
-  material_map.clear();
-  std::stringstream err;
-
-  material_t material;
-  
-  int maxchars = 8192;  // Alloc enough size.
-  std::vector<char> buf(maxchars);  // Alloc enough size.
-  while (inStream.peek() != -1) {
-    inStream.getline(&buf[0], maxchars);
-
-    std::string linebuf(&buf[0]);
-
-    // Trim newline '\r\n' or '\n'
-    if (linebuf.size() > 0) {
-      if (linebuf[linebuf.size()-1] == '\n') linebuf.erase(linebuf.size()-1);
-    }
-    if (linebuf.size() > 0) {
-      if (linebuf[linebuf.size()-1] == '\r') linebuf.erase(linebuf.size()-1);
-    }
-
-    // Skip if empty line.
-    if (linebuf.empty()) {
-      continue;
-    }
-
-    // Skip leading space.
-    const char* token = linebuf.c_str();
-    token += strspn(token, " \t");
-
-    assert(token);
-    if (token[0] == '\0') continue; // empty line
-    
-    if (token[0] == '#') continue;  // comment line
-    
-    // new mtl
-    if ((0 == strncmp(token, "newmtl", 6)) && isSpace((token[6]))) {
-      // flush previous material.
-      if (!material.name.empty())
-      {
-          material_map.insert(std::pair<std::string, int>(material.name, (int)materials.size()));
-          materials.push_back(material);
-      }
-
-      // initial temporary material
-      InitMaterial(material);
-
-      // set new mtl name
-      char namebuf[4096];
-      token += 7;
-      sscanf(token, "%s", namebuf);
-      material.name = namebuf;
-      continue;
-    }
-    
-    // ambient
-    if (token[0] == 'K' && token[1] == 'a' && isSpace((token[2]))) {
-      token += 2;
-      float r, g, b;
-      parseFloat3(r, g, b, token);
-      material.ambient[0] = r;
-      material.ambient[1] = g;
-      material.ambient[2] = b;
-      continue;
-    }
-    
-    // diffuse
-    if (token[0] == 'K' && token[1] == 'd' && isSpace((token[2]))) {
-      token += 2;
-      float r, g, b;
-      parseFloat3(r, g, b, token);
-      material.diffuse[0] = r;
-      material.diffuse[1] = g;
-      material.diffuse[2] = b;
-      continue;
-    }
-    
-    // specular
-    if (token[0] == 'K' && token[1] == 's' && isSpace((token[2]))) {
-      token += 2;
-      float r, g, b;
-      parseFloat3(r, g, b, token);
-      material.specular[0] = r;
-      material.specular[1] = g;
-      material.specular[2] = b;
-      continue;
-    }
-    
-    // transmittance
-    if (token[0] == 'K' && token[1] == 't' && isSpace((token[2]))) {
-      token += 2;
-      float r, g, b;
-      parseFloat3(r, g, b, token);
-      material.transmittance[0] = r;
-      material.transmittance[1] = g;
-      material.transmittance[2] = b;
-      continue;
-    }
-
-    // ior(index of refraction)
-    if (token[0] == 'N' && token[1] == 'i' && isSpace((token[2]))) {
-      token += 2;
-      material.ior = parseFloat(token);
-      continue;
-    }
-
-    // emission
-    if(token[0] == 'K' && token[1] == 'e' && isSpace(token[2])) {
-      token += 2;
-      float r, g, b;
-      parseFloat3(r, g, b, token);
-      material.emission[0] = r;
-      material.emission[1] = g;
-      material.emission[2] = b;
-      continue;
-    }
-
-    // shininess
-    if(token[0] == 'N' && token[1] == 's' && isSpace(token[2])) {
-      token += 2;
-      material.shininess = parseFloat(token);
-      continue;
-    }
-
-    // illum model
-    if (0 == strncmp(token, "illum", 5) && isSpace(token[5])) {
-      token += 6;
-      material.illum = parseInt(token);
-      continue;
-    }
-
-    // dissolve
-    if ((token[0] == 'd' && isSpace(token[1]))) {
-      token += 1;
-      material.dissolve = parseFloat(token);
-      continue;
-    }
-    if (token[0] == 'T' && token[1] == 'r' && isSpace(token[2])) {
-      token += 2;
-      material.dissolve = parseFloat(token);
-      continue;
-    }
-
-    // ambient texture
-    if ((0 == strncmp(token, "map_Ka", 6)) && isSpace(token[6])) {
-      token += 7;
-      material.ambient_texname = token;
-      continue;
-    }
-
-    // diffuse texture
-    if ((0 == strncmp(token, "map_Kd", 6)) && isSpace(token[6])) {
-      token += 7;
-      material.diffuse_texname = token;
-      continue;
-    }
-
-    // specular texture
-    if ((0 == strncmp(token, "map_Ks", 6)) && isSpace(token[6])) {
-      token += 7;
-      material.specular_texname = token;
-      continue;
-    }
-
-    // normal texture
-    if ((0 == strncmp(token, "map_Ns", 6)) && isSpace(token[6])) {
-      token += 7;
-      material.normal_texname = token;
-      continue;
-    }
-
-    // unknown parameter
-    const char* _space = strchr(token, ' ');
-    if(!_space) {
-      _space = strchr(token, '\t');
-    }
-    if(_space) {
-      int len = (int)(_space - token);
-      std::string key(token, len);
-      std::string value = _space + 1;
-      material.unknown_parameter.insert(std::pair<std::string, std::string>(key, value));
-    }
-  }
-  // flush last material.
-  material_map.insert(std::pair<std::string, int>(material.name, (int)materials.size()));
-  materials.push_back(material);
-
-  return err.str();
-}
-
-std::string MaterialFileReader::operator() (
-    const std::string& matId,
-    std::vector<material_t>& materials,
-    std::map<std::string, int>& matMap)
-{
-  std::string filepath;
-
-  if (!m_mtlBasePath.empty()) {
-    filepath = std::string(m_mtlBasePath) + "/" + matId;
-  } else {
-    filepath = matId;
-  }
-
-  std::ifstream matIStream(filepath.c_str());
-  return LoadMtl(matMap, materials, matIStream);
-}
-
-std::string
-LoadObj(
-  std::vector<shape_t>& shapes,
-  std::vector<material_t>& materials,   // [output]
-  const char* filename,
-  const char* mtl_basepath)
-{
-
-  shapes.clear();
-
-  std::stringstream err;
-
-  std::ifstream ifs(filename);
-  if (!ifs) {
-    err << "Cannot open file [" << filename << "]" << std::endl;
-    return err.str();
-  }
-
-  std::string basePath;
-  if (mtl_basepath) {
-    basePath = mtl_basepath;
-  }
-  MaterialFileReader matFileReader( basePath );
-  
-  return LoadObj(shapes, materials, ifs, matFileReader);
-}
-
-std::string LoadObj(
-  std::vector<shape_t>& shapes,
-  std::vector<material_t>& materials,   // [output]
-  std::istream& inStream,
-  MaterialReader& readMatFn)
-{
-  std::stringstream err;
-
-  std::vector<float> v;
-  std::vector<float> vn;
-  std::vector<float> vt;
-  std::vector<std::vector<vertex_index> > faceGroup;
-  std::string name;
-
-  // material
-  std::map<std::string, int> material_map;
-  std::map<vertex_index, unsigned int> vertexCache;
-  int  material = -1;
-
-  shape_t shape;
-
-  int maxchars = 8192;  // Alloc enough size.
-  std::vector<char> buf(maxchars);  // Alloc enough size.
-  while (inStream.peek() != -1) {
-    inStream.getline(&buf[0], maxchars);
-
-    std::string linebuf(&buf[0]);
-
-    // Trim newline '\r\n' or '\n'
-    if (linebuf.size() > 0) {
-      if (linebuf[linebuf.size()-1] == '\n') linebuf.erase(linebuf.size()-1);
-    }
-    if (linebuf.size() > 0) {
-      if (linebuf[linebuf.size()-1] == '\r') linebuf.erase(linebuf.size()-1);
-    }
-
-    // Skip if empty line.
-    if (linebuf.empty()) {
-      continue;
-    }
-
-    // Skip leading space.
-    const char* token = linebuf.c_str();
-    token += strspn(token, " \t");
-
-    assert(token);
-    if (token[0] == '\0') continue; // empty line
-    
-    if (token[0] == '#') continue;  // comment line
-
-    // vertex
-    if (token[0] == 'v' && isSpace((token[1]))) {
-      token += 2;
-      float x, y, z;
-      parseFloat3(x, y, z, token);
-      v.push_back(x);
-      v.push_back(y);
-      v.push_back(z);
-      continue;
-    }
-
-    // normal
-    if (token[0] == 'v' && token[1] == 'n' && isSpace((token[2]))) {
-      token += 3;
-      float x, y, z;
-      parseFloat3(x, y, z, token);
-      vn.push_back(x);
-      vn.push_back(y);
-      vn.push_back(z);
-      continue;
-    }
-
-    // texcoord
-    if (token[0] == 'v' && token[1] == 't' && isSpace((token[2]))) {
-      token += 3;
-      float x, y;
-      parseFloat2(x, y, token);
-      vt.push_back(x);
-      vt.push_back(y);
-      continue;
-    }
-
-    // face
-    if (token[0] == 'f' && isSpace((token[1]))) {
-      token += 2;
-      token += strspn(token, " \t");
-
-      std::vector<vertex_index> face;
-      while (!isNewLine(token[0])) {
-        vertex_index vi = parseTriple(token, (int)v.size() / 3, (int)vn.size() / 3, (int)vt.size() / 2);
-        face.push_back(vi);
-        int n = (int)strspn(token, " \t\r");
-        token += n;
-      }
-
-      faceGroup.push_back(face);
-      
-      continue;
-    }
-
-    // use mtl
-    if ((0 == strncmp(token, "usemtl", 6)) && isSpace((token[6]))) {
-
-      char namebuf[4096];
-      token += 7;
-      sscanf(token, "%s", namebuf);
-
-      exportFaceGroupToShape(shape, vertexCache, v, vn, vt, faceGroup, material, name, false);
-      faceGroup.clear();
-
-      if (material_map.find(namebuf) != material_map.end()) {
-        material = material_map[namebuf];
-      } else {
-        // { error!! material not found }
-        material = -1;
-      }
-
-      continue;
-
-    }
-
-    // load mtl
-    if ((0 == strncmp(token, "mtllib", 6)) && isSpace((token[6]))) {
-      char namebuf[4096];
-      token += 7;
-      sscanf(token, "%s", namebuf);
-        
-      std::string err_mtl = readMatFn(namebuf, materials, material_map);
-      if (!err_mtl.empty()) {
-        faceGroup.clear();  // for safety
-        return err_mtl;
-      }
-      
-      continue;
-    }
-
-    // group name
-    if (token[0] == 'g' && isSpace((token[1]))) {
-
-      // flush previous face group.
-      bool ret = exportFaceGroupToShape(shape, vertexCache, v, vn, vt, faceGroup, material, name, true);
-      if (ret) {
-        shapes.push_back(shape);
-      }
-
-      shape = shape_t();
-
-      //material = -1;
-      faceGroup.clear();
-
-      std::vector<std::string> names;
-      while (!isNewLine(token[0])) {
-        std::string str = parseString(token);
-        names.push_back(str);
-        token += strspn(token, " \t\r"); // skip tag
-      }
-
-      assert(names.size() > 0);
-
-      // names[0] must be 'g', so skipt 0th element.
-      if (names.size() > 1) {
-        name = names[1];
-      } else {
-        name = "";
-      }
-
-      continue;
-    }
-
-    // object name
-    if (token[0] == 'o' && isSpace((token[1]))) {
-
-      // flush previous face group.
-      bool ret = exportFaceGroupToShape(shape, vertexCache, v, vn, vt, faceGroup, material, name, true);
-      if (ret) {
-        shapes.push_back(shape);
-      }
-
-      //material = -1;
-      faceGroup.clear();
-      shape = shape_t();
-
-      // @todo { multiple object name? }
-      char namebuf[4096];
-      token += 2;
-      sscanf(token, "%s", namebuf);
-      name = std::string(namebuf);
-
-
-      continue;
-    }
-
-    // Ignore unknown command.
-  }
-
-  bool ret = exportFaceGroupToShape(shape, vertexCache, v, vn, vt, faceGroup, material, name, true);
-  if (ret) {
-    shapes.push_back(shape);
-  }
-  faceGroup.clear();  // for safety
-
-  return err.str();
-}
-
-
-}
diff --git a/Tutorials/Tools/tiny_obj_loader.h b/Tutorials/Tools/tiny_obj_loader.h
deleted file mode 100644
index 7cc6fa6a..00000000
--- a/Tutorials/Tools/tiny_obj_loader.h
+++ /dev/null
@@ -1,107 +0,0 @@
-//
-// Copyright 2012-2013, Syoyo Fujita.
-//
-// Licensed under 2-clause BSD liecense.
-//
-#ifndef _TINY_OBJ_LOADER_H
-#define _TINY_OBJ_LOADER_H
-
-#include <string>
-#include <vector>
-#include <map>
-
-namespace tinyobj {
-
-typedef struct
-{
-    std::string name;
-
-    float ambient[3];
-    float diffuse[3];
-    float specular[3];
-    float transmittance[3];
-    float emission[3];
-    float shininess;
-    float ior;                // index of refraction
-    float dissolve;           // 1 == opaque; 0 == fully transparent
-    // illumination model (see http://www.fileformat.info/format/material/)
-    int illum;
-
-    std::string ambient_texname;
-    std::string diffuse_texname;
-    std::string specular_texname;
-    std::string normal_texname;
-    std::map<std::string, std::string> unknown_parameter;
-} material_t;
-
-typedef struct
-{
-    std::vector<float>          positions;
-    std::vector<float>          normals;
-    std::vector<float>          texcoords;
-    std::vector<int>            indices;
-    std::vector<int>            material_ids; // per-mesh material ID
-} mesh_t;
-
-typedef struct
-{
-    std::string  name;
-    mesh_t       mesh;
-} shape_t;
-
-class MaterialReader
-{
-public:
-    MaterialReader(){}
-    virtual ~MaterialReader(){}
-
-    virtual std::string operator() (
-        const std::string& matId,
-        std::vector<material_t>& materials,
-        std::map<std::string, int>& matMap) = 0;
-};
-
-class MaterialFileReader:
-  public MaterialReader
-{
-    public:
-        MaterialFileReader(const std::string& mtl_basepath): m_mtlBasePath(mtl_basepath) {}
-        virtual ~MaterialFileReader() {}
-        virtual std::string operator() (
-          const std::string& matId,
-          std::vector<material_t>& materials,
-          std::map<std::string, int>& matMap);
-
-    private:
-        std::string m_mtlBasePath;
-};
-
-/// Loads .obj from a file.
-/// 'shapes' will be filled with parsed shape data
-/// The function returns error string.
-/// Returns empty string when loading .obj success.
-/// 'mtl_basepath' is optional, and used for base path for .mtl file.
-std::string LoadObj(
-    std::vector<shape_t>& shapes,   // [output]
-    std::vector<material_t>& materials,   // [output]
-    const char* filename,
-    const char* mtl_basepath = NULL);
-
-/// Loads object from a std::istream, uses GetMtlIStreamFn to retrieve
-/// std::istream for materials.
-/// Returns empty string when loading .obj success.
-std::string LoadObj(
-    std::vector<shape_t>& shapes,   // [output]
-    std::vector<material_t>& materials,   // [output]
-    std::istream& inStream,
-    MaterialReader& readMatFn);
-
-/// Loads materials into std::map
-/// Returns an empty string if successful
-std::string LoadMtl (
-  std::map<std::string, int>& material_map,
-  std::vector<material_t>& materials,
-  std::istream& inStream);
-}
-
-#endif  // _TINY_OBJ_LOADER_H
diff --git a/Tutorials/Triangle/main.cpp b/Tutorials/Triangle/main.cpp
deleted file mode 100644
index ca3c3152..00000000
--- a/Tutorials/Triangle/main.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-#include "radeon_rays.h"
-#include <GL/glew.h>
-#include <GLUT/GLUT.h>
-#include <cassert>
-#include <iostream>
-#include <memory>
-#include "../Tools/shader_manager.h"
-
-using namespace RadeonRays;
-
-namespace {
-    float const g_vertices[] = {
-        -1.f,-1.f,0.f,
-        1.f,-1.f,0.f,
-        0.f,1.f,0.f,
-    };
-    int const g_indices[] = { 0, 1, 2 };
-    const int g_numfaceverts[] = { 3 };
-    
-    GLuint g_vertex_buffer, g_index_buffer;
-    GLuint g_texture;
-    int g_window_width = 640;
-    int g_window_height = 480;
-    std::unique_ptr<ShaderManager> g_shader_manager;
-}
-
-void InitGl()
-{
-    g_shader_manager.reset(new ShaderManager());
-    glClearColor(0.0, 0.0, 0.0, 0.0);
-
-    glCullFace(GL_FRONT);
-    glDisable(GL_DEPTH_TEST);
-    glEnable(GL_TEXTURE_2D);
-    glGenBuffers(1, &g_vertex_buffer);
-    glGenBuffers(1, &g_index_buffer);
-    // create Vertex buffer
-    glBindBuffer(GL_ARRAY_BUFFER, g_vertex_buffer);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, g_index_buffer);
-    // fill data
-    glBufferData(GL_ARRAY_BUFFER, sizeof(g_vertices), g_vertices, GL_STATIC_DRAW);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(g_indices), g_indices, GL_STATIC_DRAW);
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
-    //texture
-    glGenTextures(1, &g_texture);
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, g_window_width, g_window_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, nullptr);
-    glBindTexture(GL_TEXTURE_2D, 0);
-}
-
-void DrawScene()
-{
-    glDisable(GL_DEPTH_TEST);
-    glViewport(0, 0, g_window_width, g_window_height);
-
-    glClear(GL_COLOR_BUFFER_BIT);
-    GLuint program = g_shader_manager->GetProgram("simple");
-    glUseProgram(program);
-
-    GLuint texloc = glGetUniformLocation(program, "g_Texture");
-    assert(texloc >= 0);
-    glUniform1i(texloc, 0);
-
-    glBindBuffer(GL_ARRAY_BUFFER, g_vertex_buffer);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, g_index_buffer);
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-    GLuint position_attr = glGetAttribLocation(program, "inPosition");
-
-    glVertexAttribPointer(position_attr, 3, GL_FLOAT, GL_FALSE, 0, 0);
-
-    glEnableVertexAttribArray(position_attr);
-
-    glDrawElements(GL_TRIANGLES, 3, GL_UNSIGNED_INT, nullptr);
-    glBindTexture(GL_TEXTURE_2D, 0);
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
-    glUseProgram(0);
-
-    glFinish();
-    glutSwapBuffers();
-}
-
-int main(int argc, char* argv[])
-{
-    // GLUT Window Initialization:
-    glutInit(&argc, (char**)argv);
-    glutInitWindowSize(640, 480);
-    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH);
-    glutCreateWindow("Triangle");
-#ifndef __APPLE__
-    GLenum err = glewInit();
-    if (err != GLEW_OK)
-    {
-        std::cout << "GLEW initialization failed\n";
-        return -1;
-    }
-#endif
-
-    //prepare triangle for drawing
-    InitGl();
-
-    //choose device
-    int nativeidx = -1;
-    // Always use OpenCL
-    IntersectionApi::SetPlatform(DeviceInfo::kOpenCL);
-
-    for (auto idx = 0U; idx < IntersectionApi::GetDeviceCount(); ++idx)
-    {
-        DeviceInfo devinfo;
-        IntersectionApi::GetDeviceInfo(idx, devinfo);
-
-        if (devinfo.type == DeviceInfo::kGpu && nativeidx == -1)
-        {
-            nativeidx = idx;
-        }
-    }
-    assert(nativeidx != -1);
-    IntersectionApi* api = IntersectionApi::Create(nativeidx);
-
-    //adding triangle to tracing scene
-    Shape* shape = api->CreateMesh(g_vertices, 3, 3 * sizeof(float), g_indices, 0, g_numfaceverts, 1);
-    assert(shape != nullptr);
-    api->AttachShape(shape);
-    //commit scene changes
-    api->Commit();
-
-    // prepare rays for intersection
-    ray rays[3];
-    rays[0].o = float4(0.f, 0.f, -1.f, 1000.f);
-    rays[0].d = float3(0.f, 0.f, 10.f);
-    rays[1].o = float4(0.f, 0.5f, -10.f, 1000.f);
-    rays[1].d = float3(0.f, 0.f, 1.f);
-    rays[2].o = float4(0.4f, 0.f, -10.f, 1000.f);
-    rays[2].d = float3(0.f, 0.f, 1.f);
-    auto ray_buffer = api->CreateBuffer(3 * sizeof(ray), rays);
-
-    // prepare intersection data
-    Intersection isect[3];
-    auto isect_buffer = api->CreateBuffer(3 * sizeof(Intersection), nullptr);
-    
-    //intersection
-    api->QueryIntersection(ray_buffer, 3, isect_buffer, nullptr, nullptr);
-    
-    //get results
-    Event* e = nullptr;
-    Intersection* tmp = nullptr;
-    api->MapBuffer(isect_buffer, kMapRead, 0, 3 * sizeof(Intersection), (void**)&tmp, &e);
-    //RadeonRays calls are asynchronous, so need to wait for calculation to complete.
-    e->Wait();
-    api->DeleteEvent(e);
-    e = nullptr;
-    
-    isect[0] = tmp[0];
-    isect[1] = tmp[1];
-    isect[2] = tmp[2];
-
-    //preparing triangle texture
-    std::vector<unsigned char> tex_data(g_window_width * g_window_height * 4);
-    for (int i = 0; i < g_window_width * g_window_height; ++i)
-    {
-        tex_data[4 * i] = 255;
-        tex_data[4 * i + 1] = 255;
-        tex_data[4 * i + 2] = 255;
-        tex_data[4 * i + 3] = 255;
-    }
-
-    //marking ray hits on triangle texture as red dots
-    for (int i = 0; i < 3; ++i)
-    {
-        if (isect[i].shapeid == kNullId)
-            continue;
-
-        float x = g_vertices[3] * isect[i].uvwt.x + g_vertices[6] * isect[i].uvwt.y + g_vertices[0] * (1 - isect[i].uvwt.x - isect[i].uvwt.y);
-        float y = g_vertices[4] * isect[i].uvwt.x + g_vertices[7] * isect[i].uvwt.y + g_vertices[1] * (1 - isect[i].uvwt.x - isect[i].uvwt.y);
-
-        int k = g_window_height * g_window_width * (y + 1) / 2 + g_window_width * (x + 1) / 2;
-        tex_data[k*4] = 255;
-        tex_data[k*4 + 1] = 0;
-        tex_data[k*4 + 2] = 0;
-    }
-
-    //update texture
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, g_window_width, g_window_height, GL_RGBA, GL_UNSIGNED_BYTE, tex_data.data());
-    glBindTexture(GL_TEXTURE_2D, NULL);
-
-    //draw scene and start main loop.
-    glutDisplayFunc(DrawScene);
-    glutMainLoop();
-
-    //cleanup
-    IntersectionApi::Delete(api);
-
-    return 0;
-}
diff --git a/Tutorials/Triangle/simple.fsh b/Tutorials/Triangle/simple.fsh
deleted file mode 100644
index 34bc176b..00000000
--- a/Tutorials/Triangle/simple.fsh
+++ /dev/null
@@ -1,30 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-
-uniform sampler2D g_Texture;
-
-varying vec2 Texcoord;
-
-void main()
-{
-    gl_FragColor = texture2D(g_Texture, Texcoord);
-}
\ No newline at end of file
diff --git a/Tutorials/Triangle/simple.vsh b/Tutorials/Triangle/simple.vsh
deleted file mode 100644
index b60a420c..00000000
--- a/Tutorials/Triangle/simple.vsh
+++ /dev/null
@@ -1,32 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-
-attribute vec3 inPosition;
-
-varying vec2 Texcoord;
-
-void main()
-{
-    Texcoord = 0.5* (inPosition.xy + 1);
-    gl_Position = vec4(inPosition, 1.0);
-}
-
diff --git a/Tutorials/TriangleLight/main.cpp b/Tutorials/TriangleLight/main.cpp
deleted file mode 100644
index a1d647a8..00000000
--- a/Tutorials/TriangleLight/main.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-#include "radeon_rays.h"
-#include <GL/glew.h>
-#include <GLUT/GLUT.h>
-#include <cassert>
-#include <iostream>
-#include <memory>
-#include "../Tools/shader_manager.h"
-
-using namespace RadeonRays;
-
-namespace {
-    float const g_vertices[] = {
-        -1.f,-1.f,0.f,
-        1.f,-1.f,0.f,
-        0.f,1.f,0.f,
-    };
-    float const g_normals[] = {
-        0.f,0.f,1.f,
-        0.f,0.f,1.f,
-        0.f,0.f,1.f,
-    };
-    int const g_indices[] = { 0, 1, 2 };
-    const int g_numfaceverts[] = { 3 };
-    const float3 g_color = { 1.f, 0.f, 0.f, 1.f };
-    GLuint g_vertex_buffer, g_index_buffer;
-    GLuint g_texture;
-    int g_window_width = 640;
-    int g_window_height = 480;
-    std::unique_ptr<ShaderManager> g_shader_manager;
-}
-
-float3 ConvertFromBarycentric(const float* vec, const int* ind, int prim_id, const float4& uvwt)
-{
-    float3 a = { vec[ind[prim_id * 3] * 3],
-        vec[ind[prim_id * 3] * 3 + 1],
-        vec[ind[prim_id * 3] * 3 + 2], };
-
-    float3 b = { vec[ind[prim_id * 3 + 1] * 3],
-        vec[ind[prim_id * 3 + 1] * 3 + 1],
-        vec[ind[prim_id * 3 + 1] * 3 + 2], };
-
-    float3 c = { vec[ind[prim_id * 3 + 2] * 3],
-        vec[ind[prim_id * 3 + 2] * 3 + 1],
-        vec[ind[prim_id * 3 + 2] * 3 + 2], };
-    return a * (1 - uvwt.x - uvwt.y) + b * uvwt.x + c * uvwt.y;
-}
-
-void InitGl()
-{
-    g_shader_manager.reset(new ShaderManager());
-
-    glClearColor(0.0, 0.0, 0.0, 0.0);
-    glCullFace(GL_NONE);
-    glDisable(GL_DEPTH_TEST);
-    glEnable(GL_TEXTURE_2D);
-
-    glGenBuffers(1, &g_vertex_buffer);
-    glGenBuffers(1, &g_index_buffer);
-
-    // create Vertex buffer
-    glBindBuffer(GL_ARRAY_BUFFER, g_vertex_buffer);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, g_index_buffer);
-
-    float quad_vdata[] =
-    {
-        -1, -1, 0.5, 0, 0,
-        1, -1, 0.5, 1, 0,
-        1, 1, 0.5, 1, 1,
-        -1, 1, 0.5, 0, 1
-    };
-
-    GLshort quad_idata[] =
-    {
-        0, 1, 3,
-        3, 1, 2
-    };
-
-    // fill data
-    glBufferData(GL_ARRAY_BUFFER, sizeof(quad_vdata), quad_vdata, GL_STATIC_DRAW);
-    glBufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(quad_idata), quad_idata, GL_STATIC_DRAW);
-
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
-
-    //texture
-    glGenTextures(1, &g_texture);
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
-    glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
-    glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA, g_window_width, g_window_height, 0, GL_RGBA, GL_UNSIGNED_BYTE, nullptr);
-    glBindTexture(GL_TEXTURE_2D, 0);
-}
-
-void DrawScene()
-{
-    glDisable(GL_DEPTH_TEST);
-    glViewport(0, 0, g_window_width, g_window_height);
-    glClear(GL_COLOR_BUFFER_BIT);
-
-    glBindBuffer(GL_ARRAY_BUFFER, g_vertex_buffer);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, g_index_buffer);
-
-    //shader data
-    GLuint program = g_shader_manager->GetProgram("simple");
-    glUseProgram(program);
-    GLuint texloc = glGetUniformLocation(program, "g_Texture");
-    assert(texloc >= 0);
-
-    glUniform1i(texloc, 0);
-
-    glActiveTexture(GL_TEXTURE0);
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-
-    GLuint position_attr = glGetAttribLocation(program, "inPosition");
-    GLuint texcoord_attr = glGetAttribLocation(program, "inTexcoord");
-    glVertexAttribPointer(position_attr, 3, GL_FLOAT, GL_FALSE, sizeof(float) * 5, 0);
-    glVertexAttribPointer(texcoord_attr, 2, GL_FLOAT, GL_FALSE, sizeof(float) * 5, (void*)(sizeof(float) * 3));
-    glEnableVertexAttribArray(position_attr);
-    glEnableVertexAttribArray(texcoord_attr);
-
-    //draw rectanle
-    glDrawElements(GL_TRIANGLES, 6, GL_UNSIGNED_SHORT, nullptr);
-
-    glDisableVertexAttribArray(texcoord_attr);
-    glBindTexture(GL_TEXTURE_2D, 0);
-    glBindBuffer(GL_ARRAY_BUFFER, 0);
-    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
-    glUseProgram(0);
-
-    glFinish();
-    glutSwapBuffers();
-}
-
-int main(int argc, char* argv[])
-{
-    // GLUT Window Initialization:
-    glutInit(&argc, (char**)argv);
-    glutInitWindowSize(640, 480);
-    glutInitDisplayMode(GLUT_RGBA | GLUT_DOUBLE | GLUT_DEPTH);
-    glutCreateWindow("Triangle");
-#ifndef __APPLE__
-    GLenum err = glewInit();
-    if (err != GLEW_OK)
-    {
-        std::cout << "GLEW initialization failed\n";
-        return -1;
-    }
-#endif
-    // Prepare rectangle for drawing texture
-    // rendered using intersection results
-    InitGl();
-
-    // Choose device
-    int nativeidx = -1;
-    // Always use OpenCL
-    IntersectionApi::SetPlatform(DeviceInfo::kOpenCL);
-
-    for (auto idx = 0U; idx < IntersectionApi::GetDeviceCount(); ++idx)
-    {
-        DeviceInfo devinfo;
-        IntersectionApi::GetDeviceInfo(idx, devinfo);
-
-        if (devinfo.type == DeviceInfo::kGpu && nativeidx == -1)
-        {
-            nativeidx = idx;
-        }
-    }
-    assert(nativeidx != -1);
-    IntersectionApi* api = IntersectionApi::Create(nativeidx);
-
-    // Adding triangle to tracing scene
-    Shape* shape = api->CreateMesh(g_vertices, 3, 3 * sizeof(float), g_indices, 0, g_numfaceverts, 1);
-    assert(shape != nullptr);
-    api->AttachShape(shape);
-    // Commit scene changes
-    api->Commit();
-
-    const int k_raypack_size = g_window_height * g_window_width;
-
-    // Prepare rays. One for each texture pixel.
-    std::vector<ray> rays(k_raypack_size);
-    for (int i = 0; i < g_window_height; ++i)
-        for (int j = 0; j < g_window_width; ++j)
-        {
-            const float xstep = 2.f / (float)g_window_width;
-            const float ystep = 2.f / (float)g_window_height;
-            float x = -1.f + xstep * (float)j;
-            float y = -1.f + ystep * (float)i;
-            float z = 1.f;
-            rays[i * g_window_width + j].o = float3(x, y, z, 1000.f);
-            rays[i * g_window_width + j].d = float3{0.f, 0.f, -1.f};
-        }
-    Buffer* ray_buffer = api->CreateBuffer(rays.size() * sizeof(ray), rays.data());
-
-    // Intersection data
-    std::vector<Intersection> isect(k_raypack_size);
-    Buffer* isect_buffer = api->CreateBuffer(isect.size() * sizeof(Intersection), nullptr);
-
-    // Intersection
-    api->QueryIntersection(ray_buffer, k_raypack_size, isect_buffer, nullptr, nullptr);
-    
-    // Get results
-    Event* e = nullptr;
-    Intersection* tmp = nullptr;
-    api->MapBuffer(isect_buffer, kMapRead, 0, isect.size() * sizeof(Intersection), (void**)&tmp, &e);
-    // RadeonRays calls are asynchronous, so need to wait for calculation to complete.
-    e->Wait();
-    api->DeleteEvent(e);
-    e = nullptr;
-
-    // Copy results
-    for (int i = 0; i < k_raypack_size; ++i)
-    {
-        isect[i] = tmp[i];
-    }
-
-    // Point light position
-    float3 light = { 0.f, 0.f, 0.25f };
-
-    // Render triangle and lightning
-    std::vector<unsigned char> tex_data(k_raypack_size * 4);
-    for (int i = 0; i < k_raypack_size; ++i)
-    {
-        int shape_id = isect[i].shapeid;
-        int prim_id = isect[i].primid;
-
-        if (shape_id != kNullId)
-        {
-            // Calculate position and normal of the intersection point
-            float3 pos = ConvertFromBarycentric(g_vertices, g_indices, prim_id, isect[i].uvwt);
-            float3 norm = ConvertFromBarycentric(g_normals, g_indices, prim_id, isect[i].uvwt);
-            norm.normalize();
-
-            // Calculate lighting
-            float3 col = { 0.f, 0.f, 0.f };
-            float3 light_dir = light - pos;
-            light_dir.normalize();
-            float dot_prod = dot(norm, light_dir);
-            if (dot_prod > 0)
-                col += dot_prod * g_color;
-
-            tex_data[i * 4] = col[0] * 255;
-            tex_data[i * 4 + 1] = col[1] * 255;
-            tex_data[i * 4 + 2] = col[2] * 255;
-            tex_data[i * 4 + 3] = 255;
-        }
-        else
-        {
-            // Draw white pixels for misses
-            tex_data[i * 4] = 255;
-            tex_data[i * 4 + 1] = 255;
-            tex_data[i * 4 + 2] = 255;
-            tex_data[i * 4 + 3] = 255;
-        }
-    }
-
-    // Update texture data
-    glBindTexture(GL_TEXTURE_2D, g_texture);
-    glTexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, g_window_width, g_window_height, GL_RGBA, GL_UNSIGNED_BYTE, tex_data.data());
-    glBindTexture(GL_TEXTURE_2D, NULL);
-
-    // Start main loop.
-    glutDisplayFunc(DrawScene);
-    glutMainLoop(); 
-
-    // Cleanup
-    IntersectionApi::Delete(api);
-
-    return 0;
-}
diff --git a/Tutorials/TriangleLight/simple.fsh b/Tutorials/TriangleLight/simple.fsh
deleted file mode 100644
index 34bc176b..00000000
--- a/Tutorials/TriangleLight/simple.fsh
+++ /dev/null
@@ -1,30 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-
-uniform sampler2D g_Texture;
-
-varying vec2 Texcoord;
-
-void main()
-{
-    gl_FragColor = texture2D(g_Texture, Texcoord);
-}
\ No newline at end of file
diff --git a/Tutorials/TriangleLight/simple.vsh b/Tutorials/TriangleLight/simple.vsh
deleted file mode 100644
index 002269d4..00000000
--- a/Tutorials/TriangleLight/simple.vsh
+++ /dev/null
@@ -1,33 +0,0 @@
-/**********************************************************************
-Copyright (c) 2016 Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-********************************************************************/
-
-attribute vec3 inPosition;
-attribute vec2 inTexcoord;
-
-varying vec2 Texcoord;
-
-void main()
-{
-    Texcoord = inTexcoord;
-    gl_Position = vec4(inPosition, 1.0);
-}
-
diff --git a/UnitTest/clw_test_cl.h b/UnitTest/clw_test_cl.h
index b95a4947..734ed8eb 100644
--- a/UnitTest/clw_test_cl.h
+++ b/UnitTest/clw_test_cl.h
@@ -37,7 +37,7 @@ THE SOFTWARE.
 #include "CLW.h"
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/UnitTest/radeon_rays_performance_test_cl.h b/UnitTest/radeon_rays_performance_test_cl.h
index eb6a3e5a..3bbd6e98 100644
--- a/UnitTest/radeon_rays_performance_test_cl.h
+++ b/UnitTest/radeon_rays_performance_test_cl.h
@@ -38,7 +38,7 @@ using namespace tinyobj;
 #include <chrono>
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif
diff --git a/UnitTest/radeon_rays_test_cl.h b/UnitTest/radeon_rays_test_cl.h
index e527470e..b614363e 100644
--- a/UnitTest/radeon_rays_test_cl.h
+++ b/UnitTest/radeon_rays_test_cl.h
@@ -39,7 +39,7 @@ THE SOFTWARE.
 using namespace RadeonRays;
 
 #ifdef __APPLE__
-#include <OpenCL/OpenCL.h>
+#include <OpenCL/opencl.h>
 #else
 #include <CL/cl.h>
 #endif