fastmachinelearning · jmitrevs · Feb 20, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/docs/api/configuration.rst b/docs/api/configuration.rst
@@ -101,6 +101,20 @@ Finally, one then uses the configuration to create an hls model:
         backend='Vitis'
     )
 
+To target an oneAPI Board Support Package (BSP) enabled FPGA for offload acceleration, you can specify the ``part`` argument to be the path to your BSP and the BSP variant. Then, set ``use_oneapi_bsp=True``.
+
+.. code-block:: python
+
+  hls_model = hls4ml.converters.convert_from_keras_model(
+        model,
+        hls_config=config,
+        output_dir="my_project_dir",
+        io_type="io_parallel",
+        backend="oneAPI",
+        part="/path/to/my/bsp:bsp_variant",
+        use_oneapi_bsp=True
+    )
+
 See :py:class:`~hls4ml.converters.convert_from_keras_model` for more information on the various options. Similar functions exist for ONNX and PyTorch.
 
 ----
@@ -132,6 +146,9 @@ It looks like this:
    ClockPeriod: 5
    IOType: io_parallel # options: io_parallel/io_stream
 
+   # oneAPI Offload Acceleration flag.
+   UseOneAPIBSP: True
+
    HLSConfig:
      Model:
        Precision: fixed<16,6>
@@ -156,6 +173,7 @@ The backend-specific section of the configuration depends on the backend. You ca
 For Vivado backend the options are:
 
 * **Part**\ : the particular FPGA part number that you are considering, here it's a Xilinx Virtex UltraScale+ VU13P FPGA
+* **UseOneAPIBSP**\ : path to the oneAPI Board Support Package (and the BSP variant) to enable offload acceleration with an Altera FPGA. This is only needed if you are using the oneAPI backend.
 * **ClockPeriod**\ : the clock period, in ns, at which your algorithm runs
   Then you have some optimization parameters for how your algorithm runs:
 * **IOType**\ : your options are ``io_parallel`` or ``io_stream`` which defines the type of data structure used for inputs, intermediate activations between layers, and outputs. For ``io_parallel``, arrays are used that, in principle, can be fully unrolled and are typically implemented in RAMs. For ``io_stream``, HLS streams are used, which are a more efficient/scalable mechanism to represent data that are produced and consumed in a sequential manner. Typically, HLS streams are implemented with FIFOs instead of RAMs. For more information see `here <https://docs.xilinx.com/r/en-US/ug1399-vitis-hls/pragma-HLS-stream>`__.

diff --git a/hls4ml/backends/oneapi/oneapi_backend.py b/hls4ml/backends/oneapi/oneapi_backend.py
@@ -130,7 +130,9 @@ def get_default_flow(self):
     def get_writer_flow(self):
         return self._writer_flow
 
-    def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_parallel', write_tar=False, **_):
+    def create_initial_config(
+        self, part='Arria10', clock_period=5, io_type='io_parallel', write_tar=False, use_oneapi_bsp=False, **_
+    ):
         """Create initial configuration of the oneAPI backend.
 
         Args:
@@ -154,7 +156,7 @@ def create_initial_config(self, part='Arria10', clock_period=5, io_type='io_para
             # TODO:  add namespace
             'WriteTar': write_tar,
         }
-
+        config['UseOneAPIBSP'] = use_oneapi_bsp
         return config
 
     def compile(self, model):

diff --git a/hls4ml/backends/oneapi/oneapi_types.py b/hls4ml/backends/oneapi/oneapi_types.py
@@ -170,11 +170,18 @@ def definition_cpp(self, name_suffix='', as_reference=False):
         else:
             return f'{self.type.name} {self.name}{name_suffix}'
 
-    def declare_cpp(self, pipe_min_size=0, indent=''):
-        lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + (
-            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-            + f'{self.type.name}, {pipe_min_size}, PipeProps>;\n'
+    # Updated pipe min size to be 32 for simulation.
+    def declare_cpp(self, pipe_min_size=32, indent=''):
+        # Updated to use streaming beat for restartable streaming kernel.
+        # Streaming beat is a wrapper type of the actual type with sideband control signals.
+        # Syntax: using BeatT = sycl::ext::intel::experimental::StreamingBeat<DataT, eop, empty>;
+        streaming_beat_t = f"{self.pipe_name}BeatT"
+        lines = (
+            f"{indent}class {self.pipe_id};\n"
+            f"{indent}using {streaming_beat_t} = "
+            f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
+            f"{indent}using {self.pipe_name} = sycl::ext::intel::experimental::pipe<"
+            f"{self.pipe_id}, {streaming_beat_t}, {pipe_min_size}, HostPipePropertiesT>;\n"
         )
         return lines
 
@@ -193,10 +200,13 @@ def definition_cpp(self, name_suffix='', as_reference=True):
         return f'{self.name}{name_suffix}'
 
     def declare_cpp(self, indent=''):
-        lines = indent + f'class {self.pipe_id};\n'
-        lines += indent + (
-            f'using {self.pipe_name} = sycl::ext::intel::experimental::pipe<{self.pipe_id}, '
-            + f'{self.type.name}, {self.pragma[-1]}>;\n'
+        streaming_beat_t = f"{self.pipe_name}BeatT"
+        lines = (
+            f"{indent}class {self.pipe_id};\n"
+            f"{indent}using {streaming_beat_t} = "
+            f"sycl::ext::intel::experimental::StreamingBeat<{self.type.name}, true, true>;\n"
+            f"{indent}using {self.pipe_name} = "
+            f"sycl::ext::intel::experimental::pipe<{self.pipe_id}, {streaming_beat_t}, {self.pragma[-1]}>;\n"
         )
         return lines
 

diff --git a/hls4ml/templates/oneapi/CMakeLists.txt b/hls4ml/templates/oneapi/CMakeLists.txt
@@ -38,15 +38,20 @@ set(LIBRARY_NAME myproject-${LIB_STAMP})
 # You can also specify a device family (E.g. "Arria10" or "Stratix10") or a
 # specific part number (E.g. "10AS066N3F40E2SG") to generate a standalone IP.
 if(NOT DEFINED FPGA_DEVICE)
-    set(FPGA_DEVICE "Arria10")
+    set(FPGA_DEVICE "Agilex7")
+    set(BSP_FLAG "")
 endif()
 
+# Set the target to a BSP if we target an actual accelerator board.
+# hls-fpga-machine-learning insert oneapi_bsp_cmake_flag
+
 # Use cmake -DUSER_FPGA_FLAGS=<flags> to set extra flags for FPGA backend
 # compilation.
-set(USER_FPGA_FLAGS -Wno-unused-label ${USER_FPGA_FLAGS})
+# -Xsoptimize=latency Turns off the hyper-optimized handshake
+set(USER_FPGA_FLAGS -Wno-unused-label;${USER_FPGA_FLAGS};-Xsoptimize=latency)
 
 # Use cmake -DUSER_FLAGS=<flags> to set extra flags for general compilation.
-set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS})
+set(USER_FLAGS -Wno-unused-label -fconstexpr-steps=134217728 ${USER_FLAGS} ${BSP_FLAG})
 
 # Use cmake -DUSER_INCLUDE_PATHS=<paths> to set extra paths for general
 # compilation.

diff --git a/hls4ml/templates/oneapi/firmware/myproject.h b/hls4ml/templates/oneapi/firmware/myproject.h
@@ -8,6 +8,117 @@
 // currently this is fixed
 using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext::intel::experimental::ready_latency<0>));
 
+// Pipe properties for host pipes. Host pipes connect to the data source DMA and sink DMA.
+// They are connected to the first and the last layer to stream data into and out from the kernel.
+using HostPipePropertiesT = decltype(sycl::ext::oneapi::experimental::properties(
+    sycl::ext::intel::experimental::ready_latency<0>, sycl::ext::intel::experimental::bits_per_symbol<16>,
+    sycl::ext::intel::experimental::uses_valid<true>, sycl::ext::intel::experimental::first_symbol_in_high_order_bits<true>,
+    sycl::ext::intel::experimental::protocol_avalon_streaming_uses_ready));
+
+namespace nnet {
+
+#if !defined(IS_BSP)
+// Definition for buffer locations for Avalon MM host.
+inline constexpr unsigned kInputBufferLocation = 0;
+inline constexpr unsigned kOutputBufferLocation = 1;
+#endif
+
+// Implementation of a direct memory access kernel. Move data from source, convert,
+// and send to the sink. Adaptive to SYCL HLS and hardware acceleration flow.
+template <class src_T, class dest_pipe> struct DMA_convert_data {
+#if !defined(IS_BSP)
+    // When targeting a device family, we instantiate an Avalon Memory Mapped Host for
+    // data transaction between host and the DMA kernel during emulation and simulation.
+    sycl::ext::oneapi::experimental::annotated_arg<
+        src_T *,
+        decltype(sycl::ext::oneapi::experimental::properties{
+            sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>,
+            sycl::ext::intel::experimental::buffer_location<kInputBufferLocation>,
+            sycl::ext::intel::experimental::read_write_mode_read, sycl::ext::intel::experimental::wait_request_requested})>
+#else
+    // When targeting oneAPI BSP, we can use USM pointer to access host memory.
+    src_T *const
+#endif
+        src;
+    size_t num_iteration;
+
+    [[intel::kernel_args_restrict]] void operator()() const {
+
+#if defined(IS_BSP)
+        // Access data using host pointer.
+        sycl::ext::intel::host_ptr<src_T> src_ptr(src);
+#else
+        // Host allocation is not supported when targeting an FPGA family or part number.
+        src_T *src_ptr(src);
+#endif
+        // First, extract the PipeDataT from the pipe
+        using PipeDataType = typename nnet::ExtractPipeType<dest_pipe>::value_type;
+        // Then, extract the DataT from StreamingBeat
+        using DstDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
+        constexpr auto dstTypeSize = std::tuple_size<DstDataType>{};
+
+        [[intel::fpga_register]] typename nnet::ExtractPipeType<dest_pipe>::value_type packet;
+
+        // Keep sending data to the input layer and keep the kernels running.
+        for (size_t i = 0; i < num_iteration; i++) {
+            #pragma unroll
+            for (size_t j = 0; j < dstTypeSize; j++) {
+                packet.data[j] = src_ptr[i * dstTypeSize + j];
+            }
+            packet.sop = (i == 0);
+            // Assert end-of-packet signal after the last iteration.
+            // All down-stream kernels will stop seeing eop.
+            packet.eop = (i == (num_iteration - 1));
+            dest_pipe::write(packet);
+        }
+    }
+};
+
+// Symmetrical to the DMA_convert_data above, this DMA drains the output pipe and
+// writes result to memory.
+template <class src_pipe, class dst_T> struct DMA_convert_data_back {
+#if !defined(IS_BSP)
+    // Without BSP, instantiate an Avalon Memory Mapped Host to write to host.
+    sycl::ext::oneapi::experimental::annotated_arg<
+        dst_T *,
+        decltype(sycl::ext::oneapi::experimental::properties{
+            sycl::ext::intel::experimental::latency<0>, sycl::ext::intel::experimental::dwidth<16>,
+            sycl::ext::intel::experimental::buffer_location<kOutputBufferLocation>,
+            sycl::ext::intel::experimental::read_write_mode_write, sycl::ext::intel::experimental::wait_request_requested})>
+#else
+    // USM pointer, otherwise.
+    dst_T *const
+#endif
+        dst;
+    size_t num_iteration;
+
+    [[intel::kernel_args_restrict]] void operator()() const {
+#if defined(IS_BSP)
+        sycl::ext::intel::host_ptr<dst_T> dst_ptr(dst);
+#else
+        dst_T *dst_ptr(dst);
+#endif
+        // First, extract the PipeDataT from the pipe
+        using PipeDataType = typename nnet::ExtractPipeType<src_pipe>::value_type;
+        // Then, extract the DataT from StreamingBeat
+        using SrcDataType = typename nnet::ExtractDataType<PipeDataType>::value_type;
+        constexpr auto srcTypeSize = std::tuple_size<SrcDataType>{};
+
+        [[intel::fpga_register]] typename nnet::ExtractPipeType<src_pipe>::value_type packet;
+
+        // Drain the output pipe and write result to memory.
+        for (size_t i = 0; i < num_iteration; i++) {
+            packet = src_pipe::read();
+            #pragma unroll
+            for (size_t j = 0; j < srcTypeSize; j++) {
+                dst_ptr[i * srcTypeSize + j] = static_cast<dst_T>(packet.data[j].to_double());
+            }
+        }
+    }
+};
+
+} // namespace nnet
+
 // Need to declare the input and output pipes
 
 // hls-fpga-machine-learning insert inputs
@@ -16,12 +127,15 @@ using PipeProps = decltype(sycl::ext::oneapi::experimental::properties(sycl::ext
 class MyProjectID;
 
 struct MyProject {
-
+#ifndef IS_BSP
     // kernel property method to config invocation interface
     auto get(sycl::ext::oneapi::experimental::properties_tag) {
         return sycl::ext::oneapi::experimental::properties{sycl::ext::intel::experimental::streaming_interface<>,
                                                            sycl::ext::intel::experimental::pipelined<>};
     }
+#else
+    // kernel properties and pipelining is not supported in BSP.
+#endif
 
     SYCL_EXTERNAL void operator()() const;
 };

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_activation_stream.h
@@ -29,23 +29,31 @@ template <class data_pipe, class res_pipe, typename CONFIG_T> void linear_stream
 // *************************************************
 //       ReLU Activation
 // *************************************************
-template <class data_pipe, class res_pipe, typename CONFIG_T> void relu_stream() {
+template <class data_pipe, class res_pipe, typename CONFIG_T> [[intel::use_stall_enable_clusters]] void relu_stream() {
+    using namespace nnet;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type out_data;
+
+    bool keep_going = true;
 ReLUActLoop:
-    [[intel::initiation_interval(
-        1)]] for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; i++) {
-        auto in_data = data_pipe::read();
-        typename ExtractPipeType<res_pipe>::value_type out_data;
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        for (int i = 0; i < CONFIG_T::n_in / std::tuple_size<ResT>{}; i++) {
+            [[intel::fpga_register]] auto in_data = data_pipe::read();
+        ReLUPackLoop:
+            #pragma unroll
+            for (int j = 0; j < std::tuple_size<ResT>{}; j++) {
+                if (in_data.data[j] > 0)
+                    out_data.data[j] = in_data.data[j];
+                else
+                    out_data.data[j] = 0;
+            }
 
-    ReLUPackLoop:
-        #pragma unroll
-        for (int j = 0; j < std::tuple_size<typename ExtractPipeType<res_pipe>::value_type>{}; j++) {
-            if (in_data[j] > 0)
-                out_data[j] = in_data[j];
-            else
-                out_data[j] = 0;
-        }
+            out_data.sop = in_data.sop;
+            out_data.eop = in_data.eop;
+            res_pipe::write(out_data);
 
-        res_pipe::write(out_data);
+            keep_going = !in_data.eop;
+        }
     }
 }
 

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_dense_stream.h
@@ -7,15 +7,34 @@
 
 namespace nnet {
 
-// Note:  DataPack logic removed, at least in the initial version
+// Restartable streaming kernel implementation.
+// Computation is carried out in a while-1 loop as long as there is valid input.
+// The loop breaks when the end-of-packet signal is asserted by upstream task.
 template <class data_pipe, class res_pipe, typename CONFIG_T>
-void dense_resource_stream(typename CONFIG_T::weight_t weights, typename CONFIG_T::bias_t biases) {
+[[intel::use_stall_enable_clusters]] void dense_resource_stream(const typename CONFIG_T::weight_t weights,
+                                                                const typename CONFIG_T::bias_t biases) {
+    using namespace nnet;
+    using DataT = typename ExtractDataType<typename ExtractPipeType<data_pipe>::value_type>::value_type;
+    using ResT = typename ExtractDataType<typename ExtractPipeType<res_pipe>::value_type>::value_type;
 
-    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type res;
-    [[intel::fpga_register]] auto data = data_pipe::read();
-    dense_resource<typename ExtractPipeType<data_pipe>::value_type, typename ExtractPipeType<res_pipe>::value_type,
-                   CONFIG_T>(data, res, weights, biases);
-    res_pipe::write(res);
+    [[intel::fpga_register]] typename ExtractPipeType<res_pipe>::value_type resbeat;
+
+    bool keep_going = true;
+    bool did_read_input;
+    [[intel::initiation_interval(1)]] while (keep_going) {
+        did_read_input = false;
+        [[intel::fpga_register]] auto databeat = data_pipe::read(did_read_input);
+
+        if (did_read_input) {
+            dense_resource<DataT, ResT, CONFIG_T>(databeat.data, resbeat.data, weights, biases);
+
+            resbeat.sop = databeat.sop;
+            resbeat.eop = databeat.eop;
+
+            res_pipe::write(resbeat);
+            keep_going = !databeat.eop;
+        }
+    }
 }
 
 } // namespace nnet

diff --git a/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h b/hls4ml/templates/oneapi/firmware/nnet_utils/nnet_types.h
@@ -8,6 +8,8 @@
 #include <tuple>
 #include <utility>
 
+#include <sycl/ext/intel/prototype/pipes_ext.hpp> // Streaming Beat and pipe properties.
+
 namespace nnet {
 
 // Define the pipe type that we use
@@ -34,6 +36,15 @@ struct ExtractPipeType<PipeClass<PipeName, PipeDataT, kPipeMinCapacity, PipeProp
     typedef PipeDataT value_type;
 };
 
+// Helper template for extracting datatype from oneAPI StreamingBeat type.
+template <typename T> struct ExtractDataType { typedef T value_type; };
+
+// Specialization on oneAPI StreamingBeat type.
+template <typename DataT, bool EnableSOP, bool EnableEmpty>
+struct ExtractDataType<sycl::ext::intel::experimental::StreamingBeat<DataT, EnableSOP, EnableEmpty>> {
+    typedef DataT value_type;
+};
+
 /*
  * HLS Shift Register Implementation
  * To verify a shift register is used in hardware, go to report.html > Area Analysis of System