Xilinx
diff --git a/‎blas/Jenkinsfile‎
Lines changed: 4 additions & 0 deletions b/‎blas/Jenkinsfile‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎blas/L1/include/hw/xf_blas/gemm.hpp‎
Lines changed: 108 additions & 0 deletions b/‎blas/L1/include/hw/xf_blas/gemm.hpp‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎blas/L1/include/hw/xf_blas/helpers/dataMover/transpMatB2.hpp‎
Lines changed: 40 additions & 16 deletions b/‎blas/L1/include/hw/xf_blas/helpers/dataMover/transpMatB2.hpp‎
Lines changed: 40 additions & 16 deletions
diff --git a/‎blas/L1/include/hw/xf_blas/helpers/dataMover/vecMoverB1.hpp‎
Lines changed: 0 additions & 1 deletion b/‎blas/L1/include/hw/xf_blas/helpers/dataMover/vecMoverB1.hpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎blas/L1/include/hw/xf_blas/helpers/funcs/sum.hpp‎
Lines changed: 0 additions & 2 deletions b/‎blas/L1/include/hw/xf_blas/helpers/funcs/sum.hpp‎
Lines changed: 0 additions & 2 deletions
@@ -0,0 +1,4 @@
+@Library('pipeline-library')_
+
+VitisLibPipeline (branch: 'master', libname: 'xf_blas', TARGETS: 'hls_csim:hls_csynth:hls_cosim:vitis_sw_emu:vitis_hw_emu',
+                  email: 'lingl@xilinx.com', devtest: 'RunDeploy.sh', TOOLVERSION: '2019.2_released')
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2019 Xilinx, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XF_BLAS_GEMM_HPP
+#define XF_BLAS_GEMM_HPP
+
+#ifndef __cplusplus
+#error "BLAS Library only works with C++."
+#endif
+
+#include "ap_int.h"
+#include "hls_stream.h"
+#include "xf_blas/helpers.hpp"
+#include "scal.hpp"
+#include "axpy.hpp"
+
+namespace xf {
+
+namespace blas {
+
+template <typename t_DataType, unsigned int t_M, unsigned int t_N = t_M, typename t_MacDataType = t_DataType>
+class SystolicArray {
+   public:
+    static void process_dsp(unsigned int p_k,
+                            hls::stream<WideType<t_DataType, t_M> >& p_As,
+                            hls::stream<WideType<t_DataType, t_N> >& p_Bs,
+                            hls::stream<WideType<t_MacDataType, t_N> >& p_sum,
+                            unsigned int p_multi = 1) {
+#ifndef __SYNTHESIS__
+        assert(p_k >= t_M + t_N);
+#endif
+
+        WideType<t_DataType, t_M + t_N> l_winA[t_M];
+#pragma HLS ARRAY_PARTITION variable = l_winA dim = 0 complete
+        WideType<t_DataType, t_M + t_N> l_winB[t_N];
+#pragma HLS ARRAY_PARTITION variable = l_winB dim = 0 complete
+
+        WideType<t_MacDataType, t_N> l_C[t_M];
+#pragma HLS ARRAY_PARTITION variable = l_C dim = 0 complete
+        WideType<t_MacDataType, t_N> l_Co[t_M];
+#pragma HLS ARRAY_PARTITION variable = l_Co dim = 0 complete
+
+        for (int k = 0, l = 0; l < p_multi * p_k + t_M + t_N; l++, k++) {
+#pragma HLS PIPELINE
+            if (k == p_k) {
+                k = 0;
+            }
+
+            if (l > p_k && k >= t_N && k < t_M + t_N) {
+                p_sum.write(l_Co[k - t_N]);
+            }
+
+            WideType<t_DataType, t_M> l_A = 0;
+            WideType<t_DataType, t_N> l_B = 0;
+
+            if (l < p_multi * p_k) {
+                l_A = p_As.read();
+                l_B = p_Bs.read();
+            }
+
+            for (int j = 0; j < t_M; j++) l_winA[j].shift(l_A[j]);
+            for (int j = 0; j < t_N; j++) l_winB[j].shift(l_B[j]);
+            for (int m = 0; m < t_M; m++) {
+                for (int n = 0; n < t_N; n++) {
+                    int l_id = m + n;
+                    if (l_id == k) {
+                        l_Co[m][n] = l_C[m][n];
+                        l_C[m][n] = 0;
+                    }
+                    l_C[m][n] += l_winA[m][l_id] * l_winB[n][l_id];
+                }
+            }
+        }
+    }
+};
+
+template <typename t_DataType,
+          unsigned int t_M,
+          unsigned int t_N = t_M,
+          typename t_IndexType = unsigned int,
+          typename t_MacDataType = t_DataType>
+void gemm(const unsigned int p_k,
+          hls::stream<WideType<t_DataType, t_M> >& p_A,
+          hls::stream<WideType<t_DataType, t_N> >& p_B,
+          hls::stream<WideType<t_MacDataType, t_N> >& p_C,
+          const unsigned int p_r = 1) {
+#pragma HLS DATAFLOW
+    SystolicArray<t_DataType, t_M, t_N, t_MacDataType>::process_dsp(p_k, p_A, p_B, p_C, p_r);
+}
+
+} // end namespace blas
+
+} // end namespace xf
+
+#endif
@@ -92,30 +92,54 @@ template <typename t_DataType, unsigned int t_ParEntries>
 void transpMatBlocks(unsigned int p_blocks,
                      hls::stream<WideType<t_DataType, t_ParEntries> >& p_in,
                      hls::stream<WideType<t_DataType, t_ParEntries> >& p_out) {
-    t_DataType l_buf[t_ParEntries][t_ParEntries];
+    t_DataType l_buf[2][t_ParEntries][t_ParEntries];
 #pragma HLS ARRAY_PARTITION variable = l_buf complete dim = 0
-    for (unsigned int l_block = 0; l_block < p_blocks; ++l_block) {
-        // shuffle and store
-        for (unsigned int i = 0; i < t_ParEntries; ++i) {
+
+    for (int i = 0; i < t_ParEntries; ++i) {
 #pragma HLS PIPELINE
-            WideType<t_DataType, t_ParEntries> l_val;
+        WideType<t_DataType, t_ParEntries> l_val;
 #pragma HLS ARRAY_PARTITION variable = l_val complete
-            l_val = p_in.read();
-            for (unsigned int j = 0; j < t_ParEntries; ++j) {
-                l_buf[i][j] = l_val[j];
-            }
+        l_val = p_in.read();
+        for (int j = 0; j < t_ParEntries; ++j) {
+            l_buf[0][i][j] = l_val[j];
         }
+    }
 
-        for (unsigned int i = 0; i < t_ParEntries; ++i) {
+    for (unsigned int l_block = 1; l_block < p_blocks; ++l_block) {
+        int jIn = 0, jOut = 0;
+        do {
 #pragma HLS PIPELINE
-            WideType<t_DataType, t_ParEntries> l_val;
-#pragma HLS ARRAY_PARTITION variable = l_val complete
-            for (unsigned int j = 0; j < t_ParEntries; ++j) {
-                l_val[j] = l_buf[j][i];
+            WideType<t_DataType, t_ParEntries> l_valIn;
+#pragma HLS ARRAY_PARTITION variable = l_valIn complete
+            WideType<t_DataType, t_ParEntries> l_valOut;
+#pragma HLS ARRAY_PARTITION variable = l_valOut complete
+            if (p_in.read_nb(l_valIn)) {
+                for (int k = 0; k < t_ParEntries; ++k) {
+                    l_buf[l_block % 2][jIn][k] = l_valIn[k];
+                }
+                jIn++;
             }
-            p_out.write(l_val);
-        }
+            for (int k = 0; k < t_ParEntries; ++k) {
+                l_valOut[k] = l_buf[(l_block - 1) % 2][k][jOut];
+            }
+            if (jOut < t_ParEntries) {
+                p_out.write(l_valOut);
+                jOut++;
+            }
+        } while ((jIn < t_ParEntries) || (jOut < t_ParEntries));
     }
+
+    int i = 0;
+    do {
+#pragma HLS PIPELINE
+        WideType<t_DataType, t_ParEntries> l_valOut;
+#pragma HLS ARRAY_PARTITION variable = l_valOut complete
+        for (int j = 0; j < t_ParEntries; ++j) {
+            l_valOut[j] = l_buf[(p_blocks - 1) % 2][j][i];
+        }
+        p_out.write(l_valOut);
+        i++;
+    } while (i < t_ParEntries);
 }
 
 template <typename t_DataType, unsigned int t_ParEntries>
 
@@ -110,7 +110,6 @@ void readVec2Stream(t_DataType* p_in, unsigned int p_n, hls::stream<WideType<t_D
 #pragma HLS PIPELINE
         BitConv<t_DataType> l_bitConv;
         WideType<t_DataType, t_ParEntries> l_val;
-#pragma HLS ARRAY_PARTITION variable = l_val complete
         for (unsigned int j = 0; j < t_ParEntries; ++j) {
             l_val[j] = p_in[i * t_ParEntries + j];
         }
 
@@ -51,7 +51,6 @@ void preProcess(unsigned int p_numElems,
         for (t_IndexType i = 0; i < p_numElems; i++) {
 #pragma HLS PIPELINE
             WideType<t_DataType, 1 << t_LogParEntries> l_x = p_x.read();
-#pragma HLS ARRAY_PARTITION variable = l_x complete dim = 1
             t_SumDataType l_sum;
             l_sum = BinarySum<t_DataType, l_ParEntries, t_SumDataType>::sum(l_x.getValAddr());
             p_data.write(l_sum);
@@ -72,7 +71,6 @@ void postProcess(unsigned int p_numElems,
         for (t_IndexType i = 0; i < l_numIter; i++) {
 #pragma HLS PIPELINE II = l_Delays
             WideType<t_DataType, l_Delays> l_input;
-#pragma HLS ARRAY_PARTITION variable = l_input complete dim = 1
             for (t_IndexType j = 0; j < l_Delays; j++) {
 #pragma HLS UNROLL
                 l_input.shift(p_pad.read());
Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,6 @@ void readVec2Stream(t_DataType* p_in, unsigned int p_n, hls::stream<WideType<t_D`
`110`	`110`	`#pragma HLS PIPELINE`
`111`	`111`	`BitConv<t_DataType> l_bitConv;`
`112`	`112`	`WideType<t_DataType, t_ParEntries> l_val;`
`113`		`-#pragma HLS ARRAY_PARTITION variable = l_val complete`
`114`	`113`	`for (unsigned int j = 0; j < t_ParEntries; ++j) {`
`115`	`114`	`l_val[j] = p_in[i * t_ParEntries + j];`
`116`	`115`	`}`