Skip to content

Commit f04e384

Browse files
Chuck SongGitHub Enterprise
authored andcommitted
Merge pull request #17 from RepoOps/blas-to-top
update benchmark with U250 results and add multi-device Python APIs for Keras MLP
2 parents 9d29411 + 33b200b commit f04e384

File tree

746 files changed

+62274
-2775
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

746 files changed

+62274
-2775
lines changed

blas/Jenkinsfile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
@Library('pipeline-library')_
2+
3+
VitisLibPipeline (branch: 'master', libname: 'xf_blas', TARGETS: 'hls_csim:hls_csynth:hls_cosim:vitis_sw_emu:vitis_hw_emu',
4+
email: 'lingl@xilinx.com', devtest: 'RunDeploy.sh', TOOLVERSION: '2019.2_released')
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
* Copyright 2019 Xilinx, Inc.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#ifndef XF_BLAS_GEMM_HPP
18+
#define XF_BLAS_GEMM_HPP
19+
20+
#ifndef __cplusplus
21+
#error "BLAS Library only works with C++."
22+
#endif
23+
24+
#include "ap_int.h"
25+
#include "hls_stream.h"
26+
#include "xf_blas/helpers.hpp"
27+
#include "scal.hpp"
28+
#include "axpy.hpp"
29+
30+
namespace xf {
31+
32+
namespace blas {
33+
34+
template <typename t_DataType, unsigned int t_M, unsigned int t_N = t_M, typename t_MacDataType = t_DataType>
35+
class SystolicArray {
36+
public:
37+
static void process_dsp(unsigned int p_k,
38+
hls::stream<WideType<t_DataType, t_M> >& p_As,
39+
hls::stream<WideType<t_DataType, t_N> >& p_Bs,
40+
hls::stream<WideType<t_MacDataType, t_N> >& p_sum,
41+
unsigned int p_multi = 1) {
42+
#ifndef __SYNTHESIS__
43+
assert(p_k >= t_M + t_N);
44+
#endif
45+
46+
WideType<t_DataType, t_M + t_N> l_winA[t_M];
47+
#pragma HLS ARRAY_PARTITION variable = l_winA dim = 0 complete
48+
WideType<t_DataType, t_M + t_N> l_winB[t_N];
49+
#pragma HLS ARRAY_PARTITION variable = l_winB dim = 0 complete
50+
51+
WideType<t_MacDataType, t_N> l_C[t_M];
52+
#pragma HLS ARRAY_PARTITION variable = l_C dim = 0 complete
53+
WideType<t_MacDataType, t_N> l_Co[t_M];
54+
#pragma HLS ARRAY_PARTITION variable = l_Co dim = 0 complete
55+
56+
for (int k = 0, l = 0; l < p_multi * p_k + t_M + t_N; l++, k++) {
57+
#pragma HLS PIPELINE
58+
if (k == p_k) {
59+
k = 0;
60+
}
61+
62+
if (l > p_k && k >= t_N && k < t_M + t_N) {
63+
p_sum.write(l_Co[k - t_N]);
64+
}
65+
66+
WideType<t_DataType, t_M> l_A = 0;
67+
WideType<t_DataType, t_N> l_B = 0;
68+
69+
if (l < p_multi * p_k) {
70+
l_A = p_As.read();
71+
l_B = p_Bs.read();
72+
}
73+
74+
for (int j = 0; j < t_M; j++) l_winA[j].shift(l_A[j]);
75+
for (int j = 0; j < t_N; j++) l_winB[j].shift(l_B[j]);
76+
for (int m = 0; m < t_M; m++) {
77+
for (int n = 0; n < t_N; n++) {
78+
int l_id = m + n;
79+
if (l_id == k) {
80+
l_Co[m][n] = l_C[m][n];
81+
l_C[m][n] = 0;
82+
}
83+
l_C[m][n] += l_winA[m][l_id] * l_winB[n][l_id];
84+
}
85+
}
86+
}
87+
}
88+
};
89+
90+
template <typename t_DataType,
91+
unsigned int t_M,
92+
unsigned int t_N = t_M,
93+
typename t_IndexType = unsigned int,
94+
typename t_MacDataType = t_DataType>
95+
void gemm(const unsigned int p_k,
96+
hls::stream<WideType<t_DataType, t_M> >& p_A,
97+
hls::stream<WideType<t_DataType, t_N> >& p_B,
98+
hls::stream<WideType<t_MacDataType, t_N> >& p_C,
99+
const unsigned int p_r = 1) {
100+
#pragma HLS DATAFLOW
101+
SystolicArray<t_DataType, t_M, t_N, t_MacDataType>::process_dsp(p_k, p_A, p_B, p_C, p_r);
102+
}
103+
104+
} // end namespace blas
105+
106+
} // end namespace xf
107+
108+
#endif

blas/L1/include/hw/xf_blas/helpers/dataMover/transpMatB2.hpp

Lines changed: 40 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -92,30 +92,54 @@ template <typename t_DataType, unsigned int t_ParEntries>
9292
void transpMatBlocks(unsigned int p_blocks,
9393
hls::stream<WideType<t_DataType, t_ParEntries> >& p_in,
9494
hls::stream<WideType<t_DataType, t_ParEntries> >& p_out) {
95-
t_DataType l_buf[t_ParEntries][t_ParEntries];
95+
t_DataType l_buf[2][t_ParEntries][t_ParEntries];
9696
#pragma HLS ARRAY_PARTITION variable = l_buf complete dim = 0
97-
for (unsigned int l_block = 0; l_block < p_blocks; ++l_block) {
98-
// shuffle and store
99-
for (unsigned int i = 0; i < t_ParEntries; ++i) {
97+
98+
for (int i = 0; i < t_ParEntries; ++i) {
10099
#pragma HLS PIPELINE
101-
WideType<t_DataType, t_ParEntries> l_val;
100+
WideType<t_DataType, t_ParEntries> l_val;
102101
#pragma HLS ARRAY_PARTITION variable = l_val complete
103-
l_val = p_in.read();
104-
for (unsigned int j = 0; j < t_ParEntries; ++j) {
105-
l_buf[i][j] = l_val[j];
106-
}
102+
l_val = p_in.read();
103+
for (int j = 0; j < t_ParEntries; ++j) {
104+
l_buf[0][i][j] = l_val[j];
107105
}
106+
}
108107

109-
for (unsigned int i = 0; i < t_ParEntries; ++i) {
108+
for (unsigned int l_block = 1; l_block < p_blocks; ++l_block) {
109+
int jIn = 0, jOut = 0;
110+
do {
110111
#pragma HLS PIPELINE
111-
WideType<t_DataType, t_ParEntries> l_val;
112-
#pragma HLS ARRAY_PARTITION variable = l_val complete
113-
for (unsigned int j = 0; j < t_ParEntries; ++j) {
114-
l_val[j] = l_buf[j][i];
112+
WideType<t_DataType, t_ParEntries> l_valIn;
113+
#pragma HLS ARRAY_PARTITION variable = l_valIn complete
114+
WideType<t_DataType, t_ParEntries> l_valOut;
115+
#pragma HLS ARRAY_PARTITION variable = l_valOut complete
116+
if (p_in.read_nb(l_valIn)) {
117+
for (int k = 0; k < t_ParEntries; ++k) {
118+
l_buf[l_block % 2][jIn][k] = l_valIn[k];
119+
}
120+
jIn++;
115121
}
116-
p_out.write(l_val);
117-
}
122+
for (int k = 0; k < t_ParEntries; ++k) {
123+
l_valOut[k] = l_buf[(l_block - 1) % 2][k][jOut];
124+
}
125+
if (jOut < t_ParEntries) {
126+
p_out.write(l_valOut);
127+
jOut++;
128+
}
129+
} while ((jIn < t_ParEntries) || (jOut < t_ParEntries));
118130
}
131+
132+
int i = 0;
133+
do {
134+
#pragma HLS PIPELINE
135+
WideType<t_DataType, t_ParEntries> l_valOut;
136+
#pragma HLS ARRAY_PARTITION variable = l_valOut complete
137+
for (int j = 0; j < t_ParEntries; ++j) {
138+
l_valOut[j] = l_buf[(p_blocks - 1) % 2][j][i];
139+
}
140+
p_out.write(l_valOut);
141+
i++;
142+
} while (i < t_ParEntries);
119143
}
120144

121145
template <typename t_DataType, unsigned int t_ParEntries>

blas/L1/include/hw/xf_blas/helpers/dataMover/vecMoverB1.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,6 @@ void readVec2Stream(t_DataType* p_in, unsigned int p_n, hls::stream<WideType<t_D
110110
#pragma HLS PIPELINE
111111
BitConv<t_DataType> l_bitConv;
112112
WideType<t_DataType, t_ParEntries> l_val;
113-
#pragma HLS ARRAY_PARTITION variable = l_val complete
114113
for (unsigned int j = 0; j < t_ParEntries; ++j) {
115114
l_val[j] = p_in[i * t_ParEntries + j];
116115
}

blas/L1/include/hw/xf_blas/helpers/funcs/sum.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ void preProcess(unsigned int p_numElems,
5151
for (t_IndexType i = 0; i < p_numElems; i++) {
5252
#pragma HLS PIPELINE
5353
WideType<t_DataType, 1 << t_LogParEntries> l_x = p_x.read();
54-
#pragma HLS ARRAY_PARTITION variable = l_x complete dim = 1
5554
t_SumDataType l_sum;
5655
l_sum = BinarySum<t_DataType, l_ParEntries, t_SumDataType>::sum(l_x.getValAddr());
5756
p_data.write(l_sum);
@@ -72,7 +71,6 @@ void postProcess(unsigned int p_numElems,
7271
for (t_IndexType i = 0; i < l_numIter; i++) {
7372
#pragma HLS PIPELINE II = l_Delays
7473
WideType<t_DataType, l_Delays> l_input;
75-
#pragma HLS ARRAY_PARTITION variable = l_input complete dim = 1
7674
for (t_IndexType j = 0; j < l_Delays; j++) {
7775
#pragma HLS UNROLL
7876
l_input.shift(p_pad.read());

0 commit comments

Comments
 (0)