Skip to content

Commit 22401ba

Browse files
author
Quentin Berthet
committed
VitisAccelerator Host code refactor:
- Multiple devices support - Selection of device by BDF - OpenCL error checking - Automatic memory bank association - Inferences validation - Improved command line parameters - Improved debug output - Dummy buffer copy to avoid benchmarking buffer allocation time - Removal of mutexes preventing buffer copies overlap with kernel executions on the same CU with multiple workers - Documentation
1 parent a8e8466 commit 22401ba

File tree

9 files changed

+416
-288
lines changed

9 files changed

+416
-288
lines changed

docs/backend/accelerator.rst

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -132,11 +132,34 @@ Once the project is generated, it possible to run manually the build steps by us
132132

133133
It is also possible to run the full build process by calling ``make`` without any target. Modifications to the ``accelerator_card.cfg`` file can be done manually before running the build process (e.g., to change the clock period, or add addition ``.xo`` kernel to the build).
134134

135-
The generated host code application and the xclbin file can be executed as such:
135+
Host code
136+
=========
137+
138+
Once built, the host program can be run to load the board and perform inferences:
139+
140+
.. code-block:: Bash
141+
142+
./host
143+
144+
By defaut, all Computing Unit (CU) on all compatible devices will be used, with 3 worker thread per CU.
145+
146+
The generated host code application support the following options to tweak the execution:
147+
148+
* ``-d``: device BDF to use (can be specified multiple times)
149+
* ``-x``: XCLBIN path
150+
* ``-i``: input feature file
151+
* ``-o``: output feature file
152+
* ``-c``: maximum computing units count to use
153+
* ``-n``: number of worker threads to use
154+
* ``-r``: number of repeatition of the input feature file (For artificially increasing the data size for benchmarking purpose)
155+
* ``-v``: enable verbose output
156+
* ``-h``: print help
157+
158+
The following example shows how to limit on only one device, one CU, and on worker thread:
136159

137160
.. code-block:: Bash
138161
139-
./host <build_directory>/<myproject>.xclbin
162+
./host -d 0000:c1:00.1 -c 1 -n 1
140163
141164
Example
142165
=======

hls4ml/backends/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from hls4ml.backends.vivado_accelerator.vivado_accelerator_backend import VivadoAcceleratorBackend
99
from hls4ml.backends.vivado_accelerator.vivado_accelerator_config import VivadoAcceleratorConfig # noqa: F401
1010

11+
from hls4ml.backends.catapult.catapult_backend import CatapultBackend # isort: skip
12+
1113
from hls4ml.backends.vitis.vitis_backend import VitisBackend # isort: skip
1214
from hls4ml.backends.vitis_accelerator.vitis_accelerator_backend import VitisAcceleratorBackend # isort: skip
1315

hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp

Lines changed: 89 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
#include <fstream>
66
#include <iostream>
77
#include <list>
8-
#include <stdexcept>
98
#include <sstream>
9+
#include <stdexcept>
1010
#include <string>
1111
#include <vector>
1212

@@ -25,26 +25,25 @@ template <class T, class U> class DataBatcher {
2525
* \param profilingDataRepeat Only used if profiling is set to True. Additional number of
2626
* times the given data is iterated over.
2727
*/
28-
DataBatcher(int batchsize, int sampleInputSize, int sampleOutputSize, int numWorkers,
29-
bool profiling, int profilingDataRepeat)
28+
DataBatcher(int batchsize, int sampleInputSize, int sampleOutputSize, int numWorkers, int profilingDataRepeat)
3029
: _batchsize(batchsize), _sampleInputSize(sampleInputSize), _sampleOutputSize(sampleOutputSize),
31-
_numWorkers(numWorkers), _profiling(profiling), _profilingDataRepeat(profilingDataRepeat) {}
30+
_numWorkers(numWorkers), _profilingDataRepeat(profilingDataRepeat) {}
3231

3332
/**
3433
* \brief Read in data to a buffer. Allocate space for results.
3534
* \param filename Filename.
3635
* \param s Type of input, currently supports text files used by VitisAccelerator backend, and
3736
* binary files produced by NumPy's toFile() function
3837
*/
39-
void read(const std::string& filename) {
40-
std::cout << "\nReading data from text file " << filename << std::endl;
38+
void read(const std::string &filename) {
4139

42-
// Read in text file
4340
std::ifstream fin(filename);
4441
if (!fin.is_open()) {
4542
throw std::runtime_error("Error opening file " + filename);
4643
}
4744

45+
std::cout << "Reading data from: " << filename << std::endl;
46+
4847
std::string line;
4948
while (std::getline(fin, line)) {
5049
originalSampleCount++;
@@ -57,13 +56,70 @@ template <class T, class U> class DataBatcher {
5756
throw std::runtime_error("Failed to parse value on line " + std::to_string(originalSampleCount));
5857
}
5958
}
60-
std::cout << "Read in " << originalSampleCount << " lines" << std::endl;
59+
60+
std::cout << "Read in " << originalSampleCount << " samples (" << inputData.size() << " elements)" << std::endl;
6161
fin.close();
6262

6363
// Zero-pad
6464
numBatches = std::ceil(static_cast<double>(originalSampleCount) / _batchsize);
65-
if (numBatches * _batchsize > originalSampleCount) {
66-
inputData.resize(numBatches * _batchsize * _sampleInputSize, (T)0);
65+
size_t finalSampleCount = numBatches * _batchsize;
66+
if (finalSampleCount > originalSampleCount) {
67+
std::cout << "Padding with " << (finalSampleCount - originalSampleCount) << " empty samples for a total of "
68+
<< numBatches << " batches of " << _batchsize << " samples" << std::endl;
69+
inputData.resize(finalSampleCount * _sampleInputSize, (T)0);
70+
}
71+
}
72+
73+
bool readReference(const std::string &filename) {
74+
75+
std::ifstream fref(filename);
76+
if (!fref.is_open()) {
77+
return false;
78+
}
79+
80+
std::cout << "Reading data from: " << filename << std::endl;
81+
size_t refSampleCount = 0;
82+
std::string line;
83+
while (std::getline(fref, line)) {
84+
refSampleCount++;
85+
std::istringstream parser(line);
86+
T val;
87+
while (parser >> val) {
88+
refData.push_back(val);
89+
}
90+
if (!parser.eof()) {
91+
throw std::runtime_error("Failed to parse value on line " + std::to_string(refSampleCount));
92+
}
93+
}
94+
95+
std::cout << "Read in " << refSampleCount << " reference samples (" << refData.size() << " elements)" << std::endl;
96+
fref.close();
97+
return true;
98+
}
99+
100+
void checkResults() {
101+
if (storedEvalResults.size() == 0 || refData.size() == 0) {
102+
throw std::runtime_error("No data to check");
103+
}
104+
105+
if (storedEvalResults.size() != refData.size()) {
106+
throw std::runtime_error("Stored results and reference data are not the same size");
107+
}
108+
size_t error_count = 0;
109+
for (uint64_t i = 0; i < storedEvalResults.size(); i++) {
110+
if (storedEvalResults[i] != refData[i]) {
111+
error_count++;
112+
std::cout << "Mismatch at index " + std::to_string(i) + ": " + std::to_string((float)storedEvalResults[i]) +
113+
" != " + std::to_string((float)refData[i])
114+
<< ", error = " << ((float)storedEvalResults[i] - (float)refData[i]) << std::endl;
115+
}
116+
}
117+
118+
if (error_count > 0) {
119+
std::cout << "Mismatch count: " << error_count << std::endl;
120+
throw std::runtime_error("Results do not match reference data");
121+
} else {
122+
std::cout << "Results match reference data" << std::endl;
67123
}
68124
}
69125

@@ -74,7 +130,7 @@ template <class T, class U> class DataBatcher {
74130
storedEvalResults.resize(numBatches * _batchsize * _sampleOutputSize, (U)0);
75131

76132
// Allocate space to dump the extra arbitrary data used during profiling
77-
if (_profiling) {
133+
if (isProfilingMode()) {
78134
profilingResultsDump.resize(_numWorkers * _batchsize * _sampleOutputSize, (U)0);
79135
}
80136
}
@@ -84,43 +140,47 @@ template <class T, class U> class DataBatcher {
84140
* \param batchedData A vector of containers for each Worker's batches/workload.
85141
* Size must be equal to _numWorkers.
86142
*/
87-
void batch(std::vector<std::list<Batch<T, U>>>& batchedData) {
143+
void batch(std::vector<std::list<Batch<T, U>>> &batchedData) {
88144
if (inputData.size() == 0 || originalSampleCount == 0) {
89145
throw std::runtime_error("No data to batch");
90146
}
147+
std::cout << "Original sample count: " << originalSampleCount << std::endl;
148+
std::cout << "Input sample element count: " << _sampleInputSize << std::endl;
149+
std::cout << "Output sample element count: " << _sampleOutputSize << std::endl;
91150
if (storedEvalResults.size() == 0) {
92151
throw std::runtime_error("Create result buffers first");
93152
}
94153

95-
batchedData.reserve(_numWorkers);
96-
for (int i = 0; i < _numWorkers; i++) {
97-
batchedData.emplace_back();
98-
}
154+
batchedData.resize(_numWorkers);
99155

100156
uint64_t batchIndex = 0;
101157
while (batchIndex < numBatches) {
102158
int worker = batchIndex % _numWorkers;
103159
uint64_t inputLocation = batchIndex * _batchsize * _sampleInputSize;
104160
uint64_t outputLocation = batchIndex * _batchsize * _sampleOutputSize;
105161

106-
const T* in = &inputData[inputLocation];
107-
U* out = &storedEvalResults[outputLocation];
162+
const T *in = &inputData[inputLocation];
163+
U *out = &storedEvalResults[outputLocation];
108164
Batch<T, U> newBatch = {in, out};
109165

110166
batchedData[worker].push_back(newBatch);
111167
batchIndex++;
112168
}
113169

114-
if (_profiling) {
170+
if (isProfilingMode()) {
115171
std::cout << "Creating profiling batches" << std::endl;
116172
profilingBatchCount = numBatches * (_profilingDataRepeat + 1);
173+
std::cout << "Batches: " << numBatches << std::endl;
174+
std::cout << "Profiling batch count: " << profilingBatchCount << std::endl;
175+
std::cout << "Profiling data repeat: " << _profilingDataRepeat << std::endl;
176+
std::cout << "Profiling total data count: " << profilingBatchCount * _batchsize << std::endl;
117177
while (batchIndex < profilingBatchCount) {
118178
int worker = batchIndex % _numWorkers;
119179
uint64_t inputLocation = (batchIndex % numBatches) * _batchsize * _sampleInputSize;
120180
uint64_t outputLocation = worker * _batchsize * _sampleOutputSize;
121181

122-
const T* in = &inputData[inputLocation];
123-
U* out = &profilingResultsDump[outputLocation];
182+
const T *in = &inputData[inputLocation];
183+
U *out = &profilingResultsDump[outputLocation];
124184
Batch<T, U> newBatch = {in, out};
125185

126186
batchedData[worker].push_back(newBatch);
@@ -141,8 +201,8 @@ template <class T, class U> class DataBatcher {
141201
profilingBatchCount = 0;
142202
}
143203

144-
void write(const std::string& filename) {
145-
std::cout << "\nWriting HW results to file " << filename << std::endl;
204+
void write(const std::string &filename) {
205+
std::cout << "Writing HW results to: " << filename << std::endl;
146206
std::ofstream fout;
147207
fout.open(filename, std::ios::trunc);
148208

@@ -163,28 +223,19 @@ template <class T, class U> class DataBatcher {
163223
profilingResultsDump.clear();
164224
}
165225

166-
uint64_t getSampleCount() {
167-
return originalSampleCount;
168-
}
226+
uint64_t getSampleCount() { return originalSampleCount; }
169227

170-
uint64_t getPaddedSampleCount() {
171-
return numBatches * _batchsize;
172-
}
228+
uint64_t getPaddedSampleCount() { return numBatches * _batchsize; }
173229

174-
uint64_t getProfilingSampleCount() {
175-
return profilingBatchCount * _batchsize;
176-
}
230+
uint64_t getProfilingSampleCount() { return profilingBatchCount * _batchsize; }
177231

178-
bool isProfilingMode() {
179-
return _profiling;
180-
}
232+
bool isProfilingMode() { return _profilingDataRepeat > 0; }
181233

182234
private:
183235
int _batchsize;
184236
int _sampleInputSize;
185237
int _sampleOutputSize;
186238
int _numWorkers;
187-
bool _profiling;
188239
int _profilingDataRepeat;
189240

190241
/// @brief Number of floats read in. (Not including padding).
@@ -195,6 +246,8 @@ template <class T, class U> class DataBatcher {
195246
uint64_t profilingBatchCount = 0;
196247
/// @brief Vector with values.
197248
std::vector<T> inputData;
249+
/// @brief Vector with reference values.
250+
std::vector<T> refData;
198251
/// @brief Vector to store evaluation results.
199252
std::vector<U> storedEvalResults;
200253
/// @brief Vector for dumping results from extra arbitrary data used during profiling.

0 commit comments

Comments
 (0)