From 5cf197e401fb7cf912693566787221eb112a5b32 Mon Sep 17 00:00:00 2001 From: Sioni Summers Date: Fri, 29 Jul 2022 17:08:14 +0200 Subject: [PATCH 1/3] Add support for Pynq ZU --- .../vivado_accelerator/supported_boards.json | 6 ++ .../python_drivers/axi_stream_driver.py | 75 +++++++++++++++++++ .../pynq-zu/tcl_scripts/axi_stream_design.tcl | 58 ++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 hls4ml/templates/vivado_accelerator/pynq-zu/python_drivers/axi_stream_driver.py create mode 100644 hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl diff --git a/hls4ml/backends/vivado_accelerator/supported_boards.json b/hls4ml/backends/vivado_accelerator/supported_boards.json index 1279ec22d0..1572b808d8 100644 --- a/hls4ml/backends/vivado_accelerator/supported_boards.json +++ b/hls4ml/backends/vivado_accelerator/supported_boards.json @@ -5,6 +5,12 @@ "python_drivers": {"axi_stream": "axi_stream_driver.py"}, "c_drivers": {} }, + "pynq-zu": { + "part": "xczu5eg-sfvc784-1-e", + "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"}, + "python_drivers": {"axi_stream": "axi_stream_driver.py"}, + "c_drivers": {} + }, "zcu102": { "part": "xczu9eg-ffvb1156-2-e", "tcl_scripts": { "axi_stream": "axi_stream_design.tcl"}, diff --git a/hls4ml/templates/vivado_accelerator/pynq-zu/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/pynq-zu/python_drivers/axi_stream_driver.py new file mode 100644 index 0000000000..4adb187ab4 --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-zu/python_drivers/axi_stream_driver.py @@ -0,0 +1,75 @@ +from pynq import DefaultHierarchy, DefaultIP, allocate +from pynq import Overlay +from datetime import datetime +import pynq.lib.dma +import numpy as np + + +class NeuralNetworkOverlay(Overlay): + def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, + device=None): + super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) + self.sendchannel = self.hier_0.axi_dma_0.sendchannel + self.recvchannel = self.hier_0.axi_dma_0.recvchannel + self.input_buffer = allocate(shape=x_shape, dtype=dtype) + self.output_buffer = allocate(shape=y_shape, dtype=dtype) + + def _print_dt(self, timea, timeb, N): + dt = (timeb - timea) + dts = dt.seconds + dt.microseconds * 10 ** -6 + rate = N / dts + print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate)) + return dts, rate + + def predict(self, X, debug=False, profile=False, encode=None, decode=None): + """ + Obtain the predictions of the NN implemented in the FPGA. + Parameters: + - X : the input vector. Should be numpy ndarray. + - dtype : the data type of the elements of the input/output vectors. + Note: it should be set depending on the interface of the accelerator; if it uses 'float' + types for the 'data' AXI-Stream field, 'np.float32' dtype is the correct one to use. + Instead if it uses 'ap_fixed', 'np.intA' is the correct one to use (note that A cannot + any integer value, but it can assume {..., 8, 16, 32, ...} values. Check `numpy` + doc for more info). + In this case the encoding/decoding has to be computed by the PS. For example for + 'ap_fixed<16,6>' type the following 2 functions are the correct one to use for encode/decode + 'float' -> 'ap_fixed<16,6>': + ``` + def encode(xi): + return np.int16(round(xi * 2**10)) # note 2**10 = 2**(A-B) + def decode(yi): + return yi * 2**-10 + encode_v = np.vectorize(encode) # to apply them element-wise + decode_v = np.vectorize(decode) + ``` + - profile : boolean. Set it to `True` to print the performance of the algorithm in term of `inference/s`. + - encode/decode: function pointers. See `dtype` section for more information. + - return: an output array based on `np.ndarray` with a shape equal to `y_shape` and a `dtype` equal to + the namesake parameter. + """ + if profile: + timea = datetime.now() + if encode is not None: + X = encode(X) + self.input_buffer[:] = X + self.sendchannel.transfer(self.input_buffer) + self.recvchannel.transfer(self.output_buffer) + if debug: + print("Transfer OK") + self.sendchannel.wait() + if debug: + print("Send OK") + self.recvchannel.wait() + if debug: + print("Receive OK") + # result = self.output_buffer.copy() + if decode is not None: + self.output_buffer = decode(self.output_buffer) + + if profile: + timeb = datetime.now() + dts, rate = self._print_dt(timea, timeb, len(X)) + return self.output_buffer, dts, rate + else: + return self.output_buffer \ No newline at end of file diff --git a/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl new file mode 100644 index 0000000000..7ef8b2649c --- /dev/null +++ b/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl @@ -0,0 +1,58 @@ +#@todo: try to remove startgroup and endgroup and see if it work +set tcldir [file dirname [info script]] +source [file join $tcldir project.tcl] + +create_project project_1 ${myproject}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force + +set_property board_part tul.com.tw:pynqzu:part0:1.1 [current_project] +set_property ip_repo_paths ${myproject}_prj [current_project] +update_ip_catalog + +create_bd_design "design_1" +set_property ip_repo_paths ${myproject}_prj/solution1/impl/ip [current_project] +update_ip_catalog + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.3 zynq_ultra_ps_e_0 +endgroup + +apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ultra_ps_e_0] + +set_property -dict [list CONFIG.PSU__USE__S_AXI_GP0 {1} CONFIG.PSU__SAXIGP0__DATA_WIDTH {32}] [get_bd_cells zynq_ultra_ps_e_0] + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:ip:axi_dma:7.1 axi_dma_0 +endgroup +set_property -dict [list CONFIG.c_m_axi_s2mm_data_width.VALUE_SRC USER CONFIG.c_s_axis_s2mm_tdata_width.VALUE_SRC USER] [get_bd_cells axi_dma_0] +set_property -dict [list CONFIG.c_include_sg {0} CONFIG.c_sg_length_width {26} CONFIG.c_sg_include_stscntrl_strm {0} CONFIG.c_m_axi_mm2s_data_width ${bit_width_hls_input} CONFIG.c_m_axis_mm2s_tdata_width ${bit_width_hls_input} CONFIG.c_mm2s_burst_size {256} CONFIG.c_m_axi_s2mm_data_width ${bit_width_hls_output} CONFIG.c_s_axis_s2mm_tdata_width ${bit_width_hls_output} CONFIG.c_s2mm_burst_size {256}] [get_bd_cells axi_dma_0] + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/zynq_ultra_ps_e_0/M_AXI_HPM0_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {New AXI Interconnect} master_apm {0}} [get_bd_intf_pins axi_dma_0/S_AXI_LITE] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {Auto} Clk_xbar {Auto} Master {/axi_dma_0/M_AXI_MM2S} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {New AXI SmartConnect} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/S_AXI_HPC0_FPD] +endgroup + +startgroup +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/axi_dma_0/M_AXI_S2MM} Slave {/zynq_ultra_ps_e_0/S_AXI_HPC0_FPD} ddr_seg {Auto} intc_ip {/axi_smc} master_apm {0}} [get_bd_intf_pins axi_dma_0/M_AXI_S2MM] +apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Clk_slave {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Clk_xbar {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Master {/zynq_ultra_ps_e_0/M_AXI_HPM1_FPD} Slave {/axi_dma_0/S_AXI_LITE} ddr_seg {Auto} intc_ip {/ps8_0_axi_periph} master_apm {0}} [get_bd_intf_pins zynq_ultra_ps_e_0/M_AXI_HPM1_FPD] +endgroup + +startgroup +create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0 +endgroup +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${myproject}_axi_0/in_r] +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${myproject}_axi_0/out_r] + +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${myproject}_axi_0/ap_clk] +group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${myproject}_axi_0] + +make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top + +add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v + +reset_run impl_1 +reset_run synth_1 +launch_runs impl_1 -to_step write_bitstream -jobs 6 +wait_on_run -timeout 360 impl_1 + +open_run impl_1 +report_utilization -file util.rpt -hierarchical -hierarchical_percentages From 3d335fd5c93b1bbd13c40164d2db11246abacdcf Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 9 Apr 2023 20:45:48 -0700 Subject: [PATCH 2/3] Update axi_stream_design.tcl --- .../pynq-zu/tcl_scripts/axi_stream_design.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl index 7ef8b2649c..f907ba6913 100644 --- a/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl +++ b/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl @@ -2,7 +2,7 @@ set tcldir [file dirname [info script]] source [file join $tcldir project.tcl] -create_project project_1 ${myproject}_vivado_accelerator -part xczu9eg-ffvb1156-2-e -force +create_project project_1 ${myproject}_vivado_accelerator -part xczu5eg-sfvc784-1-e -force set_property board_part tul.com.tw:pynqzu:part0:1.1 [current_project] set_property ip_repo_paths ${myproject}_prj [current_project] From e64619d34c60f56b178c2be3b8d9f9ab20f7bf8f Mon Sep 17 00:00:00 2001 From: Javier Duarte Date: Sun, 9 Apr 2023 21:09:16 -0700 Subject: [PATCH 3/3] pre-comit --- .../python_drivers/axi_stream_driver.py | 18 ++++++++--------- .../pynq-zu/tcl_scripts/axi_stream_design.tcl | 20 +++++++++---------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/hls4ml/templates/vivado_accelerator/pynq-zu/python_drivers/axi_stream_driver.py b/hls4ml/templates/vivado_accelerator/pynq-zu/python_drivers/axi_stream_driver.py index 4adb187ab4..1aac79f2d3 100644 --- a/hls4ml/templates/vivado_accelerator/pynq-zu/python_drivers/axi_stream_driver.py +++ b/hls4ml/templates/vivado_accelerator/pynq-zu/python_drivers/axi_stream_driver.py @@ -1,13 +1,13 @@ -from pynq import DefaultHierarchy, DefaultIP, allocate -from pynq import Overlay from datetime import datetime -import pynq.lib.dma + import numpy as np +from pynq import Overlay, allocate class NeuralNetworkOverlay(Overlay): - def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, - device=None): + def __init__( + self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, download=True, ignore_version=False, device=None + ): super().__init__(bitfile_name, dtbo=None, download=True, ignore_version=False, device=None) self.sendchannel = self.hier_0.axi_dma_0.sendchannel self.recvchannel = self.hier_0.axi_dma_0.recvchannel @@ -15,10 +15,10 @@ def __init__(self, bitfile_name, x_shape, y_shape, dtype=np.float32, dtbo=None, self.output_buffer = allocate(shape=y_shape, dtype=dtype) def _print_dt(self, timea, timeb, N): - dt = (timeb - timea) - dts = dt.seconds + dt.microseconds * 10 ** -6 + dt = timeb - timea + dts = dt.seconds + dt.microseconds * 10**-6 rate = N / dts - print("Classified {} samples in {} seconds ({} inferences / s)".format(N, dts, rate)) + print(f"Classified {N} samples in {dts} seconds ({rate} inferences / s)") return dts, rate def predict(self, X, debug=False, profile=False, encode=None, decode=None): @@ -72,4 +72,4 @@ def decode(yi): dts, rate = self._print_dt(timea, timeb, len(X)) return self.output_buffer, dts, rate else: - return self.output_buffer \ No newline at end of file + return self.output_buffer diff --git a/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl b/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl index f907ba6913..9b4676860c 100644 --- a/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl +++ b/hls4ml/templates/vivado_accelerator/pynq-zu/tcl_scripts/axi_stream_design.tcl @@ -2,14 +2,14 @@ set tcldir [file dirname [info script]] source [file join $tcldir project.tcl] -create_project project_1 ${myproject}_vivado_accelerator -part xczu5eg-sfvc784-1-e -force +create_project project_1 ${project_name}_vivado_accelerator -part xczu5eg-sfvc784-1-e -force set_property board_part tul.com.tw:pynqzu:part0:1.1 [current_project] -set_property ip_repo_paths ${myproject}_prj [current_project] +set_property ip_repo_paths ${project_name}_prj [current_project] update_ip_catalog create_bd_design "design_1" -set_property ip_repo_paths ${myproject}_prj/solution1/impl/ip [current_project] +set_property ip_repo_paths ${project_name}_prj/solution1/impl/ip [current_project] update_ip_catalog startgroup @@ -37,17 +37,17 @@ apply_bd_automation -rule xilinx.com:bd_rule:axi4 -config { Clk_master {Auto} Cl endgroup startgroup -create_bd_cell -type ip -vlnv xilinx.com:hls:${myproject}_axi:1.0 ${myproject}_axi_0 +create_bd_cell -type ip -vlnv xilinx.com:hls:${project_name}_axi:1.0 ${project_name}_axi_0 endgroup -connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${myproject}_axi_0/in_r] -connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${myproject}_axi_0/out_r] +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/M_AXIS_MM2S] [get_bd_intf_pins ${project_name}_axi_0/in_r] +connect_bd_intf_net [get_bd_intf_pins axi_dma_0/S_AXIS_S2MM] [get_bd_intf_pins ${project_name}_axi_0/out_r] -apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${myproject}_axi_0/ap_clk] -group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${myproject}_axi_0] +apply_bd_automation -rule xilinx.com:bd_rule:clkrst -config { Clk {/zynq_ultra_ps_e_0/pl_clk0 (99 MHz)} Freq {100} Ref_Clk0 {} Ref_Clk1 {} Ref_Clk2 {}} [get_bd_pins ${project_name}_axi_0/ap_clk] +group_bd_cells hier_0 [get_bd_cells axi_dma_0] [get_bd_cells ${project_name}_axi_0] -make_wrapper -files [get_files ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top +make_wrapper -files [get_files ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/design_1.bd] -top -add_files -norecurse ./${myproject}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v +add_files -norecurse ./${project_name}_vivado_accelerator/project_1.srcs/sources_1/bd/design_1/hdl/design_1_wrapper.v reset_run impl_1 reset_run synth_1