Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding initial PLIO support to the platform #13

Merged
merged 9 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 46 additions & 1 deletion firmware/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "hsa_csr.h"
#include "hsa_ext_air.h"
#include "memory.h"
#include "soft-dma.h"

extern "C" {

Expand Down Expand Up @@ -174,6 +175,8 @@ uint8_t col_dma_cols[NUM_COL_DMAS] = {7, 8, 9, 10};
#define REG_AIE_CORE_CTL_RESET (1U << 1)
#define REG_AIE_CORE_CTL_ENABLE (1U << 0)

#define SOFT_DMA_CTRL_BASE_ADDR 0x20180000000

inline uint64_t mymod(uint64_t a) {
uint64_t result = a;
while (result >= AQL_QUEUE_NUM_ENTRIES) {
Expand Down Expand Up @@ -1731,6 +1734,41 @@ int do_packet_memcpy(uint32_t slot) {
}
}

void program_soft_pl(uint64_t paddr, uint32_t length, uint32_t direction) {


uint32_t offset = direction == SHIM_DMA_S2MM ? XAXIDMA_RX_OFFSET : 0;
volatile uint32_t *soft_dma_base_addr = (uint32_t *) (SOFT_DMA_CTRL_BASE_ADDR + offset);
volatile uint32_t soft_dma_control_vals = soft_dma_base_addr[(XAXIDMA_CR_OFFSET) / 4];

// We are just using a single descriptor so need to reset the channel after using
// This is maybe a bit overkill but we are not using it for anything fancy so it should suffice
soft_dma_base_addr[(XAXIDMA_CR_OFFSET) / 4] = soft_dma_control_vals | XAXIDMA_CR_RESET_MASK;
soft_dma_base_addr[(XAXIDMA_CR_OFFSET) / 4] = soft_dma_control_vals;

// Checking to make sure it is not busy
if(soft_dma_base_addr[(XAXIDMA_SR_OFFSET) / 4] | XAXIDMA_HALTED_MASK) {

if(soft_dma_base_addr[(XAXIDMA_SR_OFFSET) / 4] & XAXIDMA_IDLE_MASK) {
air_printf("[ERROR] Soft dma is busy\r\n");
return;
}
}

// Setting the address
soft_dma_base_addr[(XAXIDMA_SRCADDR_OFFSET) / 4] = (uint32_t)(paddr & 0xFFFFFFFF);
soft_dma_base_addr[(XAXIDMA_SRCADDR_MSB_OFFSET) / 4] = (uint32_t)((paddr >> 32) & 0xFFFFFFFF);

// Setting the run
soft_dma_base_addr[(XAXIDMA_CR_OFFSET) / 4] = soft_dma_control_vals | XAXIDMA_CR_RUNSTOP_MASK;

// Writing to the BTT register which starts the transfer
soft_dma_base_addr[(XAXIDMA_BUFFLEN_OFFSET) / 4] = length;

return;

}

int stage_packet_nd_memcpy(hsa_agent_dispatch_packet_t *pkt, uint32_t slot,
uint32_t memory_space) {
air_printf("stage_packet_nd_memcpy %d\n\r", slot);
Expand All @@ -1747,7 +1785,14 @@ int stage_packet_nd_memcpy(hsa_agent_dispatch_packet_t *pkt, uint32_t slot,
nd_dma_put_checkpoint(&pkt, slot, 0, 0, 0, paddr, paddr, paddr);
staged_nd_slot[slot].valid = 1;
return 0;
} else {
}
else if(memory_space == 1) {
uint32_t length_1d = (pkt->arg[2] >> 0) & 0xffffffff;
uint16_t direction = (pkt->arg[0] >> 60) & 0x000f;
program_soft_pl(paddr, length_1d, direction);
return 1;
}
else {
air_printf("NOT SUPPORTED: Cannot program memory space %d DMAs\n\r",
memory_space);
return 1;
Expand Down
53 changes: 53 additions & 0 deletions firmware/soft-dma.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
//===- soft-dma.h ---------------------------------------------------*- C++ -*-===//
//
// Copyright (C) 2024, Advanced Micro Devices, Inc.
// SPDX-License-Identifier: MIT
//
//===----------------------------------------------------------------------===//

/*
Note: The contents of this file has been copied from
https://github.com/Xilinx/embeddedsw with the same license.
*/

#ifndef SOFT_DMA_H_
#define SOFT_DMA_H_

#define XAXIDMA_RX_OFFSET 0x00000030 /**< RX channel registers base
* offset */
#define XAXIDMA_CR_OFFSET 0x00000000 /**< Channel control */
#define XAXIDMA_SR_OFFSET 0x00000004 /**< Status */
#define XAXIDMA_CDESC_OFFSET 0x00000008 /**< Current descriptor pointer */
#define XAXIDMA_CDESC_MSB_OFFSET 0x0000000C /**< Current descriptor pointer */
#define XAXIDMA_TDESC_OFFSET 0x00000010 /**< Tail descriptor pointer */
#define XAXIDMA_TDESC_MSB_OFFSET 0x00000014 /**< Tail descriptor pointer */
#define XAXIDMA_SRCADDR_OFFSET 0x00000018 /**< Simple mode source address
pointer */
#define XAXIDMA_SRCADDR_MSB_OFFSET 0x0000001C /**< Simple mode source address
pointer */
#define XAXIDMA_DESTADDR_OFFSET 0x00000018 /**< Simple mode destination address pointer */
#define XAXIDMA_DESTADDR_MSB_OFFSET 0x0000001C /**< Simple mode destination address pointer */
#define XAXIDMA_BUFFLEN_OFFSET 0x00000028 /**< Tail descriptor pointer */
#define XAXIDMA_SGCTL_OFFSET 0x0000002c /**< SG Control Register */

#define XAXIDMA_HALTED_MASK 0x00000001 /**< DMA channel halted */
#define XAXIDMA_IDLE_MASK 0x00000002 /**< DMA channel idle */
#define XAXIDMA_ERR_INTERNAL_MASK 0x00000010 /**< Datamover internal
* err */
#define XAXIDMA_ERR_SLAVE_MASK 0x00000020 /**< Datamover slave err */
#define XAXIDMA_ERR_DECODE_MASK 0x00000040 /**< Datamover decode
* err */
#define XAXIDMA_ERR_SG_INT_MASK 0x00000100 /**< SG internal err */
#define XAXIDMA_ERR_SG_SLV_MASK 0x00000200 /**< SG slave err */
#define XAXIDMA_ERR_SG_DEC_MASK 0x00000400 /**< SG decode err */
#define XAXIDMA_ERR_ALL_MASK 0x00000770 /**< All errors */

/** @name Bitmasks of XAXIDMA_CR_OFFSET register
* @{
*/
#define XAXIDMA_CR_RUNSTOP_MASK 0x00000001 /**< Start/stop DMA channel */
#define XAXIDMA_CR_RESET_MASK 0x00000004 /**< Reset DMA engine */
#define XAXIDMA_CR_KEYHOLE_MASK 0x00000008 /**< Keyhole feature */
#define XAXIDMA_CR_CYCLIC_MASK 0x00000010 /**< Cyclic Mode */

#endif // SOFT_DMA_H
4 changes: 3 additions & 1 deletion platform/vck5000/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
We present an HSA-compatible AMD AI Engine platform implemented on an [AMD VCK5000 board](https://www.xilinx.com/products/boards-and-kits/vck5000.html) which can be targeted through our experimental [rocm-5.6.x-air branch](https://github.com/RadeonOpenCompute/ROCR-Runtime/tree/experimental/rocm-5.6.x-air). The build flow is compatible with AMD-Xilinx VCK5000 cards with production silicon: "VCK5000-AIE-ADK-G-ED". It is built with 2022.1 Vivado/Vitis tools and consists of a relatively empty Versal design (CIPS, QDMA-PCIe frontend, BRAM 'Queue Memory', NoC configuration, and CDMA 'AIE-Configuration DMA'). The ARM processor acts as an HSA AQL packet processor that manages AIE configurations and affects runtime DMA transfers from external memory into the AIE array. The build process consists of 2 steps: vivado, aie_platform:

1. Vivado IPI design. This step generates a Vitis extensible hardware platform containing the necessary CIPS configuration and hardware IP components. This step generates an .xsa file for the design.
2. Vitis design compiles a 32 GMIO AIE design with simple add functionality to enable all NoC NMU/NSU connections so all shimDMAs (used for GMIO) are enabled. This rebuilds the design targeting the platform generated by step 1.
2. Vitis design compiles a 32 GMIO 1 PLIO AIE design with simple add functionality to enable all NoC NMU/NSU connections so all shimDMAs (used for GMIO) are enabled. This rebuilds the design targeting the platform generated by step 1.

Note that all of the columns have PLIO integrated in their shim tiles but only some of the columns support GMIO. The columns that have GMIO enabled are: 2, 3, 6, 7, 10, 11, 18, 19,26, 27, 34, 35, 42, 43, 46, 47. This is necessary information when targetting this platform with a design from mlir-aie where it is necessary for a user to provide the coordinates of the tiles they are targeting.

## Using a Prebuilt Platform Design

Expand Down
21 changes: 19 additions & 2 deletions platform/vck5000/aie_platform/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,13 @@ AIE_CMPL_CMD = aiecompiler -platform=${PLATFORM} -include="./aie" -workdir=./Wor
AIE_SIM_CMD = aiesimulator --pkg-dir=./Work --dump-vcd foo
EMU_CMD = ./launch_hw_emu.sh

# Putting where we put the PL source code
PL_SRC_REPO = ./pl
COUNTER_KERNEL_TOP = counter_hls
COUNTER_KERNEL_SRC = $(PL_SRC_REPO)/counter/$(COUNTER_KERNEL_TOP).cpp
COUNTER_KERNEL_XO = $(COUNTER_KERNEL_TOP).$(TARGET)
COUNTER_KERNEL_VPP_FLAGS = --hls.clock $(VPP_CLOCK_FREQ):$(COUNTER_KERNEL_TOP)

##########################################################################################################################################################
### DO NOT MODIFY BELOW THIS LINE UNLESS NECESSARY
################################################################################################################################################
Expand Down Expand Up @@ -99,8 +106,18 @@ aiesim: ${LIBADF}

xclbin: guard-PLATFORM_REPO_PATHS ${XCLBIN}

${XCLBIN}: ${LIBADF} ${VPP_SPEC}
${VCC} -g -l --platform ${PLATFORM} ${LIBADF} -t ${TARGET} ${VPP_STRATEGY} ${VPP_FLAGS} -o $@
${COUNTER_KERNEL_XO}.xo:
v++ --target $(TARGET) $(COUNTER_KERNEL_VPP_FLAGS) \
$(VPP_FLAGS) -c -k $(COUNTER_KERNEL_TOP) \
--platform ${PLATFORM} \
$(COUNTER_KERNEL_SRC) -o $@


${XCLBIN}: $(COUNTER_KERNEL_XO).xo ${LIBADF} ${VPP_SPEC}
${VCC} -g -l --platform ${PLATFORM} ${COUNTER_KERNEL_XO}.xo ${LIBADF} -t ${TARGET} ${VPP_STRATEGY} ${VPP_FLAGS} -o $@ --to_step vpl.update_bd
vivado -mode batch -source fixup_pl_design.tcl
${VCC} -g -l --platform ${PLATFORM} ${COUNTER_KERNEL_XO}.xo ${LIBADF} -t ${TARGET} ${VPP_STRATEGY} ${VPP_FLAGS} -o $@ --from_step vpl.generate_target


package_${TARGET}: ${LIBADF} ${XCLBIN}
bootgen -arch versal -image vck5000_bootgen.bif -w on -o final_vck5000.pdi -log error
Expand Down
29 changes: 20 additions & 9 deletions platform/vck5000/aie_platform/aie/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@

using namespace adf;


PLIO plioIn0("plioIn0", adf::plio_64_bits);
GMIO gmioIn0("gmioIn0", 64, 1);
GMIO gmioIn1("gmioIn1", 64, 1);
GMIO gmioIn2("gmioIn2", 64, 1);
Expand Down Expand Up @@ -67,6 +69,7 @@ GMIO gmioIn29("gmioIn29", 64, 1);
GMIO gmioIn30("gmioIn30", 64, 1);
GMIO gmioIn31("gmioIn31", 64, 1);

PLIO plioOut0("plioOut0", adf::plio_64_bits);
GMIO gmioOut0("gmioOut0", 64, 1);
GMIO gmioOut1("gmioOut1", 64, 1);
GMIO gmioOut2("gmioOut2", 64, 1);
Expand Down Expand Up @@ -100,7 +103,7 @@ GMIO gmioOut29("gmioOut29", 64, 1);
GMIO gmioOut30("gmioOut30", 64, 1);
GMIO gmioOut31("gmioOut31", 64, 1);

simulation::platform<NUM,NUM> plat(
simulation::platform<NUM_MM + NUM_STREAM,NUM_MM + NUM_STREAM> plat(
&gmioIn0,
&gmioIn1,
&gmioIn2,
Expand Down Expand Up @@ -133,6 +136,7 @@ simulation::platform<NUM,NUM> plat(
&gmioIn29,
&gmioIn30,
&gmioIn31,
&plioIn0,

&gmioOut0,
&gmioOut1,
Expand Down Expand Up @@ -165,17 +169,18 @@ simulation::platform<NUM,NUM> plat(
&gmioOut28,
&gmioOut29,
&gmioOut30,
&gmioOut31
&gmioOut31,
&plioOut0
);

//for indexed access
GMIO* gmioIn[NUM] = {
GMIO* gmioIn[NUM_MM] = {
&gmioIn0,&gmioIn1,&gmioIn2,&gmioIn3,&gmioIn4,&gmioIn5,&gmioIn6,&gmioIn7,&gmioIn8,&gmioIn9,
&gmioIn10,&gmioIn11,&gmioIn12,&gmioIn13,&gmioIn14,&gmioIn15,&gmioIn16,&gmioIn17,&gmioIn18,&gmioIn19,
&gmioIn20,&gmioIn21,&gmioIn22,&gmioIn23,&gmioIn24,&gmioIn25,&gmioIn26,&gmioIn27,&gmioIn28,&gmioIn29,
&gmioIn30,&gmioIn31};

GMIO* gmioOut[NUM] = {
GMIO* gmioOut[NUM_MM] = {
&gmioOut0,&gmioOut1,&gmioOut2,&gmioOut3,&gmioOut4,&gmioOut5,&gmioOut6,&gmioOut7,&gmioOut8,&gmioOut9,
&gmioOut10,&gmioOut11,&gmioOut12,&gmioOut13,&gmioOut14,&gmioOut15,&gmioOut16,&gmioOut17,&gmioOut18,&gmioOut19,
&gmioOut20,&gmioOut21,&gmioOut22,&gmioOut23,&gmioOut24,&gmioOut25,&gmioOut26,&gmioOut27,&gmioOut28,&gmioOut29,
Expand All @@ -190,10 +195,16 @@ class GlobalConnection
public:
GlobalConnection()
{
for (int i=0; i<NUM; i++)
for (int i=0; i<NUM_MM; i++)
{
connect<>(plat.src[i], g.input_mm[i]);
connect<>(g.output_mm[i], plat.sink[i]);
}

for (int i=NUM_MM; i<NUM_STREAM + NUM_MM; i++)
{
connect<>(plat.src[i], g.input[i]);
connect<>(g.output[i], plat.sink[i]);
connect<>(plat.src[i], g.input_stream[i - NUM_MM]);
connect<>(g.output_stream[i - NUM_MM], plat.sink[i]);
}
}
} connection;
Expand Down Expand Up @@ -228,15 +239,15 @@ int main(int argc, char **argv)
g.update(g.k[i].in[1], i+1);
}

int32* inputArray[NUM];
int32* inputArray[NUM_MM];
for (int i=M; i<M+P; i++)
{
inputArray[i] = (int32*)GMIO::malloc(256*sizeof(int32));
for (int j=0; j<256; j++)
inputArray[i][j] = i+1;
}

int32* outputArray[NUM];
int32* outputArray[NUM_MM];
for (int i=M; i<M+P; i++)
{
outputArray[i] = (int32*)GMIO::malloc(256*sizeof(int32));
Expand Down
30 changes: 23 additions & 7 deletions platform/vck5000/aie_platform/aie/graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,26 +28,42 @@

using namespace adf;

#define NUM 32
#define NUM_MM 32
#define NUM_STREAM 1

class mygraph : public graph
{
public:
kernel k[NUM];
input_port input[NUM];
input_port output[NUM];
kernel k[NUM_MM + NUM_STREAM];
input_port input_mm[NUM_MM];
input_port output_mm[NUM_MM];

input_port input_stream[NUM_STREAM];
output_port output_stream[NUM_STREAM];

mygraph()
{
for (int i=0; i<NUM; i++)
for (int i=0; i<NUM_MM; i++)
{
k[i] = kernel::create(add);
source(k[i]) = "aie/add.cpp";
runtime<ratio>(k[i]) = 0.9;

connect<window<128>>(input_mm[i], k[i].in[0]);
connect<window<128>>(k[i].out[0], output_mm[i]);
async(k[i].in[1]);
}

for (int i=0; i<NUM_STREAM; i++)
{
k[i] = kernel::create(add);
source(k[i]) = "aie/add.cpp";
runtime<ratio>(k[i]) = 0.9;

connect<window<128>>(input[i], k[i].in[0]);
connect<window<128>>(k[i].out[0], output[i]);
connect<window<128>>(input_stream[i], k[i].in[0]);
connect<window<128>>(k[i].out[0], output_stream[i]);
async(k[i].in[1]);

}

/*location<kernel>(k[0]) = tile(2,0);
Expand Down
Binary file removed platform/vck5000/aie_platform/final_vck5000.pdi
Binary file not shown.
Binary file modified platform/vck5000/aie_platform/firmware/airrt_cpp.elf
Binary file not shown.
Loading