diff --git a/.github/workflows/buildAndTest.yml b/.github/workflows/buildAndTest.yml new file mode 100644 index 00000000000..4840d92f97e --- /dev/null +++ b/.github/workflows/buildAndTest.yml @@ -0,0 +1,77 @@ +# This is a basic workflow to help you get started with Actions +name: Build and Test +# Controls when the action will run. Triggers the workflow on push or pull request +# events but only for the master branch +on: [push, pull_request] +# A workflow run is made up of one or more jobs that can run sequentially or in parallel +jobs: + # Build Phism and run its tests. + build-phism: + name: Build and Test Phism + runs-on: self-hosted + steps: + # - name: Configure Environment + # run: echo "${GITHUB_WORKSPACE}/llvm/install/bin" >> $GITHUB_PATH + # Disabled for self-hosted + # - name: Get dependences + # run: | + # sudo apt-get update -y + # sudo apt-get install -y build-essential libtool autoconf pkg-config flex bison libgmp-dev clang-9 libclang-9-dev texinfo python3 + # - name: Update the LLVM/Clang version to 9 + # run: | + # sudo update-alternatives --install /usr/bin/llvm-config llvm-config /usr/bin/llvm-config-9 100 + # sudo update-alternatives --install /usr/bin/FileCheck FileCheck /usr/bin/FileCheck-9 100 + + + # Clone the Phism repo and its submodules. Do shallow clone to save clone + # time. + - name: Get Phism + uses: actions/checkout@v2 + with: + submodules: "true" + + # -------- + # Restore LLVM from cache and build if it's not in there. + # -------- + # Extract the LLVM submodule hash for use in the cache key. + - name: Get LLVM Hash + id: get-llvm-hash + run: echo "::set-output name=hash::$(git rev-parse @:./llvm)" + shell: bash + # Try to fetch LLVM from the cache. + - name: Cache LLVM + id: cache-llvm + uses: actions/cache@v2 + with: + path: llvm/build + key: ${{ runner.os }}-llvm-${{ steps.get-llvm-hash.outputs.hash }} + # Build LLVM if we didn't hit in the cache. Even though we build it in + # the previous job, there is a low chance that it'll have been evicted by + # the time we get here. + # Need to delete the test directory to avoid caching them. + - name: Rebuild and Install LLVM + if: steps.cache-llvm.outputs.cache-hit != 'true' + run: | + ./scripts/build-llvm.sh ci + rm -rf ./llvm/build/test + # -------- + # Build and test Phism in both debug and release mode. + # -------- + - name: Build and Test Phism (Assert) + run: | + ./scripts/build-phism.sh ci + + # Build and test Phism with pb-flow. + - name: Build and Test Phism (pb-flow) + run: | + python3 -m venv env + source env/bin/activate + which python3 + python3 -m pip install -r requirements.txt + python3 ./scripts/pb-flow.py ./example/polybench --dataset SMALL --skip-vitis + python3 ./scripts/pb-flow.py ./example/polybench --dataset SMALL --polymer --skip-vitis + python3 ./scripts/pb-flow.py ./example/polybench --dataset SMALL --polymer --loop-transforms --skip-vitis + python3 ./scripts/pb-flow.py ./example/polybench --dataset SMALL --polymer --loop-transforms --array-partition --skip-vitis + + + diff --git a/include/phism/mlir/Transforms/Utils.h b/include/phism/mlir/Transforms/Utils.h new file mode 100644 index 00000000000..f1d09726276 --- /dev/null +++ b/include/phism/mlir/Transforms/Utils.h @@ -0,0 +1,10 @@ +//===- Utils.h - Utility functions ------------------ C++-===// + +#include "mlir/IR/BuiltinOps.h" + +namespace phism { + +/// Get the top function for the hardware design. +mlir::FuncOp getTopFunction(mlir::ModuleOp m); + +} // namespace phism diff --git a/lib/llvm/Transforms/VhlsLLVMRewriter.cc b/lib/llvm/Transforms/VhlsLLVMRewriter.cc index da51844e5cd..3ceea0036a2 100644 --- a/lib/llvm/Transforms/VhlsLLVMRewriter.cc +++ b/lib/llvm/Transforms/VhlsLLVMRewriter.cc @@ -16,17 +16,57 @@ #include "llvm/IR/Value.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/Cloning.h" +#include + using namespace llvm; +#define DEBUG_TYPE "vhls_llvm" + static cl::opt XlnTop("xlntop", cl::desc("Specify the top function for Xilinx HLS."), cl::value_desc("topname")); static cl::opt XlnNames("xlnnames", cl::desc("Specify the top function param names."), cl::value_desc("paramname")); +static cl::opt XlnTBTclNames( + "xlntbtclnames", + cl::desc( + "Specify the file name of the tcl script for test bench generation."), + cl::value_desc("tbname")); +static cl::opt XlnTBSources( + "xlntbfilesettings", + cl::desc( + "Specify the file settings for the test bench, e.g. \"add_files ...\""), + cl::value_desc("tbfiles")); +static cl::opt XlnArrayPartitionEnabled( + "xln-ap-enabled", cl::desc("Whether array partition has been enabled")); + +/// Will abort if the Value is not a ConstantInt. +static int64_t getI64Value(Value *value) { + assert(isa(value)); + + ConstantInt *CI = dyn_cast(value); + assert(CI->getBitWidth() == 64); + + return CI->getSExtValue(); +} + +/// Get the dimensions from the provided array type. +static SmallVector getDimsFromArrayType(ArrayType *type) { + SmallVector dims; + dims.push_back(type->getNumElements()); + + while (type && type->getArrayElementType()->isArrayTy()) { + type = dyn_cast(type->getArrayElementType()); + dims.push_back(type->getNumElements()); + } + + return dims; +} namespace { @@ -109,17 +149,6 @@ class InsExtSequence { return true; } - - /// Will abort if the Value is not a ConstantInt. - int64_t getI64Value(Value *value) { - assert(isa(value)); - - ConstantInt *CI = dyn_cast(value); - assert(CI->getBitWidth() == 64); - - return CI->getSExtValue(); - } - /// From the scalar offset to a set of offset for each dim. /// There should be in total dimSize * 2 expressions. /// Dim size will be dims.size() * 2 since the dims in this Seq only has the @@ -129,6 +158,12 @@ class InsExtSequence { if (isa(offset)) return; + LLVM_DEBUG({ + dbgs() << "\n--------------------------------------\n"; + dbgs() << "Processing offset for target pointer: \n"; + ptr->dump(); + }); + SmallVector offsets; SmallVector strides; @@ -137,7 +172,6 @@ class InsExtSequence { unsigned dimSize = dims.size() * 2; for (unsigned i = 0; i < dimSize; ++i) { binOp = cast(curr); - // binOp->dump(); assert(binOp->getOpcode() == BinaryOperator::Add); Value *lhs = binOp->getOperand(0), *rhs = binOp->getOperand(1); @@ -167,11 +201,30 @@ class InsExtSequence { assert(offsets.size() == strides.size()); assert(strides[0] == 1); + LLVM_DEBUG({ + dbgs() << "Offsets: \n"; + for (Value *offset : offsets) + offset->dump(); + dbgs() << "Strides: "; + interleave( + strides, [&](const int64_t &stride) { dbgs() << stride; }, + [&]() { dbgs() << ", "; }); + dbgs() << "\n\n"; + }); + SmallVector partialDims; for (unsigned i = 1; i < strides.size(); ++i) partialDims.push_back(strides[i] / strides[i - 1]); assert(partialDims.size() == dimSize - 1); + LLVM_DEBUG({ + dbgs() << "Partial dims:\n"; + interleave( + partialDims, [&](const int64_t &v) { dbgs() << v; }, + [&]() { dbgs() << ", "; }); + dbgs() << "\n"; + }); + std::reverse(partialDims.begin(), partialDims.end()); std::reverse(offsets.begin(), offsets.end()); @@ -191,11 +244,26 @@ class InsExtSequence { LoadInst *load = new LoadInst( cast(restoredType)->getElementType(), bitCastInst, Twine(""), cast(bitCastInst->getNextNode())); - GetElementPtrInst *gep = GetElementPtrInst::Create( - rankedArrType, load, {offsets[0], offsets[1]}, Twine(""), - cast(load->getNextNode())); + + SmallVector gepInds; + for (unsigned i = 0; i < offsets.size() / 2; ++i) + gepInds.push_back(offsets[i]); + GetElementPtrInst *gep = + GetElementPtrInst::Create(rankedArrType, load, gepInds, Twine(""), + cast(load->getNextNode())); ptr = new BitCastInst(gep, ptr->getType(), Twine(""), cast(gep)->getNextNode()); + + LLVM_DEBUG({ + dbgs() << "Created the following instructions:\n"; + bitCastInst->dump(); + load->dump(); + gep->dump(); + ptr->dump(); + + dbgs() << "\nExpected result type:\n"; + gep->getType()->dump(); + }); } /// Append insInst to the insertInsts list, and gather the value to be @@ -629,6 +697,215 @@ static SmallVector TopologicalSort(ArrayRef funcs) { return sorted; } +/// See the doc from rewriteModuloGepIndices. +static Value *rewriteModulo(Value *value) { + SelectInst *selectInst = dyn_cast(value); + if (!selectInst) + return nullptr; + + ICmpInst *icmpInst = dyn_cast(selectInst->getCondition()); + if (!icmpInst) + return nullptr; + + BinaryOperator *addInst = + dyn_cast(selectInst->getTrueValue()); + if (!addInst || addInst->getOpcode() != BinaryOperator::Add) + return nullptr; + + BinaryOperator *sremInst = + dyn_cast(selectInst->getFalseValue()); + if (!sremInst || sremInst->getOpcode() != BinaryOperator::SRem) + return nullptr; + + // Now the pattern has been matched, do the rewrite. + selectInst->replaceAllUsesWith(sremInst); + + // Clean up + selectInst->eraseFromParent(); + addInst->eraseFromParent(); + icmpInst->eraseFromParent(); + + return sremInst; +} + +static bool isValidGepIndex(Value *value) { + return isa(value); +} + +/// We trace the address calculation (mul and add) chain for the GEP index. +/// +/// It would looks like (from heat-3d) - +/// +/// %val_9 = mul i64 %val_3, 400 +/// %val_10 = add i64 %val_9, 400 <----- Add the offset value of 400 +/// %val_11 = mul i64 %val_5, 20 +/// %val_12 = add i64 %val_10, %val_11 +/// %val_13 = add i64 %val_12, %val_7 +/// +/// Without offset +/// +/// %val_20 = mul i64 %val_3, 400 +/// %val_21 = mul i64 %val_5, 20 +/// %val_22 = add i64 %val_20, %val_21 +/// %val_23 = add i64 %val_22, %val_7 +/// +/// We cannot recover the indices when there is an offset at present. +/// It will return all the found indices. +/// The provided type argument is to verify the extracted information. +static SmallVector getGepIndices(GetElementPtrInst *inst, Type *type) { + LLVM_DEBUG({ + dbgs() << "Recognizing GEP indices from "; + inst->dump(); + dbgs() << "\n"; + dbgs() << "Using type: "; + type->dump(); + dbgs() << "\n\n"; + }); + + if (inst->getNumIndices() != 1) { + LLVM_DEBUG(dbgs() << "Given GEP has 0 or more than 1 indices."); + return {}; + } + + SmallVector operands; + // Will use this to check with the ranked array type. + SmallVector mulDims; + + // First of all, all the adders will be connected by their LHS operator. + // If the input is already an index. + if (isValidGepIndex(*inst->idx_begin())) { + operands.push_back(*inst->idx_begin()); + } else { + SmallVector addInsts; + BinaryOperator *addInst = dyn_cast(*inst->idx_begin()); + while (addInst && addInst->getOpcode() == BinaryOperator::Add) { + addInsts.push_back(addInst); + addInst = dyn_cast(addInst->getOperand(0)); + } + + LLVM_DEBUG({ + dbgs() << "Recognized adders:\n"; + for (BinaryOperator *op : addInsts) + op->dump(); + dbgs() << "\n\n"; + }); + + for (unsigned i = 0; i < addInsts.size(); ++i) { + if (i == addInsts.size() - 1) + operands.push_back(addInsts[i]->getOperand(0)); + operands.push_back(addInsts[i]->getOperand(1)); + } + + LLVM_DEBUG({ + dbgs() << "Adder operands:\n"; + for (Value *operand : operands) + operand->dump(); + dbgs() << "\n\n"; + }); + + // Replace operand with multipliers. + for (unsigned i = 0; i < operands.size(); ++i) { + BinaryOperator *mulInst = dyn_cast(operands[i]); + if (!mulInst || mulInst->getOpcode() != BinaryOperator::Mul) + continue; + if (!isa(mulInst->getOperand(1))) { + LLVM_DEBUG({ + dbgs() << "The RHS of a multiplied index is not a constant integer."; + mulInst->dump(); + }); + return {}; + } + + mulDims.push_back(getI64Value(mulInst->getOperand(1))); + operands[i] = mulInst->getOperand(0); + } + } + + LLVM_DEBUG({ + dbgs() << "Updated operands by mul:\n"; + for (Value *operand : operands) + operand->dump(); + dbgs() << "\n\n"; + }); + + // Check if every operand can be a valid GEP index. + for (Value *operand : operands) { + if (!isValidGepIndex(operand)) { + LLVM_DEBUG({ + dbgs() << "Found an invalid operand:"; + operand->dump(); + }); + return {}; + } + } + + // Finally, check whether the type matches with the parsed results. + ArrayType *arrayType = cast(type->getPointerElementType()); + SmallVector dims = getDimsFromArrayType(arrayType); + if (dims.size() != operands.size()) { + LLVM_DEBUG({ + dbgs() << "Number of dims from the type: " << dims.size() + << " doesn't match the number of operands: " << operands.size() + << "\n"; + }); + return {}; + } + + SmallVector parDims; + for (unsigned i = 1; i < dims.size(); ++i) + parDims.push_back(dims[i] * (parDims.empty() ? 1 : parDims.back())); + + LLVM_DEBUG({ + dbgs() << "Partial dims resolved from type: "; + interleaveComma(parDims, dbgs()); + dbgs() << "\nPartial dims resolved from multipliers: "; + interleaveComma(mulDims, dbgs()); + dbgs() << "\n"; + }); + + if (parDims != mulDims) { + LLVM_DEBUG(dbgs() << "Partial dims don't match.\n"); + return {}; + } + + std::reverse(operands.begin(), operands.end()); + + return operands; +} + +/// Look at the indices passed to the given GEP and see if there is any chance +/// we can make the modulo expressions simplier given that the address of GEP +/// should be positive. +/// +/// For example, transform: +/// %0 = srem i64 %arg, 32 +/// %1 = icmp slt i64 %0, 0 +/// %2 = add i64 %0, 32 +/// %3 = select i1 %1, i64 %2, i64 %0 +/// +/// to: +/// %0 = srem i64 %arg, 32 +/// +static void rewriteModuloGepIndices(SmallVectorImpl &indices) { + for (unsigned i = 0; i < indices.size(); ++i) + if (isa(indices[i])) { + Value *newInd = rewriteModulo(indices[i]); + if (!newInd) { + LLVM_DEBUG({ + dbgs() << "Failed to rewrite index at " << i << " : "; + indices[i]->dump(); + }); + continue; + } + + LLVM_DEBUG({ + dbgs() << "Rewritten index at " << i << " to "; + newInd->dump(); + }); + indices[i] = newInd; + } +} + /// This helper function convert the MemRef value represented by an /// aggregated type to a ranked N-d array. The function interface, as well /// as the internal usage of GEP will be updated. @@ -680,12 +957,24 @@ static void convertMemRefToArray(Module &M, bool ranked = false) { // same as the original one, just have additional arguments that are // ranked arrays. for (Function *F : Funcs) { + LLVM_DEBUG({ + dbgs() << "\nTransforming function: \n\n"; + F->dump(); + }); ValueToValueMapTy RankedArrVMap; auto &Seqs = FuncToSeqs[F]; + // ----------------------------------------------------------------- + // Step 1: create a rank-duplicated interface. Function *NewFunc = duplicateFunctionsWithRankedArrays(F, Seqs, RankedArrVMap); + LLVM_DEBUG({ + dbgs() << "\nDuplicated function: \n\n"; + NewFunc->dump(); + }); + // ----------------------------------------------------------------- + // Step 2: update the GEP expressions. SmallVector GEPList; for (BasicBlock &BB : *NewFunc) for (Instruction &I : BB) @@ -695,12 +984,53 @@ static void convertMemRefToArray(Module &M, bool ranked = false) { // Create new GEPs that use the ranked arrays and remove the old ones. unsigned NumNewGEP = 0; for (Instruction *I : GEPList) { - Instruction *NewGEP = - duplicateGEPWithRankedArray(I, RankedArrVMap, NumNewGEP); + // Simplify the address calculation expressions to make Vitis happy. + // It is easier to work on the original GEP. + SmallVector indices = + getGepIndices(cast(I), + RankedArrVMap[I->getOperand(0)]->getType()); + + Instruction *NewGEP; + if (indices.empty()) { + NewGEP = duplicateGEPWithRankedArray(I, RankedArrVMap, NumNewGEP); + } else { + // We will directly use the resolved indices. + // Try to rewrite the modulo expressions. + rewriteModuloGepIndices(indices); + + LLVM_DEBUG({ + dbgs() << "Indices to use: \n"; + for (Value *index : indices) + index->dump(); + }); + + // We can directly use the indices from the rewrite to get the new GEP. + /// TODO: should be more careful. + Value *ptr = RankedArrVMap[I->getOperand(0)]; + assert(ptr); + + indices.push_back(ConstantInt::get(indices.front()->getType(), 0)); + std::reverse(indices.begin(), indices.end()); + + NewGEP = GetElementPtrInst::CreateInBounds(ptr, indices, Twine(""), + I->getNextNode()); + LLVM_DEBUG({ + dbgs() << "Newly generated GEP: "; + NewGEP->dump(); + }); + } + I->replaceAllUsesWith(NewGEP); I->eraseFromParent(); } + LLVM_DEBUG({ + dbgs() << "\nGEP updated function: \n\n"; + NewFunc->dump(); + }); + + // ----------------------------------------------------------------- + // Step 3: update callers within the new function. // If there is any caller. SmallVector Callers; for (BasicBlock &BB : *NewFunc) @@ -728,6 +1058,12 @@ static void convertMemRefToArray(Module &M, bool ranked = false) { if (RankedArrVMap.count(Arg)) Args.push_back(RankedArrVMap[Arg]); else if (isa(Arg)) { + LLVM_DEBUG({ + dbgs() << "Found "; + Arg->dump(); + dbgs() << " as a result from bitcast. Need to transform it into " + "the multi-dimensional type.\n"; + }); // Or it is a result from a bitcast expression chain. // This chain is based on the instructions generated by the // processOffset function. @@ -765,6 +1101,25 @@ static void convertMemRefToArray(Module &M, bool ranked = false) { } } + LLVM_DEBUG({ + dbgs() << "Creating caller for " << FuncToNew[Callee]->getName() + << ", signature: "; + FuncToNew[Callee]->getFunctionType()->dump(); + dbgs() << "-----------------------\n\n"; + dbgs() << "Argument list:\n"; + for (auto arg : enumerate(Args)) { + dbgs() << arg.index() << "\t-> "; + arg.value()->dump(); + } + dbgs() << "\nArgument types:\n"; + for (auto arg : enumerate(Args)) { + dbgs() << arg.index() << "\t-> "; + arg.value()->getType()->dump(); + dbgs() << "\t-> "; + FuncToNew[Callee]->getArg(arg.index())->getType()->dump(); + } + }); + // New caller. CallInst::Create(FuncToNew[Callee], Args, Twine(), Caller); // Erase the original caller. @@ -997,8 +1352,9 @@ struct XilinxUnrollPass : public ModulePass { auto DT = llvm::DominatorTree(F); LoopInfo LI(DT); - for (auto &loop : LI) - unrollLoop(loop); + if (!LI.empty()) + for (auto &loop : LI) + unrollLoop(loop); } return false; @@ -1022,7 +1378,8 @@ getPartitionInfo(ArrayType *arrayTy) { } while (arrayTy); // The dimension number of arrays after Polymer should be a even number - assert(d % 2 == 0); + if (d % 2 != 0) + return {}; partitions.resize(d / 2); return partitions; @@ -1039,6 +1396,8 @@ struct XilinxArrayPartitionPass : public ModulePass { XilinxArrayPartitionPass() : ModulePass(ID) {} bool runOnModule(Module &M) override { + if (!XlnArrayPartitionEnabled) + return true; // Declare array partition APIs in Vitis HLS LLVM frontend auto mod = &M; @@ -1080,6 +1439,149 @@ struct XilinxArrayPartitionPass : public ModulePass { } // namespace +namespace { + +/// Generate test bench tcl script for Xilinx Vitis. This pass parses the LLVM +/// IR and generates compatible test bench for the design in LLVM IR. +struct XilinxTBTclGenPass : public ModulePass { + static char ID; + XilinxTBTclGenPass() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + std::error_code ec; + llvm::raw_fd_ostream XlnTBTcl(XlnTBTclNames, ec); + + XlnTBTcl << "open_project -reset tb\n" + << XlnTBSources << "set_top " << XlnTop << "\n" + << "open_solution -reset solution1\n" + << "set_part \"zynq\"\n" + << "create_clock -period \"100MHz\"\n" + << "config_bind -effort high\n"; + + for (auto &F : M) + if (F.getName() == XlnTop) { + for (unsigned i = 0; i < F.arg_size(); i++) { + auto arg = F.getArg(i); + if (arg->getType()->isPointerTy() && + arg->getType()->getPointerElementType()->isArrayTy()) { + auto arrayTy = + dyn_cast(arg->getType()->getPointerElementType()); + if (XlnArrayPartitionEnabled) { + auto partitions = getPartitionInfo(arrayTy); + for (auto partition : partitions) + XlnTBTcl << "set_directive_array_partition -dim " + << partition.first << " -factor " << partition.second + << " -type block \"" << XlnTop << "\" " + << arg->getName() << "\n"; + } + } + } + } + + XlnTBTcl << "csim_design\n" + << "csynth_design\n" + << "cosim_design\n" + << "exit\n"; + return false; + } +}; + +} // namespace + +static void nameLoop(Loop *loop, int &loopCounter) { + SmallVector Args; + + // Reserve operand 0 for loop id self reference. + LLVMContext &Context = loop->getHeader()->getContext(); + auto TempNode = MDNode::getTemporary(Context, None); + Args.push_back(TempNode.get()); + + // Loop name + Metadata *nameVals[] = { + MDString::get(Context, "llvm.loop.name"), + MDString::get(Context, "VITIS_LOOP_" + std::to_string(loopCounter))}; + Args.push_back(MDNode::get(Context, nameVals)); + + // Set the first operand to itself. + MDNode *LoopID = MDNode::get(Context, Args); + LoopID->replaceOperandWith(0, LoopID); + loop->setLoopID(LoopID); + loopCounter++; + + if (!loop->isInnermost()) + for (auto &subloop : loop->getSubLoops()) + nameLoop(subloop, loopCounter); +} + +namespace { + +/// Assign a name to each loop and enable flattening for Xilinx Vitis. +struct XilinxNameLoopPass : public ModulePass { + static char ID; + XilinxNameLoopPass() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + + int loopCounter = 0; + for (auto &F : M) + if (F.getName() != XlnTop && !F.empty()) { + auto DT = llvm::DominatorTree(F); + LoopInfo LI(DT); + + if (!LI.empty()) + for (auto &loop : LI) + nameLoop(loop, loopCounter); + } + + return false; + } +}; + +} // namespace + +// ----------------------------------------------------------------------------------- +// Mark no inline for kernels' + +/// Check if the input function is a scop.stmt based on the pattern S[0-1]+ +static bool isScopStmt(Function &F) { + StringRef name = F.getName(); + if (!name.startswith("S")) + return false; + + StringRef suffix = name.drop_front(); + if (any_of(suffix, [](const char &c) { return !isdigit(c); })) + return false; + + return true; +} + +namespace { + +struct AnnotateNoInlinePass : public ModulePass { + static char ID; // Pass identification, replacement for typeid + AnnotateNoInlinePass() : ModulePass(ID) {} + + bool runOnModule(Module &M) override { + bool modified = false; + for (auto &F : M) { + if (!isScopStmt(F)) { + if (!F.hasFnAttribute(Attribute::NoInline)) { + modified = true; + F.addFnAttr(Attribute::NoInline); + } + } else { + modified = true; + // Should always inline scop.stmt. + F.addFnAttr(Attribute::AlwaysInline); + } + } + + return modified; + } +}; + +} // namespace + char ConvertMemRefToArray::ID = 0; static RegisterPass X1("mem2ptr", @@ -1116,3 +1618,15 @@ char XilinxArrayPartitionPass::ID = 7; static RegisterPass X8( "xlnarraypartition", "Partition arrays in the top-level function arguments for Xilinx Vitis."); + +char XilinxTBTclGenPass::ID = 8; +static RegisterPass + X9("xlntbgen", "Generate test bench tcl script for Xilinx Vitis."); + +char XilinxNameLoopPass::ID = 9; +static RegisterPass X10("xlnloopname", + "Name loops for Xilinx Vitis."); + +char AnnotateNoInlinePass::ID = 10; +static RegisterPass + X11("anno-noinline", "Annotate noinline to the functions."); diff --git a/lib/mlir/Transforms/ArrayPartition.cc b/lib/mlir/Transforms/ArrayPartition.cc index 1596c28516d..c6dbeb625fc 100644 --- a/lib/mlir/Transforms/ArrayPartition.cc +++ b/lib/mlir/Transforms/ArrayPartition.cc @@ -1,6 +1,7 @@ //===- ArrayPartitions.cc - Partitioning arrays ------------------ C++-===// #include "phism/mlir/Transforms/PhismTransforms.h" +#include "phism/mlir/Transforms/Utils.h" #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" @@ -26,6 +27,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" +#include #include #include @@ -35,25 +37,15 @@ using namespace mlir; using namespace llvm; using namespace phism; -static bool hasPeCaller(FuncOp f) { - bool ret = false; - f.walk([&](CallOp caller) { - if (caller->hasAttr("scop.pe")) - ret = true; - }); - return ret; -} - -static FuncOp getTopFunction(ModuleOp m) { - FuncOp top = nullptr; - m.walk([&](FuncOp f) { - if (hasPeCaller(f)) { - assert(!top && "There should be only one top function."); - top = f; - } - }); - return top; -} +namespace { +struct ArrayPartitionPipelineOptions + : public mlir::PassPipelineOptions { + Option dumpFile{ + *this, "dumpFile", + llvm::cl::desc("Enable dumping the tile info into a file."), + llvm::cl::init(false)}; +}; +} // namespace /// -------------------------- Dependence analysis --------------------------- @@ -361,7 +353,6 @@ static void arrayPartition(FuncOp f, ModuleOp m, OpBuilder &b) { if (checkAccessOverlap(info, m, partitions)) continue; - memref.dump(); llvm::errs() << "Partitions: \n"; for (auto it : partitions) { for (auto bound : enumerate(it)) { @@ -484,13 +475,17 @@ static MapVector getTilingInfo(ArrayRef memrefs, MapVector tiling; // See if they have simple access patterns that can be directly extracted. for (Value memref : memrefs) { + LLVM_DEBUG({ + dbgs() << "Trying to tile: "; + memref.dump(); + }); // Check if all the users of memref are scop.pe callers. if (any_of(memref.getUsers(), [&](Operation *op) { return !isa(op) || !op->hasAttr("scop.pe"); })) { LLVM_DEBUG({ memref.dump(); - llvm::errs() << " has been skipped since it has non PE caller users.\n"; + dbgs() << " has been skipped since it has non PE caller users.\n"; }); continue; } @@ -520,10 +515,10 @@ static MapVector getTilingInfo(ArrayRef memrefs, // Debug the accesses. LLVM_DEBUG({ - memref.dump(); - for (MemRefAccess &access : accesses) { + dbgs() << "Found the following accesses:\n"; + for (MemRefAccess &access : accesses) access.opInst->dump(); - } + dbgs() << "---------------------------\n"; }); // Check if all accesses are idenity maps. @@ -559,12 +554,23 @@ static MapVector getTilingInfo(ArrayRef memrefs, for (AffineForOp forOp : forOps) { // Filter out the result that are constants. We don't care about them. // ()[s0] -> (70, s0 * 32 + 32) will be ()[s0] -> (s0 * 32 + 32) - tmpLbMaps.push_back( - filterExtraConstantResults(forOp.getLowerBoundMap())); - tmpUbMaps.push_back( - filterExtraConstantResults(forOp.getUpperBoundMap())); + AffineMap lbMap = filterExtraConstantResults(forOp.getLowerBoundMap()); + AffineMap ubMap = filterExtraConstantResults(forOp.getUpperBoundMap()); + + if (lbMap.isSingleConstant() && ubMap.isSingleConstant()) { + llvm::errs() << "There appears a pair of constant loop bounds. We " + "cannot deal with this yet.\n"; + isIdentical = false; + break; + } + + tmpLbMaps.push_back(lbMap); + tmpUbMaps.push_back(ubMap); } + if (!isIdentical) + break; + // Simply ignore those with constant lower upper bounds. // They won't cause much trouble (heuristically) if we don't partition // for them. @@ -584,8 +590,10 @@ static MapVector getTilingInfo(ArrayRef memrefs, std::swap(tmpUbMaps, ubMaps); } else { isIdentical = tmpLbMaps == lbMaps && tmpUbMaps == ubMaps; - if (!isIdentical) + if (!isIdentical) { + LLVM_DEBUG(dbgs() << "Found not identical loop bound maps.\n"); break; + } } } @@ -625,11 +633,19 @@ static MapVector getTilingInfo(ArrayRef memrefs, // Abandon further processing if the tile size cannot match memref's type. if ((int64_t)tileSizes.size() != memref.getType().cast().getRank()) { - llvm::errs() << "Tile sizes are not equal to the rank of the memref.\n"; + LLVM_DEBUG( + dbgs() << "Tile sizes are not equal to the rank of the memref.\n"); continue; } // The resolved memref tiling. + LLVM_DEBUG({ + dbgs() << "Memref "; + memref.dump(); + dbgs() << " has been tiled into: "; + interleaveComma(tileSizes, dbgs()); + dbgs() << "\n\n"; + }); tiling[memref] = TileInfo{tileSizes, memref}; } @@ -863,7 +879,8 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef memrefs, Value operand = op->getOperand(i); // The index for a tiled memref will be from an affine.apply op. - AffineApplyOp applyOp = operand.getDefiningOp(); + mlir::AffineApplyOp applyOp = + operand.getDefiningOp(); if (!applyOp) continue; assert(applyOp.getNumOperands() == 1); @@ -871,9 +888,12 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef memrefs, Value indvar = applyOp.getOperand(0); mlir::AffineForOp forOp = getForInductionVarOwner(indvar); - // forOp.dump(); - assert(forOp.getLowerBoundOperands().size() == 1 || - forOp.getUpperBoundOperands().size() == 1); + + // At least one bound should have a single operand (for the loop + // indvar). + if (!(forOp.getLowerBoundOperands().size() == 1 || + forOp.getUpperBoundOperands().size() == 1)) + continue; Value source = forOp.getUpperBoundOperands().size() == 1 ? forOp.getUpperBoundOperands()[0] @@ -887,6 +907,13 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef memrefs, if (indices.empty()) std::swap(tmpIndices, indices); else { + LLVM_DEBUG({ + op->dump(); + if (tmpIndices != indices) { + llvm::interleaveComma(tmpIndices, llvm::errs()); + llvm::interleaveComma(indices, llvm::errs()); + } + }); assert(tmpIndices == indices); std::swap(tmpIndices, indices); } @@ -927,7 +954,6 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef memrefs, memref::SubViewOp subView = b.create(caller.getLoc(), newTiledMemRefType, newMemRef, offsets, sizes, strides); - subView.dump(); // Strip the affine map MemRefType castMemRefType = @@ -956,8 +982,6 @@ static FuncOp tileTopFunction(FuncOp top, ArrayRef memrefs, worklist[j] = vmap.lookup(worklist[j]); } - newFunc.dump(); - prevFunc = newFunc; } @@ -1023,11 +1047,23 @@ static void renameTiledFunctions(ModuleOp m, OpBuilder &b) { struct SimpleArrayPartitionPass : public PassWrapper> { + bool dumpFile = false; + + SimpleArrayPartitionPass() = default; + SimpleArrayPartitionPass(const SimpleArrayPartitionPass &pass) {} + SimpleArrayPartitionPass(const ArrayPartitionPipelineOptions &options) + : dumpFile(options.dumpFile) {} + void runOnOperation() override { ModuleOp m = getOperation(); OpBuilder b(m.getContext()); FuncOp top = getTopFunction(m); + if (!top) { + m.emitRemark() << "No top function found for array partition. Have you " + "forgot to annotate {scop.pe} to callers?\n"; + return; + } SmallVector callers; top.walk([&](CallOp caller) { @@ -1035,6 +1071,9 @@ struct SimpleArrayPartitionPass callers.push_back(caller); }); + if (callers.empty()) + return; + // Get all the memrefs that can be partitioned. // TODO: consider scratchpad as well? SmallVector memrefs; @@ -1044,6 +1083,18 @@ struct SimpleArrayPartitionPass // Get the tiling info. auto tiling = getTilingInfo(memrefs, m); + for (Value memref : memrefs) + if (!tiling.count(memref)) { + LLVM_DEBUG({ + dbgs() << "There is at least one memref: "; + memref.dump(); + dbgs() << " has not partitioned. We discard the whole case since the " + "performance gain would be minor.\n"; + }); + return; + } + + auto tilingCopy = tiling; // Tile the top function. FuncOp newTop = tileTopFunction(top, memrefs, tiling, m, b); @@ -1053,15 +1104,33 @@ struct SimpleArrayPartitionPass // Reset names. renameTiledFunctions(m, b); + + // If array partition has been succesful, dump a file that stores the + // corresponding information. + if (dumpFile) { + std::ofstream infoFile; + infoFile.open("array_partition.txt", std::ios::out); + if (infoFile.is_open()) { + for (auto &it : tilingCopy) { + interleave( + it.second.sizes, + [&](const int64_t &size) { infoFile << std::to_string(size); }, + [&]() { infoFile << ", "; }); + infoFile << '\n'; + } + } + } } }; } // namespace void phism::registerArrayPartitionPasses() { PassRegistration("array-partition", "Partition arrays"); - PassPipelineRegistration<>( - "simple-array-partition", "Partition arrays", [&](OpPassManager &pm) { - pm.addPass(std::make_unique()); + + PassPipelineRegistration( + "simple-array-partition", "Partition arrays", + [&](OpPassManager &pm, const ArrayPartitionPipelineOptions &options) { + pm.addPass(std::make_unique(options)); pm.addPass(createCanonicalizerPass()); }); } diff --git a/lib/mlir/Transforms/CMakeLists.txt b/lib/mlir/Transforms/CMakeLists.txt index 29b1d23d736..b4d22142df2 100644 --- a/lib/mlir/Transforms/CMakeLists.txt +++ b/lib/mlir/Transforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_mlir_library(PhismTransforms PhismTransforms.cc ArrayPartition.cc DependenceAnalysis.cc + Utils.cc ADDITIONAL_HEADER_DIRS "${PHISM_MAIN_INCLUDE_DIR}/phism/mlir/Transforms" diff --git a/lib/mlir/Transforms/LoopTransforms.cc b/lib/mlir/Transforms/LoopTransforms.cc index e3afff3b2cb..8c50d76cea0 100644 --- a/lib/mlir/Transforms/LoopTransforms.cc +++ b/lib/mlir/Transforms/LoopTransforms.cc @@ -1,6 +1,7 @@ //===- LoopTransforms.cc - Loop transforms ----------------------------C++-===// #include "phism/mlir/Transforms/PhismTransforms.h" +#include "phism/mlir/Transforms/Utils.h" #include "mlir/Analysis/AffineAnalysis.h" #include "mlir/Analysis/AffineStructures.h" @@ -9,6 +10,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Affine/IR/AffineValueMap.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Dominance.h" @@ -28,7 +30,7 @@ #include -#define DEBUG_TYPE "loop-extract" +#define DEBUG_TYPE "loop-transforms" using namespace mlir; using namespace llvm; @@ -46,7 +48,7 @@ struct LoopTransformsPipelineOptions /// -------------------------- Insert Scratchpad --------------------------- -static FuncOp getTopFunction(Operation *op) { +static FuncOp getRootFunction(Operation *op) { while (!op->getParentOfType()) op = op->getParentOp(); return op->getParentOfType(); @@ -62,7 +64,7 @@ struct InsertScratchpadPass DominanceInfo dom(storeOp->getParentOp()); Value mem = storeOp.getMemRef(); // TODO: we should further check the address being accessed. - FuncOp f = getTopFunction(storeOp); + FuncOp f = getRootFunction(storeOp); b.setInsertionPointToStart(&f.getBlocks().front()); // New scratchpad memory @@ -257,7 +259,6 @@ createPointLoopsCallee(mlir::AffineForOp forOp, int id, FuncOp f, mapping.map(arg, entry->addArgument(arg.getType())); callee.setType(b.getFunctionType(entry->getArgumentTypes(), llvm::None)); - callee.setVisibility(SymbolTable::Visibility::Public); b.clone(*forOp.getOperation(), mapping); @@ -478,11 +479,628 @@ struct AnnotatePointLoopsPass }; } // namespace +/// --------------------- Redistribute statements --------------------------- + +static void getAllScopStmts(FuncOp func, SetVector &stmts, ModuleOp m) { + func.walk([&](mlir::CallOp caller) { + FuncOp callee = dyn_cast(m.lookupSymbol(caller.getCallee())); + if (!callee) + return; + if (!callee->hasAttr("scop.stmt")) + return; + + stmts.insert(callee); + }); +} + +static void detectScopPeWithMultipleStmts(ModuleOp m, + SetVector &pes) { + FuncOp top = getTopFunction(m); + if (!top) + return; + + top.walk([&](mlir::CallOp caller) { + if (!caller->hasAttr("scop.pe")) + return; + + FuncOp callee = dyn_cast(m.lookupSymbol(caller.getCallee())); + if (!callee) + return; + + SetVector stmts; + getAllScopStmts(callee, stmts, m); + + if (stmts.size() >= 2) + pes.insert(callee); + }); +} + +static bool hasOnlyReadByScopStmts(FuncOp f, ModuleOp m, Value memref) { + SmallVector> funcAndArgIdx; + f.walk([&](mlir::CallOp caller) { + FuncOp callee = dyn_cast(m.lookupSymbol(caller.getCallee())); + if (!callee || !callee->hasAttr("scop.stmt")) + return; + auto it = find(caller.getArgOperands(), memref); + if (it == caller.arg_operand_end()) + return; + + funcAndArgIdx.push_back({callee, it - caller.arg_operand_begin()}); + }); + + // Examine the accesses. + for (auto &it : funcAndArgIdx) { + FuncOp callee; + unsigned argIdx; + std::tie(callee, argIdx) = it; + + assert(callee.getArgument(argIdx).getType().isa()); + + bool hasWriteAccess = false; + callee.walk([&](mlir::AffineStoreOp storeOp) { + if (storeOp.getMemRef() == callee.getArgument(argIdx)) + hasWriteAccess = true; + }); + + if (hasWriteAccess) + return false; + } + + return true; +} + +/// Assuming the memrefs at the top-level are not aliases. +/// Also assuming each scop.stmt will have its accessed memrefs once in its +/// interface. +static bool areScopStmtsSeparable(FuncOp f, ModuleOp m) { + SetVector visited; // memrefs visited. + SetVector conflicted; + SetVector visitedStmts; + f.walk([&](mlir::CallOp caller) { + FuncOp callee = dyn_cast(m.lookupSymbol(caller.getCallee())); + if (!callee || !callee->hasAttr("scop.stmt")) + return; + if (visitedStmts.count(callee)) + return; + visitedStmts.insert(callee); + + for (Value arg : caller.getArgOperands()) + if (arg.getType().isa()) { + if (visited.count(arg)) + conflicted.insert(arg); + visited.insert(arg); + } + }); + + unsigned bad = 0; + for (auto &memref : conflicted) + if (!hasOnlyReadByScopStmts(f, m, memref)) + ++bad; + + if (!bad) + return true; + + LLVM_DEBUG({ + llvm::errs() + << "\nConflicted memrefs that have not only read accesses:\n\n"; + for (Value memref : conflicted) + if (!hasOnlyReadByScopStmts(f, m, memref)) + memref.dump(); + }); + + return false; +} + +/// Erase those affine.for with empty blocks. +static void eraseEmptyAffineFor(FuncOp f) { + SmallVector eraseOps; + while (true) { + eraseOps.clear(); + f.walk([&](mlir::AffineForOp forOp) { + if (llvm::hasSingleElement(*forOp.getBody())) // the yield + eraseOps.push_back(forOp.getOperation()); + }); + for (Operation *op : eraseOps) + op->erase(); + + if (eraseOps.empty()) + break; + } +} + +static std::pair> +distributeScopStmt(FuncOp stmt, FuncOp f, ModuleOp m, OpBuilder &b) { + OpBuilder::InsertionGuard g(b); + b.setInsertionPointAfter(f); + FuncOp newFunc = cast(b.clone(*f.getOperation())); + newFunc.setName(std::string(f.getName()) + "__cloned_for__" + + std::string(stmt.getName())); + + SmallVector eraseOps; + newFunc.walk([&](mlir::CallOp caller) { + if (caller.getCallee() != stmt.getName()) + eraseOps.push_back(caller.getOperation()); + }); + + for (Operation *op : eraseOps) + op->erase(); + + eraseEmptyAffineFor(newFunc); + + // Erase not used arguments. + SmallVector usedArgs; + for (unsigned i = 0; i < newFunc.getNumArguments(); ++i) + if (newFunc.getArgument(i).use_empty()) + usedArgs.push_back(i); + newFunc.eraseArguments(usedArgs); + + return {newFunc, usedArgs}; +} + +/// The input function will be altered in-place. +static LogicalResult distributeScopStmts( + FuncOp f, SmallVectorImpl>> &dist, + ModuleOp m, OpBuilder &b) { + SetVector stmts; + getAllScopStmts(f, stmts, m); + + // Need to duplicate the whole function for each statement. And within each + // duplication, remove the callers that don't belong there. + for (FuncOp stmt : stmts) { + auto res = distributeScopStmt(stmt, f, m, b); + if (res.first) + dist.push_back(res); + else { + LLVM_DEBUG(dbgs() << "Cannot distribute for: " << stmt.getName() << '\n'); + return failure(); + } + } + + return success(); +} + +namespace { +struct RedistributeScopStatementsPass + : public mlir::PassWrapper> { + + void runOnOperation() override { + ModuleOp m = getOperation(); + OpBuilder b(m.getContext()); + + // ------------------------------------------------------------------- + // Step 1: detect the scop.pe callee that has more than one scop.stmt. + SetVector pes; + detectScopPeWithMultipleStmts(m, pes); + + if (pes.empty()) + return; + + LLVM_DEBUG({ + llvm::errs() << "-------------------------------------------\n"; + llvm::errs() << "Detected PEs with multiple SCoP statements:\n\n"; + for (FuncOp pe : pes) { + pe.dump(); + llvm::errs() << "\n------------------------\n\n"; + } + }); + + // ------------------------------------------------------------------- + // Step 2: check if the multiple scop.stmt can be fully separated. + // The condition is basically each caller refers to different memref. + /// TODO: carry out alias analysis (not an issue for polybench) + /// TODO: detailed dependence analysis to cover more cases. + SetVector pesToProc; + for (FuncOp pe : pes) { + if (!areScopStmtsSeparable(pe, m)) { + LLVM_DEBUG({ + llvm::errs() << "Discared " << pe.getName() + << "since its scop.stmts are not separable.\n"; + }); + continue; + } + + pesToProc.insert(pe); + } + + // ------------------------------------------------------------------- + // Step 3: Process each PE. + for (FuncOp pe : pesToProc) { + SmallVector>> dists; + if (failed(distributeScopStmts(pe, dists, m, b))) { + LLVM_DEBUG({ + llvm::errs() << "Failed to distribute scop.stmt: " << pe.getName() + << "\n"; + }); + continue; + } + + SmallVector callers; + m.walk([&](mlir::CallOp caller) { + if (caller.getCallee() == pe.getName()) + callers.push_back(caller); + }); + + for (mlir::CallOp caller : callers) { + b.setInsertionPointAfter(caller); + for (auto dist : dists) { + FuncOp callee; + SmallVector erased; + std::tie(callee, erased) = dist; + + SmallVector operands; + for (auto arg : enumerate(caller.getOperands())) + if (find(erased, arg.index()) == erased.end()) + operands.push_back(arg.value()); + + mlir::CallOp newCaller = + b.create(caller.getLoc(), callee, operands); + newCaller->setAttr("scop.pe", b.getUnitAttr()); + } + } + + for (mlir::CallOp caller : callers) + caller.erase(); + pe.erase(); + } + } +}; +} // namespace + +/// --------------------- Loop merge pass --------------------------- + +static LogicalResult loopMergeOnScopStmt(FuncOp f, ModuleOp m, OpBuilder &b) { + SetVector stmts; + getAllScopStmts(f, stmts, m); + + if (!llvm::hasSingleElement(stmts)) { + LLVM_DEBUG( + dbgs() + << "Being conservative not to merge loops with multiple scop.stmts.\n"); + return failure(); + } + + FuncOp targetStmt = *stmts.begin(); + + // Get all the callers for the target scop.stmt + SmallVector callers; + f.walk([&](mlir::CallOp caller) { + if (caller.getCallee() == targetStmt.getName()) + callers.push_back(caller); + }); + + if (hasSingleElement(callers)) { + LLVM_DEBUG(dbgs() << "There is only one caller instance for PE: " + << f.getName() << ".\n"); + return failure(); + } + + // ---------------------------------------------------------------------- + // Step 1: make sure there are no empty sets in loop domains. + SetVector erased; + for (mlir::CallOp caller : callers) { + SmallVector ops; + getEnclosingAffineForAndIfOps(*caller.getOperation(), &ops); + + FlatAffineConstraints cst; + getIndexSet(ops, &cst); + + if (!cst.findIntegerSample().hasValue()) { + LLVM_DEBUG({ + dbgs() << "Found a caller in an empty loop nest.\n"; + caller.dump(); + }); + erased.insert(caller.getOperation()); + }; + } + + callers.erase(remove_if(callers, + [&](mlir::CallOp caller) { + return erased.count(caller.getOperation()); + }), + callers.end()); + for (Operation *op : erased) + op->erase(); + + eraseEmptyAffineFor(f); + + if (hasSingleElement(callers)) { + LLVM_DEBUG(dbgs() << "There is only one caller instance for PE: " + << f.getName() << " after empty loop removal.\n"); + return failure(); + } + + // ---------------------------------------------------------------------- + // Step 2: gather loop structure + // Make sure the callers have the same prefix, only the last forOp different. + SmallVector outerLoops; + SmallVector innermosts; // each corresponds to a caller. + for (mlir::CallOp caller : callers) { + SmallVector ops; + getEnclosingAffineForAndIfOps(*caller.getOperation(), &ops); + + if (ops.empty()) { + LLVM_DEBUG(dbgs() << "Callers should be wrapped within loops.\n"); + return failure(); + } + + if (any_of(ops, [&](Operation *op) { return isa(op); })) { + LLVM_DEBUG(dbgs() << "Cannot deal with affine.if yet.\n"); + return failure(); + } + + // Initialise + if (outerLoops.empty()) { + innermosts.push_back(cast(ops.back())); + ops.pop_back(); + + for (Operation *op : ops) + outerLoops.push_back(cast(op)); + } else { + SmallVector tmpOuters; + mlir::AffineForOp innermost; + + innermost = cast(ops.back()); + ops.pop_back(); + + for (Operation *op : ops) + tmpOuters.push_back(cast(op)); + + if (tmpOuters != outerLoops) { + LLVM_DEBUG(dbgs() << "Outer loops are not the same among statements " + "(given the last being different).\n"); + return failure(); + } + + if (find(innermosts, innermost) != innermosts.end()) { + LLVM_DEBUG(dbgs() << "Weird to find the same loop structures between " + "two caller instances.\n"); + return failure(); + } + + innermosts.push_back(innermost); + } + } + + LLVM_DEBUG({ + dbgs() << "\n-----------------------------------\n"; + dbgs() << "Merging PE: \n"; + f.dump(); + }); + + // ---------------------------------------------------------------------- + // Step 3: Affine analysis + // Check if the innermost loops have no intersection. + SmallVector csts; + transform(innermosts, std::back_inserter(csts), [&](mlir::AffineForOp forOp) { + FlatAffineConstraints cst; + cst.addInductionVarOrTerminalSymbol(forOp.getInductionVar()); + + LLVM_DEBUG(cst.dump()); + + return cst; + }); + + // Make every constraint has the same induction variable. + for (unsigned i = 1; i < csts.size(); ++i) + csts[i].setIdValue(0, csts[0].getIdValue(0)); + + // Check if all the constraints share the same number of columns. + for (unsigned i = 1; i < csts.size(); ++i) { + if (csts[i].getNumCols() != csts[0].getNumCols()) { + LLVM_DEBUG(dbgs() << "Number of columns don't match between two " + "candidate constraints.\n"); + return failure(); + } + } + + // Check if two loops have intersection. + for (unsigned i = 0; i < csts.size(); ++i) + for (unsigned j = i + 1; j < csts.size(); ++j) { + FlatAffineConstraints tmp{csts[i]}; + tmp.append(csts[j]); + + if (tmp.findIntegerSample().hasValue()) { + LLVM_DEBUG(dbgs() << "There is intersection between two innermost " + "loops. Cannot merge them safely.\n"); + return failure(); + } + } + + // Merge: check if one can be merged into another iteratively, until there is + // no chance of merging. + while (true) { + bool merged = false; + + mlir::AffineForOp loopToErase; + + for (unsigned i = 0; i < innermosts.size() && !merged; ++i) + for (unsigned j = 0; j < innermosts.size() && !merged; ++j) { + if (i == j) + continue; + + mlir::AffineForOp loop1 = innermosts[i]; + mlir::AffineForOp loop2 = innermosts[j]; + + AffineMap ubMap = loop1.getUpperBoundMap(); + + // Condition BEGIN - + if (loop2.getLowerBoundMap().isSingleConstant()) { + int64_t constLb = loop2.getLowerBoundMap().getSingleConstantResult(); + for (AffineExpr ub : ubMap.getResults()) { + if (AffineConstantExpr constUbExpr = + ub.dyn_cast()) { + int64_t constUb = constUbExpr.getValue(); + if (constLb == constUb) { + // Condition END - + LLVM_DEBUG(dbgs() + << "Found loop2's single constant lower bound " + << constLb + << " equals to one of the upper bounds of loop1 " + << constUb + << ". We can merge them together since loop1 and " + "loop2 don't intersect.\n"); + + merged = true; + + // Set to erase; + loopToErase = loop2; + + // Set the new upper bound; + SetVector results; + for (AffineExpr expr : ubMap.getResults()) + if (expr != ub) + results.insert(expr); + for (AffineExpr expr : loop2.getUpperBoundMap().getResults()) + results.insert(expr); + + AffineMap newUbMap = + AffineMap::get(ubMap.getNumDims(), ubMap.getNumSymbols(), + results.takeVector(), ubMap.getContext()); + LLVM_DEBUG({ + dbgs() << "New upper bound: \n"; + newUbMap.dump(); + }); + loop1.setUpperBoundMap(newUbMap); + + break; + } + } + } + } + } + + if (loopToErase) { + innermosts.erase(find(innermosts, loopToErase)); + loopToErase.erase(); + } + + if (!merged) + break; + } + + return success(); +} + +namespace { + +/// Will only work within scop.pe on scop.stmt to avoid side effects. +struct LoopMergePass + : public mlir::PassWrapper> { + + void runOnOperation() override { + ModuleOp m = getOperation(); + OpBuilder b(m.getContext()); + + SmallVector pes; + FuncOp f = getTopFunction(m); + if (!f) + return; + + f.walk([&](mlir::CallOp caller) { + if (!caller->hasAttr("scop.pe")) + return; + FuncOp pe = dyn_cast(m.lookupSymbol(caller.getCallee())); + if (!pe) + return; + pes.push_back(pe); + }); + + for (FuncOp pe : pes) { + if (failed(loopMergeOnScopStmt(pe, m, b))) { + LLVM_DEBUG(dbgs() << "Failed to merge loops in: " << pe.getName() + << ".\n"); + } + } + } +}; + +} // namespace + +/// -------------------------- Scop stmt inline ------------------------------- + +static LogicalResult inlineScopStmtWithinFunction(FuncOp f, FuncOp stmt, + OpBuilder &b) { + if (f->hasAttr("scop.stmt")) // skipped. + return success(); + + SmallVector callers; + f.walk([&](mlir::CallOp caller) { + if (caller.getCallee() == stmt.getName()) + callers.push_back(caller); + }); + + // Replace each caller with the statement body. + for (mlir::CallOp caller : callers) { + b.setInsertionPointAfter(caller); + + BlockAndValueMapping vmap; + vmap.map(stmt.getArguments(), caller.getArgOperands()); + + // We know that the body of the stmt is simply a list of operations without + // region. + for (Operation &op : stmt.getBlocks().begin()->getOperations()) + if (!isa(op)) + b.clone(op, vmap); + } + + // Erase the callers. + for (mlir::CallOp caller : callers) + caller.erase(); + + return success(); +} + +namespace { + +/// Try to merge all the functions with attribute {scop.stmt}. +struct ScopStmtInlinePass + : public mlir::PassWrapper> { + + void runOnOperation() override { + ModuleOp m = getOperation(); + OpBuilder b(m.getContext()); + + SmallVector stmts; + SmallVector funcs; + + m.walk([&](FuncOp f) { + if (f->hasAttr("scop.stmt")) + stmts.push_back(f); + else + funcs.push_back(f); + }); + + // We know that a scop.stmt won't call another scop.stmt. + for (FuncOp stmt : stmts) { + bool hasCaller = false; + stmt.walk([&](mlir::CallOp caller) { hasCaller = true; }); + + assert(!hasCaller && "A scop.stmt cannot call another function."); + } + + // Iterate every scop.stmt that should be inlined. + for (FuncOp stmt : stmts) { + for (FuncOp func : funcs) + if (failed(inlineScopStmtWithinFunction(func, stmt, b))) + return; + stmt.erase(); + } + } +}; + +} // namespace + void phism::registerLoopTransformPasses() { PassRegistration( "annotate-point-loops", "Annotate loops with point/tile info."); PassRegistration( "extract-point-loops", "Extract point loop bands into functions"); + PassRegistration( + "redis-scop-stmts", + "Redistribute scop statements across extracted point loops."); + PassRegistration("loop-merge", + "Merge loops by affine analysis."); PassPipelineRegistration<>( "improve-pipelining", "Improve the pipelining performance", @@ -497,7 +1115,16 @@ void phism::registerLoopTransformPasses() { pm.addPass(std::make_unique()); pm.addPass(std::make_unique(pipelineOptions)); pm.addPass(createCanonicalizerPass()); - // only those private functions will be inlined. - pm.addPass(createInlinerPass()); + }); + + PassPipelineRegistration<>( + "loop-redis-and-merge", "Redistribute stmts and merge loops.", + [](OpPassManager &pm) { + pm.addPass(std::make_unique()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(std::make_unique()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(std::make_unique()); + pm.addPass(createCanonicalizerPass()); }); } diff --git a/lib/mlir/Transforms/Utils.cc b/lib/mlir/Transforms/Utils.cc new file mode 100644 index 00000000000..2d00c90cb05 --- /dev/null +++ b/lib/mlir/Transforms/Utils.cc @@ -0,0 +1,33 @@ +//===- Utils.cc - Utility functions ------------------ C++-===// + +#include "phism/mlir/Transforms/Utils.h" + +#include "mlir/Dialect/StandardOps/IR/Ops.h" + +using namespace mlir; +using namespace llvm; +using namespace phism; + +static bool hasPeCaller(FuncOp f) { + bool ret = false; + f.walk([&](CallOp caller) { + if (caller->hasAttr("scop.pe")) + ret = true; + }); + return ret; +} + +namespace phism { + +FuncOp getTopFunction(ModuleOp m) { + FuncOp top = nullptr; + m.walk([&](FuncOp f) { + if (hasPeCaller(f)) { + assert(!top && "There should be only one top function."); + top = f; + } + }); + return top; +} + +} // namespace phism diff --git a/python/utils/polybench.py b/python/utils/polybench.py index ed559c6ffb2..38f587ba22f 100644 --- a/python/utils/polybench.py +++ b/python/utils/polybench.py @@ -18,8 +18,6 @@ import pandas as pd -logger = logging.getLogger(__name__) - POLYBENCH_DATASETS = ("MINI", "SMALL", "MEDIUM", "LARGE", "EXTRALARGE") POLYBENCH_EXAMPLES = ( "2mm", @@ -92,6 +90,8 @@ class PbFlowOptions: max_span: int = -1 tile_sizes: Optional[List[int]] = None array_partition: bool = False + skip_vitis: bool = False + skip_csim: bool = False # Given cosim = True, you can still turn down csim. # ----------------------- Utility functions ------------------------------------ @@ -375,19 +375,55 @@ def get_module_parameters(file: str, module_name: str) -> List[str]: end_line = next(i for i, l in enumerate(lines) if ");" in l) params = (" ".join(line for line in lines[start_line + 1 : end_line])).split(",") - return [param.strip() for param in params] + return [param.strip() for param in params if param.strip()] + + +def get_autotb_parameters(file: str) -> List[str]: + """Read interface from autotb files.""" + assert os.path.isfile(file) + assert file.endswith(".autotb.v") + + with open(file, "r") as f: + lines = f.readlines() + lines = [line.strip() for line in lines] + + start_line = next( + i for i, l in enumerate(lines) if f"`AUTOTB_DUT `AUTOTB_DUT_INST(" in l + ) + assert start_line >= 0 and start_line < len(lines) + + end_line = next(i for i, l in enumerate(lines) if ");" in l and i > start_line) + assert end_line >= 0 and end_line < len(lines) + + # Deal with things like - + # .ap_clk(ap_clk), + # .ap_rst(ap_rst), + + conns = (" ".join(line for line in lines[start_line + 1 : end_line + 1])).split(",") + conns = [conn.strip() for conn in conns] + + params = [] + for conn in conns: + if conn.endswith(");"): + conn = conn[:-2] + assert conn[0] == "." and "(" in conn and conn[-1] == ")" + param = conn.split("(")[0][1:] + assert param == conn.split("(")[1][:-1] + params.append(param) + + return params def get_memory_interfaces(params: List[str]): """Parse memory interfaces from the module params.""" interfaces = OrderedDict() for param in params: - prefix = param.split("_")[0] + prefix = "_".join(param.split("_")[:-1]) if prefix not in interfaces: interfaces[prefix] = [] if param.startswith("ap") or "_" not in param: continue - interfaces[prefix].append(param.split("_")[1]) + interfaces[prefix].append(param.split("_")[-1]) return [ ApMemoryInterface(name, ports) @@ -415,32 +451,26 @@ def is_read_write_conflict( ) -def fix_cosim_kernels(dir: str) -> CosimFixStrategy: - """Fix issues with co-simulation. - Returns directives for (source, destination). - """ - - dir = os.path.abspath(dir) # canonicalize path - kernel_name = f"kernel_{os.path.basename(dir)}" - - src_proj_dir = os.path.join(dir, "proj", "solution1") - assert os.path.isdir(src_proj_dir) - - dst_proj_dir = os.path.join(dir, "tb.backup", "solution1") - assert os.path.isdir(dst_proj_dir) - - src_kernel = os.path.join(src_proj_dir, "syn", "verilog", f"{kernel_name}.v") - assert os.path.isfile(src_kernel) +def is_cosim_interface_matched( + src_mems: List[ApMemoryInterface], dst_mems: List[ApMemoryInterface] +) -> bool: + if len(src_mems) != len(dst_mems): + return False - dst_kernel = os.path.join(dst_proj_dir, "syn", "verilog", f"{kernel_name}.v") - assert os.path.isfile(dst_kernel) + for src, dst in zip(src_mems, dst_mems): + if src.get_num_ports() != dst.get_num_ports(): + return False + if set(src.ports) != set(dst.ports): + return False - src_params = get_module_parameters(src_kernel, kernel_name) - dst_params = get_module_parameters(dst_kernel, kernel_name) + return True - src_mems = get_memory_interfaces(src_params) - dst_mems = get_memory_interfaces(dst_params) +def get_cosim_fix_strategy( + kernel_name: str, + src_mems: List[ApMemoryInterface], + dst_mems: List[ApMemoryInterface], +) -> CosimFixStrategy: if len(src_mems) != len(dst_mems): raise RuntimeError("The number of ap_memory interfaces should be the same.") if [mem.name for mem in src_mems] != [mem.name for mem in dst_mems]: @@ -496,6 +526,36 @@ def fix_cosim_kernels(dir: str) -> CosimFixStrategy: return strategy +def fix_cosim_kernels(dir: str) -> CosimFixStrategy: + """Fix issues with co-simulation. + Returns directives for (source, destination). + """ + + dir = os.path.abspath(dir) # canonicalize path + kernel_name = f"kernel_{os.path.basename(dir)}" + + src_proj_dir = os.path.join(dir, "proj", "solution1") + assert os.path.isdir(src_proj_dir) + + dst_proj_dir = os.path.join(dir, "tb.backup", "solution1") + assert os.path.isdir(dst_proj_dir) + + src_kernel = os.path.join(src_proj_dir, "syn", "verilog", f"{kernel_name}.v") + assert os.path.isfile(src_kernel) + + dst_kernel = os.path.join(dst_proj_dir, "syn", "verilog", f"{kernel_name}.v") + assert os.path.isfile(dst_kernel) + + src_params = get_module_parameters(src_kernel, kernel_name) + dst_params = get_module_parameters(dst_kernel, kernel_name) + + return get_cosim_fix_strategy( + kernel_name, + get_memory_interfaces(src_params), + get_memory_interfaces(dst_params), + ) + + # ----------------------- Benchmark runners --------------------------- @@ -609,6 +669,8 @@ def is_func_decl(item, name): exit """ +TBGEN_VITIS_TCL_FILES = 'add_files {{{src_dir}/{src_base}.c}} -cflags "-I {src_dir} -I {work_dir}/utilities -D {pb_dataset}_DATASET" -csimflags "-I {src_dir} -I {work_dir}/utilities -D{pb_dataset}_DATASET"\\nadd_files -tb {{{src_dir}/{src_base}.c {work_dir}/utilities/polybench.c}} -cflags "-I {src_dir} -I {work_dir}/utilities -D{pb_dataset}_DATASET" -csimflags "-I {src_dir} -I {work_dir}/utilities -D{pb_dataset}_DATASET"\\n' + TBGEN_VITIS_TCL = """ open_project -reset tb add_files {{{src_dir}/{src_base}.c}} -cflags "-I {src_dir} -I {work_dir}/utilities -D {pb_dataset}_DATASET" -csimflags "-I {src_dir} -I {work_dir}/utilities -D{pb_dataset}_DATASET" @@ -655,11 +717,30 @@ def __init__(self, work_dir: str, options: PbFlowOptions): self.status = 0 self.errmsg = "No Error" + # Logger + self.logger = logging.getLogger("pb-flow") + self.logger.setLevel(logging.DEBUG) + def run(self, src_file): """Run the whole pb-flow on the src_file (*.c).""" self.cur_file = src_file self.c_source = src_file # Will be useful in some later stages + base_dir = os.path.dirname(src_file) + + # Setup logging + log_file = os.path.join(base_dir, f"pb-flow.log") + if os.path.isfile(log_file): + os.remove(log_file) + + formatter = logging.Formatter( + "[%(asctime)s][%(name)s][%(levelname)s] %(message)s" + ) + fh = logging.FileHandler(log_file) + fh.setFormatter(formatter) + fh.setLevel(logging.DEBUG) + self.logger.addHandler(fh) + # The whole flow try: ( @@ -669,12 +750,17 @@ def run(self, src_file): .split_statements() .extract_top_func() .polymer_opt() - .loop_transforms() .constant_args() + .loop_transforms() .array_partition() .lower_llvm() .vitis_opt() - .run_vitis() + .write_tb_tcl_by_llvm() + .run_vitis_on_phism() + .run_tbgen_csim() + .backup_csim_results() + .copy_design_from_phism_to_tb() + .run_cosim() ) except Exception as e: self.status = 1 @@ -685,16 +771,26 @@ def run_command( ): """Single entry for running a command.""" kwargs.update({"cwd": os.path.dirname(self.cur_file)}) + if cmd_list: + cmd_ = " \\\n\t".join(cmd_list) + self.logger.debug(f"{cmd_}") if self.options.dry_run: print(" ".join(cmd_list)) return - return subprocess.run(cmd_list, **kwargs) + proc = subprocess.run(cmd_list, **kwargs) else: + self.logger.debug(f"{cmd}") if self.options.dry_run: print(cmd) return - return subprocess.run(cmd, **kwargs) + proc = subprocess.run(cmd, **kwargs) + + cmd_str = cmd if cmd else " ".join(cmd_list) + if proc.returncode != 0: + raise RuntimeError(f"{cmd_str} failed.") + + return proc def get_program_abspath(self, program: str) -> str: """Get the absolute path of a program.""" @@ -708,7 +804,8 @@ def generate_tile_sizes(self): tile_file = os.path.join(base_dir, "tile.sizes") if not self.options.tile_sizes: - shutil.rmtree(tile_file, ignore_errors=True) + if os.path.isfile(tile_file): + os.remove(tile_file) return self with open(tile_file, "w") as f: @@ -865,6 +962,8 @@ def loop_transforms(self): self.get_program_abspath("phism-opt"), src_file, f'-loop-transforms="max-span={self.options.max_span}"', + "-loop-redis-and-merge", + "-debug-only=loop-transforms", ] self.run_command( @@ -887,10 +986,16 @@ def array_partition(self): ) log_file = self.cur_file.replace(".mlir", ".log") + array_partition_file = os.path.join( + os.path.dirname(self.cur_file), "array_partition.txt" + ) + if os.path.isfile(array_partition_file): + os.remove(array_partition_file) + args = [ self.get_program_abspath("phism-opt"), src_file, - "-simple-array-partition", + "-simple-array-partition=dumpFile", "-debug-only=array-partition", ] @@ -959,11 +1064,17 @@ def vitis_opt(self): src_file, self.cur_file = self.cur_file, self.cur_file.replace( ".llvm", ".vitis.llvm" ) + log_file = self.cur_file.replace(".llvm", ".log") xln_names = get_top_func_param_names( self.c_source, self.work_dir, llvm_dir=os.path.join(self.root_dir, "llvm") ) + # Whether array partition has been successful. + xln_ap_enabled = os.path.isfile( + os.path.join(os.path.dirname(self.cur_file), "array_partition.txt") + ) + args = [ os.path.join(self.root_dir, "llvm", "build", "bin", "opt"), src_file, @@ -980,15 +1091,260 @@ def vitis_opt(self): "-xlnanno", '-xlntop="{}"'.format(get_top_func(src_file)), '-xlnnames="{}"'.format(",".join(xln_names)), + "-xlnunroll" if self.options.loop_transforms else "", + "-xlnarraypartition" if self.options.array_partition else "", + "-xln-ap-enabled" if xln_ap_enabled else "", "-strip-attr", - "-xlnunroll", - "-xlnarraypartition", + "-debug", ] self.run_command( cmd=" ".join(args), shell=True, stdout=open(self.cur_file, "w"), + stderr=open(log_file, "w"), + env=self.env, + ) + + return self + + def write_tb_tcl_by_llvm(self): + """Generate the tbgen TCL file from LLVM passes.""" + if self.options.skip_vitis: + return self + + src_file = self.cur_file + base_dir = os.path.dirname(src_file) + top_func = get_top_func(src_file) + + # Whether array partition has been successful. + xln_ap_enabled = os.path.isfile(os.path.join(base_dir, "array_partition.txt")) + + tbgen_vitis_tcl = os.path.join(base_dir, "tbgen.tcl") + + tb_tcl_log = "write_tb_tcl_by_llvm.log" + + # Write the TCL for TBGEN. + args = [ + os.path.join(self.root_dir, "llvm", "build", "bin", "opt"), + src_file, + "-S", + "-enable-new-pm=0", + '-load "{}"'.format( + os.path.join(self.root_dir, "build", "lib", "VhlsLLVMRewriter.so") + ), + f'-xlntop="{top_func}"', + "-xlntbgen", + "-xln-ap-enabled" if xln_ap_enabled else "", + "-xlntbfilesettings=$'{}'".format( + TBGEN_VITIS_TCL_FILES.format( + src_dir=base_dir, + src_base=os.path.basename(src_file).split(".")[0], + work_dir=self.work_dir, + pb_dataset=self.options.dataset, + ) + ), + f'-xlntbtclnames="{tbgen_vitis_tcl}"', + ] + + self.run_command( + cmd=" ".join(args), + shell=True, + stdout=open(tb_tcl_log, "w"), + env=self.env, + ) + + return self + + def run_vitis_on_phism(self): + """Just run vitis_hls on the LLVM generated from Phism.""" + if self.options.skip_vitis: + self.logger.warn("Vitis won't run since --skip-vitis has been set.") + return self + + src_file = self.cur_file + base_dir = os.path.dirname(src_file) + top_func = get_top_func(src_file) + + phism_vitis_tcl = os.path.join(base_dir, "phism.tcl") + run_config = "config_bind -effort high" + if self.options.debug: + run_config = "" + + # Generate dummy C code as the interface for the top function. + dummy_src = src_file.replace(".llvm", ".dummy.c") + with open(dummy_src, "w") as f: + f.write("void {}() {{}}".format(top_func)) + + # Write the TCL for Phism. + with open(phism_vitis_tcl, "w") as f: + phism_run_config = [str(run_config)] + f.write( + PHISM_VITIS_TCL.format( + src_file=src_file, + dummy_src=dummy_src, + top_func=top_func, + config="\n".join(phism_run_config), + ) + ) + + log_file = os.path.join(base_dir, "phism.vitis_hls.stdout.log") + + # Clean up old results + shutil.rmtree(os.path.join(base_dir, "proj"), ignore_errors=True) + if os.path.isfile(log_file): + os.remove(log_file) + + if self.options.dry_run: + return self + + self.run_command( + cmd_list=["vitis_hls", phism_vitis_tcl], + stdout=open(log_file, "w"), + stderr=open(os.path.join(base_dir, "phism.vitis_hls.stderr.log"), "w"), + env=self.env, + ) + + return self + + def run_tbgen_csim(self): + """Run the tbgen.tcl file. Assuming the Tcl file has been written.""" + if not self.options.cosim: + self.logger.warn("Cosim won't run due to the input setting.") + return self + if self.options.skip_csim: + self.logger.warn("CSim is set to be skipped.") + return self + + src_file = self.cur_file + base_dir = os.path.dirname(src_file) + + tbgen_vitis_tcl = os.path.join(base_dir, "tbgen.tcl") + assert os.path.isfile(tbgen_vitis_tcl), f"{tbgen_vitis_tcl} should exist." + + if self.options.dry_run: + return self + + shutil.rmtree(os.path.join(base_dir, "tb"), ignore_errors=True) + log_file = os.path.join(base_dir, "tbgen.vitis_hls.stdout.log") + if os.path.isfile(log_file): + os.remove(log_file) + + self.run_command( + cmd_list=["vitis_hls", tbgen_vitis_tcl], + stdout=open(log_file, "w"), + stderr=open(os.path.join(base_dir, "tbgen.vitis_hls.stderr.log"), "w"), + env=self.env, + ) + + return self + + def backup_csim_results(self): + """Create a backup for the csim results.""" + # TODO: make this --dry-run compatible + base_dir = os.path.dirname(self.cur_file) + tbgen_dir = os.path.join(base_dir, "tb") + assert os.path.isdir( + tbgen_dir + ), f"tbgen_dir={tbgen_dir} isn't there, please don't skip csim in this case." + + csim_dir = os.path.join(base_dir, "tb.csim") + if os.path.isdir(csim_dir): + self.logger.debug(f"csim_dir={csim_dir} exists, deleting it ...") + shutil.rmtree(csim_dir) + + # Backup the tbgen (csim) results. + shutil.copytree(tbgen_dir, csim_dir) + + return self + + def copy_design_from_phism_to_tb(self): + """Move design files from Phism output to the testbench directory.""" + # TODO: make this --dry-run compatible + src_file = self.cur_file + base_dir = os.path.dirname(src_file) + top_func = get_top_func(src_file) + + # Check results + phism_syn_verilog_dir = os.path.join( + base_dir, "proj", "solution1", "syn", "verilog" + ) + assert os.path.isdir( + phism_syn_verilog_dir + ), f"{phism_syn_verilog_dir} doens't exist." + + tbgen_syn_verilog_dir = os.path.join( + base_dir, "tb", "solution1", "syn", "verilog" + ) + assert os.path.isdir( + tbgen_syn_verilog_dir + ), f"{tbgen_syn_verilog_dir} doens't exist." + + tbgen_sim_verilog_dir = os.path.join( + base_dir, "tb", "solution1", "sim", "verilog" + ) + assert os.path.isdir( + tbgen_sim_verilog_dir + ), f"{tbgen_sim_verilog_dir} doens't exist." + + # Copy and paste the design files. + design_files = glob.glob(os.path.join(phism_syn_verilog_dir, "*.*")) + assert design_files, "There should exist design files." + for f in design_files: + shutil.copy(f, tbgen_syn_verilog_dir) + + self.logger.debug(f"Design files found: \n" + "\n".join(design_files)) + + # Fix the inconsistency between the testbench and the design top. + phism_top = os.path.join(tbgen_syn_verilog_dir, f"{top_func}.v") + assert os.path.isfile(phism_top), f"The top module {phism_top} should exist." + autotb = os.path.join(tbgen_sim_verilog_dir, f"{top_func}.autotb.v") + assert os.path.isfile(autotb), f"The autotb file {autotb} should exist." + + phism_params = get_module_parameters(phism_top, top_func) + self.logger.debug( + f"Parameters parsed from {phism_top}:\n" + "\n".join(phism_params) + ) + autotb_params = get_autotb_parameters(autotb) + self.logger.debug( + f"Parameters parsed from {autotb}:\n" + "\n".join(autotb_params) + ) + + phism_mems = get_memory_interfaces(phism_params) + self.logger.debug( + f"Parsed memory interfaces from {phism_top}:\n" + + "\n".join([str(m) for m in phism_mems]) + ) + autotb_mems = get_memory_interfaces(autotb_params) + self.logger.debug( + f"Parsed memory interfaces from {autotb}:\n" + + "\n".join([str(m) for m in autotb_mems]) + ) + + if not is_cosim_interface_matched(phism_mems, autotb_mems): + print(get_cosim_fix_strategy(top_func, phism_mems, autotb_mems)) + + return self + + def run_cosim(self): + """Run cosim.tcl""" + if not self.options.cosim: + self.logger.debug("cosim is skipped since --cosim has not been set.") + return self + + src_file = self.cur_file + base_dir = os.path.dirname(src_file) + + cosim_vitis_tcl = os.path.join(base_dir, "cosim.tcl") + with open(cosim_vitis_tcl, "w") as f: + f.write(COSIM_VITIS_TCL) + + log_file = os.path.join(base_dir, "cosim.vitis_hls.stdout.log") + + self.run_command( + cmd_list=["vitis_hls", cosim_vitis_tcl], + stdout=open(log_file, "w"), + stderr=open(os.path.join(base_dir, "cosim.vitis_hls.stderr.log"), "w"), env=self.env, ) @@ -996,6 +1352,9 @@ def vitis_opt(self): def run_vitis(self, strategy: Optional[CosimFixStrategy] = None): """Run synthesize/testbench generation/co-simulation.""" + if self.options.skip_vitis: + return self + src_file = self.cur_file base_dir = os.path.dirname(src_file) top_func = get_top_func(src_file) @@ -1028,22 +1387,23 @@ def run_vitis(self, strategy: Optional[CosimFixStrategy] = None): ) ) - # Write the TCL for TBGEN. - with open(tbgen_vitis_tcl, "w") as f: - tbgen_run_config = [str(run_config)] - if strategy: - tbgen_run_config.extend(strategy.tbgen_directives) - - f.write( - TBGEN_VITIS_TCL.format( - src_dir=base_dir, - src_base=os.path.basename(src_file).split(".")[0], - top_func=top_func, - work_dir=self.work_dir, - config="\n".join(tbgen_run_config), - pb_dataset=self.options.dataset, - ) - ) + # Keep it for now in case we need C baseline simulation? + # with open(tbgen_vitis_tcl, "w") as f: + # tbgen_run_config = [str(run_config)] + # if strategy: + # tbgen_run_config.extend(strategy.tbgen_directives) + # f.write( + # TBGEN_VITIS_TCL.format( + # src_dir=base_dir, + # src_base=os.path.basename(src_file).split(".")[0], + # top_func=top_func, + # work_dir=self.work_dir, + # config="\n".join(tbgen_run_config), + # pb_dataset=self.options.dataset, + # ) + # ) + + # Write the TCL for COSIM. with open(cosim_vitis_tcl, "w") as f: f.write(COSIM_VITIS_TCL) @@ -1180,6 +1540,9 @@ def pb_flow_runner(options: PbFlowOptions): """Run pb-flow with the provided arguments.""" assert os.path.isdir(options.pb_dir) + if not options.examples: + options.examples = POLYBENCH_EXAMPLES + # Copy all the files from the source pb_dir to a target temporary directory. if not options.work_dir: options.work_dir = os.path.join( @@ -1204,5 +1567,7 @@ def pb_flow_runner(options: PbFlowOptions): end = timer() print("Elapsed time: {:.6f} sec".format(end - start)) - print(">>> Dumping report ... ") - pb_flow_dump_report(options) + # Will only dump report if Vitis has been run. + if not options.skip_vitis: + print(">>> Dumping report ... ") + pb_flow_dump_report(options) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000000..42596e21844 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,66 @@ +argon2-cffi==21.1.0 +attrs==21.2.0 +backcall==0.2.0 +black==21.8b0 +bleach==4.1.0 +cffi==1.14.6 +click==8.0.1 +debugpy==1.4.3 +decorator==5.1.0 +defusedxml==0.7.1 +entrypoints==0.3 +ipykernel==6.4.1 +ipython==7.27.0 +ipython-genutils==0.2.0 +ipywidgets==7.6.4 +jedi==0.18.0 +Jinja2==3.0.1 +jsonschema==3.2.0 +jupyter==1.0.0 +jupyter-client==7.0.2 +jupyter-console==6.4.0 +jupyter-core==4.7.1 +jupyterlab-pygments==0.1.2 +jupyterlab-widgets==1.0.1 +MarkupSafe==2.0.1 +matplotlib-inline==0.1.3 +mistune==0.8.4 +mypy-extensions==0.4.3 +nbclient==0.5.4 +nbconvert==6.1.0 +nbformat==5.1.3 +nest-asyncio==1.5.1 +notebook==6.4.3 +numpy==1.21.2 +packaging==21.0 +pandas==1.3.3 +pandocfilters==1.4.3 +parso==0.8.2 +pathspec==0.9.0 +pexpect==4.8.0 +pickleshare==0.7.5 +platformdirs==2.3.0 +prometheus-client==0.11.0 +prompt-toolkit==3.0.20 +ptyprocess==0.7.0 +pycparser==2.20 +Pygments==2.10.0 +pyparsing==2.4.7 +pyrsistent==0.18.0 +python-dateutil==2.8.2 +pytz==2021.1 +pyzmq==22.2.1 +qtconsole==5.1.1 +QtPy==1.11.0 +regex==2021.8.28 +Send2Trash==1.8.0 +six==1.16.0 +terminado==0.12.1 +testpath==0.5.0 +tomli==1.2.1 +tornado==6.1 +traitlets==5.1.0 +typing-extensions==3.10.0.2 +wcwidth==0.2.5 +webencodings==0.5.1 +widgetsnbextension==3.5.1 diff --git a/scripts/build-llvm.sh b/scripts/build-llvm.sh index d5b714ecfc2..66a1760b534 100755 --- a/scripts/build-llvm.sh +++ b/scripts/build-llvm.sh @@ -36,26 +36,27 @@ mkdir -p build cd build # Configure CMake -export CC=gcc -export CXX=g++ -cmake ../llvm \ - -DLLVM_ENABLE_PROJECTS="mlir;llvm;clang" \ - -DCMAKE_BUILD_TYPE=RELEASE \ - -DLLVM_BUILD_EXAMPLES=OFF \ - -DLLVM_TARGETS_TO_BUILD="host" \ - -DLLVM_OPTIMIZED_TABLEGEN=ON \ - -DLLVM_ENABLE_OCAMLDOC=OFF \ - -DLLVM_ENABLE_BINDINGS=OFF \ - -DLLVM_INSTALL_UTILS=ON \ - -DLLVM_ENABLE_ASSERTIONS=ON \ - -DBUILD_POLYMER=ON \ - -DPLUTO_LIBCLANG_PREFIX="$(llvm-config --prefix)" \ - -G "${CMAKE_GENERATOR}" +if [ ! -f "CMakeCache.txt" ]; then + export CC=gcc + export CXX=g++ + cmake ../llvm \ + -DLLVM_ENABLE_PROJECTS="mlir;llvm;clang" \ + -DCMAKE_BUILD_TYPE=RELEASE \ + -DLLVM_BUILD_EXAMPLES=OFF \ + -DLLVM_TARGETS_TO_BUILD="host" \ + -DLLVM_OPTIMIZED_TABLEGEN=ON \ + -DLLVM_ENABLE_OCAMLDOC=OFF \ + -DLLVM_ENABLE_BINDINGS=OFF \ + -DLLVM_INSTALL_UTILS=ON \ + -DLLVM_ENABLE_ASSERTIONS=ON \ + -DBUILD_POLYMER=ON \ + -DPLUTO_LIBCLANG_PREFIX="$(llvm-config --prefix)" \ + -G "${CMAKE_GENERATOR}" +fi # Run building -cmake --build . --target all -- -j "$(nproc)" - -if [ "${TARGET}" == "ci" ]; then - # Run test - cmake --build . --target check-llvm -- -j "$(nproc)" +if [ "${CMAKE_GENERATOR}" == "Ninja" ]; then + ninja +else + make -j "$(nproc)" fi diff --git a/scripts/pb-flow.py b/scripts/pb-flow.py index 4d2970de534..19a848aee50 100755 --- a/scripts/pb-flow.py +++ b/scripts/pb-flow.py @@ -66,6 +66,10 @@ def main(): parser.add_argument( "--array-partition", action="store_true", help="Use array partition." ) + parser.add_argument("--skip-vitis", action="store_true", help="Don't run Vitis.") + parser.add_argument( + "--skip-csim", action="store_true", help="Don't run tbgen (csim)." + ) args = parser.parse_args() options = pb_utils.PbFlowOptions(**vars(args)) diff --git a/test/llvm/Transforms/VhlsLLVMRewriter/matmul.mlir b/test/llvm/Transforms/VhlsLLVMRewriter/matmul.mlir index bcf0db62aad..9adbffe6db4 100644 --- a/test/llvm/Transforms/VhlsLLVMRewriter/matmul.mlir +++ b/test/llvm/Transforms/VhlsLLVMRewriter/matmul.mlir @@ -1,6 +1,7 @@ // RUN: mlir-opt -lower-affine -convert-scf-to-std -convert-std-to-llvm='use-bare-ptr-memref-call-conv=1' %s | mlir-translate -mlir-to-llvmir | opt -enable-new-pm=0 -load ${PHISM_LIBS_DIR}/VhlsLLVMRewriter.so -mem2arr -instcombine -strip-debug -S | FileCheck %s -// CHECK: define void @matmul([200 x [300 x float]]* %[[A:.*]], [300 x [400 x float]]* %[[B:.*]], [200 x [400 x float]]* %[[C:.*]]) { +// CHECK: noinline +// CHECK: define void @matmul([200 x [300 x float]]* %[[A:.*]], [300 x [400 x float]]* %[[B:.*]], [200 x [400 x float]]* %[[C:.*]]) #[[ATTR:.*]] func @matmul(%A: memref<200x300xf32>, %B: memref<300x400xf32>, %C: memref<200x400xf32>) { affine.for %i = 0 to 200 { affine.for %j = 0 to 400 { @@ -16,22 +17,9 @@ func @matmul(%A: memref<200x300xf32>, %B: memref<300x400xf32>, %C: memref<200x40 return } -// CHECK: %[[VAL14:.*]] = mul i64 %[[I:.*]], 300 -// CHECK: %[[VAL15:.*]] = add i64 %[[VAL14]], %[[K:.*]] -// CHECK: %[[GEP0IDX0:.*]] = udiv i64 %[[VAL15]], 300 -// CHECK: %[[GEP0ADDR1:.*]] = urem i64 %[[VAL15]], 300 -// CHECK: %[[GEP0:.*]] = getelementptr inbounds [200 x [300 x float]], [200 x [300 x float]]* %[[A]], i64 0, i64 %[[GEP0IDX0]], i64 %[[GEP0ADDR1]] -// CHECK: %[[VAL16:.*]] = load float, float* %[[GEP0]], align 4 -// CHECK: %[[VAL17:.*]] = mul i64 %[[K]], 400 -// CHECK: %[[VAL18:.*]] = add i64 %[[VAL17]], %[[J:.*]] -// CHECK: %[[GEP1IDX0:.*]] = udiv i64 %[[VAL18]], 400 -// CHECK: %[[GEP1ADDR1:.*]] = urem i64 %[[VAL18]], 400 -// CHECK: %[[GEP1:.*]] = getelementptr inbounds [300 x [400 x float]], [300 x [400 x float]]* %[[B]], i64 0, i64 %[[GEP1IDX0]], i64 %[[GEP1ADDR1]] -// CHECK: %[[VAL19:.*]] = load float, float* %[[GEP1]], align 4 -// CHECK: %[[VAL20:.*]] = fmul float %[[VAL16]], %[[VAL19]] -// CHECK: %[[VAL21:.*]] = mul i64 %[[I]], 400 -// CHECK: %[[VAL22:.*]] = add i64 %[[VAL21]], %[[J]] -// CHECK: %[[GEP2IDX0:.*]] = udiv i64 %[[VAL22]], 400 -// CHECK: %[[GEP2ADDR1:.*]] = urem i64 %[[VAL22]], 400 -// CHECK: %[[GEP2:.*]] = getelementptr inbounds [200 x [400 x float]], [200 x [400 x float]]* %[[C]], i64 0, i64 %[[GEP2IDX0]], i64 %[[GEP2ADDR1]] -// CHECK: store float %[[VAL20]], float* %[[GEP2]], align 4 +// CHECK: %[[i:.*]] = phi i64 +// CHECK: %[[j:.*]] = phi i64 +// CHECK: %[[k:.*]] = phi i64 +// CHECK: getelementptr inbounds [200 x [300 x float]], [200 x [300 x float]]* %[[A]], i64 0, i64 %[[i]], i64 %[[k]] +// CHECK: getelementptr inbounds [300 x [400 x float]], [300 x [400 x float]]* %[[B]], i64 0, i64 %[[k]], i64 %[[j]] +// CHECK: getelementptr inbounds [200 x [400 x float]], [200 x [400 x float]]* %[[C]], i64 0, i64 %[[i]], i64 %[[j]] diff --git a/test/mlir/Transforms/ArrayPartition/no-scop-pe-caller.mlir b/test/mlir/Transforms/ArrayPartition/no-scop-pe-caller.mlir new file mode 100644 index 00000000000..2237f1332f2 --- /dev/null +++ b/test/mlir/Transforms/ArrayPartition/no-scop-pe-caller.mlir @@ -0,0 +1,27 @@ +// RUN: phism-opt -simple-array-partition %s -verify-diagnostics | FileCheck %s + +#map0 = affine_map<()[s0] -> (s0 * 32)> +#map1 = affine_map<()[s0] -> (s0 * 32 + 32)> + +// expected-remark@below {{No top function found}} +module { + +// CHECK-LABEL: @bar +func @bar(%A: memref<64xf32>, %i: index) { + affine.for %j = #map0()[%i] to #map1()[%i] { + %0 = affine.load %A[%j] : memref<64xf32> + %1 = addf %0, %0 : f32 + affine.store %1, %A[%j] : memref<64xf32> + } + return +} + +// CHECK-LABEL: @foo +func @foo(%A: memref<64xf32>) { + affine.for %i = 0 to 2 { + call @bar(%A, %i) : (memref<64xf32>, index) -> () + } + return +} + +} diff --git a/test/mlir/Transforms/ArrayPartition/simple-partition.mlir b/test/mlir/Transforms/ArrayPartition/simple-partition.mlir new file mode 100644 index 00000000000..80a7d6903a8 --- /dev/null +++ b/test/mlir/Transforms/ArrayPartition/simple-partition.mlir @@ -0,0 +1,30 @@ +// RUN: phism-opt -simple-array-partition %s | FileCheck %s + +#map0 = affine_map<()[s0] -> (s0 * 32)> +#map1 = affine_map<()[s0] -> (s0 * 32 + 32)> +// CHECK: #[[MAP2:.*]] = affine_map<(d0)[s0] -> (d0 + s0)> + +// CHECK: func @bar(%[[ARG0:.*]]: memref<32xf32>, %[[ARG1:.*]]: index) +func @bar(%A: memref<64xf32>, %i: index) { + // CHECK: affine.for %[[ARG2:.*]] = + affine.for %j = #map0()[%i] to #map1()[%i] { + // CHECK: affine.load %[[ARG0]][%[[ARG2]] mod 32] + %0 = affine.load %A[%j] : memref<64xf32> + %1 = addf %0, %0 : f32 + // CHECK: affine.store %{{.*}}, %[[ARG0]][%[[ARG2]] mod 32] + affine.store %1, %A[%j] : memref<64xf32> + } + return +} + +// CHECK: func @foo(%[[ARG0:.*]]: memref<2x32xf32>) +func @foo(%A: memref<64xf32>) { + // CHECK: affine.for %[[ARG1:.*]] = 0 to 2 + affine.for %i = 0 to 2 { + // CHECK: %[[VAL0:.*]] = memref.subview %[[ARG0]][%[[ARG1]], 0] [1, 32] [1, 1] : memref<2x32xf32> to memref<32xf32, #[[MAP2]]> + // CHECK-NEXT: %[[VAL1:.*]] = memref.cast %[[VAL0]] : memref<32xf32, #[[MAP2]]> to memref<32xf32> + // CHECK-NEXT: call @bar(%[[VAL1]], %[[ARG1]]) {scop.pe} : (memref<32xf32>, index) -> () + call @bar(%A, %i) {scop.pe} : (memref<64xf32>, index) -> () + } + return +} diff --git a/test/mlir/Transforms/LoopTransforms/loop-merge.mlir b/test/mlir/Transforms/LoopTransforms/loop-merge.mlir new file mode 100644 index 00000000000..1a099e15723 --- /dev/null +++ b/test/mlir/Transforms/LoopTransforms/loop-merge.mlir @@ -0,0 +1,27 @@ +// RUN: phism-opt -loop-merge %s | FileCheck %s + +func @S0(%A: memref<32xf32>, %i: index) attributes {scop.stmt} { + %0 = affine.load %A[%i] : memref<32xf32> + %1 = addf %0, %0 : f32 + affine.store %1, %A[%i] : memref<32xf32> + return +} + +func @two_loops(%A: memref<32xf32>) { + affine.for %i = 0 to 16 { + call @S0(%A, %i) : (memref<32xf32>, index) -> () + } + affine.for %i = 16 to 32 { + call @S0(%A, %i) : (memref<32xf32>, index) -> () + } + return +} + +// CHECK: func @two_loops +// CHECK: affine.for %[[ARG0:.*]] = 0 to 32 +// CHECK: call @S0(%{{.*}}, %[[ARG0]]) + +func @top(%A : memref<32xf32>) { + call @two_loops(%A) {scop.pe} : (memref<32xf32>) -> () + return +} diff --git a/test/mlir/Transforms/LoopTransforms/redis-scop-stmts.mlir b/test/mlir/Transforms/LoopTransforms/redis-scop-stmts.mlir new file mode 100644 index 00000000000..3ed19d9546a --- /dev/null +++ b/test/mlir/Transforms/LoopTransforms/redis-scop-stmts.mlir @@ -0,0 +1,40 @@ +// RUN: phism-opt %s -redis-scop-stmts | FileCheck %s + +func @S0(%A: memref<32xf32>, %i: index) attributes {scop.stmt} { + %0 = affine.load %A[%i] : memref<32xf32> + %1 = addf %0, %0 : f32 + affine.store %1, %A[%i] : memref<32xf32> + return +} + +func @S1(%A: memref<32xf32>, %i: index) attributes {scop.stmt} { + %0 = affine.load %A[%i] : memref<32xf32> + %1 = mulf %0, %0 : f32 + affine.store %1, %A[%i] : memref<32xf32> + return +} + +func @two_stmts(%A: memref<32xf32>, %B: memref<32xf32>) { + affine.for %i = 0 to 32 { + call @S0(%A, %i) : (memref<32xf32>, index) -> () + call @S1(%B, %i) : (memref<32xf32>, index) -> () + } + return +} + +// CHECK: func @two_stmts__cloned_for__S1(%[[ARG0:.*]]: memref<32xf32>) +// CHECK: affine.for %[[ARG1:.*]] = 0 to 32 +// CHECK: call @S1(%[[ARG0]], %[[ARG1]]) + +// CHECK: func @two_stmts__cloned_for__S0(%[[ARG0:.*]]: memref<32xf32>) +// CHECK: affine.for %[[ARG1:.*]] = 0 to 32 +// CHECK: call @S0(%[[ARG0]], %[[ARG1]]) + +// CHECK: func @top(%[[ARG0:.*]]: memref<32xf32>, %[[ARG1:.*]]: memref<32xf32>) +// CHECK: call @two_stmts__cloned_for__S0(%[[ARG0]]) +// CHECK: call @two_stmts__cloned_for__S1(%[[ARG1]]) + +func @top(%A : memref<32xf32>, %B : memref<32xf32>) { + call @two_stmts(%A, %B) {scop.pe} : (memref<32xf32>, memref<32xf32>) -> () + return +}