From a430f9fbc50d8ad84d9140ff04566bdfd9744e0e Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Thu, 7 Nov 2019 21:36:18 +0000 Subject: [PATCH 01/11] Generalise POLite edge properties --- include/POLite/Graph.h | 19 ------------------- include/POLite/PDevice.h | 2 +- include/POLite/PGraph.h | 23 ++++++++++++++++------- 3 files changed, 17 insertions(+), 27 deletions(-) diff --git a/include/POLite/Graph.h b/include/POLite/Graph.h index b4b2c9cf..4ca9dc44 100644 --- a/include/POLite/Graph.h +++ b/include/POLite/Graph.h @@ -9,7 +9,6 @@ typedef uint32_t NodeId; typedef int32_t PinId; typedef uint32_t NodeLabel; -typedef int32_t EdgeLabel; // TODO: generalise struct Graph { // Incoming and outgoing edges @@ -21,10 +20,6 @@ struct Graph { // Invariant: this sequence always has the same structure as 'outgoing' Seq*>* pins; - // Each edge has a label - // Invariant: this sequence always has the same structure as 'outgoing' - Seq*>* edgeLabels; - // Each node has a label Seq* labels; @@ -34,7 +29,6 @@ struct Graph { incoming = new Seq*> (initialCapacity); outgoing = new Seq*> (initialCapacity); pins = new Seq*> (initialCapacity); - edgeLabels = new Seq*> (initialCapacity); labels = new Seq (initialCapacity); } @@ -44,12 +38,10 @@ struct Graph { delete incoming->elems[i]; delete outgoing->elems[i]; delete pins->elems[i]; - delete edgeLabels->elems[i]; } delete incoming; delete outgoing; delete pins; - delete edgeLabels; delete labels; } @@ -59,7 +51,6 @@ struct Graph { incoming->append(new Seq (initialCapacity)); outgoing->append(new Seq (initialCapacity)); pins->append(new Seq (initialCapacity)); - edgeLabels->append(new Seq (initialCapacity)); labels->append(incoming->numElems - 1); return incoming->numElems - 1; } @@ -74,7 +65,6 @@ struct Graph { void addEdge(NodeId x, NodeId y) { outgoing->elems[x]->append(y); pins->elems[x]->append(0); - edgeLabels->elems[x]->append(0); incoming->elems[y]->append(x); } @@ -82,15 +72,6 @@ struct Graph { void addEdge(NodeId x, PinId p, NodeId y) { outgoing->elems[x]->append(y); pins->elems[x]->append(p); - edgeLabels->elems[x]->append(0); - incoming->elems[y]->append(x); - } - - // Add labelled edge using given output pin - void addLabelledEdge(EdgeLabel label, NodeId x, PinId p, NodeId y) { - outgoing->elems[x]->append(y); - pins->elems[x]->append(p); - edgeLabels->elems[x]->append(label); incoming->elems[y]->append(x); } diff --git a/include/POLite/PDevice.h b/include/POLite/PDevice.h index 036fd5d3..f080c7d7 100644 --- a/include/POLite/PDevice.h +++ b/include/POLite/PDevice.h @@ -265,7 +265,7 @@ template * m = (PMessage*) tinselSlot(0); - dev.recv(&m->payload, &m->edge); + dev.recv(&m->payload, &neighbour->edge); // Insert device into a senders array, if not already there if (oldReadyToSend == No && *dev.readyToSend != No) *(sendersTop++) = id; diff --git a/include/POLite/PGraph.h b/include/POLite/PGraph.h index 10a1ef16..d83b9cfc 100644 --- a/include/POLite/PGraph.h +++ b/include/POLite/PGraph.h @@ -77,6 +77,9 @@ template *> edgeLabels; + // Mapping from device id to device state // (Not valid until the mapper is called) PState** devices; @@ -114,6 +117,7 @@ template ); numDevices++; return graph.newNode(); } @@ -121,11 +125,14 @@ template append(edge); } // Add labelled edge using given output pin - void addLabelledEdge(EdgeLabel label, PDeviceId x, PinId pin, PDeviceId y) { - graph.addLabelledEdge(label, x, pin, y); + void addLabelledEdge(E edge, PDeviceId x, PinId pin, PDeviceId y) { + graph.addEdge(x, pin, y); + edgeLabels.elems[x]->append(edge); } // Allocate SRAM and DRAM partitions @@ -286,12 +293,12 @@ template ::value) { - if (i >= graph.edgeLabels->elems[id]->numElems) { + if (i >= edgeLabels.elems[id]->numElems) { printf("Edge weight not specified\n"); exit(EXIT_FAILURE); } - memcpy(&edgeArray[base+offset].edge, - &graph.edgeLabels->elems[id]->elems[i], sizeof(E)); + edgeArray[base+offset].edge = + edgeLabels.elems[id]->elems[i]; } offset++; } @@ -299,8 +306,8 @@ template )); + nextEMem = nextEMem + (numPins+1) * + POLITE_MAX_FANOUT * sizeof(PNeighbour); nextNeighbours += (numPins+1); } // At this point, check that next pointers line up with heap sizes @@ -442,6 +449,8 @@ template Date: Thu, 7 Nov 2019 21:38:06 +0000 Subject: [PATCH 02/11] Further refinement of idle release --- rtl/Network.bsv | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/rtl/Network.bsv b/rtl/Network.bsv index 013f46b9..6a06a101 100644 --- a/rtl/Network.bsv +++ b/rtl/Network.bsv @@ -439,22 +439,30 @@ module mkMailboxMesh#( // Detect inter-board activity // --------------------------- + // Latch to improve timing + Reg#(Bool) activityReg <- mkReg(False); + + // Determine when a flit arrives on a link, + // provided that flit is not a stage 1 idle token + function Bool active(BoardLink link); + Flit flit = link.flitOut.value; + IdleToken in = unpack(truncate(flit.payload)); + return (link.flitOut.valid && (flit.isIdleToken ? !in.stage1 : True)); + endfunction + // For barrier release phase rule informIdleDetector; Bool activity = False; - for (Integer i = 0; i < `NumNorthSouthLinks; i=i+1) begin - Flit flit = southLink[i].flitOut.value; - IdleToken in = unpack(truncate(flit.payload)); - activity = activity || (southLink[i].flitOut.valid && - (flit.isIdleToken ? !in.stage1 : True)); - end - for (Integer i = 0; i < `NumEastWestLinks; i=i+1) begin - Flit flit = westLink[i].flitOut.value; - IdleToken in = unpack(truncate(flit.payload)); - activity = activity || (westLink[i].flitOut.valid && - (flit.isIdleToken ? !in.stage1 : True)); - end - idle.idle.interBoardActivity(activity); + for (Integer i = 0; i < `NumNorthSouthLinks; i=i+1) + activity = activity || active(southLink[i]); + for (Integer i = 0; i < `NumNorthSouthLinks; i=i+1) + activity = activity || active(northLink[i]); + for (Integer i = 0; i < `NumEastWestLinks; i=i+1) + activity = activity || active(westLink[i]); + for (Integer i = 0; i < `NumEastWestLinks; i=i+1) + activity = activity || active(eastLink[i]); + activityReg <= activity; + idle.idle.interBoardActivity(activityReg); endrule `ifndef SIMULATE From 2c4a9afee54137acb4134038ee45a7ac72a81308 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Fri, 8 Nov 2019 08:14:44 +0000 Subject: [PATCH 03/11] Prevent optimising of writes to mailbox scratchpad Problem: messages are often written to the mailbox scratchpad, a region of memory that is never read by software (only hardware). Previous solution: make the pointer to the message slot volatile. The downside is that ugly "volatile" tags spread into high-level user code. New solution: I believe the simple change in this commit removes the need for volatile. It basically tells the compiler that a call to tinselSend may read the message slot. --- include/tinsel.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/tinsel.h b/include/tinsel.h index 18556747..8f0254ca 100644 --- a/include/tinsel.h +++ b/include/tinsel.h @@ -151,7 +151,7 @@ INLINE void tinselSetLen(int n) // Send message at addr to dest INLINE void tinselSend(int dest, volatile void* addr) { - asm volatile("csrrw zero, " CSR_SEND_PTR ", %0" : : "r"(addr)); + asm volatile("csrrw zero, " CSR_SEND_PTR ", %0" : : "r"(addr) : "memory"); asm volatile("csrrw zero, " CSR_SEND ", %0" : : "r"(dest)); } From b3b99b4e32dcbe4f00f284acd26b35587f089f91 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Sat, 9 Nov 2019 20:59:41 +0000 Subject: [PATCH 04/11] Fix bug in boot loader The boot loader has support for writing to data memory, and uses a cache flush to publish these writes before triggering the application. To ensure that the flush has completed, we always issued a load to a predefined address. Problem is, that predefined address (or the cache line that contains it) could be written to by a boot loader on another core leading to a race. I had never observed this bug actually being trigged until implementing the hardware multicast work, which improves efficiency of message delivery to the boot loader. Thanks to @jrbeaumont for helping to track this down. --- README.md | 16 ++++++++------ apps/boot/boot.c | 23 ++++++-------------- apps/linktest/Makefile | 28 ------------------------ apps/linktest/run.cpp | 48 ------------------------------------------ config.py | 2 +- include/boot.h | 5 ----- include/tinsel.h | 12 ++--------- 7 files changed, 19 insertions(+), 115 deletions(-) delete mode 100644 apps/linktest/Makefile delete mode 100644 apps/linktest/run.cpp diff --git a/README.md b/README.md index 66e17540..eafb444a 100644 --- a/README.md +++ b/README.md @@ -359,16 +359,17 @@ A *cache flush* function is provided that evicts all cache lines owned by the calling thread. ```c -// Full cache flush -// (Issues flush request for every line, and waits until all requests are done) +// Full cache flush (non-blocking) inline void tinselCacheFlush(); -// Flush given cache line -// (Issues flush request, but doesn't wait until request is done) +// Flush given cache line (non-blocking) inline void tinselFlushLine(uint32_t lineNum, uint32_t way) ``` -These functions are implemented using the following CSR. +These functions do not block until the flushed lines have reached +memory (that can be acheived by issuing a subsequent load instruction +and waiting for the response). The flush functions are implemented +using the following CSR. CSR Name | CSR | R/W | Function ------------ | ------ | --- | -------- @@ -1231,9 +1232,12 @@ inline uint32_t tinselCycleCount(); // Write 32-bit word to instruction memory inline void tinselWriteInstr(uint32_t addr, uint32_t word); -// Cache flush +// Full cache flush (non-blocking) inline void tinselCacheFlush(); +// Cache line flush (non-blocking) +inline void tinselFlushLine(uint32_t lineNum, uint32_t way) + // Get pointer to nth message-aligned slot in mailbox scratchpad inline volatile void* tinselSlot(uint32_t n); diff --git a/apps/boot/boot.c b/apps/boot/boot.c index 319c10ee..0c30aac1 100644 --- a/apps/boot/boot.c +++ b/apps/boot/boot.c @@ -24,10 +24,10 @@ int main() if (threadId == 0) { // State uint32_t addrReg = 0; // Address register + uint32_t lastDataStoreAddr = 0; // Get mailbox message slot for send and receive volatile BootReq* msgIn = tinselSlot(0); - volatile BootReq* reqOut; volatile uint32_t* msgOut = tinselSlot(1); // Command loop @@ -55,6 +55,7 @@ int main() for (int i = 0; i < n; i++) { uint32_t* ptr = (uint32_t*) addrReg; *ptr = msgIn->args[i]; + lastDataStoreAddr = addrReg; addrReg += 4; } } @@ -80,6 +81,10 @@ int main() else if (cmd == StartCmd) { // Cache flush tinselCacheFlush(); + // Wait until lines written back, by issuing a load + if (lastDataStoreAddr != 0) { + volatile uint32_t* ptr = (uint32_t*) lastDataStoreAddr; ptr[0]; + } // Send response tinselWaitUntil(TINSEL_CAN_SEND); msgOut[0] = tinselId(); @@ -92,22 +97,6 @@ int main() tinselCreateThread(i+1); break; } - else if (cmd == PingCmd) { - // Respond to ping - tinselWaitUntil(TINSEL_CAN_SEND); - reqOut = (volatile BootReq*) msgOut; - if (msgIn->args[0] != 0) { - // If number of hops is one - reqOut->cmd = PingCmd; - reqOut->args[0] = 0; - tinselSend(msgIn->args[1], reqOut); - } - else { - // If number of hops is zero - msgOut[0] = tinselId(); - tinselSend(hostId, msgOut); - } - } } } diff --git a/apps/linktest/Makefile b/apps/linktest/Makefile deleted file mode 100644 index 90eec66b..00000000 --- a/apps/linktest/Makefile +++ /dev/null @@ -1,28 +0,0 @@ -# SPDX-License-Identifier: BSD-2-Clause -TINSEL_ROOT = ../.. - -ifndef QUARTUS_ROOTDIR - $(error Please set QUARTUS_ROOTDIR) -endif - -include $(TINSEL_ROOT)/globals.mk - -# Local compiler flags -CFLAGS = $(RV_CFLAGS) -O2 -I $(INC) -LDFLAGS = -melf32lriscv -G 0 - -.PHONY: all -all: run - -$(HL)/%.o: - make -C $(HL) - -run: run.cpp $(HL)/*.o - g++ -O2 -I $(INC) -I $(HL) -o run run.cpp $(HL)/*.o - -sim: run.cpp $(HL)/sim/*.o - g++ -O2 -I $(INC) -I $(HL) -o sim run.cpp $(HL)/sim/*.o - -.PHONY: clean -clean: - rm -f *.o run sim diff --git a/apps/linktest/run.cpp b/apps/linktest/run.cpp deleted file mode 100644 index 074ae66d..00000000 --- a/apps/linktest/run.cpp +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: BSD-2-Clause -// Simple non-pipelined connectivity test using boot loader -// Check that every core can send a message to every other core - -#include -#include -#include -#include - -int main() -{ - HostLink hostLink; - - // Create ping command - BootReq req; - req.cmd = PingCmd; - req.args[0] = 1; - - // Send a ping between every pair of cores - uint32_t count = 0; - for (int x1 = 0; x1 < TinselMeshXLenWithinBox; x1++) - for (int y1 = 0; y1 < TinselMeshYLenWithinBox; y1++) - for (int i1 = 0; i1 < (1 << TinselLogCoresPerBoard); i1++) - for (int x2 = 0; x2 < TinselMeshXLenWithinBox; x2++) - for (int y2 = 0; y2 < TinselMeshYLenWithinBox; y2++) - for (int i2 = 0; i2 < (1 << TinselLogCoresPerBoard); i2++) { - uint32_t dest = hostLink.toAddr(x1, y1, i1, 0); - uint32_t resp[1 << TinselLogWordsPerMsg]; - req.args[1] = hostLink.toAddr(x2, y2, i2, 0); - hostLink.send(dest, 1, &req); - hostLink.recv(resp); - if (resp[0] == req.args[1]) { - printf("."); - //fflush(stdout); - } - else { - printf("\nInvalid result from ping %d -> %d\n", - dest, req.args[1]); - return -1; - } - count++; - if ((count%64) == 0) printf("\n"); - } - - printf("\nPassed\n"); - - return 0; -} diff --git a/config.py b/config.py index bda5372d..d15b995d 100755 --- a/config.py +++ b/config.py @@ -88,7 +88,7 @@ def quoted(s): return "'\"" + s + "\"'" p["LogMailboxesPerBoard"] = p["MailboxMeshXBits"] + p["MailboxMeshYBits"] # Maximum size of boot loader (in bytes) -p["MaxBootImageBytes"] = 576 +p["MaxBootImageBytes"] = 512 # Size of transmit buffer in a reliable link p["LogTransmitBufferSize"] = 10 diff --git a/include/boot.h b/include/boot.h index e3b42649..31dabff4 100644 --- a/include/boot.h +++ b/include/boot.h @@ -41,11 +41,6 @@ typedef enum { // to start. StartCmd, - // Ping a thread with an optional thread->thread hop - // Argument 0: number of hops (zero or one) - // Argument 1: next hop address (when number of hops is one) - PingCmd, - } BootCmd; diff --git a/include/tinsel.h b/include/tinsel.h index 8f0254ca..b6972ca0 100644 --- a/include/tinsel.h +++ b/include/tinsel.h @@ -52,27 +52,19 @@ INLINE uint32_t tinselCycleCount() return n; } -// Flush cache line +// Flush cache line (non-blocking) INLINE void tinselFlushLine(uint32_t lineNum, uint32_t way) { uint32_t arg = (lineNum << TinselDCacheLogNumWays) | way; asm volatile("csrrw zero, " CSR_FLUSH ", %0" : : "r"(arg)); } -// Cache flush +// Cache flush (non-blocking) INLINE void tinselCacheFlush() { for (uint32_t i = 0; i < (1< Date: Sat, 9 Nov 2019 22:00:08 +0000 Subject: [PATCH 05/11] Let's not change the boot mem size For backwards compatability --- config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config.py b/config.py index d15b995d..bda5372d 100755 --- a/config.py +++ b/config.py @@ -88,7 +88,7 @@ def quoted(s): return "'\"" + s + "\"'" p["LogMailboxesPerBoard"] = p["MailboxMeshXBits"] + p["MailboxMeshYBits"] # Maximum size of boot loader (in bytes) -p["MaxBootImageBytes"] = 512 +p["MaxBootImageBytes"] = 576 # Size of transmit buffer in a reliable link p["LogTransmitBufferSize"] = 10 From 2e7a916c9a3ba9b9cb6dc20cfa71a31840b0a268 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Mon, 11 Nov 2019 16:49:18 +0000 Subject: [PATCH 06/11] Specify clock frequency in config.py Various software tools may want to know the clock frequency, and config.py seems like a good place to define it. --- README.md | 1 + config.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/README.md b/README.md index eafb444a..25e0ac7b 100644 --- a/README.md +++ b/README.md @@ -1137,6 +1137,7 @@ the DE5-Net*. `MeshXLenWithinBox` | 3 | Boards in X dimension within box `MeshYLenWithinBox` | 2 | Boards in Y dimension within box `EnablePerfCount` | True | Enable performance counters + `ClockFreq` | 240 | Clock frequency in MHz Further parameters can be found in [config.py](config.py). diff --git a/config.py b/config.py index bda5372d..18b506b7 100755 --- a/config.py +++ b/config.py @@ -171,6 +171,9 @@ def quoted(s): return "'\"" + s + "\"'" # Enable custom accelerators (experimental feature) p["UseCustomAccelerator"] = False +# Clock frequency (in MHz) +p["ClockFreq"] = 240 + #============================================================================== # Derived Parameters #============================================================================== From 8e9fa76671ca34cc9bf23e8bdda4e5575f3d6146 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 12 Nov 2019 07:35:16 +0000 Subject: [PATCH 07/11] Update Makefile --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index dd8bc7da..4276db5e 100644 --- a/Makefile +++ b/Makefile @@ -26,7 +26,6 @@ clean: make -C apps/inter clean make -C apps/benchmarks clean make -C apps/linkrate clean - make -C apps/linktest clean make -C apps/multiprog clean make -C apps/sync clean make -C apps/web clean From 0b804b1fa1661fab9a7ca9bba60ac242def5c678 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Wed, 13 Nov 2019 08:03:16 +0000 Subject: [PATCH 08/11] New POLite app: discrete pressure simulator --- Makefile | 1 + apps/POLite/pressure-sync/Makefile | 7 ++ apps/POLite/pressure-sync/Pressure.cpp | 23 ++++++ apps/POLite/pressure-sync/Pressure.h | 109 +++++++++++++++++++++++++ apps/POLite/pressure-sync/Run.cpp | 95 +++++++++++++++++++++ 5 files changed, 235 insertions(+) create mode 100644 apps/POLite/pressure-sync/Makefile create mode 100644 apps/POLite/pressure-sync/Pressure.cpp create mode 100644 apps/POLite/pressure-sync/Pressure.h create mode 100644 apps/POLite/pressure-sync/Run.cpp diff --git a/Makefile b/Makefile index 4276db5e..054fb700 100644 --- a/Makefile +++ b/Makefile @@ -41,5 +41,6 @@ clean: make -C apps/POLite/sssp-async clean make -C apps/POLite/ping-test clean make -C apps/POLite/clocktree-async clean + make -C apps/POLite/pressure-sync clean make -C bin clean make -C tests clean diff --git a/apps/POLite/pressure-sync/Makefile b/apps/POLite/pressure-sync/Makefile new file mode 100644 index 00000000..30114cb9 --- /dev/null +++ b/apps/POLite/pressure-sync/Makefile @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: BSD-2-Clause +APP_CPP = Pressure.cpp +APP_HDR = Pressure.h +RUN_CPP = Run.cpp +RUN_H = + +include ../util/polite.mk diff --git a/apps/POLite/pressure-sync/Pressure.cpp b/apps/POLite/pressure-sync/Pressure.cpp new file mode 100644 index 00000000..5ee05407 --- /dev/null +++ b/apps/POLite/pressure-sync/Pressure.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: BSD-2-Clause +#include "Pressure.h" + +#include +#include + +typedef PThread< + PressureDevice, + PressureState, // State + Dir, // Edge label + PressureMessage // Message + > PressureThread; + +int main() +{ + // Point thread structure at base of thread's heap + PressureThread* thread = (PressureThread*) tinselHeapBaseSRAM(); + + // Invoke interpreter + thread->run(); + + return 0; +} diff --git a/apps/POLite/pressure-sync/Pressure.h b/apps/POLite/pressure-sync/Pressure.h new file mode 100644 index 00000000..4d19aec0 --- /dev/null +++ b/apps/POLite/pressure-sync/Pressure.h @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: BSD-2-Clause + +// Discrete pressure simulator +// (Assumes at most 32 neighbours per device) + +#ifndef _PRESSURE_H_ +#define _PRESSURE_H_ + +#define POLITE_DUMP_STATS +#define POLITE_COUNT_MSGS +#include + +// Number of neighbours per device +// (Assumed to be an even number) +#define NUM_NEIGHBOURS 26 + +// Modes +#define SHARE 0 +#define MOVE 1 + +struct PressureMessage { + // Number of beads at sender + int32_t pressure; + // Move vector: one bit per neighbour + uint32_t move; +}; + +struct PressureState { + // Mode: sharing data with neighbours, or migrating beads? + uint8_t mode; + // Number of time steps left in simulation + int32_t numSteps; + // Number of beads at device + int32_t pressure, newPressure; + // One bit per neighbour + uint32_t move; + // State of random number generator + uint32_t prng; +}; + +// Each neighbour has a differen direction +typedef uint16_t Dir; + +// Given a direction, return the opposite direction +INLINE Dir opposite(Dir dir) { + Dir newDir = (dir + NUM_NEIGHBOURS/2); + if (newDir >= NUM_NEIGHBOURS) newDir -= NUM_NEIGHBOURS; + return newDir; +} + +// Random number generator +INLINE float rand(uint32_t* seed) { + *seed = (*seed * 1103515245 + 12345) & 0x7fffffff; + return ((float) *seed) / ((float) 0x7fffffff); +} + +struct PressureDevice : PDevice { + + inline void init() { + #ifdef TINSEL + s->prng = tinselId(); + #endif + s->mode = SHARE; + s->newPressure = s->pressure; + *readyToSend = Pin(0); + } + + inline void send(volatile PressureMessage *msg){ + msg->pressure = s->pressure; + msg->move = s->move; + *readyToSend = No; + } + + inline void recv(PressureMessage *msg, Dir* dir) { + if (s->mode == SHARE) { + int32_t diff = s->newPressure - msg->pressure; + if (diff > 0) { + if (rand(&s->prng) < float(diff)/float(NUM_NEIGHBOURS)) { + s->move |= 1 << opposite(*dir); + s->newPressure--; + } + } + } + else if (msg->move & (1 << *dir)) + s->newPressure++; + } + + inline bool step() { + if (s->numSteps > 0) { + s->pressure = s->newPressure; + s->mode = s->mode == SHARE ? MOVE : SHARE; + if (s->mode == SHARE) s->move = 0; + *readyToSend = Pin(0); + s->numSteps--; + return true; + } + else { + *readyToSend = No; + return false; + } + } + + inline bool finish(volatile PressureMessage* msg) { + msg->pressure = s->pressure; + return true; + } +}; + +#endif diff --git a/apps/POLite/pressure-sync/Run.cpp b/apps/POLite/pressure-sync/Run.cpp new file mode 100644 index 00000000..0a7b53da --- /dev/null +++ b/apps/POLite/pressure-sync/Run.cpp @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: BSD-2-Clause +// Regression test: on each step, every device sends to its 26 3D neighbours + +#include "Pressure.h" +#include +#include +#include + +// Number of time steps +#define T 1000 + +// Volume dimensions +#define D 40 + +int main() +{ + HostLink hostLink; + PGraph graph; + //graph.mapVerticesToDRAM = true; + + int devs[D][D][D]; + for (int x = 0; x < D; x++) + for (int y = 0; y < D; y++) + for (int z = 0; z < D; z++) + devs[x][y][z] = graph.newDevice(); + + for (int x = 0; x < D; x++) + for (int y = 0; y < D; y++) + for (int z = 0; z < D; z++) { + int label = 0; + for (int i = -1; i < 2; i++) + for (int j = -1; j < 2; j++) + for (int k = -1; k < 2; k++) { + if (! (i == 0 && j == 0 && k == 0)) { + int xd = (x+i) < 0 ? (D-1) : ((x+i) >= D ? 0 : (x+i)); + int yd = (y+j) < 0 ? (D-1) : ((y+j) >= D ? 0 : (y+j)); + int zd = (z+k) < 0 ? (D-1) : ((z+k) >= D ? 0 : (z+k)); + graph.addLabelledEdge(label, devs[x][y][z], 0, + devs[xd][yd][zd]); + label++; + } + } + } + + // Prepare mapping from graph to hardware + graph.map(); + + // Initialise devices + srand(0); + for (int i = 0; i < D*D*D; i++) { + graph.devices[i]->state.numSteps = T; + graph.devices[i]->state.pressure = rand() % 100; + } + + // Write graph down to tinsel machine via HostLink + graph.write(&hostLink); + + // Load code and trigger execution + hostLink.boot("code.v", "data.v"); + hostLink.go(); + printf("Starting\n"); + + // Start timer + struct timeval start, finish, diff; + gettimeofday(&start, NULL); + + // Consume performance stats + politeSaveStats(&hostLink, "stats.txt"); + + int64_t* pressures = new int64_t [D*D*D]; + PMessage msg; + int64_t total = 0; + for (int i = 0; i < D*D*D; i++) { + hostLink.recvMsg(&msg, sizeof(PMessage)); + if (i == 0) gettimeofday(&finish, NULL); + pressures[i] = msg.payload.pressure; + total += msg.payload.pressure; + } + int64_t average = total/(D*D*D); + printf("Average: %ld\n", average); + + total = 0; + for (int i = 0; i < D*D*D; i++) { + int64_t diff = pressures[i] - average; + total += diff*diff; + } + printf("Standard deviation: %lf\n", sqrt((double) total / (double) (D*D*D))); + + // Display time + timersub(&finish, &start, &diff); + double duration = (double) diff.tv_sec + (double) diff.tv_usec / 1000000.0; + printf("Time = %lf\n", duration); + + return 0; +} From a81a2972cfdaeea8692ad34293824b7474df0b31 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 19 Nov 2019 09:28:16 +0000 Subject: [PATCH 09/11] Allow use of Scotch placer in POLite Thanks to David for this. At this stage, I only incorporated the bare minimum functionality. David's branch contains much more stuff for actually evaluating placement methods. --- apps/POLite/util/polite.mk | 2 +- include/POLite/Placer.h | 196 ++++++++++++++++++++++++++++++++----- 2 files changed, 172 insertions(+), 26 deletions(-) diff --git a/apps/POLite/util/polite.mk b/apps/POLite/util/polite.mk index a1d96f83..9d8880c8 100644 --- a/apps/POLite/util/polite.mk +++ b/apps/POLite/util/polite.mk @@ -51,7 +51,7 @@ $(HL)/%.o: $(BUILD)/run: $(RUN_CPP) $(RUN_H) $(HL)/*.o g++ -std=c++11 -O2 -I $(INC) -I $(HL) -o $(BUILD)/run $(RUN_CPP) $(HL)/*.o \ - -lmetis -fno-exceptions + -lmetis -lscotch -fno-exceptions $(BUILD)/sim: $(RUN_CPP) $(RUN_H) $(HL)/sim/*.o g++ -O2 -I $(INC) -I $(HL) -o $(BUILD)/sim $(RUN_CPP) $(HL)/sim/*.o \ diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h index 32aec831..84da5728 100644 --- a/include/POLite/Placer.h +++ b/include/POLite/Placer.h @@ -3,13 +3,24 @@ #define _PLACER_H_ #include +#include #include +#include #include +#include typedef uint32_t PartitionId; // Partition and place a graph on a 2D mesh struct Placer { + // Select between different methods + enum Method { + Default, + Metis, + Scotch + }; + const Method defaultMethod=Metis; + // The graph being placed Graph* graph; @@ -41,8 +52,31 @@ struct Placer { uint32_t* yCoordSaved; uint64_t savedCost; + // Controls which strategy is used + Method method = Default; + + // Select placer method + void chooseMethod() + { + auto e = getenv("POLITE_PLACER"); + if (e) { + if (!strcmp(e, "metis")) + method=Metis; + else if (!strcmp(e, "scotch")) + method=Scotch; + else if (!strcmp(e, "default") || *e == '\0') + method=Default; + else { + fprintf(stderr, "Don't understand placer method : %s\n", e); + exit(EXIT_FAILURE); + } + } + if (method == Default) + method = defaultMethod; + } + // Partition the graph using Metis - void partition() { + void partitionMetis() { // Compute total number of edges uint32_t numEdges = 0; for (uint32_t i = 0; i < graph->incoming->numElems; i++) { @@ -116,6 +150,103 @@ struct Placer { free(parts); } + // Place the graph using Scotch + void partitionScotch() { + idx_t nvtxs = (idx_t) graph->incoming->numElems; + idx_t nparts = (idx_t) (width * height); + + SCOTCH_Arch *archptr=(SCOTCH_Arch *)malloc(sizeof(SCOTCH_Arch)); + if (SCOTCH_archInit(archptr)) { + fprintf(stderr, "Couldn't init scotch arch\n"); + exit(EXIT_FAILURE); + } + + if (SCOTCH_archMesh2(archptr, width, height)) { + fprintf(stderr, "Couldn't create 2d mesh in scotch\n"); + exit(EXIT_FAILURE); + } + + std::vector verttab; + std::vector edgetab; + + for (uint32_t i = 0; i < nvtxs; i++) { + verttab.push_back(edgetab.size()); + + const Seq* in = graph->incoming->elems[i]; + for (uint32_t j = 0; j < in->numElems; j++) { + if (in->elems[j] != i) + edgetab.push_back(in->elems[j]); + } + + const Seq* out = graph->outgoing->elems[i]; + for (uint32_t j = 0; j < out->numElems; j++) { + if (out->elems[j] != i) + edgetab.push_back(out->elems[j]); + } + } + verttab.push_back(edgetab.size()); + + SCOTCH_Graph *grafptr = (SCOTCH_Graph *) malloc(sizeof(SCOTCH_Graph)); + + if (SCOTCH_graphBuild(grafptr, + 0, // baseval - where do array indices start + graph->incoming->numElems, // vertnbr + &verttab[0], + &verttab[1], // vendtab, means it is a compact edge array + 0, // velotab, Integer load per vertex. Not used here. + 0, // vlbltab, vertex label tab (?) + edgetab.size(), // edgenbr, + &edgetab[0], + 0 // edlotab, load on each arc + )) { + fprintf(stderr, "Scotch didn't want to build a graph.\n"); + exit(EXIT_FAILURE); + } + + if (SCOTCH_graphCheck (grafptr)) { + fprintf(stderr, "Scotch does not like the graph we built.\n"); + exit(EXIT_FAILURE); + } + + SCOTCH_Strat *stratptr=(SCOTCH_Strat *)malloc(sizeof(SCOTCH_Strat)); + if (SCOTCH_stratInit(stratptr)) { + fprintf(stderr, "Scotch won't make a strategy.\n"); + exit(EXIT_FAILURE); + } + + std::vector parttab(nvtxs); + + if (SCOTCH_graphMap (grafptr, archptr, stratptr, &parttab[0])) { + fprintf(stderr, "Scotch couldn't map the graph.\n"); + exit(EXIT_FAILURE); + } + + // Populate result array + for (uint32_t i = 0; i < graph->incoming->numElems; i++){ + partitions[i] = (uint32_t) parttab[i]; + } + + SCOTCH_archExit(archptr); + free(archptr); + SCOTCH_graphExit(grafptr); + free(grafptr); + SCOTCH_stratInit(stratptr); + free(stratptr); + } + + void partition() + { + switch(method){ + case Default: + case Metis: + partitionMetis(); + break; + case Scotch: + partitionScotch(); + break; + } + } + // Create subgraph for each partition void computeSubgraphs() { uint32_t numPartitions = width*height; @@ -262,31 +393,44 @@ struct Placer { // Very simple local search algorithm for placement // Repeatedly swap a mesh node with it's neighbour if it lowers cost void place(uint32_t numAttempts) { - // Initialise best cost - savedCost = ~0; - - for (uint32_t n = 0; n < numAttempts; n++) { - randomPlacement(); - currentCost = cost(); - - bool change; - do { - change = false; - // Loop over mesh - for (uint32_t y = 0; y < height-1; y++) { - for (uint32_t x = 0; x < width-1; x++) { - change = trySwap(x, y, x+1, y) || - trySwap(x, y, x, y+1) || - trySwap(x, y, x+1, y+1) || - change; - } + if (method == Scotch) { + // Use Scotch's placement + for (uint32_t y = 0; y < height; y++) { + for (uint32_t x = 0; x < width; x++) { + unsigned p = y*width+x; + mapping[y][x] = p; + xCoord[p] = x; + yCoord[p] = y; } - } while (change); - - if (currentCost <= savedCost) - save(); - else - restore(); + } + } + else { + // Initialise best cost + savedCost = ~0; + + for (uint32_t n = 0; n < numAttempts; n++) { + randomPlacement(); + currentCost = cost(); + + bool change; + do { + change = false; + // Loop over mesh + for (uint32_t y = 0; y < height-1; y++) { + for (uint32_t x = 0; x < width-1; x++) { + change = trySwap(x, y, x+1, y) || + trySwap(x, y, x, y+1) || + trySwap(x, y, x+1, y+1) || + change; + } + } + } while (change); + + if (currentCost <= savedCost) + save(); + else + restore(); + } } } @@ -322,6 +466,8 @@ struct Placer { computeSubgraphs(); // Count connections between each pair of partitions computeInterPartitionCounts(); + // Pick a placement method, or select default + chooseMethod(); } // Deconstructor From 4a57c0de3ef09428d6245de9074d5144e8aee8d0 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Tue, 19 Nov 2019 10:12:30 +0000 Subject: [PATCH 10/11] Avoid duplicate arcs in Scotch placer --- include/POLite/Placer.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/POLite/Placer.h b/include/POLite/Placer.h index 84da5728..7ecb0897 100644 --- a/include/POLite/Placer.h +++ b/include/POLite/Placer.h @@ -172,12 +172,6 @@ struct Placer { for (uint32_t i = 0; i < nvtxs; i++) { verttab.push_back(edgetab.size()); - const Seq* in = graph->incoming->elems[i]; - for (uint32_t j = 0; j < in->numElems; j++) { - if (in->elems[j] != i) - edgetab.push_back(in->elems[j]); - } - const Seq* out = graph->outgoing->elems[i]; for (uint32_t j = 0; j < out->numElems; j++) { if (out->elems[j] != i) @@ -190,7 +184,7 @@ struct Placer { if (SCOTCH_graphBuild(grafptr, 0, // baseval - where do array indices start - graph->incoming->numElems, // vertnbr + nvtxs, // vertnbr &verttab[0], &verttab[1], // vendtab, means it is a compact edge array 0, // velotab, Integer load per vertex. Not used here. @@ -460,14 +454,14 @@ struct Placer { yCoord = new uint32_t [width*height]; xCoordSaved = new uint32_t [width*height]; yCoordSaved = new uint32_t [width*height]; + // Pick a placement method, or select default + chooseMethod(); // Partition the graph using Metis partition(); // Compute subgraphs, one per partition computeSubgraphs(); // Count connections between each pair of partitions computeInterPartitionCounts(); - // Pick a placement method, or select default - chooseMethod(); } // Deconstructor From 9224ac4e577dc907f67cbfad343143e61ba94b15 Mon Sep 17 00:00:00 2001 From: Matthew Naylor Date: Sun, 24 Nov 2019 17:25:41 +0000 Subject: [PATCH 11/11] Update checkelf.sh --- bin/checkelf.sh | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bin/checkelf.sh b/bin/checkelf.sh index dc0775f4..9cf9fb6c 100755 --- a/bin/checkelf.sh +++ b/bin/checkelf.sh @@ -10,11 +10,12 @@ fi DUMP=$(riscv64-unknown-elf-objdump -d $1) # Errors -ES="\secall\s|ebreak\s" -ES="$ES|\scsrrs\s|csrrc\s|\scsrrwi\s|\scsrrsi\s|\scsrrci\s" -ES="$ES|\s[^f]div\s|divu\s|\srem\s|\sremu\s" -ES="$ES|\sfsqrt\s|fmin\s|fmax\s|\sfclassify\s" -ES="$ES|\sfmadd\s|fmsub\s|fnmadd\s|\sfnmsub\s" +T="[\.\s]" +ES="\secall$T|ebreak$T" +ES="$ES|\scsrrs$T|\scsrrc$T|\scsrrwi$T|\scsrrsi$T|\scsrrci$T" +ES="$ES|\s[^f]div$T|divu$T|\srem$T|\sremu$T" +ES="$ES|\sfsqrt$T|fmin$T|fmax$T|\sfclassify$T" +ES="$ES|\sfmadd$T|fmsub$T|fnmadd$T|\sfnmsub$T" if echo "$DUMP" | grep -q -E "$ES"; then echo "ERROR: $1 uses unsupported instructions:"