diff --git a/configs/accl/archived/sega_detailed.py b/configs/accl/archived/sega_detailed.py
new file mode 100644
index 0000000000..795089579a
--- /dev/null
+++ b/configs/accl/archived/sega_detailed.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret, intlv_low_bit + intlv_bits - 1
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            examine_window=8,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=1024,
+            examine_window=12,
+            max_propagates_per_cycle=8,
+            update_queue_size=64,
+        )
+
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(),
+            dram_2=HBM_2000_4H_1x64(),
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="16GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
+class SEGA(System):
+    def __init__(
+        self,
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph_path,
+    ):
+        super(SEGA, self).__init__()
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self._num_gpts = num_gpts
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("4GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+
+    def work_count(self):
+        return self.ctrl.controller.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.controller.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.controller.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.controller.printAnswerToHostSimout()
+
+    def get_num_gpts(self):
+        return self._num_gpts
\ No newline at end of file
diff --git a/configs/accl/archived/sega_double_simple.py b/configs/accl/archived/sega_double_simple.py
new file mode 100644
index 0000000000..87f37ce269
--- /dev/null
+++ b/configs/accl/archived/sega_double_simple.py
@@ -0,0 +1,270 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            examine_window=8,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=1024,
+            examine_window=12,
+            max_propagates_per_cycle=8,
+            update_queue_size=64,
+        )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="120ns", bandwidth="32GiB/s"
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "9.6GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = SimpleMemory(
+            latency="90ns",
+            bandwidth="76.8GiB/s",
+            range=AddrRange(size),
+            in_addr_map=False,
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="32GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
+class SEGA(System):
+    def __init__(
+        self,
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph_path,
+    ):
+        super(SEGA, self).__init__()
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+
+    def work_count(self):
+        return self.ctrl.controller.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.controller.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.controller.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py
new file mode 100644
index 0000000000..a24f2ff860
--- /dev/null
+++ b/configs/accl/async-pr.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+from sega import SEGA
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_async_pr_workload(alpha, threshold)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/bc.py b/configs/accl/bc.py
new file mode 100644
index 0000000000..c100068aa2
--- /dev/null
+++ b/configs/accl/bc.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+from sega import SEGA
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+
+    m5.instantiate()
+
+    system.set_bsp_mode()
+    system.create_pop_count_directory(64)
+    system.create_bc_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        iterations = 0
+        while True:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iterations += 1
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iterations}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
new file mode 100644
index 0000000000..6c33c93f59
--- /dev/null
+++ b/configs/accl/bfs.py
@@ -0,0 +1,160 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+from sega import SEGA
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--tile",
+        dest="tile",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use temporal partitioning",
+    )
+    argparser.add_argument(
+        "--best",
+        dest="best",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use best update value for switching slices",
+    )
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.tile,
+        args.best,
+        args.visited,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        tile,
+        best,
+        visited,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    if tile:
+        system.set_aux_images(f"{graph}/mirrors", f"{graph}/mirrors_map")
+
+    if best:
+        system.set_choose_best(True)
+
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    if tile:
+        system.set_pg_mode()
+    else:
+        system.set_async_mode()
+
+    system.create_pop_count_directory(32)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(50000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "simulate() limit reached":
+                m5.stats.dump()
+                m5.stats.reset()
+            elif exit_event.getCause() == "Done with all the slices.":
+                break
+            elif exit_event.getCause() == "no update left to process.":
+                break
+    else:
+        while True:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "Done with all the slices.":
+                break
+            if exit_event.getCause() == "no update left to process.":
+                break
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/cc.py b/configs/accl/cc.py
new file mode 100644
index 0000000000..0fd4fe3505
--- /dev/null
+++ b/configs/accl/cc.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+from sega import SEGA
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_cc_workload()
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/disagg_bfs.py b/configs/accl/disagg_bfs.py
new file mode 100644
index 0000000000..42e1c06acb
--- /dev/null
+++ b/configs/accl/disagg_bfs.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+from disagg_sega import SEGA
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument("fend", type=int)
+    argparser.add_argument("mem_ctrl_lat", type=str)
+
+    argparser.add_argument(
+        "--tile",
+        dest="tile",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use temporal partitioning",
+    )
+    argparser.add_argument(
+        "--best",
+        dest="best",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use best update value for switching slices",
+    )
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+    
+    
+        
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.tile,
+        args.best,
+        args.visited,
+        args.sample,
+        args.verify,
+        args.fend,
+        args.mem_ctrl_lat,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        tile,
+        best,
+        visited,
+        sample,
+        verify,
+        fend,
+        mem_ctrl_lat,
+    ) = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph,fend, mem_ctrl_lat)
+    if tile:
+        system.set_aux_images(f"{graph}/mirrors", f"{graph}/mirrors_map")
+
+    if best:
+        system.set_choose_best(True)
+
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    if tile:
+        system.set_pg_mode()
+    else:
+        system.set_async_mode()
+
+    system.create_pop_count_directory(32)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(743598075)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "simulate() limit reached":
+                m5.stats.dump()
+                m5.stats.reset()
+            elif exit_event.getCause() == "Done with all the slices.":
+                break
+            elif exit_event.getCause() == "no update left to process.":
+                break
+    else:
+        while True:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "Done with all the slices.":
+                break
+            if exit_event.getCause() == "no update left to process.":
+                break
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/disagg_sega.py b/configs/accl/disagg_sega.py
new file mode 100644
index 0000000000..f3b4c44c31
--- /dev/null
+++ b/configs/accl/disagg_sega.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            examine_window=8,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=1024,
+            examine_window=12,
+            max_propagates_per_cycle=8,
+            update_queue_size=64,
+            base_addr=4294967296,
+        )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="120ns", bandwidth="28GiB/s"
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port 
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, num_channels: str, fend_latency: int, mem_ctrl_lat: int):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        print("fend_latency is: ", fend_latency)
+        print("mem_ctrl_lat is: ", mem_ctrl_lat)
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=fend_latency, forward_latency=1, response_latency=1
+        )
+        
+        addr_ranges = interleave_addresses(AddrRange(start="4GiB", size="64GiB"), num_channels, 64) # in_addr_map was False
+        self.mem_ctrls = [MemCtrl(dram=DDR4_2400_8x8(range=addr_ranges[i], in_addr_map=True), static_frontend_latency=mem_ctrl_lat, port=self.xbar.mem_side_ports) for i in range(num_channels)]
+        [print(f"{self.mem_ctrls[i]} range is: {addr_ranges[i]}") for i in range(num_channels)]
+    def get_abs_mems(self):
+        return [ctrl.dram for ctrl in self.mem_ctrls]
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start="0GiB", size="16GiB"),# was 16 GiB
+                in_addr_map=False,
+            ),
+            edge_base = 0,
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+    
+    def set_edges_image(self, edges):
+        self.controller.edge_image_file = edges
+    
+    def set_abs_mems(self, abs_mems):
+        self.controller.abstract_mem_vector = abs_mems
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
+class SEGA(System):
+    def __init__(
+        self,
+        num_gpts,
+        cache_size,
+        graph_path,
+        fend_latency,
+        mem_ctrl_lat,
+    ):
+        super(SEGA, self).__init__()
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+        
+        self.edge_mem = EdgeMemory(4, fend_latency, mem_ctrl_lat)
+        
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+        self.ctrl.set_edges_image(f"{graph_path}/edgelist_0")
+        self.ctrl.set_abs_mems(self.edge_mem.get_abs_mems())
+        num_registers = 128
+        # Building the GPTs
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32 # was 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.setEdgeMemPort(self.edge_mem.getPort())
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+
+    def work_count(self):
+        return self.ctrl.controller.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.controller.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.controller.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
new file mode 100644
index 0000000000..723f122908
--- /dev/null
+++ b/configs/accl/pr.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+from sega import SEGA
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("iterations", type=int)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("--num_nodes", type=int, default=1)
+    argparser.add_argument("--error_threshold", type=float, default=0.0)
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.iterations,
+        args.alpha,
+        args.num_nodes,
+        args.error_threshold,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        iterations,
+        alpha,
+        num_nodes,
+        error_threshold,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    print(f"error_threshold: {error_threshold}")
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_bsp_mode()
+    system.create_pop_count_directory(64)
+    system.create_pr_workload(num_nodes, alpha)
+    iteration = 0
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        while iteration < iterations:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iteration += 1
+            print(f"error: {system.get_pr_error()}")
+            if system.get_pr_error() < error_threshold:
+                break
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iteration}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
new file mode 100644
index 0000000000..bd3ffe567f
--- /dev/null
+++ b/configs/accl/sega.py
@@ -0,0 +1,266 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            examine_window=8,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=1024,
+            examine_window=12,
+            max_propagates_per_cycle=8,
+            update_queue_size=64,
+        )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="120ns", bandwidth="256GiB/s"
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="16GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
+class SEGA(System):
+    def __init__(
+        self,
+        num_gpts,
+        cache_size,
+        graph_path,
+    ):
+        super(SEGA, self).__init__()
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+        num_registers = 128
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+
+    def work_count(self):
+        return self.ctrl.controller.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.controller.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.controller.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/configs/accl/sssp.py b/configs/accl/sssp.py
new file mode 100644
index 0000000000..e23ebfb365
--- /dev/null
+++ b/configs/accl/sssp.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+from sega import SEGA
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        sample,
+        verify,
+    ) = get_inputs()
+    
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_sssp_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
new file mode 100644
index 0000000000..ebfca7e794
--- /dev/null
+++ b/src/accl/graph/TODO.md
@@ -0,0 +1,8 @@
+# TODO Items
+
+* We might need to revisit the fact that we could insert something to a queue on
+    the same cycle that another event is consuming something from the queue.
+* Move checking for wl.degree == 0 to coalesce engine.
+* Fix the retry system between memory queue and coalesce engine
+* Update inheritance: There is not enough reason for PushEngine and
+CoalesceEngine to be of the same type (i.e. delete BaseMemEngine).
diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py
new file mode 100644
index 0000000000..0585c36e48
--- /dev/null
+++ b/src/accl/graph/base/BaseReduceEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseReduceEngine(ClockedObject):
+    abstract = True
+    type = 'BaseReduceEngine'
+    cxx_header = "accl/graph/base/base_reduce_engine.hh"
+    cxx_class = 'gem5::BaseReduceEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
new file mode 100644
index 0000000000..35111c34d2
--- /dev/null
+++ b/src/accl/graph/base/SConscript
@@ -0,0 +1,33 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import("*")
+
+SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"])
+
+Source("base_reduce_engine.cc")
+Source("graph_workload.cc")
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
new file mode 100644
index 0000000000..ade95800d2
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/base_reduce_engine.hh"
+
+namespace gem5
+{
+
+BaseReduceEngine::BaseReduceEngine(const Params &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this))
+{}
+
+BaseReduceEngine::~BaseReduceEngine()
+{}
+
+}
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
new file mode 100644
index 0000000000..268bb60b76
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
+
+#include "params/BaseReduceEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseReduceEngine : public ClockedObject
+{
+  private:
+    System* system;
+
+  protected:
+
+    const RequestorID _requestorId;
+
+  public:
+    PARAMS(BaseReduceEngine);
+    BaseReduceEngine(const Params &params);
+    ~BaseReduceEngine();
+
+    RequestorID requestorId() { return _requestorId; }
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
new file mode 100644
index 0000000000..f1a26f6ac2
--- /dev/null
+++ b/src/accl/graph/base/data_structs.hh
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
+#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
+
+#include "base/cprintf.hh"
+#include "base/intmath.hh"
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <deque>
+
+namespace gem5
+{
+
+struct __attribute__ ((packed)) WorkListItem
+{
+    uint32_t tempProp : 32;
+    uint32_t prop : 32;
+    uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeFuture: 1;
+
+    std::string to_string()
+    {
+        return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
+                        "degree: %u, activeNow: %s, activeFuture: %s}",
+                        tempProp, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeFuture ? "true" : "false");
+    }
+
+    WorkListItem():
+        tempProp(0),
+        prop(0),
+        edgeIndex(0),
+        degree(0),
+        activeNow(false),
+        activeFuture(false)
+    {}
+
+    WorkListItem(uint32_t temp_prop, uint32_t prop,
+                uint32_t degree, uint32_t edge_index,
+                bool active_now, bool active_future):
+        tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree),
+        activeNow(active_now), activeFuture(active_future)
+    {}
+
+};
+
+struct __attribute__ ((packed)) Edge
+{
+    uint16_t weight : 16;
+    uint64_t neighbor : 48;
+
+    std::string to_string()
+    {
+        return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
+    }
+
+    Edge(): weight(0), neighbor(0) {}
+
+    Edge(uint16_t weight, uint64_t neighbor):
+        weight(weight),
+        neighbor(neighbor)
+    {}
+};
+
+struct __attribute__ ((packed)) MirrorVertex
+{
+    uint32_t vertexId : 32;
+    uint32_t prop : 32;
+    uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeNext: 1;
+
+    std::string to_string()
+    {
+        return csprintf("MirrorVertex{vertexId: %u, prop: %u, edgeIndex: %u, "
+                        "degree: %u, activeNow: %s, activeNext: %s}",
+                        vertexId, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeNext ? "true" : "false");
+    }
+    MirrorVertex():
+        vertexId(-1),
+        prop(-1),
+        edgeIndex(-1),
+        degree(-1),
+        activeNow(false),
+        activeNext(false)
+    {}
+
+    MirrorVertex(uint32_t vertex_id, uint32_t prop, uint32_t degree,
+                uint32_t edge_index, bool active_now, bool active_next):
+                vertexId(vertex_id), prop(prop), edgeIndex(edge_index),
+                degree(degree), activeNow(active_now), activeNext(active_next)
+    {}
+
+};
+
+static_assert(isPowerOf2(sizeof(WorkListItem)));
+static_assert(isPowerOf2(sizeof(Edge)));
+static_assert(isPowerOf2(sizeof(MirrorVertex)));
+
+struct MetaEdge {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t weight;
+    uint32_t value;
+
+    MetaEdge(): src(0), dst(0), weight(0), value(0)
+    {}
+    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
+        src(src), dst(dst), weight(weight), value(value)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}",
+                                                    src, dst, weight, value);
+    }
+};
+
+struct Update {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t value;
+
+    Update(): src(0), dst(0), value(0)
+    {}
+    Update(uint64_t src, uint64_t dst, uint32_t value):
+        src(src), dst(dst), value(value)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("Update{src: %lu, dst:%lu, value: %u}",
+                                                src, dst, value);
+    }
+};
+
+template<typename T>
+class UniqueFIFO
+{
+  private:
+    int cap;
+    int pop;
+
+    int* added;
+    int* deleted;
+    std::deque<T> container;
+
+  public:
+    UniqueFIFO() {
+        cap = 0;
+        pop = 0;
+        added = nullptr;
+        deleted = nullptr;
+    }
+
+    UniqueFIFO(int size) {
+        cap = size;
+        pop = 0;
+
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
+        }
+        container.clear();
+    }
+
+    ~UniqueFIFO() {
+        delete [] added;
+        delete [] deleted;
+    }
+
+    void fix_front() {
+        while(true) {
+            T elem = container.front();
+            if (deleted[elem] > 0) {
+                deleted[elem]--;
+                added[elem]--;
+                container.pop_front();
+            } else {
+                assert(deleted[elem] == 0);
+                assert(added[elem] == 1);
+                break;
+            }
+        }
+    }
+
+    T front() {
+        fix_front();
+        return container.front();
+    }
+
+    size_t size() {
+        return pop;
+    }
+
+    void clear() {
+        pop = 0;
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
+        }
+        container.clear();
+    }
+
+    bool empty() {
+        return size() == 0;
+    }
+
+    bool find(T item) {
+        assert(added[item] >= 0);
+        assert(deleted[item] >= 0);
+        int diff = added[item] - deleted[item];
+        assert((diff == 0) || (diff == 1));
+        return (diff == 1);
+    }
+
+    void push_back(T item) {
+        if (!find(item)) {
+            added[item]++;
+            pop++;
+            container.push_back(item);
+        }
+    }
+
+    void pop_front() {
+        T elem = front();
+        assert(added[elem] == 1);
+        added[elem] = 0;
+        pop--;
+        container.pop_front();
+    }
+
+    void erase(T item) {
+        assert(find(item));
+        deleted[item]++;
+        pop--;
+    }
+
+    void operator=(const UniqueFIFO<T>& rhs) {
+        cap = rhs.cap;
+        pop = rhs.pop;
+        container = rhs.container;
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+        std::memcpy(added, rhs.added, cap * sizeof(int));
+        std::memcpy(deleted, rhs.deleted, cap * sizeof(int));
+    }
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
new file mode 100644
index 0000000000..fd802cf275
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.cc
@@ -0,0 +1,411 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/graph_workload.hh"
+
+#include <cstring>
+
+#include "base/cprintf.hh"
+#include "base/intmath.hh"
+
+namespace gem5
+{
+
+template<typename T>
+float
+writeToFloat(T value)
+{
+    assert(sizeof(T) == sizeof(float));
+    float float_form;
+    std::memcpy(&float_form, &value, sizeof(float));
+    return float_form;
+}
+
+template<typename T>
+T
+readFromFloat(float value)
+{
+    assert(sizeof(T) == sizeof(float));
+    T float_bits;
+    std::memcpy(&float_bits, &value, sizeof(float));
+    return float_bits;
+}
+
+void
+BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
+
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = initValue;
+        if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
+            dir->activate(aligned_addr);
+        }
+        items[index] = new_wl;
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BFSWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+BFSWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + 1;
+}
+
+bool
+BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0);
+}
+
+uint32_t
+BFSWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+BFSWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+uint32_t
+BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) {
+    return value;
+}
+
+void
+CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    Addr pkt_addr = pkt->getAddr();
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i;
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
+        items[i] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+SSSPWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + weight;
+}
+
+void
+PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int num_elements = pkt->getSize() / sizeof(WorkListItem);
+    WorkListItem items[num_elements];
+    pkt->writeDataToBlock((uint8_t*) items, pkt->getSize());
+
+    bool atom_active = false;
+    for (int index = 0; index < num_elements; index++) {
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = readFromFloat<uint32_t>(0);
+        new_wl.prop = readFromFloat<uint32_t>(1 - alpha);
+        new_wl.activeNow = activeCondition(new_wl, items[index]);
+        atom_active |= new_wl.activeNow;
+        items[index] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt->getSize());
+}
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    if (weight == 0) {
+        weight_float = 1.0;
+    }
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+}
+
+bool
+PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    float temp_float = writeToFloat<uint32_t>(new_wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(new_wl.prop);
+    float dist = std::abs(temp_float - prop_float);
+    return (dist >= threshold) && (new_wl.degree > 0);
+}
+
+uint32_t
+PRWorkload::apply(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = (temp_float - prop_float) / wl.degree;
+    wl.prop = wl.tempProp;
+    return readFromFloat<uint32_t>(delta);
+}
+
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+void
+BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = readFromFloat<uint32_t>((1 - alpha)/numNodes);
+        new_wl.prop = readFromFloat<uint32_t>(1/numNodes);
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
+        items[i] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+BSPPRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+BSPPRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(alpha * value_float);
+}
+
+bool
+BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    return (old_wl.degree > 0);
+}
+
+uint32_t
+BSPPRWorkload::apply(WorkListItem& wl)
+{
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = prop_float / wl.degree;
+    uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+    return delta_uint;
+}
+
+void
+BSPPRWorkload::interIterationInit(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    error += std::abs(temp_float - prop_float);
+    wl.prop = wl.tempProp;
+    wl.tempProp = readFromFloat<uint32_t>((1 - alpha) / numNodes);
+    wl.activeFuture = (wl.degree > 0);
+}
+
+std::string
+BSPPRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+void
+BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int pkt_size = pkt->getSize();
+    int aligned_addr = roundDown<uint32_t, size_t>(initAddr, pkt_size);
+
+    if (aligned_addr == pkt->getAddr()) {
+        int num_elements = pkt_size / sizeof(WorkListItem);
+        WorkListItem items[num_elements];
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+        int index = (initAddr - aligned_addr) / sizeof(WorkListItem);
+        WorkListItem new_wl = items[index];
+        uint32_t prop = 0;
+        prop |= initValue;
+        // NOTE: Depth of the initial vertex is 0.
+        prop &= countMask;
+        new_wl.tempProp = prop;
+        new_wl.prop = prop;
+        if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
+            dir->activate(aligned_addr);
+        }
+        items[index] = new_wl;
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BSPBCWorkload::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t update_depth = (update & depthMask) >> 24;
+    uint32_t update_count = (update & countMask);
+    uint32_t value_depth = (value & depthMask) >> 24;
+    uint32_t value_count = (value & countMask);
+    if (value_depth == 255) {
+        value_depth = currentDepth;
+        value_count = 0;
+    }
+    if (value_depth == currentDepth) {
+        value_count += update_count;
+    }
+    uint32_t ret = 0;
+    ret |= value_count;
+    warn_if(value_count > 16777215, "value count has grown bigger than 16777125."
+                                " This means the algorithm result might not be correct."
+                                " However, the traversal will not be affected."
+                                " Therefore, performane metrics could be used.");
+    // HACK: Make sure to always set the depth correctly even if count
+    // exceeds the 2^24-1 limit. Here we reset the depth section of ret.
+    ret &= countMask;
+    // NOTE: Now that the depth is securely reset we can copy the correct value.
+    ret |= (value_depth << 24);
+    return ret;
+}
+
+uint32_t
+BSPBCWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value;
+}
+
+uint32_t
+BSPBCWorkload::apply(WorkListItem& wl)
+{
+    return wl.prop;
+}
+
+void
+BSPBCWorkload::interIterationInit(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+}
+
+bool
+BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    uint32_t depth = (new_wl.tempProp & depthMask) >> 24;
+    return (depth == currentDepth) && (new_wl.degree > 0);
+}
+
+std::string
+BSPBCWorkload::printWorkListItem(WorkListItem wl)
+{
+    uint32_t temp_depth = (wl.tempProp & depthMask) >> 24;
+    uint32_t temp_count = (wl.tempProp & countMask);
+    uint32_t depth = (wl.prop & depthMask) >> 24;
+    uint32_t count = (wl.prop & countMask);
+    return csprintf(
+            "WorkListItem{tempProp: (depth: %d, count: %d), "
+            "prop: (depth: %d, count: %d), degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
new file mode 100644
index 0000000000..481cfc146f
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.hh
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+#define  __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+
+#include <bitset>
+#include <deque>
+#include <tuple>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/work_directory.hh"
+#include "mem/packet.hh"
+
+
+namespace gem5
+{
+
+class GraphWorkload
+{
+  public:
+    GraphWorkload() {}
+    ~GraphWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0;
+    virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
+    virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual bool betterThan(uint32_t lhs, uint32_t rhs) { return true; }
+    virtual void iterate() = 0;
+    virtual void interIterationInit(WorkListItem& wl) = 0;
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
+    virtual std::string printWorkListItem(const WorkListItem wl) = 0;
+};
+
+class BFSWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+
+  public:
+    BFSWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~BFSWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool betterThan(uint32_t lhs, uint32_t rhs) override { return lhs < rhs; }
+    virtual void iterate() {}
+    virtual void interIterationInit(WorkListItem& wl) {}
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+class BFSVisitedWorkload : public BFSWorkload
+{
+  public:
+    BFSVisitedWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
+    {}
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
+};
+
+class CCWorkload : public BFSVisitedWorkload
+{
+  public:
+    CCWorkload(): BFSVisitedWorkload(0, 0) {}
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+};
+
+class SSSPWorkload : public BFSWorkload
+{
+  public:
+    SSSPWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
+    {}
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
+};
+
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+
+  public:
+    PRWorkload(float alpha, float threshold):
+        alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
+    virtual void interIterationInit(WorkListItem& wl) {};
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+class BSPPRWorkload : public GraphWorkload
+{
+  private:
+    int numNodes;
+    float alpha;
+    float prevError;
+    float error;
+
+  public:
+    BSPPRWorkload(int num_nodes, float alpha):
+        numNodes(num_nodes), alpha(alpha), prevError(0), error(0)
+    {}
+
+    ~BSPPRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() { prevError = error; error = 0; }
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+
+    float getError() { return prevError; }
+};
+
+class BSPBCWorkload : public GraphWorkload
+{
+  private:
+    Addr initAddr;
+    uint32_t initValue;
+
+    int currentDepth;
+
+    uint32_t depthMask;
+    uint32_t countMask;
+  public:
+    BSPBCWorkload(Addr init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value),
+        currentDepth(0), depthMask(4278190080), countMask(16777215)
+    {}
+
+    ~BSPBCWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() { currentDepth++; }
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py
new file mode 100644
index 0000000000..10d8b708f0
--- /dev/null
+++ b/src/accl/graph/sega/BaseMemoryEngine.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseMemoryEngine(ClockedObject):
+    abstract = True
+    type = 'BaseMemoryEngine'
+    cxx_header = "accl/graph/sega/base_memory_engine.hh"
+    cxx_class = 'gem5::BaseMemoryEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
+    mem_port  = RequestPort("Port to communicate with the memory")
+
+    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
+                                    "memory.")
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
new file mode 100644
index 0000000000..0cdd11d251
--- /dev/null
+++ b/src/accl/graph/sega/CenteralController.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.util.pybind import PyBindMethod
+from m5.objects.AbstractMemory import AbstractMemory
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+
+class CenteralController(BaseMemoryEngine):
+    type = "CenteralController"
+    cxx_header = "accl/graph/sega/centeral_controller.hh"
+    cxx_class = "gem5::CenteralController"
+
+    mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.")
+
+    choose_best = Param.Bool(
+        "Whether to prefer the best update "
+        "value for choosing the next slice"
+    )
+
+    vertex_image_file = Param.String("Path to the vertex image file.")
+
+    mirrors_mem = Param.SimpleMemory("Memory to store the vertex mirrors.")
+
+    mpu_vector = VectorParam.MPU("All mpus in the system.")
+
+    edge_image_file = Param.String("Path to the edge image file.")
+    
+    abstract_mem_vector = VectorParam.AbstractMemory(
+        "Abstract Memories to be intialized by edge_image_file."
+    )
+    abstract_mem_atom_size = Param.Int(
+        64, "burst size of the abstract memories."
+    )
+    
+    edge_base = Param.UInt64("Addr of base address range")
+    
+    
+
+    cxx_exports = [
+        PyBindMethod("setAsyncMode"),
+        PyBindMethod("setBSPMode"),
+        PyBindMethod("setPGMode"),
+        PyBindMethod("createPopCountDirectory"),
+        PyBindMethod("createBFSWorkload"),
+        PyBindMethod("createBFSVisitedWorkload"),
+        PyBindMethod("createSSSPWorkload"),
+        PyBindMethod("createCCWorkload"),
+        PyBindMethod("createAsyncPRWorkload"),
+        PyBindMethod("createPRWorkload"),
+        PyBindMethod("createBCWorkload"),
+        PyBindMethod("workCount"),
+        PyBindMethod("getPRError"),
+        PyBindMethod("printAnswerToHostSimout"),
+    ]
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
new file mode 100644
index 0000000000..bb45802c1d
--- /dev/null
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class CoalesceEngine(BaseMemoryEngine):
+    type = 'CoalesceEngine'
+    cxx_header = "accl/graph/sega/coalesce_engine.hh"
+    cxx_class = 'gem5::CoalesceEngine'
+
+    cache_size = Param.MemorySize("Size of the internal SRAM array.")
+
+    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
+                                "requestor in each cycle. Used to limit b/w.")
+    pending_pull_limit = Param.Int("Maximum number of pending pull processes.")
+    active_buffer_size = Param.Int("Maximum number of memory active memory "
+                                "atoms ready to send updates. This parameter "
+                                "and post_push_wb_queue_size should be set "
+                                "in tandem. Probably, they should be equal.")
+    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
+                                "apply process for applications that require "
+                                "the apply process to happen exactly before "
+                                "pushing the edgePointer to the PushEngine.")
+    transitions_per_cycle = Param.Int("Max number of transitions in a cycle")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
new file mode 100644
index 0000000000..8d2453b01c
--- /dev/null
+++ b/src/accl/graph/sega/MPU.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+
+class MPU(SimObject):
+    type = "MPU"
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = "gem5::MPU"
+
+    system = Param.System(Parent.any, "System this MPU is a part of")
+
+    wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
+                                "MPU object.")
+    coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
+                                "each instance of MPU object.")
+    push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
+                                "instance of MPU object.")
+
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
new file mode 100644
index 0000000000..50e240808e
--- /dev/null
+++ b/src/accl/graph/sega/PushEngine.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class PushEngine(BaseMemoryEngine):
+    type = 'PushEngine'
+    cxx_header = "accl/graph/sega/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
+
+    push_req_queue_size = Param.Int("Size of the queue to "
+                                    "queue push requests.")
+    # resp_queue_size should probably be
+    # significantly bigger than push_req_queue_size
+    resp_queue_size = Param.Int("Size of the response queue in the "
+                                    "push engine where it stores the "
+                                    "edges read from memory.")
+
+    examine_window = Param.Int("Number of edges at the front of the edge queue"
+                                " to examine in order to propagate.")
+
+    max_propagates_per_cycle = Param.Int("Maximum number of propagates "
+                                                        "done per cycle.")
+
+    update_queue_size = Param.Int("Maximum number of entries "
+                                    "for each update queue.")
+
+    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
+    
+    base_addr = Param.UInt64("Addr of base address range")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
new file mode 100644
index 0000000000..b3e1a838fb
--- /dev/null
+++ b/src/accl/graph/sega/SConscript
@@ -0,0 +1,55 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import("*")
+
+SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"])
+SimObject("CenteralController.py", sim_objects=["CenteralController"])
+SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"])
+SimObject("MPU.py", sim_objects=["MPU"])
+SimObject("PushEngine.py", sim_objects=["PushEngine"])
+SimObject("WLEngine.py", sim_objects=["WLEngine"])
+
+Source("base_memory_engine.cc")
+Source("centeral_controller.cc")
+Source("coalesce_engine.cc")
+Source("enums.cc")
+Source("mpu.cc")
+Source("push_engine.cc")
+Source("wl_engine.cc")
+
+DebugFlag("BaseMemoryEngine")
+DebugFlag("CenteralController")
+DebugFlag("CacheBlockState")
+DebugFlag("CoalesceEngine")
+DebugFlag("PushEngine")
+DebugFlag("SEGAStructureSize")
+DebugFlag("MSDebug")
+DebugFlag("WLEngine")
+
+CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
+                    "WLEngine", "BaseMemoryEngine"])
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
new file mode 100644
index 0000000000..f9ea4488df
--- /dev/null
+++ b/src/accl/graph/sega/WLEngine.py
@@ -0,0 +1,54 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReduceEngine import BaseReduceEngine
+
+class WLEngine(BaseReduceEngine):
+    type = 'WLEngine'
+    cxx_header = "accl/graph/sega/wl_engine.hh"
+    cxx_class = 'gem5::WLEngine'
+
+    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
+                                                "remote outside")
+
+    update_queue_size = Param.Int("Size of the queue WLEngine stores "
+                                        "the incoming updates")
+
+    register_file_size = Param.Int("Number of internal registers the "
+                                    "WLEngine has. It can service as "
+                                    "many updates as this queueu has "
+                                    "entries at the same time.")
+
+    examine_window = Param.Int("Number of updates at the front of update "
+                                "queue examined for reading.")
+    rd_per_cycle = Param.Int("Maximum number of reads per cycle.")
+    reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.")
+    wr_per_cycle = Param.Int("Maximum number of writes per cycle.")
+
+
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
new file mode 100644
index 0000000000..9f704f71e9
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/base_memory_engine.hh"
+
+#include "debug/BaseMemoryEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+
+namespace gem5
+{
+
+BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this)),
+    memPort(name() + ".mem_port", this),
+    peerMemoryAtomSize(params.attached_memory_atom_size)
+{}
+
+BaseMemoryEngine::~BaseMemoryEngine()
+{}
+
+Port&
+BaseMemoryEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+BaseMemoryEngine::init()
+{
+    AddrRangeList memory_ranges = memPort.getAddrRanges();
+
+    assert(memory_ranges.size() == 1);
+
+    peerMemoryRange = memory_ranges.front();
+
+    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is "
+            "%s. The range is %s interleaved.\n", __func__,
+            peerMemoryRange.to_string(),
+            peerMemoryRange.interleaved() ? "" : "not");
+}
+
+void
+BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to "
+                "the memory.\n", __func__, pkt->print());
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__);
+    } else {
+        DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__);
+        owner->recvMemRetry();
+    }
+}
+
+bool
+BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+BaseMemoryEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket),
+            "Received retry without a blockedPacket");
+
+    _blocked = false;
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+}
+
+PacketPtr
+BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
+PacketPtr
+BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+}
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
new file mode 100644
index 0000000000..31e7d85bef
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+
+#include <unordered_map>
+
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/BaseMemoryEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseMemoryEngine : public ClockedObject
+{
+  protected:
+    class MemoryEvent : public EventFunctionWrapper
+    {
+      private:
+        bool _pending;
+        int _prevState;
+
+      public:
+        MemoryEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name),
+            _pending(false), _prevState(0)
+        {}
+        bool pending() { return _pending; }
+        void sleep() { _pending = true; }
+        void wake() { _pending = false; }
+        void setPrevState(int state) { _prevState = state; }
+        int getPrevState() { return _prevState; }
+    };
+
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseMemoryEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseMemoryEngine* owner):
+            RequestPort(name), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    System* system;
+    const RequestorID _requestorId;
+
+    MemPort memPort;
+    AddrRange peerMemoryRange;
+    size_t peerMemoryAtomSize;
+
+    virtual void recvMemRetry() = 0;
+    virtual bool handleMemResp(PacketPtr pkt) = 0;
+
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+
+  public:
+    PARAMS(BaseMemoryEngine);
+
+    BaseMemoryEngine(const Params &params);
+    ~BaseMemoryEngine();
+
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); }
+
+    virtual void recvFunctional(PacketPtr pkt) = 0;
+
+    virtual void init() override;
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr
new file mode 100644
index 0000000000..316fcd37d9
--- /dev/null
+++ b/src/accl/graph/sega/busyMaskErr
@@ -0,0 +1,16 @@
+gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0
+
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+
+// This assertion would be hit although it should not.
+// It is fixed by a hack in recvWLRead when hit in the cache.
+assert(cacheBlocks[block_index].busyMask == 0);
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
new file mode 100644
index 0000000000..a2970a9013
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/centeral_controller.hh"
+
+#include <cmath>
+#include <iostream>
+
+#include "base/addr_range_map.hh"
+#include "base/cprintf.hh"
+#include "base/loader/memory_image.hh"
+#include "base/loader/object_file.hh"
+#include "debug/CenteralController.hh"
+#include "mem/abstract_mem.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+using memory::AbstractMemory;
+
+CenteralController::CenteralController(const Params& params):
+    BaseMemoryEngine(params),
+    mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET),
+    mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0),
+    chooseBest(params.choose_best),
+    edgeBase(params.edge_base),
+    nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name()),
+    stats(*this)
+{
+    uint64_t total_cache_size = 0;
+    for (auto mpu : params.mpu_vector) {
+        mpuVector.push_back(mpu);
+        mpu->registerCenteralController(this);
+        total_cache_size += mpu->getCacheSize();
+    }
+    verticesPerSlice = std::floor(total_cache_size / sizeof(WorkListItem));
+}
+
+Port&
+CenteralController::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "mirrors_map_mem") {
+        return mapPort;
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort("mem_port", idx);
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSVisitedWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new SSSPWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createCCWorkload()
+{
+    workload = new CCWorkload();
+}
+
+void
+CenteralController::createAsyncPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
+
+void
+CenteralController::createPRWorkload(int num_nodes, float alpha)
+{
+    workload = new BSPPRWorkload(num_nodes, alpha);
+}
+
+void
+CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BSPBCWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createPopCountDirectory(int atoms_per_block)
+{
+    fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing "
+                        "mode by calling either setAsyncMode or setBSPMode.");
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createBSPPopCountDirectory(atoms_per_block);
+        }
+    }
+    if (mode == ProcessingMode::POLY_GRAPH) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
+}
+
+void
+CenteralController::startup()
+{
+    DPRINTF(CenteralController, "Startup 1!\n");
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
+    for (auto mpu: mpuVector) {
+        for (auto range: mpu->getAddrRanges()) {
+            mpuAddrMap.insert(range, mpu);
+        }
+        mpu->setProcessingMode(mode);
+        mpu->recvWorkload(workload);
+    }
+        DPRINTF(CenteralController, "Startup 2!\n");
+
+    const auto& vertex_file = params().vertex_image_file;
+    if (vertex_file == "")
+        return;
+    DPRINTF(CenteralController, "Startup 3!\n");
+
+    auto* object = loader::createObjectFile(vertex_file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), vertex_file);
+
+    loader::debugSymbolTable.insert(*object->symtab().globals());
+    loader::MemoryImage vertex_image = object->buildImage();
+    maxVertexAddr = vertex_image.maxAddr();
+
+    int num_total_vertices = (maxVertexAddr / sizeof(WorkListItem));
+    numTotalSlices = std::ceil((double) num_total_vertices / verticesPerSlice);
+    DPRINTF(CenteralController, "Startup 4!\n");
+
+    numPendingUpdates = new int [numTotalSlices];
+    bestPendingUpdate = new uint32_t [numTotalSlices];
+    for (int i = 0; i < numTotalSlices; i++) {
+        numPendingUpdates[i] = 0;
+        bestPendingUpdate[i] = -1;
+    }
+    DPRINTF(CenteralController, "Startup 5!\n");
+
+    PortProxy vertex_proxy(
+    [this](PacketPtr pkt) {
+        auto routing_entry = mpuAddrMap.contains(pkt->getAddr());
+        routing_entry->second->recvFunctional(pkt);
+    }, vertex_atom);
+
+    panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
+
+    for (auto mpu: mpuVector) {
+        mpu->postMemInitSetup();
+        if (!mpu->running() && (mpu->workCount() > 0)) {
+            mpu->start();
+        }
+    }
+    workload->iterate();
+    DPRINTF(CenteralController, "Startup 6!\n");
+    DPRINTF(CenteralController, "params().edge_image_file = %s\n", params().edge_image_file);
+    const auto& edge_file = params().edge_image_file;
+    DPRINTF(CenteralController, "edge_file = %s\n", edge_file);
+
+    // if (edge_file == "") {} // commented this out
+    //     return;
+
+    DPRINTF(CenteralController, "Startup 7!\n");
+
+    AddrRangeMap<AbstractMemory*> abs_mem_range_map;
+    for (auto abs_mem: params().abstract_mem_vector) { 
+        abs_mem_range_map.insert(abs_mem->getAddrRange(), abs_mem);
+    }
+    // DPRINTF(CenteralController, "%s, Edge memory ranges: %s", __func__, abs_mem_range_map);
+    auto* edge_object = loader::createObjectFile(edge_file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), edge_file);
+
+    loader::debugSymbolTable.insert(*edge_object->symtab().globals());
+    loader::MemoryImage edge_image = edge_object->buildImage();
+    DPRINTF(CenteralController, "Startup 8!\n");
+
+    PortProxy edge_proxy(
+    [abs_mem_range_map, this](PacketPtr pkt) {
+        pkt->setAddr(pkt->getAddr() + mpuVector[0]->getBaseAddr());
+        auto routing_entry = abs_mem_range_map.contains(pkt->getAddr());
+        routing_entry->second->functionalAccess(pkt);
+    }, params().abstract_mem_atom_size);
+
+    DPRINTF(CenteralController, "%s, mpuVector[0]->getBaseAddr(): %lu", __func__, mpuVector[0]->getBaseAddr());
+
+    panic_if(!edge_image.write(edge_proxy), "%s: Unable to write image.");
+}
+
+void
+CenteralController::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s "
+                "is blocked.\n", __func__, _id, pkt->print());
+        blockedPacket = pkt;
+    } else {
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s "
+                    "sent.\n", __func__, _id, pkt->print());
+    }
+}
+
+bool
+CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp should not be called at all");
+}
+
+void
+CenteralController::ReqPort::recvReqRetry()
+{
+    panic("recvReqRetry should not be called at all");
+}
+
+void
+CenteralController::recvDoneSignal()
+{
+    bool done = true;
+    for (auto mpu : mpuVector) {
+        done &= mpu->done();
+    }
+
+    if (done && mode == ProcessingMode::ASYNCHRONOUS) {
+        exitSimLoopNow("no update left to process.");
+    }
+
+    if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->postConsumeProcess();
+            mpu->swapDirectories();
+            if (!mpu->running() && (mpu->workCount() > 0)) {
+                mpu->start();
+            }
+        }
+        workload->iterate();
+        exitSimLoopNow("finished an iteration.");
+    }
+
+    if (done && mode == ProcessingMode::POLY_GRAPH) {
+        DPRINTF(CenteralController, "%s: Received done signal.\n", __func__);
+        exitSimLoopNow("Finished processing a slice.");
+        if (!nextSliceSwitchEvent.scheduled()) {
+            schedule(nextSliceSwitchEvent, nextCycle());
+        }
+    }
+}
+
+int
+CenteralController::chooseNextSlice()
+{
+    int ret_slice_id = -1;
+    int max_pending_count = 0;
+    // TODO: Make this generalizable for all workloads.
+    uint32_t best_update = -1;
+    for (int i = 0; i < numTotalSlices; i++) {
+        if (numPendingUpdates[i] > max_pending_count) {
+            max_pending_count = numPendingUpdates[i];
+        }
+        if (numPendingUpdates[i] > 0 &&
+            workload->betterThan(bestPendingUpdate[i], best_update)) {
+            best_update = bestPendingUpdate[i];
+        }
+    }
+    if (chooseBest) {
+        int max_count = 0;
+        for (int i = 0; i < numTotalSlices; i++) {
+            if (numPendingUpdates[i] > max_count &&
+                bestPendingUpdate[i] == best_update) {
+                max_count = numPendingUpdates[i];
+                ret_slice_id = i;
+            }
+        }
+    } else {
+        uint32_t best_value = -1;
+        for (int i = 0; i < numTotalSlices; i++) {
+            if (numPendingUpdates[i] == max_pending_count &&
+                workload->betterThan(bestPendingUpdate[i], best_value)) {
+                best_value = bestPendingUpdate[i];
+                ret_slice_id = i;
+            }
+        }
+    }
+    return ret_slice_id;
+}
+
+void
+CenteralController::processNextSliceSwitchEvent()
+{
+    int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int vertices_per_atom = (int) vertex_atom / sizeof(WorkListItem);
+    int bytes_accessed = 0;
+    int updates_generated_total =  0;
+    for (int dst_id = 0; dst_id < numTotalSlices; dst_id++) {
+        if (dst_id == currentSliceId) {
+            continue;
+        }
+        int updates_generated = 0;
+        Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(uint64_t);
+        Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(uint64_t);
+        PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t));
+        mapPort.sendFunctional(start);
+        mapPort.sendFunctional(end);
+        Addr start_addr = start->getLE<uint64_t>();
+        Addr end_addr = end->getLE<uint64_t>();
+        delete start;
+        delete end;
+        DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__,
+                            currentSliceId, dst_id, start_addr, end_addr);
+
+        uint64_t num_bytes = end_addr - start_addr;
+        uint64_t num_mirrors = (end_addr - start_addr) / sizeof(MirrorVertex);
+        MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
+
+        PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
+        memPort.sendFunctional(read_mirrors);
+        read_mirrors->writeData((uint8_t*) mirrors);
+        delete read_mirrors;
+
+        WorkListItem vertices [vertices_per_atom];
+        for (int i = 0; i < num_mirrors; i++) {
+            Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+            Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
+            int wl_offset = (int) (org_addr - aligned_org_addr) / sizeof(WorkListItem);
+            PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
+            auto routing_entry = mpuAddrMap.contains(aligned_org_addr);
+            routing_entry->second->recvFunctional(read_org);
+            read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom);
+            delete read_org;
+             if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) {
+                assert(vertices[wl_offset].degree == 0);
+                vertices[wl_offset].prop = vertices[wl_offset].tempProp;
+            }
+            if (mirrors[i].prop != vertices[wl_offset].prop) {
+                mirrors[i].prop = vertices[wl_offset].prop;
+                if (!mirrors[i].activeNow) {
+                    mirrors[i].activeNow = true;
+                    numPendingUpdates[dst_id]++;
+                    totalUpdatesLeft++;
+                    updates_generated++;
+                }
+                bestPendingUpdate[dst_id] =
+                    workload->betterThan(mirrors[i].prop, bestPendingUpdate[dst_id]);
+            }
+        }
+        PacketPtr write_mirrors =
+                    createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
+        memPort.sendFunctional(write_mirrors);
+        delete write_mirrors;
+        delete [] mirrors;
+        DPRINTF(CenteralController, "%s: Done scattering updates from slice "
+                        "%d to slice %d.\n", __func__, currentSliceId, dst_id);
+        DPRINTF(CenteralController, "%s: Generated %d updates from slice "
+                                        "%d to slice %d.\n", __func__,
+                                    updates_generated, currentSliceId, dst_id);
+        updates_generated_total += updates_generated;
+        bytes_accessed += 2 * num_bytes;
+    }
+    DPRINTF(CenteralController, "%s: Done with slice %d.\n", __func__, currentSliceId);
+    DPRINTF(CenteralController, "%s: Generated a total of %d updates.\n",
+                                        __func__, updates_generated_total);
+    DPRINTF(CenteralController, "%s: There are a total of %d "
+                                "updates left.\n", __func__, totalUpdatesLeft);
+    if (totalUpdatesLeft > 0) {
+        currentSliceId = chooseNextSlice();
+    } else {
+        exitSimLoopNow("Done with all the slices.");
+        return;
+    }
+    DPRINTF(CenteralController, "%s: Chose %d as the "
+                                    "next slice.\n", __func__, currentSliceId);
+
+    for (int src_id = 0; src_id < numTotalSlices; src_id++) {
+        if (src_id == currentSliceId) {
+            continue;
+        }
+        Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(uint64_t);
+        Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(uint64_t);
+        PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t));
+        mapPort.sendFunctional(start);
+        mapPort.sendFunctional(end);
+        Addr start_addr = start->getLE<uint64_t>();
+        Addr end_addr = end->getLE<uint64_t>();
+        delete start;
+        delete end;
+        DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__,
+                            src_id, currentSliceId, start_addr, end_addr);
+
+        uint64_t num_bytes = end_addr - start_addr;
+        uint64_t num_mirrors = (end_addr - start_addr) / sizeof(MirrorVertex);
+        MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
+
+        PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
+        memPort.sendFunctional(read_mirrors);
+        read_mirrors->writeData((uint8_t*) mirrors);
+        delete read_mirrors;
+        for (int i = 0; i < num_mirrors; i++) {
+            if (mirrors[i].activeNow) {
+                Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+                auto routing_entry = mpuAddrMap.contains(org_addr);
+                routing_entry->second->recvMirrorPush(org_addr, mirrors[i].prop,
+                                        mirrors[i].edgeIndex, mirrors[i].degree);
+                mirrors[i].activeNow = false;
+                numPendingUpdates[currentSliceId]--;
+                totalUpdatesLeft--;
+            }
+        }
+        PacketPtr write_mirrors =
+                    createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
+        memPort.sendFunctional(write_mirrors);
+        delete write_mirrors;
+        delete [] mirrors;
+        DPRINTF(CenteralController, "%s: Done gathering updates from slice "
+                        "%d to slice %d.\n", __func__, src_id, currentSliceId);
+        bytes_accessed += num_bytes;
+    }
+
+    double mirror_mem_bw = mirrorsMem->getBW();
+    Tick time_to_switch = bytes_accessed * mirror_mem_bw;
+    stats.switchTicks += time_to_switch;
+    stats.switchedBytes += bytes_accessed;
+    stats.numSwitches++;
+    for (auto mpu: mpuVector) {
+        mpu->startProcessingMirrors(time_to_switch);
+    }
+    exitSimLoopNow("Done with slice switch.");
+}
+
+bool
+CenteralController::handleMemResp(PacketPtr pkt)
+{
+    panic("handleMemResp should not be called at all");
+}
+
+void
+CenteralController::recvMemRetry()
+{
+    panic("recvMemRetry should not be called at all");
+}
+
+void
+CenteralController::recvFunctional(PacketPtr pkt)
+{
+    panic("recvFunctional should not be called at all");
+}
+
+int
+CenteralController::workCount()
+{
+    int work_count = 0;
+    for (auto mpu: mpuVector) {
+        work_count += mpu->workCount();
+    }
+    return work_count;
+}
+
+float
+CenteralController::getPRError()
+{
+    BSPPRWorkload* pr_workload = dynamic_cast<BSPPRWorkload*>(workload);
+    return pr_workload->getError();
+}
+
+void
+CenteralController::printAnswerToHostSimout()
+{
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int num_items = vertex_atom / sizeof(WorkListItem);
+    WorkListItem items[num_items];
+    for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom)
+    {
+        PacketPtr pkt = createReadPacket(addr, vertex_atom);
+        auto routing_entry = mpuAddrMap.contains(pkt->getAddr());
+        routing_entry->second->recvFunctional(pkt);
+        pkt->writeDataToBlock((uint8_t*) items, vertex_atom);
+        for (int i = 0; i < num_items; i++) {
+            std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
+                                        workload->printWorkListItem(items[i]));
+
+            std::cout << print << std::endl;
+        }
+        delete pkt;
+    }
+}
+
+CenteralController::ControllerStats::ControllerStats(CenteralController& _ctrl):
+    statistics::Group(&_ctrl), ctrl(_ctrl),
+    ADD_STAT(numSwitches, statistics::units::Byte::get(),
+             "Number of slices switches completed."),
+    ADD_STAT(switchedBytes, statistics::units::Byte::get(),
+             "Number of bytes accessed during slice switching."),
+    ADD_STAT(switchTicks, statistics::units::Tick::get(),
+             "Number of ticks spent switching slices."),
+    ADD_STAT(switchSeconds, statistics::units::Second::get(),
+             "Traversed Edges Per Second.")
+{
+}
+
+void
+CenteralController::ControllerStats::regStats()
+{
+    using namespace statistics;
+
+    switchSeconds = switchTicks / simFreq;
+}
+
+}
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
new file mode 100644
index 0000000000..74f8124380
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+
+#include <cmath>
+#include <vector>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/mpu.hh"
+#include "base/addr_range.hh"
+#include "base/addr_range_map.hh"
+#include "base/intmath.hh"
+#include "mem/simple_mem.hh"
+#include "params/CenteralController.hh"
+
+namespace gem5
+{
+
+class CenteralController : public BaseMemoryEngine
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        CenteralController* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, CenteralController* owner, PortID id):
+          RequestPort(name), owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort mapPort;
+    Addr maxVertexAddr;
+    ProcessingMode mode;
+
+    memory::SimpleMemory* mirrorsMem;
+
+    // AddrRangeMap<gem5::AbstractMemory*> abs_mem_range_map; // moved here from .cc file
+    Addr edgeBase;
+    std::vector<MPU*> mpuVector;
+    AddrRangeMap<MPU*> mpuAddrMap;
+
+    int currentSliceId;
+    int numTotalSlices;
+    int verticesPerSlice;
+    int totalUpdatesLeft;
+
+    bool chooseBest;
+    int* numPendingUpdates;
+    uint32_t* bestPendingUpdate;
+    int chooseNextSlice();
+
+    EventFunctionWrapper nextSliceSwitchEvent;
+    void processNextSliceSwitchEvent();
+
+    struct ControllerStats : public statistics::Group
+    {
+      ControllerStats(CenteralController& ctrl);
+
+      void regStats() override;
+
+      CenteralController& ctrl;
+
+      statistics::Scalar numSwitches;
+      statistics::Scalar switchedBytes;
+      statistics::Scalar switchTicks;
+      statistics::Formula switchSeconds;
+    };
+    ControllerStats stats;
+
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
+
+  public:
+    GraphWorkload* workload;
+
+    PARAMS(CenteralController);
+    CenteralController(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+
+    virtual void startup() override;
+
+    virtual void recvFunctional(PacketPtr pkt) override;
+
+    void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
+    void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
+    void setPGMode() { mode = ProcessingMode::POLY_GRAPH; }
+
+    void createPopCountDirectory(int atoms_per_block);
+
+    void createBFSWorkload(Addr init_addr, uint32_t init_value);
+    void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value);
+    void createSSSPWorkload(Addr init_addr, uint32_t init_value);
+    void createCCWorkload();
+    void createAsyncPRWorkload(float alpha, float threshold);
+    void createPRWorkload(int num_nodes, float alpha);
+    void createBCWorkload(Addr init_addr, uint32_t init_value);
+
+    void recvDoneSignal();
+
+    int workCount();
+    float getPRError();
+    void printAnswerToHostSimout();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
new file mode 100644
index 0000000000..afb0695206
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -0,0 +1,1327 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/coalesce_engine.hh"
+
+#include <bitset>
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
+    lastReadTick(0), onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    numReceivedPulls(0), numScheduledPulls(0), pendingPullLimit(params.pending_pull_limit),
+    pendingPullReads(0), activeBufferSize(params.active_buffer_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    transitionsPerCycle(params.transitions_per_cycle),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
+    nextDoneSignalEvent([this] {
+        processNextDoneSignalEvent();
+        }, name() + ".nextDoneSignalEvent"),
+    stats(*this)
+{
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
+    }
+    numActiveBlocksNow = UniqueFIFO<int>(numLines);
+    numActiveBlocksNext = UniqueFIFO<int>(numLines);
+
+    activeBuffer.clear();
+    postPushWBQueue.clear();
+    blocksTouchedThisTick.clear();
+}
+
+void
+CoalesceEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+
+// NOTE: Used for initializing memory and reading the final answer
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        graphWorkload->init(pkt, currentDirectory);
+        if (pkt->getAddr() > lastAtomAddr) {
+            lastAtomAddr = pkt->getAddr();
+        }
+        memPort.sendFunctional(pkt);
+    }
+}
+
+void
+CoalesceEngine::postMemInitSetup()
+{
+    currentDirectory->setLastAtomAddr(lastAtomAddr);
+}
+
+void
+CoalesceEngine::postConsumeProcess()
+{
+    Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
+    for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
+        Addr addr = peerMemoryRange.addIntlvBits(local_addr);
+        int block_index = getBlockIndex(addr);
+        if (cacheBlocks[block_index].addr == addr) {
+            assert(cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].hasConflict);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
+                graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;
+                if (cacheBlocks[block_index].items[index].activeFuture) {
+                    cacheBlocks[block_index].items[index].activeFuture = false;
+                    cacheBlocks[block_index].items[index].activeNow = true;
+                    cacheBlocks[block_index].dirty = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                numActiveBlocksNext.push_back(block_index);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                numActiveBlocksNext.erase(block_index);
+            }
+        } else {
+            WorkListItem items[numElementsPerLine];
+            PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
+            memPort.sendFunctional(read_pkt);
+            read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!items[index].activeNow);
+                atom_active_future_before |= items[index].activeFuture;
+                graphWorkload->interIterationInit(items[index]);
+                atom_active_future_after |= items[index].activeFuture;
+                if (items[index].activeFuture) {
+                    items[index].activeFuture = false;
+                    items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureDirectory->activate(addr);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureDirectory->deactivate(addr);
+            }
+            PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
+            memPort.sendFunctional(write_pkt);
+            delete read_pkt;
+            delete write_pkt;
+        }
+    }
+}
+
+void
+CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = nullptr;
+}
+
+void
+CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
+void
+CoalesceEngine::swapDirectories()
+{
+    assert(currentDirectory->empty());
+    assert(numActiveBlocksNow.empty());
+    // assert currentDirectory is empty
+    WorkDirectory* temp = currentDirectory;
+    currentDirectory = futureDirectory;
+    futureDirectory = temp;
+
+    numActiveBlocksNow.clear();
+    numActiveBlocksNow = numActiveBlocksNext;
+    numActiveBlocksNext.clear();
+}
+
+bool
+CoalesceEngine::done()
+{
+    return memAccBuffer.empty() && numActiveBlocksNow.empty() &&
+        activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
+}
+
+bool
+CoalesceEngine::enoughSpace()
+{
+    return (activeBuffer.size() + pendingPullReads + numScheduledPulls) < activeBufferSize;
+}
+
+bool
+CoalesceEngine::pullCondition()
+{
+    bool enough_space = enoughSpace();
+    bool schedule_limit = numScheduledPulls < pendingPullLimit;
+    return enough_space && schedule_limit;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+ReadReturnStatus
+CoalesceEngine::recvWLRead(Addr addr)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    assert(aligned_addr % peerMemoryAtomSize == 0);
+    int block_index = getBlockIndex(aligned_addr);
+    assert(block_index < numLines);
+    if (lastReadTick < curTick()) {
+        blocksTouchedThisTick.clear();
+        lastReadTick = curTick();
+    }
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if ((cacheBlocks[block_index].addr == aligned_addr) &&
+        (cacheBlocks[block_index].valid)) {
+        // Hit
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(cacheBlocks[block_index].state != CacheState::INVALID);
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
+
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        // TODO: Stat to count the number of WLItems that have been touched.
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+        cacheBlocks[block_index].state = CacheState::BUSY;
+        // HACK: If a read happens on the same cycle as another operation such
+        // as apply set lastChangedTick to half a cycle later so that operation
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+
+        blocksTouchedThisTick.insert(block_index);
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
+                (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].dirty);
+
+        assert(MSHR.find(block_index) != MSHR.end());
+        MSHR[block_index].push_back(addr);
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        blocksTouchedThisTick.insert(block_index);
+
+        stats.numVertexReads++;
+        return ReadReturnStatus::ACCEPT;
+    } else {
+        // miss
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
+        stats.readMisses++;
+        if (blocksTouchedThisTick.find(block_index) != blocksTouchedThisTick.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has already been "
+                            "accessed this tick.\n", __func__, block_index);
+            return ReadReturnStatus::REJECT_ROLL;
+        }
+        if (cacheBlocks[block_index].state != CacheState::INVALID) {
+            // conflict miss
+            DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
+                "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr);
+            cacheBlocks[block_index].hasConflict = true;
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                if (cacheBlocks[block_index].dirty) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is dirty.\n",
+                                                        __func__, block_index);
+                    cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    memAccBuffer.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextWriteBack(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is now "
+                            "pending write back.\n", __func__, block_index);
+                } else {
+                    // NOTE: The cache block could still be active but
+                    // not dirty. If active we only have to active tracking
+                    // but can throw the data away.
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not dirty.\n",
+                                                        __func__, block_index);
+                    bool atom_active_now = false;
+                    bool atom_active_future = false;
+                    for (int index = 0; index < numElementsPerLine; index++) {
+                        atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                        atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+                    }
+                    if (atom_active_now) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active now.\n",
+                                                        __func__, block_index);
+                        numActiveBlocksNow.erase(block_index);
+                        int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                        stats.currentFrontierSize.sample(currentDirectory->workCount());
+                        stats.countActiveBlocksNow.sample(count);
+                    }
+                    if (atom_active_future) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active next.\n",
+                                                        __func__, block_index);
+                        numActiveBlocksNext.erase(block_index);
+                        int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                        stats.futureFrontierSize.sample(futureDirectory->workCount());
+                        stats.countActiveBlocksNext.sample(count);
+                    }
+                    // NOTE: Bring the cache line to invalid state.
+                    // NOTE: Above line where we set hasConflict to true
+                    // does not matter anymore since we reset the cache line.
+                    cacheBlocks[block_index].reset();
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is reset.\n",
+                                                        __func__, block_index);
+                }
+                blocksTouchedThisTick.insert(block_index);
+                return ReadReturnStatus::REJECT_NO_ROLL;
+            } else {
+                blocksTouchedThisTick.insert(block_index);
+                stats.numConflicts++;
+                return ReadReturnStatus::REJECT_ROLL;
+            }
+        } else {
+            // cold miss
+            assert(MSHR.find(block_index) == MSHR.end());
+            cacheBlocks[block_index].addr = aligned_addr;
+            cacheBlocks[block_index].busyMask = 0;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].dirty = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+
+            MSHR[block_index].push_back(addr);
+            memAccBuffer.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                    processNextRead(block_index, schedule_tick);
+                }, block_index, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            blocksTouchedThisTick.insert(block_index);
+            return ReadReturnStatus::ACCEPT;
+        }
+    }
+}
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
+
+    onTheFlyReqs--;
+    if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
+        delete pkt;
+    } else {
+        assert(pkt->isRead());
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+        ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+
+        // NOTE: Regardless of where the pkt will go we have to release the
+        // reserved space for this pkt in the activeBuffer in case
+        // it was read from memory for placement in the activeBuffer.
+        // NOTE: Also we have to stop tracking the address for pullAddrs
+        if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+            pendingPullReads--;
+            pendingPullAddrs.erase(addr);
+        }
+        if (cacheBlocks[block_index].addr == addr) {
+            // If it is in the cache, line should be in PENDING_DATA state.
+            // Regardless of the purpose for which it was read, it should
+            // be placed in the cache array.
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].dirty);
+            assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+            // NOTE: Since it is in PENDING_DATA state it
+            // should have an entry in the MSHR.
+            assert(MSHR.find(block_index) != MSHR.end());
+
+            pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                            peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            // HACK: In case the pkt was read for push but it was allocated
+            // for in the cache later on, we should cancel the future
+            // processNextRead for this block. We could set lastChangedTick
+            // to curTick() like usual. However, there is no way to ensure
+            // that processNextRead will be not be called on the same tick
+            // as the pkt arrives from the memory. Therefore, we will set
+            // the lastChangedTick to half a cycle before the actual time.
+            // We move that back in time because it would be fine if
+            // processNextRead happened before pkt arriveed. processNextRead
+            // actually will check if there is a pending read for push for
+            // the address it's trying to populate.
+            if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+                cacheBlocks[block_index].lastChangedTick =
+                                    curTick() - (Tick) (clockPeriod() / 2);
+            } else {
+                cacheBlocks[block_index].lastChangedTick = curTick();
+            }
+
+            // NOTE: If the atom is active we have to deactivate the tracking
+            // of this atom in the memory since it's not in memory anymore.
+            // Since it is going to the cache, cache will be responsible for
+            // tracking this. Push to activeCacheBlocks for simulator speed
+            // instead of having to search for active blocks in the cache.
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                numActiveBlocksNow.push_back(block_index);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.countActiveBlocksNow.sample(count);
+            }
+            if (atom_active_future) {
+                int count = futureDirectory->deactivate(addr);
+                numActiveBlocksNext.push_back(block_index);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.countActiveBlocksNext.sample(count);
+            }
+
+            assert(MSHR.find(block_index) != MSHR.end());
+            for (auto it = MSHR[block_index].begin();
+                                            it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            }
+            MSHR.erase(block_index);
+
+            cacheBlocks[block_index].state = CacheState::BUSY;
+            if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            delete pkt;
+        } else {
+            assert(purpose->dest() == ReadDestination::READ_FOR_PUSH);
+            // There should be enough room in activeBuffer to place this pkt.
+            // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space.
+            // So at this point in code we should have at least one free entry
+            // in the active buffer which is reserved for this pkt.
+            assert(activeBuffer.size() + pendingPullReads < activeBufferSize);
+
+            WorkListItem items[numElementsPerLine];
+            pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_now = false;
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_now |= items[index].activeNow;
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                int count = currentDirectory->deactivate(addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.countActiveBlocksNow.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->deactivate(addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.countActiveBlocksNext.sample(count);
+                }
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                stats.wastefulBytesRead += pkt->getSize();
+                delete pkt;
+            }
+
+            if (pullCondition()) {
+                memAccBuffer.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, -1, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                numScheduledPulls++;
+            }
+        }
+        delete purpose;
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    int num_responses_sent = 0;
+
+    Addr addr_response;
+    WorkListItem worklist_response;
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
+                    addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue."
+                    " responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            // TODO: Add the condition to check that front of queue can be
+            // sent to WLEngine. i.e. it has at least been in the queue for
+            // one cycle.
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
+        }
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    int block_index = getBlockIndex(aligned_addr);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__,
+                graphWorkload->printWorkListItem(wl), addr);
+
+    // NOTE: Design does not allow for write misses.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].state == CacheState::BUSY);
+
+    // respective bit in busyMask for wl is set.
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
+            (1 << wl_offset));
+
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].dirty |= true;
+    }
+
+    bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
+    cacheBlocks[block_index].items[wl_offset] = wl;
+
+    stats.vertexActivations += active ? 1 : 0;
+    if (mode == ProcessingMode::ASYNCHRONOUS || mode == ProcessingMode::POLY_GRAPH) {
+        cacheBlocks[block_index].items[wl_offset].activeNow |= active;
+        if (active && (!numActiveBlocksNow.find(block_index))) {
+            numActiveBlocksNow.push_back(block_index);
+            if (!owner->running()) {
+                owner->start();
+            }
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
+        if (active && (!numActiveBlocksNext.find(block_index))) {
+            numActiveBlocksNext.push_back(block_index);
+        }
+    }
+
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if (cacheBlocks[block_index].busyMask == 0) {
+        if (cacheBlocks[block_index].hasConflict) {
+            if (cacheBlocks[block_index].dirty) {
+                cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                cacheBlocks[block_index].lastChangedTick = curTick();
+                memAccBuffer.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextWriteBack(block_index, schedule_tick);
+                    }, block_index, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+            } else {
+                bool atom_active_now = false;
+                bool atom_active_future = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                    atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+                }
+                if (atom_active_now) {
+                    numActiveBlocksNow.erase(block_index);
+                    int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.currentFrontierSize.sample(currentDirectory->workCount());
+                    stats.countActiveBlocksNow.sample(count);
+                }
+                if (atom_active_future) {
+                    numActiveBlocksNext.erase(block_index);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.countActiveBlocksNext.sample(count);
+                }
+                cacheBlocks[block_index].reset();
+            }
+        } else {
+            cacheBlocks[block_index].state = CacheState::IDLE;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+        }
+    }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    stats.numVertexWrites++;
+
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) &&
+        done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextMemoryEvent()
+{
+    int num_transitions = 0;
+    std::unordered_set<int> transitions;
+    MemoryFunctionDeque temp_deque;
+    temp_deque.clear();
+
+    while (true) {
+        if (memPort.blocked()) {
+            while (!temp_deque.empty()) {
+                memAccBuffer.push_front(temp_deque.back());
+                temp_deque.pop_back();
+            }
+            stats.numMemoryBlocks++;
+            nextMemoryEvent.sleep();
+            return;
+        }
+        DPRINTF(CoalesceEngine, "%s: Processing another "
+                            "memory function.\n", __func__);
+        std::function<void(int, Tick)> function;
+        int input;
+        Tick tick;
+        std::tie(function, input, tick) = memAccBuffer.front();
+        if ((transitions.find(input) == transitions.end()) || (input == -1)) {
+            function(input, tick);
+            memAccBuffer.pop_front();
+            transitions.insert(input);
+            stats.memAccBufferLat.sample((curTick() - tick) * 1e9 / getClockFrequency());
+            DPRINTF(CoalesceEngine, "%s: Popped a function from memAccBuffer. "
+                    "memAccBuffer.size = %d.\n", __func__, memAccBuffer.size());
+            num_transitions++;
+        } else {
+            temp_deque.emplace_back(function, input, tick);
+            memAccBuffer.pop_front();
+        }
+        if ((num_transitions >= transitionsPerCycle) || memAccBuffer.empty()) {
+            break;
+        }
+    }
+
+    while (!temp_deque.empty()) {
+        memAccBuffer.push_front(temp_deque.back());
+        temp_deque.pop_back();
+    }
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memAccBuffer.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+    // TODO: Figure out if this is still necessary.
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
+
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].valid);
+    assert(!cacheBlocks[block_index].dirty);
+    assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+    bool need_send_pkt = true;
+
+    // NOTE: Search postPushWBQueue
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // NOTE: If an atom is in the postPushWBQueue,
+            // the it is definitely currently not active.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                numActiveBlocksNext.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            wb = postPushWBQueue.erase(wb);
+            delete wb_pkt;
+        } else {
+            wb++;
+        }
+    }
+    // NOTE: Search activeBuffer
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) {
+        PacketPtr ab_pkt = std::get<0>(*ab);
+        if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) {
+            ab_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            // If an atom is in the activeBuffer,
+            // then it is definitely currently active.
+            numActiveBlocksNow.push_back(block_index);
+            // NOTE: Residence in the activeBuffer does not
+            // signify anything about future activity.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                numActiveBlocksNext.push_back(block_index);
+            }
+
+            need_send_pkt = false;
+            ab = activeBuffer.erase(ab);
+            delete ab_pkt;
+            if (pullCondition()) {
+                memAccBuffer.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, -1, curTick());
+                numScheduledPulls++;
+            }
+        } else {
+            ab++;
+        }
+    }
+    if (!need_send_pkt) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                    __func__, block_index,
+                    cacheBlocks[block_index].to_string());
+            it = MSHR[block_index].erase(it);
+        }
+        assert(MSHR[block_index].empty());
+        MSHR.erase(block_index);
+        if ((!nextResponseEvent.scheduled()) &&
+            (!responseQueue.empty())) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        cacheBlocks[block_index].state = CacheState::BUSY;
+    }
+
+    if (pendingPullAddrs.find(cacheBlocks[block_index].addr) !=
+                                            pendingPullAddrs.end()) {
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE);
+        pkt->pushSenderState(purpose);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+    }
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].dirty);
+        assert(cacheBlocks[block_index].hasConflict);
+        assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
+
+        // NOTE: If the atom we're writing back is active, we have to
+        // stop tracking it in the cache and start tracking it in the memory.
+        bool atom_active_now = false;
+        bool atom_active_future = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+            atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+        }
+
+        PacketPtr pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        pkt->getAddr(), pkt->getSize());
+        if (atom_active_future) {
+            numActiveBlocksNext.erase(block_index);
+        }
+        if (atom_active_now) {
+            numActiveBlocksNow.erase(block_index);
+            if (enoughSpace()) {
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.countActiveBlocksNow.sample(count);
+                if (atom_active_future) {
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.countActiveBlocksNext.sample(count);
+                }
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+            }
+        } else {
+            if (atom_active_future) {
+                int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.countActiveBlocksNext.sample(count);
+            }
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+        }
+        cacheBlocks[block_index].reset();
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
+    }
+}
+
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    if (!postPushWBQueue.empty()) {
+        PacketPtr wb_pkt;
+        Tick pkt_tick;
+        std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+        if (schedule_tick == pkt_tick) {
+            WorkListItem items[numElementsPerLine];
+            wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureDirectory->activate(wb_pkt->getAddr());
+            }
+            memPort.sendPacket(wb_pkt);
+            onTheFlyReqs++;
+            postPushWBQueue.pop_front();
+        }
+    }
+}
+
+void
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__);
+    numScheduledPulls--;
+    if (!currentDirectory->empty()) {
+        Addr addr = currentDirectory->getNextWork();
+        int block_index = getBlockIndex(addr);
+
+        bool in_cache = cacheBlocks[block_index].addr == addr;
+        bool in_active_buffer = false;
+        for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+            PacketPtr pkt = std::get<0>(*ab);
+            in_active_buffer |= (pkt->getAddr() == addr);
+        }
+        bool in_write_buffer = false;
+        for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+        {
+            PacketPtr pkt = std::get<0>(*wb);
+            in_write_buffer |= (pkt->getAddr() == addr);
+        }
+        bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end();
+
+        if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) {
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH);
+            pkt->pushSenderState(purpose);
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+            pendingPullReads++;
+            pendingPullAddrs.insert(addr);
+        }
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+int
+CoalesceEngine::workCount()
+{
+    return numActiveBlocksNow.size() + currentDirectory->workCount() + activeBuffer.size();
+}
+
+void
+CoalesceEngine::recvVertexPull()
+{
+    numReceivedPulls++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. numReceivedPulls: %d.\n", __func__, numReceivedPulls);
+
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
+    if (!nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    if ((!activeBuffer.empty()) &&
+        (postPushWBQueue.size() < postPushWBQueueSize)) {
+        PacketPtr pkt;
+        Tick entrance_tick;
+        WorkListItem items[numElementsPerLine];
+
+        std::tie(pkt, entrance_tick) = activeBuffer.front();
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) {
+            if (items[index].activeNow) {
+                Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
+                uint32_t delta = graphWorkload->apply(items[index]);
+                items[index].activeNow = false;
+                owner->recvVertexPush(addr, delta, items[index].edgeIndex,
+                                                    items[index].degree);
+                numReceivedPulls--;
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            }
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        bool atom_active_now = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active_now |= items[index].activeNow;
+        }
+        // NOTE: If the atom is not active anymore.
+        if (!atom_active_now) {
+            PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
+                                        peerMemoryAtomSize, (uint8_t*) items);
+            postPushWBQueue.emplace_back(wb_pkt, curTick());
+            activeBuffer.pop_front();
+            memAccBuffer.emplace_back(
+                [this] (int ignore, Tick schedule_tick) {
+                    processNextPostPushWB(ignore, schedule_tick);
+                }, -1, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            delete pkt;
+        }
+    } else if (!numActiveBlocksNow.empty()) {
+        int num_visited_indices = 0;
+        int initial_fifo_length = numActiveBlocksNow.size();
+        while (true) {
+            int block_index = numActiveBlocksNow.front();
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) {
+                    if (cacheBlocks[block_index].items[index].activeNow) {
+                        Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
+                        uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].items[index].activeNow = false;
+                        cacheBlocks[block_index].dirty = true;
+                        owner->recvVertexPush(addr, delta,
+                            cacheBlocks[block_index].items[index].edgeIndex,
+                            cacheBlocks[block_index].items[index].degree);
+                        numReceivedPulls--;
+                        stats.verticesPushed++;
+                        stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+                    }
+                }
+
+                bool atom_active_now = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                }
+                // NOTE: If we have reached the last item in the cache block
+                if (!atom_active_now) {
+                    numActiveBlocksNow.erase(block_index);
+                }
+                break;
+            }
+            // NOTE: If the block with index at the front of activeCacheBlocks
+            // is not in IDLE state, then roll the that index to the back
+            numActiveBlocksNow.pop_front();
+            numActiveBlocksNow.push_back(block_index);
+            // NOTE: If we have visited all the items initially in the FIFO.
+            num_visited_indices++;
+            if (num_visited_indices == initial_fifo_length) {
+                break;
+            }
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__);
+        stats.worklessCycles++;
+    }
+
+    if (pullCondition()) {
+        memAccBuffer.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextVertexPull(ignore, schedule_tick);
+            }, -1, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+        numScheduledPulls++;
+    }
+
+    if ((numReceivedPulls > 0) && (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine& _coalesce):
+    statistics::Group(&_coalesce), coalesce(_coalesce), lastResetTick(0),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
+    ADD_STAT(numConflicts, statistics::units::Count::get(),
+             "Number of conflicts raised by reads in the cache."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
+    ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(),
+             "Number of bytes read that were not used by coalesce engine"),
+    ADD_STAT(vertexActivations, statistics::units::Count::get(),
+             "Number of times a vertex has become active. "
+             "Only meaningful in async mode"),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(worklessCycles, statistics::units::Count::get(),
+             "cycles the coalesce engine could not find work for apply"),
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
+    ADD_STAT(currentFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the current bitvector."),
+    ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the future bitvector."),
+    ADD_STAT(countActiveBlocksNow, statistics::units::Count::get(),
+             "Histogram of the popCount values in the current directory"),
+    ADD_STAT(countActiveBlocksNext, statistics::units::Count::get(),
+             "Histogram of the popCount values in the future directory"),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
+    ADD_STAT(memAccBufferLat, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    currentFrontierSize.init(64);
+    futureFrontierSize.init(64);
+    countActiveBlocksNow.init(64);
+    countActiveBlocksNext.init(64);
+    responseQueueLatency.init(64);
+    memAccBufferLat.init(64);
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
new file mode 100644
index 0000000000..b7e3821dd7
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+
+#include <unordered_set>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/work_directory.hh"
+#include "base/cprintf.hh"
+#include "base/statistics.hh"
+#include "params/CoalesceEngine.hh"
+
+namespace gem5
+{
+
+typedef std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> MemoryFunctionDeque;
+
+class MPU;
+
+class CoalesceEngine : public BaseMemoryEngine
+{
+  private:
+    struct Block
+    {
+        WorkListItem* items;
+        Addr addr;
+        uint64_t busyMask;
+        bool valid;
+        bool dirty;
+        bool hasConflict;
+        CacheState state;
+        Tick lastChangedTick;
+        Block() {}
+        Block(int num_elements):
+          addr(-1),
+          busyMask(0),
+          valid(false),
+          dirty(false),
+          hasConflict(false),
+          state(CacheState::INVALID),
+          lastChangedTick(0)
+        {
+          items = new WorkListItem [num_elements];
+        }
+
+        void reset() {
+            addr = -1;
+            busyMask = 0;
+            valid = false;
+            dirty = false;
+            hasConflict = false;
+            state = CacheState::INVALID;
+            lastChangedTick = 0;
+        }
+
+        std::string to_string() {
+            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
+                "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                dirty ? "true" : "false", hasConflict ? "true" : "false",
+                cacheStateStrings[state], lastChangedTick);
+        }
+    };
+
+    struct ReadPurpose : public Packet::SenderState
+    {
+      ReadDestination _dest;
+      ReadPurpose(ReadDestination dest): _dest(dest) {}
+      ReadDestination dest() { return _dest; }
+    };
+
+    MPU* owner;
+    ProcessingMode mode;
+    WorkDirectory* currentDirectory;
+    WorkDirectory* futureDirectory;
+    GraphWorkload* graphWorkload;
+
+    Addr lastAtomAddr;
+
+    int numLines;
+    int numElementsPerLine;
+    Block* cacheBlocks;
+
+    Tick lastReadTick;
+    std::unordered_set<int> blocksTouchedThisTick;
+
+    int onTheFlyReqs;
+    std::unordered_map<int, std::vector<Addr>> MSHR;
+
+    // Response route to WLEngine
+    int maxRespPerCycle;
+    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
+
+    // Tracking work in cache
+    int numReceivedPulls;
+    // NOTE: Remember to erase from these upon eviction from cache
+    UniqueFIFO<int> numActiveBlocksNow;
+    UniqueFIFO<int> numActiveBlocksNext;
+
+    int numScheduledPulls;
+    int pendingPullLimit;
+    int pendingPullReads;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_set<Addr> pendingPullAddrs;
+
+    int activeBufferSize;
+    int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
+    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
+
+    bool enoughSpace();
+    bool pullCondition();
+    int getBlockIndex(Addr addr);
+
+    int transitionsPerCycle;
+    MemoryFunctionDeque memAccBuffer;
+
+    MemoryEvent nextMemoryEvent;
+    void processNextMemoryEvent();
+    void processNextRead(int block_index, Tick schedule_tick);
+    void processNextWriteBack(int block_index, Tick schedule_tick);
+    void processNextVertexPull(int ignore, Tick schedule_tick);
+    void processNextPostPushWB(int ignore, Tick schedule_tick);
+
+    EventFunctionWrapper nextResponseEvent;
+    void processNextResponseEvent();
+
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
+    struct CoalesceStats : public statistics::Group
+    {
+        CoalesceStats(CoalesceEngine& coalesce);
+
+        virtual void regStats() override;
+
+        virtual void resetStats() override;
+
+        CoalesceEngine &coalesce;
+
+        Tick lastResetTick;
+
+        statistics::Scalar numVertexReads;
+        statistics::Scalar numVertexWrites;
+        statistics::Scalar readHits;
+        statistics::Scalar readMisses;
+        statistics::Scalar readHitUnderMisses;
+        statistics::Scalar numConflicts;
+        statistics::Scalar responsePortShortage;
+        statistics::Scalar numMemoryBlocks;
+        statistics::Scalar wastefulBytesRead;
+        statistics::Scalar vertexActivations;
+        statistics::Scalar verticesPulled;
+        statistics::Scalar verticesPushed;
+        statistics::Scalar lastVertexPullTime;
+        statistics::Scalar lastVertexPushTime;
+        statistics::Scalar worklessCycles;
+
+        statistics::Formula hitRate;
+        statistics::Formula vertexPullBW;
+        statistics::Formula vertexPushBW;
+
+        statistics::Histogram currentFrontierSize;
+        statistics::Histogram futureFrontierSize;
+        statistics::Histogram countActiveBlocksNow;
+        statistics::Histogram countActiveBlocksNext;
+        statistics::Histogram responseQueueLatency;
+        statistics::Histogram memAccBufferLat;
+    };
+
+    CoalesceStats stats;
+
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
+
+  public:
+    PARAMS(CoalesceEngine);
+    CoalesceEngine(const Params &params);
+    void registerMPU(MPU* mpu);
+
+    void setProcessingMode(ProcessingMode _mode) { mode = _mode; }
+    void createAsyncPopCountDirectory(int atoms_per_block);
+    void createBSPPopCountDirectory(int atoms_per_block);
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+
+    virtual void recvFunctional(PacketPtr pkt) override;
+    void postMemInitSetup();
+    void postConsumeProcess();
+    void swapDirectories();
+
+    ReadReturnStatus recvWLRead(Addr addr);
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    int workCount();
+    int futureWorkCount();
+    void recvVertexPull();
+
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
new file mode 100644
index 0000000000..ba57b387f4
--- /dev/null
+++ b/src/accl/graph/sega/enums.cc
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/enums.hh"
+
+namespace gem5
+{
+
+const char* registerStateStrings[NUM_REGISTER_STATE] = {
+    "PENDING_READ",
+    "PENDING_REDUCE",
+    "PENDING_WRITE"
+};
+
+const char* cacheStateStrings[NUM_CACHE_STATE] = {
+    "INVALID",
+    "PENDING_DATA",
+    "BUSY",
+    "IDLE",
+    "PENDING_WB"
+};
+
+const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] =
+{
+    "ACCEPT",
+    "REJECT_ROLL",
+    "REJECT_NO_ROLL"
+};
+
+const char* readDestinationStrings[NUM_READ_DESTINATION] =
+{
+    "READ_FOR_CACHE",
+    "READ_FOR_PUSH"
+};
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
new file mode 100644
index 0000000000..0f654c5386
--- /dev/null
+++ b/src/accl/graph/sega/enums.hh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__
+#define __ACCL_GRAPH_SEGA_ENUMS_HH__
+
+namespace gem5
+{
+
+enum RegisterState
+{
+    PENDING_READ,
+    PENDING_REDUCE,
+    PENDING_WRITE,
+    NUM_REGISTER_STATE
+};
+extern const char* registerStateStrings[NUM_REGISTER_STATE];
+
+enum CacheState
+{
+    INVALID,
+    PENDING_DATA,
+    BUSY,
+    IDLE,
+    PENDING_WB,
+    NUM_CACHE_STATE
+};
+extern const char* cacheStateStrings[NUM_CACHE_STATE];
+
+enum ReadReturnStatus
+{
+    ACCEPT,
+    REJECT_ROLL,
+    REJECT_NO_ROLL,
+    NUM_READ_RETURN_STATUS
+};
+extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS];
+
+enum ReadDestination
+{
+    READ_FOR_CACHE,
+    READ_FOR_PUSH,
+    NUM_READ_DESTINATION
+};
+extern const char* readDestinationStrings[NUM_READ_DESTINATION];
+
+enum ProcessingMode
+{
+    NOT_SET,
+    ASYNCHRONOUS,
+    BULK_SYNCHRONOUS,
+    POLY_GRAPH,
+    NUM_PROCESSING_MODE
+};
+extern const char* processingModeStrings[NUM_PROCESSING_MODE];
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
new file mode 100644
index 0000000000..a5063cf685
--- /dev/null
+++ b/src/accl/graph/sega/mpu.cc
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/mpu.hh"
+
+#include "accl/graph/sega/centeral_controller.hh"
+#include "debug/MPU.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+MPU::MPU(const Params& params):
+    SimObject(params),
+    system(params.system),
+    wlEngine(params.wl_engine),
+    coalesceEngine(params.coalesce_engine),
+    pushEngine(params.push_engine)
+{
+    wlEngine->registerMPU(this);
+    coalesceEngine->registerMPU(this);
+    pushEngine->registerMPU(this);
+}
+
+void
+MPU::registerCenteralController(CenteralController* centeral_controller)
+{
+    centeralController = centeral_controller;
+}
+
+bool
+MPU::handleIncomingUpdate(PacketPtr pkt)
+{
+    return wlEngine->handleIncomingUpdate(pkt);
+}
+
+void
+MPU::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    wlEngine->handleIncomingWL(addr, wl);
+}
+
+void
+MPU::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    coalesceEngine->recvWLWrite(addr, wl);
+}
+
+void
+MPU::recvWorkload(GraphWorkload* workload)
+{
+    coalesceEngine->recvWorkload(workload);
+    pushEngine->recvWorkload(workload);
+    wlEngine->recvWorkload(workload);
+}
+
+void
+MPU::recvVertexPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
+{
+    pushEngine->recvVertexPush(addr, delta, edge_index, degree);
+}
+
+void
+MPU::recvMirrorPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
+{
+    pushEngine->recvMirrorPush(addr, delta, edge_index, degree);
+}
+
+void
+MPU::recvDoneSignal()
+{
+    if (done()) {
+        centeralController->recvDoneSignal();
+    }
+}
+
+bool
+MPU::done()
+{
+    return wlEngine->done() && coalesceEngine->done() && pushEngine->done();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
new file mode 100644
index 0000000000..7d6d7d4003
--- /dev/null
+++ b/src/accl/graph/sega/mpu.hh
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
+#define __ACCL_GRAPH_SEGA_MPU_HH__
+
+#include <unordered_map>
+#include <vector>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "sim/sim_object.hh"
+#include "sim/system.hh"
+#include "params/MPU.hh"
+
+namespace gem5
+{
+
+class CenteralController;
+
+class MPU : public SimObject
+{
+  private:
+    System* system;
+    CenteralController* centeralController;
+
+    WLEngine* wlEngine;
+    CoalesceEngine* coalesceEngine;
+    PushEngine* pushEngine;
+    
+  public:
+    PARAMS(MPU);
+    MPU(const Params& params);
+    void registerCenteralController(CenteralController* centeral_controller);
+
+    void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); }
+    void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); }
+    void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); }
+
+    unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; }
+    AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
+    uint64_t getCacheSize() { return coalesceEngine->params().cache_size; }
+    void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
+    void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
+    void postConsumeProcess() { coalesceEngine->postConsumeProcess(); }
+    void swapDirectories() { coalesceEngine->swapDirectories(); }
+
+    bool handleIncomingUpdate(PacketPtr pkt);
+
+    void handleIncomingWL(Addr addr, WorkListItem wl);
+    ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
+    void recvWLWrite(Addr addr, WorkListItem wl);
+    void recvWorkload(GraphWorkload* Workload);
+
+    int workCount() { return coalesceEngine->workCount(); }
+    void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
+    bool running() { return pushEngine->running(); }
+    void start() { return pushEngine->start(); }
+    void recvVertexPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+
+    void recvMirrorPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void startProcessingMirrors(Tick time_to_wait) { pushEngine->startProcessingMirrors(time_to_wait); }
+
+    void recvDoneSignal();
+    bool done();
+
+    uint64_t getBaseAddr() {return pushEngine->params().base_addr;};
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
new file mode 100644
index 0000000000..6173fd67e8
--- /dev/null
+++ b/src/accl/graph/sega/push_engine.cc
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/push_engine.hh"
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/PushEngine.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+PushEngine::PushEngine(const Params& params):
+    BaseMemoryEngine(params),
+    _running(false),
+    lastIdleEntranceTick(0),
+    numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
+    onTheFlyMemReqs(0), outstandingEdgeReqs(0), maxOutstandingEdgeReqs(0),
+    edgeQueueSize(params.resp_queue_size),
+    examineWindow(params.examine_window),
+    maxPropagatesPerCycle(params.max_propagates_per_cycle),
+    updateQueueSize(params.update_queue_size),
+    nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
+    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
+    nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
+    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()),
+    base(params.base_addr), 
+    stats(*this)
+{
+    destinationQueues.clear();
+    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+        outPorts.emplace_back(name() + ".out_ports" + std::to_string(i), this, i);
+        destinationQueues.emplace_back();
+        destinationQueues[i].clear();
+        sourceAndValueMaps.emplace_back();
+        sourceAndValueMaps[i].clear();
+    }
+}
+
+Port&
+PushEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "out_ports") {
+        return outPorts[idx];
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort(if_name, idx);
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::init()
+{
+    localAddrRange = owner->getAddrRanges();
+    for (int i = 0; i < outPorts.size(); i++){
+        AddrRangeList range_list = outPorts[i].getAddrRanges();
+        assert(range_list.size() == 1);
+        AddrRange range = outPorts[i].getAddrRanges().front();
+        portAddrMap.insert(range, i);
+    }
+}
+
+void
+PushEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__);
+        blockedPacket = pkt;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. "
+            "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__);
+        owner->recvReqRetry();
+    }
+}
+
+void
+PushEngine::recvReqRetry()
+{
+    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
+    if (!nextUpdatePushEvent.scheduled()) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::vertexSpace()
+{
+    return (edgePointerQueueSize == 0) ||
+        ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize);
+}
+
+bool
+PushEngine::workLeft()
+{
+    return ((owner->workCount() - numPendingPulls) > 0);
+}
+
+bool
+PushEngine::done()
+{
+    bool empty_update_queues = true;
+    for (int i = 0; i < outPorts.size(); i++) {
+        empty_update_queues &= destinationQueues[i].empty();
+    }
+    return empty_update_queues && metaEdgeQueue.empty() &&
+        (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
+}
+
+void
+PushEngine::start()
+{
+    assert(!_running);
+    // assert(!nextVertexPullEvent.scheduled());
+
+    _running = true;
+    // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
+    // NOTE: We might have to check for size availability here.
+    assert(workLeft());
+    if (vertexSpace() && !nextVertexPullEvent.scheduled()) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextVertexPullEvent()
+{
+    if (workLeft()) {
+        numPendingPulls++;
+        owner->recvVertexPull();
+        if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+            schedule(nextVertexPullEvent, nextCycle());
+        }
+    } else {
+        _running = false;
+        lastIdleEntranceTick = curTick();
+        DPRINTF(PushEngine, "%s: In idle state now.\n", __func__);
+    }
+}
+
+void
+PushEngine::recvVertexPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
+{
+    assert(degree > 0);
+    assert((edgePointerQueueSize == 0) ||
+            ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize));
+
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
+
+    edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+    numPendingPulls--;
+
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+
+    if ((!nextMemoryReadEvent.pending()) &&
+        (!nextMemoryReadEvent.scheduled())) {
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::recvMirrorPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
+{
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
+
+    edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+}
+
+void
+PushEngine::startProcessingMirrors(Tick time_to_wait)
+{
+    assert(!nextMemoryReadEvent.pending());
+    assert(!nextMemoryReadEvent.scheduled());
+    Cycles wait = ticksToCycles(time_to_wait);
+    if (!edgePointerQueue.empty()) {
+        schedule(nextMemoryReadEvent, clockEdge(wait));
+    }
+}
+
+void
+PushEngine::processNextMemoryReadEvent()
+{
+    if (memPort.blocked()) {
+        nextMemoryReadEvent.sleep();
+        return;
+    }
+    Addr aligned_addr, offset;
+    int num_edges;
+
+    EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front());
+    Tick entrance_tick = std::get<1>(edgePointerQueue.front());
+    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    aligned_addr += base;
+    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
+    {
+        DPRINTF(PushEngine, "%s: Current packet information generated by "
+                    "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
+                    "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
+
+        PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
+        PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges};
+        reqInfoMap[pkt->req] = push_info;
+        reqTickMap[pkt->req] = curTick(); //added this for stats
+        memPort.sendPacket(pkt);
+        onTheFlyMemReqs += num_edges;
+        outstandingEdgeReqs++;
+        maxOutstandingEdgeReqs = std::max(maxOutstandingEdgeReqs, outstandingEdgeReqs);
+        stats.maxOutstandingEdgeRequests = maxOutstandingEdgeReqs;
+        stats.outstandingEdgeRequests.sample(outstandingEdgeReqs);
+        // stats.outstandingEdgeRequests.sample(onTheFlyMemReqs);
+
+        curr_info.iterate();
+        if (curr_info.done()) {
+            DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
+            stats.edgePointerQueueLatency.sample(
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
+            edgePointerQueue.pop_front();
+            stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+            DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
+            "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
+        }
+    }
+
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+
+    if (!edgePointerQueue.empty()) {
+        assert(!nextMemoryReadEvent.pending());
+        assert(!nextMemoryReadEvent.scheduled());
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::recvMemRetry()
+{
+    if (nextMemoryReadEvent.pending()) {
+        DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
+        nextMemoryReadEvent.wake();
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::handleMemResp(PacketPtr pkt)
+{
+    // TODO: in case we need to edit edges, get rid of second statement.
+    assert(pkt->isResponse() && (!pkt->isWrite()));
+    // here check tick and update the stats
+    // DPRINTF(PushEngine, "%s: Received a memory response.\n", __func__);
+
+    uint8_t pkt_data [peerMemoryAtomSize];
+    PushInfo push_info = reqInfoMap[pkt->req];
+    Tick entrance_tick = reqTickMap[pkt->req];
+    pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
+
+    for (int i = 0; i < push_info.numElements; i++) {
+        Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
+        Addr edge_dst = edge->neighbor;
+        uint32_t edge_weight = edge->weight;
+        MetaEdge meta_edge(
+                    push_info.src, edge_dst, edge_weight, push_info.value);
+        metaEdgeQueue.emplace_back(meta_edge, curTick());
+        stats.edgeQueueLength.sample(metaEdgeQueue.size());
+    }
+    stats.edgeQueueLength.sample(metaEdgeQueue.size());
+    stats.numWastefulEdgesRead +=
+                (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
+
+    onTheFlyMemReqs -= push_info.numElements;
+    outstandingEdgeReqs--;
+    stats.avgEdgeAccessLatency.sample(
+            (curTick() - entrance_tick));
+    reqTickMap.erase(pkt->req); // added this for edgeAcccessLatency
+    reqInfoMap.erase(pkt->req);
+
+    delete pkt;
+
+    if (!nextPropagateEvent.scheduled()) {
+        schedule(nextPropagateEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+PushEngine::processNextPropagateEvent()
+{
+    int num_propagates = 0;
+    int num_tries = 0;
+    int num_reads = 0;
+    std::deque<std::tuple<MetaEdge, Tick>> temp_edge;
+    for (int i = 0; i < examineWindow; i++) {
+        if (metaEdgeQueue.empty()) {
+            break;
+        }
+        temp_edge.push_back(metaEdgeQueue.front());
+        metaEdgeQueue.pop_front();
+    }
+    int max_visits = temp_edge.size();
+
+    while(true) {
+        MetaEdge meta_edge;
+        Tick entrance_tick;
+        std::tie(meta_edge, entrance_tick) = temp_edge.front();
+
+        DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
+                                __func__, meta_edge.to_string());
+
+        uint32_t update_value =
+                graphWorkload->propagate(meta_edge.value, meta_edge.weight);
+        temp_edge.pop_front();
+        num_tries++;
+
+        if (enqueueUpdate(meta_edge.src, meta_edge.dst, update_value)) {
+            DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
+                                            __func__, meta_edge.to_string());
+            num_reads++;
+            stats.numPropagates++;
+            stats.edgeQueueLatency.sample(
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
+        } else {
+            temp_edge.emplace_back(meta_edge, entrance_tick);
+            stats.updateQueueFull++;
+        }
+        num_propagates++;
+
+        if (temp_edge.empty()) {
+            break;
+        }
+        if (num_tries >= max_visits) {
+            break;
+        }
+    }
+
+    while (!temp_edge.empty()) {
+        metaEdgeQueue.push_front(temp_edge.back());
+        temp_edge.pop_back();
+    }
+
+    stats.numPropagatesHist.sample(num_propagates);
+
+    assert(!nextPropagateEvent.scheduled());
+    if (!metaEdgeQueue.empty()) {
+        schedule(nextPropagateEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
+{
+    Addr aligned_dst = roundDown<Addr, size_t>(dst, owner->vertexAtomSize());
+    AddrRange update_range(aligned_dst, aligned_dst + owner->vertexAtomSize());
+    auto entry = portAddrMap.contains(update_range);
+    PortID port_id = entry->second;
+
+    DPRINTF(PushEngine, "%s: Update{src: %lu, dst:%lu, value: %u} "
+                        "belongs to port %d.\n",
+                        __func__, src, dst, value, port_id);
+    DPRINTF(PushEngine, "%s: There are %d updates already "
+                        "in queue for port %d.\n", __func__,
+                        destinationQueues[port_id].size(), port_id);
+
+    assert(destinationQueues[port_id].size() == sourceAndValueMaps[port_id].size());
+
+    int num_updates = 0;
+    for (auto queue: destinationQueues) {
+        num_updates += queue.size();
+    }
+
+    if (sourceAndValueMaps[port_id].find(dst) != sourceAndValueMaps[port_id].end()) {
+        DPRINTF(PushEngine, "%s: Found an existing update "
+                            "for dst: %lu.\n", __func__, dst);
+        Addr prev_src;
+        uint32_t prev_val;
+        std::tie(prev_src, prev_val) = sourceAndValueMaps[port_id][dst];
+        uint32_t new_val = graphWorkload->reduce(value, prev_val);
+        sourceAndValueMaps[port_id][dst] = std::make_tuple(prev_src, new_val);
+        DPRINTF(PushEngine, "%s: Coalesced Update{src: %lu, dst:%lu, value: %u} "
+                            "with Update{src: %lu, dst:%lu, value: %u} to"
+                            "Update{src: %lu, dst:%lu, value: %u}.\n", __func__,
+                            src, dst, value, prev_src, dst, prev_val,
+                            prev_src, dst, new_val);
+        stats.updateQueueCoalescions++;
+        return true;
+    } else if (num_updates < (updateQueueSize * destinationQueues.size())) {
+        DPRINTF(PushEngine, "%s: There is a free entry available "
+                            "in queue for port %d.\n", __func__, port_id);
+        destinationQueues[port_id].emplace_back(dst, curTick());
+        sourceAndValueMaps[port_id][dst] = std::make_tuple(src, value);
+        DPRINTF(PushEngine, "%s: Emplaced Update{src: %lu, dst:%lu, value: %u} "
+                            "at the back of queue for port %d. "
+                            "Size of queue for port %d is %d.\n", __func__,
+                            src, dst, value, port_id, port_id,
+                            destinationQueues[port_id].size());
+        stats.updateQueueLength.sample(destinationQueues[port_id].size());
+        if (!nextUpdatePushEvent.scheduled()) {
+            schedule(nextUpdatePushEvent, nextCycle());
+        }
+        return true;
+    }
+    DPRINTF(PushEngine, "%s: DestinationQueue for pot %d is blocked.\n",
+                            __func__, port_id);
+    return false;
+}
+
+template<typename T> PacketPtr
+PushEngine::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 1) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+void
+PushEngine::processNextUpdatePushEvent()
+{
+    int next_time_send = 0;
+
+    for (int i = 0; i < outPorts.size(); i++) {
+        if (outPorts[i].blocked()) {
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, i);
+            continue;
+        }
+        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, i);
+        if (destinationQueues[i].empty()) {
+            DPRINTF(PushEngine, "%s: Respective queue for "
+                                "port %d is empty.\n", __func__, i);
+            continue;
+        }
+        Addr dst;
+        Tick entrance_tick;
+        std::tie(dst, entrance_tick) = destinationQueues[i].front();
+        Addr src;
+        uint32_t value;
+        std::tie(src, value) = sourceAndValueMaps[i][dst];
+
+        PacketPtr pkt = createUpdatePacket<uint32_t>(dst, value);
+        outPorts[i].sendPacket(pkt);
+        destinationQueues[i].pop_front();
+        sourceAndValueMaps[i].erase(dst);
+        DPRINTF(PushEngine, "%s: Sent Update{src: %lu, dst:%lu, value: %u} to "
+                    "port %d. Respective queue size is %d.\n", __func__,
+                    src, dst, value, i, destinationQueues[i].size());
+        if (destinationQueues[i].size() > 0) {
+            next_time_send += 1;
+        }
+        stats.numUpdates++;
+    }
+
+    assert(!nextUpdatePushEvent.scheduled());
+    if (next_time_send > 0) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
+PushEngine::PushStats::PushStats(PushEngine& _push):
+    statistics::Group(&_push), push(_push),
+    ADD_STAT(numPropagates, statistics::units::Count::get(),
+             "Number of propagate operations done."),
+    ADD_STAT(updateQueueFull, statistics::units::Count::get(),
+             "Number of times the update queue returns false."),
+    ADD_STAT(numNetBlocks, statistics::units::Count::get(),
+             "Number of updates blocked by network."),
+    // ADD_STAT(numIdleCycles, statistics::units::Count::get(),
+    //          "Number of cycles PushEngine has been idle."),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
+    ADD_STAT(numUpdates, statistics::units::Count::get(),
+             "Number of updates sent to the network."),
+    ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(),
+             "Number of wasteful edges read from edge memory."),
+    ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
+                                    statistics::units::Second>::get(),
+             "Traversed Edges Per Second."),
+    ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the edgePointerQueue."),
+    ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the edgePointerQueue."),
+    ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the metaEdgeQueue."),
+    ADD_STAT(edgeQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the metaEdgeQueue."),
+    ADD_STAT(updateQueueLength, statistics::units::Count::get(),
+             "Histogram of the length of updateQueues."),
+    ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
+             "Histogram of number of propagates sent."),
+    ADD_STAT(avgEdgeAccessLatency, statistics::units::Second::get(),
+             "Histogram of edgeAccessLatency."),
+    ADD_STAT(outstandingEdgeRequests, statistics::units::Count::get(),
+             "Histogram of the size of the outstanding edge requests."),
+    ADD_STAT(maxOutstandingEdgeRequests, statistics::units::Count::get(),
+             "Histogram of the size of the outstanding edge requests.")
+{
+}
+
+void
+PushEngine::PushStats::regStats()
+{
+    using namespace statistics;
+
+    TEPS = numPropagates / simSeconds;
+
+    edgePointerQueueLatency.init(64);
+    edgePointerQueueLength.init(64);
+    edgeQueueLatency.init(64);
+    edgeQueueLength.init(64);
+    updateQueueLength.init(64);
+    numPropagatesHist.init(1 + push.params().max_propagates_per_cycle);
+
+    // need to check what these init values mean 
+    avgEdgeAccessLatency.init(64);
+    outstandingEdgeRequests.init(64);
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
new file mode 100644
index 0000000000..41fb6391cd
--- /dev/null
+++ b/src/accl/graph/sega/push_engine.hh
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+
+#include <unordered_map>
+#include <vector>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "base/addr_range_map.hh"
+#include "base/intmath.hh"
+#include "params/PushEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class PushEngine : public BaseMemoryEngine
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner, PortID id) :
+          RequestPort(name, owner), 
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class EdgeReadInfoGen {
+      private:
+        Addr _src;
+        uint32_t _delta;
+
+        Addr _start;
+        Addr _end;
+        size_t _step;
+        size_t _atom;
+
+      public:
+        EdgeReadInfoGen(Addr src, uint32_t delta, Addr start,
+                        Addr end, size_t step, size_t atom):
+                        _src(src), _delta(delta), _start(start),
+                        _end(end), _step(step), _atom(atom)
+        {}
+
+        Addr src() { return _src; }
+        uint32_t delta() { return _delta; }
+
+        std::tuple<Addr, Addr, int> nextReadPacketInfo()
+        {
+            panic_if(done(), "Should not call nextPacketInfo when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            Addr offset = _start - aligned_addr;
+            int num_items = 0;
+
+            if (_end > (aligned_addr + _atom)) {
+                num_items = (_atom - offset) / _step;
+            } else {
+                num_items = (_end - _start) / _step;
+            }
+
+            return std::make_tuple(aligned_addr, offset, num_items);
+        }
+
+        void iterate()
+        {
+            panic_if(done(), "Should not call iterate when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            _start = aligned_addr + _atom;
+        }
+
+        bool done() { return (_start >= _end); }
+    };
+
+    struct PushInfo {
+        Addr src;
+        uint32_t value;
+        Addr offset;
+        int numElements;
+    };
+
+    MPU* owner;
+    GraphWorkload* graphWorkload;
+
+    bool _running;
+    Tick lastIdleEntranceTick;
+
+    AddrRangeList localAddrRange;
+    Addr base;
+    int numPendingPulls;
+    int edgePointerQueueSize;
+    std::deque<std::tuple<EdgeReadInfoGen, Tick>> edgePointerQueue;
+    std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
+    std::unordered_map<RequestPtr, Tick> reqTickMap;
+
+
+    int onTheFlyMemReqs;
+    int outstandingEdgeReqs;
+    int maxOutstandingEdgeReqs;
+    int edgeQueueSize;
+    int examineWindow;
+    int maxPropagatesPerCycle;
+    std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
+
+    int updateQueueSize;
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
+    bool enqueueUpdate(Addr src, Addr dst, uint32_t value);
+    std::vector<std::deque<std::tuple<Addr, Tick>>> destinationQueues;
+    std::vector<std::unordered_map<Addr, std::tuple<Addr, uint32_t>>> sourceAndValueMaps;
+    AddrRangeMap<PortID> portAddrMap;
+    std::vector<ReqPort> outPorts;
+
+    bool vertexSpace();
+    bool workLeft();
+
+    EventFunctionWrapper nextVertexPullEvent;
+    void processNextVertexPullEvent();
+
+    MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent();
+
+    EventFunctionWrapper nextPropagateEvent;
+    void processNextPropagateEvent();
+
+    EventFunctionWrapper nextUpdatePushEvent;
+    void processNextUpdatePushEvent();
+
+    struct PushStats : public statistics::Group
+    {
+      PushStats(PushEngine& push);
+
+      void regStats() override;
+
+      PushEngine &push;
+
+      statistics::Scalar numMemoryBlocks;
+      statistics::Scalar numPropagates;
+      statistics::Scalar updateQueueFull;
+      statistics::Scalar numNetBlocks;
+      statistics::Scalar updateQueueCoalescions;
+      statistics::Scalar numUpdates;
+      statistics::Scalar numWastefulEdgesRead;
+
+      statistics::Formula TEPS;
+
+      statistics::Histogram edgePointerQueueLatency;
+      statistics::Histogram edgePointerQueueLength;
+      statistics::Histogram edgeQueueLatency;
+      statistics::Histogram edgeQueueLength;
+      statistics::Histogram updateQueueLength;
+      statistics::Histogram numPropagatesHist;
+
+      statistics::Histogram avgEdgeAccessLatency;
+      statistics::Histogram outstandingEdgeRequests;
+      statistics::Scalar maxOutstandingEdgeRequests = 0;
+    };
+
+    PushStats stats;
+
+  protected:
+    virtual void recvMemRetry();
+    virtual bool handleMemResp(PacketPtr pkt);
+
+  public:
+    PARAMS(PushEngine);
+    PushEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
+    void registerMPU(MPU* mpu);
+
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+    virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
+    void start();
+    bool running() { return _running; }
+    void recvVertexPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void recvMirrorPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void startProcessingMirrors(Tick time_to_wait);
+
+    void recvReqRetry();
+
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md
new file mode 100644
index 0000000000..203c47cf02
--- /dev/null
+++ b/src/accl/graph/sega/state_machine.md
@@ -0,0 +1 @@
+# CoalesceEngine Block state machine
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
new file mode 100644
index 0000000000..8e5ccc9ebe
--- /dev/null
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -0,0 +1,499 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/wl_engine.hh"
+
+#include <algorithm>
+#include <random>
+#include <vector>
+
+#include "accl/graph/sega/mpu.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "debug/WLEngine.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+WLEngine::WLEngine(const WLEngineParams& params):
+    BaseReduceEngine(params),
+    updateQueueSize(params.update_queue_size),
+    examineWindow(params.examine_window),
+    maxReadsPerCycle(params.rd_per_cycle),
+    maxReducesPerCycle(params.reduce_per_cycle),
+    maxWritesPerCycle(params.wr_per_cycle),
+    registerFileSize(params.register_file_size),
+    nextReadEvent([this]{ processNextReadEvent(); }, name()),
+    nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    nextWriteEvent([this] { processNextWriteEvent(); }, name()),
+    nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()),
+    stats(*this)
+{
+    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
+        inPorts.emplace_back(
+                            name() + ".in_ports" + std::to_string(i), this, i);
+    }
+}
+
+Port&
+WLEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "in_ports") {
+        return inPorts[idx];
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::init()
+{
+    for (int i = 0; i < inPorts.size(); i++){
+        inPorts[i].sendRangeChange();
+    }
+}
+
+void
+WLEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+AddrRangeList
+WLEngine::getAddrRanges()
+{
+    return owner->getAddrRanges();
+}
+
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+void
+WLEngine::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        needSendRetryReq = false;
+        sendRetryReq();
+    }
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::checkRetryReq()
+{
+    std::vector<int> random_shuffle;
+    for (int i = 0; i < inPorts.size(); i++) {
+        random_shuffle.push_back(i);
+    }
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::shuffle(random_shuffle.begin(), random_shuffle.end(), gen);
+
+    for (int i = 0; i < inPorts.size(); i++) {
+        inPorts[random_shuffle[i]].checkRetryReq();
+    }
+}
+
+bool
+WLEngine::done()
+{
+    return registerFile.empty() && updateQueue.empty();
+}
+
+bool
+WLEngine::handleIncomingUpdate(PacketPtr pkt)
+{
+    Addr update_addr = pkt->getAddr();
+    uint32_t update_value = pkt->getLE<uint32_t>();
+
+    if (valueMap.find(update_addr) != valueMap.end()) {
+        assert((updateQueueSize == 0) ||
+                (updateQueue.size() <= updateQueueSize));
+        DPRINTF(WLEngine, "%s: Found an already queued update to %u. ",
+                            "Current value is: %u.\n", __func__,
+                            update_addr, valueMap[update_addr]);
+        valueMap[update_addr] =
+                graphWorkload->reduce(update_value, valueMap[update_addr]);
+        stats.numIncomingUpdates++;
+        stats.updateQueueCoalescions++;
+    } else {
+        assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
+        if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
+            return false;
+        } else {
+            updateQueue.emplace_back(update_addr, curTick());
+            valueMap[update_addr] = update_value;
+            stats.numIncomingUpdates++;
+            DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                        "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                        __func__, update_addr, update_value,
+                        updateQueue.size(), updateQueueSize);
+            DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                        "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                        __func__, update_addr, update_value,
+                        updateQueue.size(), updateQueueSize);
+        }
+    }
+
+    // delete the packet since it's not needed anymore.
+    delete pkt;
+
+    if (!nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+WLEngine::processNextReadEvent()
+{
+    std::deque<std::tuple<Addr, Tick>> temp_queue;
+    for (int i = 0; i < examineWindow; i++) {
+        if (updateQueue.empty()) {
+            break;
+        }
+        temp_queue.push_back(updateQueue.front());
+        updateQueue.pop_front();
+    }
+
+    int num_reads = 0;
+    int num_popped = 0;
+    int num_tries = 0;
+    int max_visits = temp_queue.size();
+    while (true) {
+        Addr update_addr;
+        Tick enter_tick;
+        std::tie(update_addr, enter_tick) = temp_queue.front();
+
+        uint32_t update_value = valueMap[update_addr];
+        DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
+            "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
+        if ((registerFile.find(update_addr) == registerFile.end())) {
+            DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
+                                "in registerFile.\n", __func__, update_addr);
+            if (registerFile.size() < registerFileSize) {
+                DPRINTF(WLEngine, "%s: There are free registers available in the "
+                                                "registerFile.\n", __func__);
+                ReadReturnStatus read_status = owner->recvWLRead(update_addr);
+                if (read_status == ReadReturnStatus::ACCEPT) {
+                    DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
+                                "request to addr: %lu.\n", __func__, update_addr);
+                    registerFile[update_addr] = std::make_tuple(RegisterState::PENDING_READ, update_value);
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
+                            "to registerFile. registerFile.size = %d, "
+                            "registerFileSize = %d.\n", __func__, update_addr,
+                            update_value, registerFile.size(), registerFileSize);
+                    DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
+                            "to registerFile. registerFile.size = %d, "
+                            "registerFileSize = %d.\n", __func__, update_addr,
+                            update_value, registerFile.size(), registerFileSize);
+                    temp_queue.pop_front();
+                    valueMap.erase(update_addr);
+                    num_reads++;
+                    num_popped++;
+                    stats.updateQueueLatency.sample(
+                            (curTick() - enter_tick) * 1e9 / getClockFrequency());
+                    DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                                "from updateQueue. updateQueue.size = %d. "
+                                "updateQueueSize = %d.\n", __func__, update_addr,
+                                update_value, temp_queue.size(), updateQueueSize);
+                    DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                                "from updateQueue. updateQueue.size = %d. "
+                                "updateQueueSize = %d.\n", __func__, update_addr,
+                                update_value, updateQueue.size(), updateQueueSize);
+                    vertexReadTime[update_addr] = curTick();
+                } else {
+                    if (read_status == ReadReturnStatus::REJECT_ROLL) {
+                        temp_queue.pop_front();
+                        temp_queue.emplace_back(update_addr, enter_tick);
+                        DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                            "Rolling the update.\n", __func__);
+                        stats.numUpdateRolls++;
+                    } else {
+                        temp_queue.pop_front();
+                        temp_queue.emplace_back(update_addr, enter_tick);
+                        DPRINTF(WLEngine, "%s: Received a reject with no roll "
+                        "from cache. Rolling the update anyway.\n", __func__);
+                    }
+                }
+            } else {
+                DPRINTF(WLEngine, "%s: There are no free registers "
+                        "available in the registerFile.\n", __func__);
+                temp_queue.pop_front();
+                temp_queue.emplace_back(update_addr, enter_tick);
+                stats.registerShortage++;
+            }
+        } else {
+            DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                "addr: %lu in registerFile. registerFile[%lu] = %u.\n", __func__,
+                update_addr, update_addr, std::get<1>(registerFile[update_addr]));
+            RegisterState state = std::get<0>(registerFile[update_addr]);
+            if (state == RegisterState::PENDING_WRITE) {
+                // NOTE: If it's pending write, let it be written.
+                DPRINTF(WLEngine, "%s: Respective register for addr: "
+                        "%lu is pending a write to the cache. Rolling "
+                        "the update.\n", __func__, update_addr);
+                temp_queue.pop_front();
+                temp_queue.emplace_back(update_addr, enter_tick);
+            } else {
+                uint32_t curr_value = std::get<1>(registerFile[update_addr]);
+                uint32_t new_value = graphWorkload->reduce(update_value, curr_value);
+                registerFile[update_addr] = std::make_tuple(state, new_value);
+                DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
+                            " registerFile. registerFile[%lu] = %u.\n", __func__,
+                            update_value, update_addr, std::get<1>(registerFile[update_addr]));
+                stats.registerFileCoalescions++;
+                temp_queue.pop_front();
+                valueMap.erase(update_addr);
+                num_popped++;
+                stats.updateQueueLatency.sample(
+                                (curTick() - enter_tick) * 1e9 / getClockFrequency());
+                DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                                    "from updateQueue. updateQueue.size = %d. "
+                                    "updateQueueSize = %d.\n", __func__, update_addr,
+                                    update_value, updateQueue.size(), updateQueueSize);
+                DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+            }
+        }
+
+        num_tries++;
+        if (num_reads >= maxReadsPerCycle) {
+            if (!temp_queue.empty()) {
+                stats.numReadPortShortage++;
+            }
+            break;
+        }
+        if (num_tries >= max_visits) {
+            break;
+        }
+        if (temp_queue.empty()) {
+            break;
+        }
+    }
+
+    while (!temp_queue.empty()) {
+        updateQueue.push_front(temp_queue.back());
+        temp_queue.pop_back();
+    }
+    if (num_popped > 0) {
+        checkRetryReq();
+    }
+    if (!updateQueue.empty() && !nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    assert(workListFile.size() <= registerFileSize);
+    assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_READ);
+
+    workListFile[addr] = wl;
+    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                graphWorkload->printWorkListItem(wl), workListFile.size());
+    DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                graphWorkload->printWorkListItem(wl), workListFile.size());
+
+    uint32_t value = std::get<1>(registerFile[addr]);
+    registerFile[addr] = std::make_tuple(RegisterState::PENDING_REDUCE, value);
+    toReduce.push_back(addr);
+
+    stats.vertexReadLatency.sample(
+        ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
+    vertexReadTime.erase(addr);
+
+    if (!nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextReduceEvent()
+{
+    int num_reduces = 0;
+    while (true) {
+        Addr addr = toReduce.front();
+        assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_REDUCE);
+        uint32_t update_value = std::get<1>(registerFile[addr]);
+        DPRINTF(WLEngine, "%s: Reducing for addr: %lu, update_value: %u, "
+                            "temp_prop: %s.\n", __func__, addr,
+                            update_value, workListFile[addr].tempProp);
+        workListFile[addr].tempProp =
+            graphWorkload->reduce(update_value, workListFile[addr].tempProp);
+        DPRINTF(WLEngine, "%s: Reduction result: %s", __func__,
+                graphWorkload->printWorkListItem(workListFile[addr]));
+        registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value);
+        num_reduces++;
+        stats.numReductions++;
+        toReduce.pop_front();
+        toWrite.push_back(addr);
+
+        if (num_reduces >= maxReducesPerCycle) {
+            if (!toReduce.empty()) {
+                stats.numReducerShortage++;
+            }
+            break;
+        }
+        if (toReduce.empty()) {
+            break;
+        }
+    }
+
+    if (!toWrite.empty() && !nextWriteEvent.scheduled()) {
+        schedule(nextWriteEvent, nextCycle());
+    }
+
+    if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextWriteEvent()
+{
+    int num_writes = 0;
+    while (true) {
+        Addr addr = toWrite.front();
+        assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_WRITE);
+        owner->recvWLWrite(addr, workListFile[addr]);
+        registerFile.erase(addr);
+        workListFile.erase(addr);
+        toWrite.pop_front();
+        num_writes++;
+        if (num_writes >= maxWritesPerCycle) {
+            if (!toWrite.empty()) {
+                stats.numWritePortShortage++;
+            }
+            break;
+        }
+        if (toWrite.empty()) {
+            break;
+        }
+    }
+
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+
+    if (!toWrite.empty() && !nextWriteEvent.scheduled()) {
+        schedule(nextWriteEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+WLEngine::WorkListStats::WorkListStats(WLEngine& _wl):
+    statistics::Group(&_wl), wl(_wl),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
+    ADD_STAT(registerShortage, statistics::units::Count::get(),
+             "Number of times updates were "
+             "stalled because of register shortage"),
+    ADD_STAT(numUpdateRolls, statistics::units::Count::get(),
+             "Number of times an update has been rolled back "
+             "to the back of the update queue due to cache reject."),
+    ADD_STAT(numReadPortShortage, statistics::units::Count::get(),
+             "Number of times limited by read per cycle."),
+    ADD_STAT(registerFileCoalescions, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numReductions, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numReducerShortage, statistics::units::Count::get(),
+             "Number of times limited by number of reducers."),
+    ADD_STAT(numWritePortShortage, statistics::units::Count::get(),
+             "Number of times limited by write per cycle."),
+    ADD_STAT(numIncomingUpdates, statistics::units::Count::get(),
+              "Number of inocoming updates for each GPT."),
+    ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
+             "Histogram of the latency of reading a vertex (ns)."),
+    ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of dequeuing an update (ns).")
+{
+}
+
+void
+WLEngine::WorkListStats::regStats()
+{
+    using namespace statistics;
+
+    vertexReadLatency.init(64);
+    updateQueueLatency.init(64);
+
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
new file mode 100644
index 0000000000..b1a85a2465
--- /dev/null
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/enums.hh"
+#include "base/statistics.hh"
+#include "params/WLEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class WLEngine : public BaseReduceEngine
+{
+  private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner, PortID id):
+          ResponsePort(name), owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    MPU* owner;
+    GraphWorkload* graphWorkload;
+
+    std::vector<RespPort> inPorts;
+
+    int updateQueueSize;
+    std::deque<std::tuple<Addr, Tick>> updateQueue;
+    std::unordered_map<Addr, uint32_t> valueMap;
+
+    int examineWindow;
+    int maxReadsPerCycle;
+    int maxReducesPerCycle;
+    int maxWritesPerCycle;
+
+    int registerFileSize;
+    std::unordered_map<Addr, std::tuple<RegisterState, uint32_t>> registerFile;
+    std::unordered_map<Addr, WorkListItem> workListFile;
+    std::deque<Addr> toReduce;
+    std::deque<Addr> toWrite;
+
+    std::unordered_map<Addr, Tick> vertexReadTime;
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextReduceEvent;
+    void processNextReduceEvent();
+
+    EventFunctionWrapper nextWriteEvent;
+    void processNextWriteEvent();
+
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
+    struct WorkListStats : public statistics::Group
+    {
+      WorkListStats(WLEngine& worklist);
+
+      void regStats() override;
+
+      WLEngine &wl;
+      statistics::Scalar updateQueueCoalescions;
+      statistics::Scalar registerShortage;
+      statistics::Scalar numUpdateRolls;
+      statistics::Scalar numReadPortShortage;
+      statistics::Scalar registerFileCoalescions;
+      statistics::Scalar numReductions;
+      statistics::Scalar numReducerShortage;
+      statistics::Scalar numWritePortShortage;
+      statistics::Scalar numIncomingUpdates;
+
+      statistics::Histogram vertexReadLatency;
+      statistics::Histogram updateQueueLatency;
+    };
+
+    WorkListStats stats;
+
+  public:
+    PARAMS(WLEngine);
+    WLEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
+    void registerMPU(MPU* mpu);
+
+    AddrRangeList getAddrRanges();
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+    void recvFunctional(PacketPtr pkt);
+
+    bool handleIncomingUpdate(PacketPtr pkt);
+    void handleIncomingWL(Addr addr, WorkListItem wl);
+
+    void checkRetryReq();
+
+    bool done();
+};
+
+// virtual AddrRangeList getAddrRanges() const;
+
+// protected:
+// virtual bool recvTimingReq(PacketPtr pkt);
+// virtual Tick recvAtomic(PacketPtr pkt);
+// virtual void recvFunctional(PacketPtr pkt);
+// virtual void recvRespRetry();
+
+
+}
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
new file mode 100644
index 0000000000..620e97f654
--- /dev/null
+++ b/src/accl/graph/sega/work_directory.hh
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+
+#include <iostream>
+
+#include "accl/graph/base/data_structs.hh"
+#include "base/addr_range.hh"
+#include "base/types.hh"
+
+namespace gem5
+{
+
+class WorkDirectory
+{
+  public:
+    virtual int activate(Addr atom_addr) = 0;
+    virtual int deactivate(Addr atom_addr) = 0;
+    virtual Addr getNextWork() = 0;
+
+    virtual int workCount() = 0;
+    bool empty() { return workCount() == 0; }
+
+    virtual void setLastAtomAddr(Addr atom_addr) = 0;
+};
+
+class PopCountDirectory: public WorkDirectory
+{
+  private:
+    AddrRange memoryRange;
+
+    int numAtomsPerBlock;
+    int memoryAtomSize;
+    int blockSize;
+
+    uint32_t _workCount;
+
+    int numCounters;
+    int lastCounterIndex;
+    uint32_t* popCount;
+
+    uint32_t prevIndex;
+    uint32_t currentCounter;
+
+    UniqueFIFO<int> activeBlockIndices;
+
+    int getIndexFromAtomAddr(Addr atom_addr)
+    {
+        assert((atom_addr % memoryAtomSize) == 0);
+        Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr);
+        int index = (int) (trimmed_addr / blockSize);
+        return index;
+    }
+
+    Addr getAtomAddrFromIndex(int block_index, int atom_index)
+    {
+        Addr block_addr = block_index * blockSize;
+        Addr trimmed_addr = block_addr + atom_index * memoryAtomSize;
+        return memoryRange.addIntlvBits(trimmed_addr);
+    }
+
+  public:
+    PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size):
+        WorkDirectory(),
+        memoryRange(mem_range), numAtomsPerBlock(atoms_per_block),
+        memoryAtomSize(atom_size), _workCount(0),
+        prevIndex(-1), currentCounter(0)
+    {
+        blockSize = numAtomsPerBlock * memoryAtomSize;
+        int numCounters = (int) (memoryRange.size() / blockSize);
+        lastCounterIndex = numCounters - 1;
+        popCount = new uint32_t [numCounters];
+        for (int index = 0; index < numCounters; index++) {
+            popCount[index] = 0;
+        }
+        activeBlockIndices = UniqueFIFO<int>(numCounters);
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is not** tracking the the atom with atom_addr
+    virtual int activate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]++;
+        _workCount++;
+        activeBlockIndices.push_back(index);
+        assert(popCount[index] > prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is** tracking the the atom with atom_addr
+    virtual int deactivate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]--;
+        _workCount--;
+        if (popCount[index] == 0) {
+            activeBlockIndices.erase(index);
+        }
+        assert(popCount[index] < prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
+    }
+
+    virtual int workCount() { return _workCount; }
+
+    void setLastAtomAddr(Addr atom_addr)
+    {
+        lastCounterIndex = getIndexFromAtomAddr(atom_addr);
+    }
+
+    // CAUTION: This directory only tracks active vertices in the memory
+    // and it does not have any information on the state of the cache and/or
+    // the active buffer or the write buffer. Therefore, it might generate a
+    // read request to an address that might be in any of those. In that case,
+    // the generated address should be ignored.
+    virtual Addr getNextWork()
+    {
+        // Why ask directory if it's empty?
+        assert(!activeBlockIndices.empty());
+        int front_index = activeBlockIndices.front();
+        assert(popCount[front_index] > 0);
+        if ((prevIndex != -1) && (prevIndex != front_index)) {
+            currentCounter = 0;
+        }
+        if (currentCounter == numAtomsPerBlock) {
+            currentCounter = 0;
+            activeBlockIndices.pop_front();
+            activeBlockIndices.push_back(front_index);
+        }
+        int current_index = activeBlockIndices.front();
+        Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter);
+        prevIndex = current_index;
+        currentCounter++;
+        return ret_addr;
+    }
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 11fb1cd668..b314bfefe1 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -48,6 +48,7 @@
 
 #include "base/bitfield.hh"
 #include "base/cprintf.hh"
+#include "base/intmath.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 
@@ -748,6 +749,40 @@ class AddrRange
         }
         return AddrRange(start, end);
     }
+
+    friend AddrRange
+    mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit)
+    {
+        assert(left.interleaved());
+        assert(right.interleaved());
+        assert(left.mergesWith(right));
+
+        uint8_t old_left_match = left.intlvMatch;
+        uint8_t new_left_match = 0;
+        uint8_t old_right_match = right.intlvMatch;
+        uint8_t new_right_match = 0;
+        int new_bits = left.masks.size() - 1;
+
+        // assumption: masks is sorted in ascending order
+        std::vector<Addr> new_masks;
+        for (auto mask: left.masks) {
+            uint64_t lsb_mask = (mask ^ (mask - 1)) + 1;
+            if ((lsb_mask >> 1) != (1 << pch_bit)) {
+                new_masks.push_back(mask);
+                new_left_match |= ((old_left_match & 1) << new_bits);
+                new_left_match >>= 1;
+                new_right_match |= ((old_right_match & 1) << new_bits);
+                new_right_match >>= 1;
+            }
+            old_left_match >>= 1;
+            old_right_match >>= 1;
+        }
+        panic_if(new_left_match != new_right_match,
+                    "The two ranges can not be a pseudo channel pair "
+                    "given the pseudochannel bit position of params.pch_bit.");
+
+        return AddrRange(left._start, left._end, new_masks, new_left_match);
+    }
 };
 
 static inline AddrRangeList
@@ -833,6 +868,16 @@ RangeSize(Addr start, Addr size)
     return AddrRange(start, start + size);
 }
 
+inline bool
+contains(AddrRangeList range_list, Addr addr)
+{
+    bool ret = false;
+    for (auto range: range_list) {
+        ret |= range.contains(addr);
+    }
+    return ret;
+}
+
 } // namespace gem5
 
 #endif // __BASE_ADDR_RANGE_HH__
diff --git a/src/base/statistics.hh b/src/base/statistics.hh
index 8156be5a79..22be74ec90 100644
--- a/src/base/statistics.hh
+++ b/src/base/statistics.hh
@@ -1051,7 +1051,7 @@ class VectorBase : public DataWrapVec<Derived, VectorInfoProxy>
     Proxy
     operator[](off_type index)
     {
-        assert (index < size());
+        // assert (index < size());
         return Proxy(this->self(), index);
     }
 };
diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py
index 45d89a76c9..f32ffe6f0a 100644
--- a/src/mem/HBMCtrl.py
+++ b/src/mem/HBMCtrl.py
@@ -42,6 +42,8 @@ class HBMCtrl(MemCtrl):
     # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces
     dram_2 = Param.DRAMInterface("DRAM memory interface")
 
+    pch_bit = Param.Int("Position of PseudoChannel bit in addresses.")
+
     # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces
     # gives the best results with following min_r/w_per_switch
     min_reads_per_switch = 64
diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc
index f87fa2dcbb..6e7a0761d5 100644
--- a/src/mem/hbm_ctrl.cc
+++ b/src/mem/hbm_ctrl.cc
@@ -45,7 +45,7 @@ namespace memory
 
 HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
     MemCtrl(p),
-    retryRdReqPC1(false), retryWrReqPC1(false),
+    retryRdReqPC1(false), retryWrReqPC1(false), pchBit(p.pch_bit),
     nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1,
                          respondEventPC1, nextReqEventPC1, retryWrReqPC1);},
                          name()),
@@ -226,7 +226,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
     bool is_pc0;
 
     // TODO: make the interleaving bit across pseudo channels a parameter
-    if (bits(pkt->getAddr(), 6) == 0) {
+    if (bits(pkt->getAddr(), pchBit) == 0) {
         is_pc0 = true;
     } else {
         is_pc0 = false;
@@ -487,8 +487,11 @@ AddrRangeList
 HBMCtrl::getAddrRanges()
 {
     AddrRangeList ranges;
-    ranges.push_back(pc0Int->getAddrRange());
-    ranges.push_back(pc1Int->getAddrRange());
+    AddrRange pc0Int_range = pc0Int->getAddrRange();
+    AddrRange pc1Int_range = pc1Int->getAddrRange();
+    ranges.push_back(
+                mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit)
+                    );
     return ranges;
 }
 
diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh
index b17caa6b49..657851eaa6 100644
--- a/src/mem/hbm_ctrl.hh
+++ b/src/mem/hbm_ctrl.hh
@@ -79,6 +79,8 @@ class HBMCtrl : public MemCtrl
     bool retryRdReqPC1;
     bool retryWrReqPC1;
 
+    int pchBit;
+
     /**
      * Remove commands that have already issued from rowBurstTicks
      * and colBurstTicks
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index 9a3600f331..b43c1b8366 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -211,7 +211,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
     for (int cnt = 0; cnt < pkt_count; ++cnt) {
         unsigned size = std::min((addr | (burst_size - 1)) + 1,
                         base_addr + pkt->getSize()) - addr;
-        stats.readPktSize[ceilLog2(size)]++;
+        // stats.readPktSize[ceilLog2(size)]++;
         stats.readBursts++;
         stats.requestorReadAccesses[pkt->requestorId()]++;
 
@@ -1213,8 +1213,8 @@ MemCtrl::CtrlStats::CtrlStats(MemCtrl &_ctrl)
     ADD_STAT(numWrRetry, statistics::units::Count::get(),
              "Number of times write queue was full causing retry"),
 
-    ADD_STAT(readPktSize, statistics::units::Count::get(),
-             "Read request sizes (log2)"),
+    // ADD_STAT(readPktSize, statistics::units::Count::get(),
+    //          "Read request sizes (log2)"),
     ADD_STAT(writePktSize, statistics::units::Count::get(),
              "Write request sizes (log2)"),
 
@@ -1286,7 +1286,7 @@ MemCtrl::CtrlStats::regStats()
     avgRdQLen.precision(2);
     avgWrQLen.precision(2);
 
-    readPktSize.init(ceilLog2(ctrl.system()->cacheLineSize()) + 1);
+    // readPktSize.init(ceilLog2(ctrl.system()->cacheLineSize()) + 1);
     writePktSize.init(ceilLog2(ctrl.system()->cacheLineSize()) + 1);
 
     rdQLenPdf.init(ctrl.readBufferSize);
diff --git a/src/mem/mem_ctrl.hh b/src/mem/mem_ctrl.hh
index 917798ffa7..d33724e327 100644
--- a/src/mem/mem_ctrl.hh
+++ b/src/mem/mem_ctrl.hh
@@ -581,7 +581,7 @@ class MemCtrl : public qos::MemCtrl
 
         statistics::Scalar numRdRetry;
         statistics::Scalar numWrRetry;
-        statistics::Vector readPktSize;
+        // statistics::Vector readPktSize;
         statistics::Vector writePktSize;
         statistics::Vector rdQLenPdf;
         statistics::Vector wrQLenPdf;
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 31dc330cab..daf9d18e88 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -237,6 +237,7 @@ MemCmd::commandInfo[] =
     { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" },
     { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" },
     { {IsRequest}, InvalidCmd, "TlbiExtSync" },
+    { {IsRequest, HasData}, InvalidCmd, "UpdateWL"}
 };
 
 AddrRange
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index ed7a94f4fb..69686e7835 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -149,6 +149,8 @@ class MemCmd
         HTMAbort,
         // Tlb shootdown
         TlbiExtSync,
+        // MPU Accelerator
+        UpdateWL,
         NUM_MEM_CMDS
     };
 
diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc
index 19e1a53e84..55145ab7d7 100644
--- a/src/mem/port_proxy.cc
+++ b/src/mem/port_proxy.cc
@@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) :
 
 void
 PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
-                        void *p, int size) const
+                        void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
-                         const void *p, int size) const
+                         const void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags,
-                          uint8_t v, int size) const
+                          uint8_t v, Addr size) const
 {
     // quick and dirty...
     uint8_t *buf = new uint8_t[size];
diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh
index 29f6ba60a4..8cd21322ea 100644
--- a/src/mem/port_proxy.hh
+++ b/src/mem/port_proxy.hh
@@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol
      * Read size bytes memory at physical address and store in p.
      */
     void readBlobPhys(Addr addr, Request::Flags flags,
-                      void *p, int size) const;
+                      void *p, Addr size) const;
 
     /**
      * Write size bytes from p to physical address.
      */
     void writeBlobPhys(Addr addr, Request::Flags flags,
-                       const void *p, int size) const;
+                       const void *p, Addr size) const;
 
     /**
      * Fill size bytes starting at physical addr with byte value val.
      */
     void memsetBlobPhys(Addr addr, Request::Flags flags,
-                        uint8_t v, int size) const;
+                        uint8_t v, Addr size) const;
 
 
 
@@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryReadBlob(Addr addr, void *p, int size) const
+    tryReadBlob(Addr addr, void *p, Addr size) const
     {
         readBlobPhys(addr, 0, p, size);
         return true;
@@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryWriteBlob(Addr addr, const void *p, int size) const
+    tryWriteBlob(Addr addr, const void *p, Addr size) const
     {
         writeBlobPhys(addr, 0, p, size);
         return true;
@@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryMemsetBlob(Addr addr, uint8_t val, int size) const
+    tryMemsetBlob(Addr addr, uint8_t val, Addr size) const
     {
         memsetBlobPhys(addr, 0, val, size);
         return true;
@@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryReadBlob, but insists on success.
      */
     void
-    readBlob(Addr addr, void *p, int size) const
+    readBlob(Addr addr, void *p, Addr size) const
     {
         if (!tryReadBlob(addr, p, size))
             fatal("readBlob(%#x, ...) failed", addr);
@@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryWriteBlob, but insists on success.
      */
     void
-    writeBlob(Addr addr, const void *p, int size) const
+    writeBlob(Addr addr, const void *p, Addr size) const
     {
         if (!tryWriteBlob(addr, p, size))
             fatal("writeBlob(%#x, ...) failed", addr);
@@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryMemsetBlob, but insists on success.
      */
     void
-    memsetBlob(Addr addr, uint8_t v, int size) const
+    memsetBlob(Addr addr, uint8_t v, Addr size) const
     {
         if (!tryMemsetBlob(addr, v, size))
             fatal("memsetBlob(%#x, ...) failed", addr);
diff --git a/src/mem/simple_mem.hh b/src/mem/simple_mem.hh
index 75a03fbe0e..0be85e9d86 100644
--- a/src/mem/simple_mem.hh
+++ b/src/mem/simple_mem.hh
@@ -180,7 +180,6 @@ class SimpleMemory : public AbstractMemory
     std::unique_ptr<Packet> pendingDelete;
 
   public:
-
     SimpleMemory(const SimpleMemoryParams &p);
 
     DrainState drain() override;
@@ -189,6 +188,8 @@ class SimpleMemory : public AbstractMemory
                   PortID idx=InvalidPortID) override;
     void init() override;
 
+    double getBW() { return bandwidth; }
+
   protected:
     Tick recvAtomic(PacketPtr pkt);
     Tick recvAtomicBackdoor(PacketPtr pkt, MemBackdoorPtr &_backdoor);
diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc
index 8ab859f40d..bc698c1a07 100644
--- a/src/mem/translating_port_proxy.cc
+++ b/src/mem/translating_port_proxy.cc
@@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen,
 }
 
 bool
-TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
+TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Read;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
 
 bool
 TranslatingPortProxy::tryWriteBlob(
-        Addr addr, const void *p, int size) const
+        Addr addr, const void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob(
 }
 
 bool
-TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const
+TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh
index bedb57a3ce..7e619784b1 100644
--- a/src/mem/translating_port_proxy.hh
+++ b/src/mem/translating_port_proxy.hh
@@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy
 
     /** Version of tryReadblob that translates virt->phys and deals
       * with page boundries. */
-    bool tryReadBlob(Addr addr, void *p, int size) const override;
+    bool tryReadBlob(Addr addr, void *p, Addr size) const override;
 
     /** Version of tryWriteBlob that translates virt->phys and deals
       * with page boundries. */
-    bool tryWriteBlob(Addr addr, const void *p, int size) const override;
+    bool tryWriteBlob(Addr addr, const void *p, Addr size) const override;
 
     /**
      * Fill size bytes starting at addr with byte value val.
      */
-    bool tryMemsetBlob(Addr address, uint8_t  v, int size) const override;
+    bool tryMemsetBlob(Addr address, uint8_t  v, Addr size) const override;
 };
 
 } // namespace gem5
diff --git a/src/python/m5/SimObject.py b/src/python/m5/SimObject.py
index 08105d8833..cdfd0f367a 100644
--- a/src/python/m5/SimObject.py
+++ b/src/python/m5/SimObject.py
@@ -1048,6 +1048,7 @@ def find_any(self, ptype):
     def find_all(self, ptype):
         all = {}
         # search children
+        # print(f"I {self._name} am self at entry.")
         for child in self._children.values():
             # a child could be a list, so ensure we visit each item
             if isinstance(child, list):
@@ -1064,34 +1065,54 @@ def find_all(self, ptype):
                     all[child] = True
                 if isSimObject(child):
                     # also add results from the child itself
+                    # print(f"I {self._name} am child to {self._parent}")
+                    # print(f"Me children are {self._children}")
+                    # print(f"Me looking for {ptype}")
                     child_all, done = child.find_all(ptype)
+                    # print(f"My ({self._name}) children are {child_all}")
                     all.update(dict(zip(child_all, [done] * len(child_all))))
         # search param space
         for pname, pdesc in self._params.items():
             if issubclass(pdesc.ptype, ptype):
                 match_obj = self._values[pname]
                 if not isproxy(match_obj) and not isNullPointer(match_obj):
-                    all[match_obj] = True
+                    # print(f"I {match_obj} be match_object") # maybe we can  either make the DRAM interfaces children?
+                    # print(type(match_obj)) #                  or we can maybe check if isSimObjectVector, then serialize it
+                    # print(f" here is all: {type(all)} {all}")
+                    if type(match_obj) is SimObjectVector:
+                        # print("sim object vector!!!")
+                        for simobj in match_obj:
+                            print(simobj)
+                            all[simobj] = True
+                    else:
+                        all[match_obj] = True
+                    # print(f"post all is true")
         # Also make sure to sort the keys based on the objects' path to
         # ensure that the order is the same on all hosts
+        # print(f"I {self._name} am self at exit.")
         return sorted(all.keys(), key=lambda o: o.path()), True
 
     def unproxy(self, base):
         return self
 
     def unproxyParams(self):
+        print(f"Me be {self._name} at the entry of unproxyParams.")
         for param in self._params.keys():
             value = self._values.get(param)
+            print(f"me value is {value}")
+
             if value != None and isproxy(value):
                 try:
+                    print(f"me type im trying to unproxy is {type(value)}")
                     value = value.unproxy(self)
                 except:
+                    print(f"Me be {param} when hit error")
                     print(
                         f"Error in unproxying param '{param}' of {self.path()}"
                     )
                     raise
                 setattr(self, param, value)
-
+        print(f"Me be {self._name} at the exit of unproxyParams.")
         # Unproxy ports in sorted order so that 'append' operations on
         # vector ports are done in a deterministic fashion.
         port_names = list(self._ports.keys())
@@ -1190,7 +1211,21 @@ def getCCParams(self):
                     self.path(),
                     param,
                 )
-
+            if (not isinstance(value, EthernetAddr)) and isproxy(value):
+                # At the time of adding this error unproxying params happens
+                # in simulate.py at lines 103-104 (commit hash: f56459470a)
+                # To understand how attributes are handled for SimObjects
+                # refer to SimObject::__setattr__.
+                fatal(
+                    f"Param {param} for {self._name} has value = {value}. "
+                    "This value is a not a valid value. This could be caused "
+                    f"by {param} not having been unproxied correctly. "
+                    "One reason why this might happen is if you have "
+                    "mistakenly added a child SimObject as an attr and not a "
+                    "child by giving it a name that starts with an underscore "
+                    f"`_`. {self.path()} should not say 'orphan.'"
+                )
+                
             value = value.getValue()
             if isinstance(self._params[param], VectorParamDesc):
                 assert isinstance(value, list)
diff --git a/src/python/m5/params.py b/src/python/m5/params.py
index 2ca6dfcc14..65f87ff3df 100644
--- a/src/python/m5/params.py
+++ b/src/python/m5/params.py
@@ -260,11 +260,14 @@ def getValue(self):
         return [v.getValue() for v in self]
 
     def unproxy(self, base):
+        print(f" me Unproxying vector param, me is {self}")
         if len(self) == 1 and isinstance(self[0], proxy.BaseProxy):
+            print(f"me in if params.py, about to try to unproxy {self[0]}")
             # The value is a proxy (e.g. Parent.any, Parent.all or
             # Parent.x) therefore try resolve it
             return self[0].unproxy(base)
         else:
+            print("me in else params.py")
             return [v.unproxy(base) for v in self]