diff --git a/configs/accl/archived/sega_detailed.py b/configs/accl/archived/sega_detailed.py new file mode 100644 index 0000000000..795089579a --- /dev/null +++ b/configs/accl/archived/sega_detailed.py @@ -0,0 +1,280 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret, intlv_low_bit + intlv_bits - 1 + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=register_file_size, + examine_window=8, + rd_per_cycle=4, + reduce_per_cycle=32, + wr_per_cycle=4, + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=64, + active_buffer_size=80, + post_push_wb_queue_size=64, + transitions_per_cycle=4, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=1024, + examine_window=12, + max_propagates_per_cycle=8, + update_queue_size=64, + ) + + self.vertex_mem_ctrl = HBMCtrl( + dram=HBM_2000_4H_1x64(), + dram_2=HBM_2000_4H_1x64(), + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_ranges): + self.vertex_mem_ctrl.dram.range = vertex_ranges[0] + self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1] + + def set_vertex_pch_bit(self, pch_bit): + self.vertex_mem_ctrl.pch_bit = pch_bit + + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + + +class SEGAController(SubSystem): + def __init__(self, mirror_bw): + super().__init__() + self.map_mem = SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth="1024GiB/s", + range=AddrRange(start=0, size="4GiB"), + in_addr_map=False, + ) + self.controller = CenteralController( + choose_best=False, + mirrors_mem=SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth=mirror_bw, + range=AddrRange(start=0, size="16GiB"), + in_addr_map=False, + ), + ) + self.controller.mem_port = self.controller.mirrors_mem.port + self.controller.mirrors_map_mem = self.map_mem.port + + def set_choose_best(self, choose_best): + self.controller.choose_best = choose_best + + def set_vertices_image(self, vertices): + self.controller.vertex_image_file = vertices + + def set_aux_images(self, mirrors, mirrors_map): + self.controller.mirrors_mem.image_file = mirrors + self.map_mem.image_file = mirrors_map + + def set_mpu_vector(self, mpu_vector): + self.controller.mpu_vector = mpu_vector + + +class SEGA(System): + def __init__( + self, + num_gpts, + num_registers, + cache_size, + graph_path, + ): + super(SEGA, self).__init__() + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self._num_gpts = num_gpts + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + self.ctrl = SEGAController("256GiB/s") + self.ctrl.set_vertices_image(f"{graph_path}/vertices") + + edge_mem = [] + for i in range(int(num_gpts / 2)): + mem = EdgeMemory("4GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs + vertex_ranges, pch_bit = interleave_addresses( + AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32 + ) + gpts = [] + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range( + [vertex_ranges[i], vertex_ranges[i + num_gpts]] + ) + gpt.set_vertex_pch_bit(pch_bit) + gpt.setEdgeMemPort( + self.edge_mem[i % (int(num_gpts / 2))].getPort() + ) + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts]) + + def work_count(self): + return self.ctrl.controller.workCount() + + def set_async_mode(self): + self.ctrl.controller.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.controller.setBSPMode() + + def set_pg_mode(self): + self.ctrl.controller.setPGMode() + + def set_aux_images(self, mirrors, mirrors_map): + self.ctrl.set_aux_images(mirrors, mirrors_map) + + def set_choose_best(self, choose_best): + self.ctrl.set_choose_best(choose_best) + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.controller.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSWorkload(init_addr, init_value) + + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.controller.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.controller.createCCWorkload() + + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.controller.createAsyncPRWorkload(alpha, threshold) + + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.controller.createPRWorkload(num_nodes, alpha) + + def get_pr_error(self): + return self.ctrl.controller.getPRError() + + def create_bc_workload(self, init_addr, init_value): + self.ctrl.controller.createBCWorkload(init_addr, init_value) + + def print_answer(self): + self.ctrl.controller.printAnswerToHostSimout() + + def get_num_gpts(self): + return self._num_gpts \ No newline at end of file diff --git a/configs/accl/archived/sega_double_simple.py b/configs/accl/archived/sega_double_simple.py new file mode 100644 index 0000000000..87f37ce269 --- /dev/null +++ b/configs/accl/archived/sega_double_simple.py @@ -0,0 +1,270 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=register_file_size, + examine_window=8, + rd_per_cycle=4, + reduce_per_cycle=32, + wr_per_cycle=4, + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=64, + active_buffer_size=80, + post_push_wb_queue_size=64, + transitions_per_cycle=4, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=1024, + examine_window=12, + max_propagates_per_cycle=8, + update_queue_size=64, + ) + + self.vertex_mem_ctrl = SimpleMemory( + latency="120ns", bandwidth="32GiB/s" + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "9.6GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = SimpleMemory( + latency="90ns", + bandwidth="76.8GiB/s", + range=AddrRange(size), + in_addr_map=False, + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + + +class SEGAController(SubSystem): + def __init__(self, mirror_bw): + super().__init__() + self.map_mem = SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth="1024GiB/s", + range=AddrRange(start=0, size="4GiB"), + in_addr_map=False, + ) + self.controller = CenteralController( + choose_best=False, + mirrors_mem=SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth=mirror_bw, + range=AddrRange(start=0, size="32GiB"), + in_addr_map=False, + ), + ) + self.controller.mem_port = self.controller.mirrors_mem.port + self.controller.mirrors_map_mem = self.map_mem.port + + def set_choose_best(self, choose_best): + self.controller.choose_best = choose_best + + def set_vertices_image(self, vertices): + self.controller.vertex_image_file = vertices + + def set_aux_images(self, mirrors, mirrors_map): + self.controller.mirrors_mem.image_file = mirrors + self.map_mem.image_file = mirrors_map + + def set_mpu_vector(self, mpu_vector): + self.controller.mpu_vector = mpu_vector + + +class SEGA(System): + def __init__( + self, + num_gpts, + num_registers, + cache_size, + graph_path, + ): + super(SEGA, self).__init__() + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + self.ctrl = SEGAController("256GiB/s") + self.ctrl.set_vertices_image(f"{graph_path}/vertices") + + edge_mem = [] + for i in range(int(num_gpts / 2)): + mem = EdgeMemory("16GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs + vertex_ranges = interleave_addresses( + AddrRange(start=0, size="4GiB"), num_gpts, 32 + ) + gpts = [] + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range(vertex_ranges[i]) + gpt.setEdgeMemPort( + self.edge_mem[i % (int(num_gpts / 2))].getPort() + ) + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts]) + + def work_count(self): + return self.ctrl.controller.workCount() + + def set_async_mode(self): + self.ctrl.controller.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.controller.setBSPMode() + + def set_pg_mode(self): + self.ctrl.controller.setPGMode() + + def set_aux_images(self, mirrors, mirrors_map): + self.ctrl.set_aux_images(mirrors, mirrors_map) + + def set_choose_best(self, choose_best): + self.ctrl.set_choose_best(choose_best) + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.controller.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSWorkload(init_addr, init_value) + + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.controller.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.controller.createCCWorkload() + + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.controller.createAsyncPRWorkload(alpha, threshold) + + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.controller.createPRWorkload(num_nodes, alpha) + + def get_pr_error(self): + return self.ctrl.controller.getPRError() + + def create_bc_workload(self, init_addr, init_value): + self.ctrl.controller.createBCWorkload(init_addr, init_value) + + def print_answer(self): + self.ctrl.controller.printAnswerToHostSimout() diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py new file mode 100644 index 0000000000..a24f2ff860 --- /dev/null +++ b/configs/accl/async-pr.py @@ -0,0 +1,109 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * +from sega import SEGA + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("alpha", type=float) + argparser.add_argument("threshold", type=float) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.alpha, + args.threshold, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + cache_size, + graph, + alpha, + threshold, + sample, + verify, + ) = get_inputs() + + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_async_pr_workload(alpha, threshold) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/bc.py b/configs/accl/bc.py new file mode 100644 index 0000000000..c100068aa2 --- /dev/null +++ b/configs/accl/bc.py @@ -0,0 +1,116 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * +from sega import SEGA + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.init_addr, + args.init_value, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + cache_size, + graph, + init_addr, + init_value, + sample, + verify, + ) = get_inputs() + + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system=False, system=system) + + + m5.instantiate() + + system.set_bsp_mode() + system.create_pop_count_directory(64) + system.create_bc_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + iterations = 0 + while True: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + iterations += 1 + if system.work_count() == 0: + break + print(f"#iterations: {iterations}") + if verify: + system.print_answer() diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py new file mode 100644 index 0000000000..6c33c93f59 --- /dev/null +++ b/configs/accl/bfs.py @@ -0,0 +1,160 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * +from sega import SEGA + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--tile", + dest="tile", + action="store_const", + const=True, + default=False, + help="Whether to use temporal partitioning", + ) + argparser.add_argument( + "--best", + dest="best", + action="store_const", + const=True, + default=False, + help="Whether to use best update value for switching slices", + ) + argparser.add_argument( + "--visited", + dest="visited", + action="store_const", + const=True, + default=False, + help="Use visitation version of BFS", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.init_addr, + args.init_value, + args.tile, + args.best, + args.visited, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + cache_size, + graph, + init_addr, + init_value, + tile, + best, + visited, + sample, + verify, + ) = get_inputs() + + system = SEGA(num_gpts, cache_size, graph) + if tile: + system.set_aux_images(f"{graph}/mirrors", f"{graph}/mirrors_map") + + if best: + system.set_choose_best(True) + + root = Root(full_system=False, system=system) + + m5.instantiate() + + if tile: + system.set_pg_mode() + else: + system.set_async_mode() + + system.create_pop_count_directory(32) + if visited: + system.create_bfs_visited_workload(init_addr, init_value) + else: + system.create_bfs_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(50000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if exit_event.getCause() == "simulate() limit reached": + m5.stats.dump() + m5.stats.reset() + elif exit_event.getCause() == "Done with all the slices.": + break + elif exit_event.getCause() == "no update left to process.": + break + else: + while True: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if exit_event.getCause() == "Done with all the slices.": + break + if exit_event.getCause() == "no update left to process.": + break + if verify: + system.print_answer() diff --git a/configs/accl/cc.py b/configs/accl/cc.py new file mode 100644 index 0000000000..0fd4fe3505 --- /dev/null +++ b/configs/accl/cc.py @@ -0,0 +1,103 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * +from sega import SEGA + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + cache_size, + graph, + sample, + verify, + ) = get_inputs() + + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_cc_workload() + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/configs/accl/disagg_bfs.py b/configs/accl/disagg_bfs.py new file mode 100644 index 0000000000..42e1c06acb --- /dev/null +++ b/configs/accl/disagg_bfs.py @@ -0,0 +1,170 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * +from disagg_sega import SEGA + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument("fend", type=int) + argparser.add_argument("mem_ctrl_lat", type=str) + + argparser.add_argument( + "--tile", + dest="tile", + action="store_const", + const=True, + default=False, + help="Whether to use temporal partitioning", + ) + argparser.add_argument( + "--best", + dest="best", + action="store_const", + const=True, + default=False, + help="Whether to use best update value for switching slices", + ) + argparser.add_argument( + "--visited", + dest="visited", + action="store_const", + const=True, + default=False, + help="Use visitation version of BFS", + ) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + + + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.init_addr, + args.init_value, + args.tile, + args.best, + args.visited, + args.sample, + args.verify, + args.fend, + args.mem_ctrl_lat, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + cache_size, + graph, + init_addr, + init_value, + tile, + best, + visited, + sample, + verify, + fend, + mem_ctrl_lat, + ) = get_inputs() + + system = SEGA(num_gpts, cache_size, graph,fend, mem_ctrl_lat) + if tile: + system.set_aux_images(f"{graph}/mirrors", f"{graph}/mirrors_map") + + if best: + system.set_choose_best(True) + + root = Root(full_system=False, system=system) + + m5.instantiate() + + if tile: + system.set_pg_mode() + else: + system.set_async_mode() + + system.create_pop_count_directory(32) + if visited: + system.create_bfs_visited_workload(init_addr, init_value) + else: + system.create_bfs_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(743598075) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if exit_event.getCause() == "simulate() limit reached": + m5.stats.dump() + m5.stats.reset() + elif exit_event.getCause() == "Done with all the slices.": + break + elif exit_event.getCause() == "no update left to process.": + break + else: + while True: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if exit_event.getCause() == "Done with all the slices.": + break + if exit_event.getCause() == "no update left to process.": + break + if verify: + system.print_answer() diff --git a/configs/accl/disagg_sega.py b/configs/accl/disagg_sega.py new file mode 100644 index 0000000000..f3b4c44c31 --- /dev/null +++ b/configs/accl/disagg_sega.py @@ -0,0 +1,272 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=register_file_size, + examine_window=8, + rd_per_cycle=4, + reduce_per_cycle=32, + wr_per_cycle=4, + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=64, + active_buffer_size=80, + post_push_wb_queue_size=64, + transitions_per_cycle=4, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=1024, + examine_window=12, + max_propagates_per_cycle=8, + update_queue_size=64, + base_addr=4294967296, + ) + + self.vertex_mem_ctrl = SimpleMemory( + latency="120ns", bandwidth="28GiB/s" + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + + +class EdgeMemory(SubSystem): + def __init__(self, num_channels: str, fend_latency: int, mem_ctrl_lat: int): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + print("fend_latency is: ", fend_latency) + print("mem_ctrl_lat is: ", mem_ctrl_lat) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=fend_latency, forward_latency=1, response_latency=1 + ) + + addr_ranges = interleave_addresses(AddrRange(start="4GiB", size="64GiB"), num_channels, 64) # in_addr_map was False + self.mem_ctrls = [MemCtrl(dram=DDR4_2400_8x8(range=addr_ranges[i], in_addr_map=True), static_frontend_latency=mem_ctrl_lat, port=self.xbar.mem_side_ports) for i in range(num_channels)] + [print(f"{self.mem_ctrls[i]} range is: {addr_ranges[i]}") for i in range(num_channels)] + def get_abs_mems(self): + return [ctrl.dram for ctrl in self.mem_ctrls] + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + + +class SEGAController(SubSystem): + def __init__(self, mirror_bw): + super().__init__() + self.map_mem = SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth="1024GiB/s", + range=AddrRange(start=0, size="4GiB"), + in_addr_map=False, + ) + self.controller = CenteralController( + choose_best=False, + mirrors_mem=SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth=mirror_bw, + range=AddrRange(start="0GiB", size="16GiB"),# was 16 GiB + in_addr_map=False, + ), + edge_base = 0, + ) + self.controller.mem_port = self.controller.mirrors_mem.port + self.controller.mirrors_map_mem = self.map_mem.port + + def set_choose_best(self, choose_best): + self.controller.choose_best = choose_best + + def set_vertices_image(self, vertices): + self.controller.vertex_image_file = vertices + + def set_edges_image(self, edges): + self.controller.edge_image_file = edges + + def set_abs_mems(self, abs_mems): + self.controller.abstract_mem_vector = abs_mems + + def set_aux_images(self, mirrors, mirrors_map): + self.controller.mirrors_mem.image_file = mirrors + self.map_mem.image_file = mirrors_map + + def set_mpu_vector(self, mpu_vector): + self.controller.mpu_vector = mpu_vector + + +class SEGA(System): + def __init__( + self, + num_gpts, + cache_size, + graph_path, + fend_latency, + mem_ctrl_lat, + ): + super(SEGA, self).__init__() + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + self.edge_mem = EdgeMemory(4, fend_latency, mem_ctrl_lat) + + self.ctrl = SEGAController("256GiB/s") + self.ctrl.set_vertices_image(f"{graph_path}/vertices") + self.ctrl.set_edges_image(f"{graph_path}/edgelist_0") + self.ctrl.set_abs_mems(self.edge_mem.get_abs_mems()) + num_registers = 128 + # Building the GPTs + vertex_ranges = interleave_addresses( + AddrRange(start=0, size="4GiB"), num_gpts, 32 # was 32 + ) + gpts = [] + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range(vertex_ranges[i]) + gpt.setEdgeMemPort(self.edge_mem.getPort()) + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts]) + + def work_count(self): + return self.ctrl.controller.workCount() + + def set_async_mode(self): + self.ctrl.controller.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.controller.setBSPMode() + + def set_pg_mode(self): + self.ctrl.controller.setPGMode() + + def set_aux_images(self, mirrors, mirrors_map): + self.ctrl.set_aux_images(mirrors, mirrors_map) + + def set_choose_best(self, choose_best): + self.ctrl.set_choose_best(choose_best) + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.controller.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSWorkload(init_addr, init_value) + + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.controller.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.controller.createCCWorkload() + + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.controller.createAsyncPRWorkload(alpha, threshold) + + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.controller.createPRWorkload(num_nodes, alpha) + + def get_pr_error(self): + return self.ctrl.controller.getPRError() + + def create_bc_workload(self, init_addr, init_value): + self.ctrl.controller.createBCWorkload(init_addr, init_value) + + def print_answer(self): + self.ctrl.controller.printAnswerToHostSimout() diff --git a/configs/accl/pr.py b/configs/accl/pr.py new file mode 100644 index 0000000000..723f122908 --- /dev/null +++ b/configs/accl/pr.py @@ -0,0 +1,126 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * +from sega import SEGA + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("iterations", type=int) + argparser.add_argument("alpha", type=float) + argparser.add_argument("--num_nodes", type=int, default=1) + argparser.add_argument("--error_threshold", type=float, default=0.0) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.iterations, + args.alpha, + args.num_nodes, + args.error_threshold, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + cache_size, + graph, + iterations, + alpha, + num_nodes, + error_threshold, + sample, + verify, + ) = get_inputs() + + print(f"error_threshold: {error_threshold}") + + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_bsp_mode() + system.create_pop_count_directory(64) + system.create_pr_workload(num_nodes, alpha) + iteration = 0 + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + while iteration < iterations: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + iteration += 1 + print(f"error: {system.get_pr_error()}") + if system.get_pr_error() < error_threshold: + break + if system.work_count() == 0: + break + print(f"#iterations: {iteration}") + if verify: + system.print_answer() diff --git a/configs/accl/sega.py b/configs/accl/sega.py new file mode 100644 index 0000000000..bd3ffe567f --- /dev/null +++ b/configs/accl/sega.py @@ -0,0 +1,266 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from math import log +from m5.objects import * + + +def interleave_addresses(plain_range, num_channels, cache_line_size): + intlv_low_bit = log(cache_line_size, 2) + intlv_bits = log(num_channels, 2) + ret = [] + for i in range(num_channels): + ret.append( + AddrRange( + start=plain_range.start, + size=plain_range.size(), + intlvHighBit=intlv_low_bit + intlv_bits - 1, + xorHighBit=0, + intlvBits=intlv_bits, + intlvMatch=i, + ) + ) + return ret + + +class GPT(SubSystem): + def __init__(self, register_file_size: int, cache_size: str): + super().__init__() + self.wl_engine = WLEngine( + update_queue_size=64, + register_file_size=register_file_size, + examine_window=8, + rd_per_cycle=4, + reduce_per_cycle=32, + wr_per_cycle=4, + ) + self.coalesce_engine = CoalesceEngine( + attached_memory_atom_size=32, + cache_size=cache_size, + max_resp_per_cycle=8, + pending_pull_limit=64, + active_buffer_size=80, + post_push_wb_queue_size=64, + transitions_per_cycle=4, + ) + self.push_engine = PushEngine( + push_req_queue_size=32, + attached_memory_atom_size=64, + resp_queue_size=1024, + examine_window=12, + max_propagates_per_cycle=8, + update_queue_size=64, + ) + + self.vertex_mem_ctrl = SimpleMemory( + latency="120ns", bandwidth="256GiB/s" + ) + self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port + + self.mpu = MPU( + wl_engine=self.wl_engine, + coalesce_engine=self.coalesce_engine, + push_engine=self.push_engine, + ) + + def getRespPort(self): + return self.wl_engine.in_ports + + def setRespPort(self, port): + self.wl_engine.in_ports = port + + def getReqPort(self): + return self.push_engine.out_ports + + def setReqPort(self, port): + self.push_engine.out_ports = port + + def getEdgeMemPort(self): + return self.push_engine.mem_port + + def setEdgeMemPort(self, port): + self.push_engine.mem_port = port + + def set_vertex_range(self, vertex_range): + self.vertex_mem_ctrl.range = vertex_range + + +class EdgeMemory(SubSystem): + def __init__(self, size: str): + super(EdgeMemory, self).__init__() + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2.4GHz" + self.clk_domain.voltage_domain = VoltageDomain() + + self.mem_ctrl = MemCtrl( + dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False) + ) + self.xbar = NoncoherentXBar( + width=64, frontend_latency=1, forward_latency=1, response_latency=1 + ) + self.xbar.mem_side_ports = self.mem_ctrl.port + + def set_image(self, image): + self.mem_ctrl.dram.image_file = image + + def getPort(self): + return self.xbar.cpu_side_ports + + def setPort(self, port): + self.xbar.cpu_side_ports = port + + +class SEGAController(SubSystem): + def __init__(self, mirror_bw): + super().__init__() + self.map_mem = SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth="1024GiB/s", + range=AddrRange(start=0, size="4GiB"), + in_addr_map=False, + ) + self.controller = CenteralController( + choose_best=False, + mirrors_mem=SimpleMemory( + latency="0ns", + latency_var="0ns", + bandwidth=mirror_bw, + range=AddrRange(start=0, size="16GiB"), + in_addr_map=False, + ), + ) + self.controller.mem_port = self.controller.mirrors_mem.port + self.controller.mirrors_map_mem = self.map_mem.port + + def set_choose_best(self, choose_best): + self.controller.choose_best = choose_best + + def set_vertices_image(self, vertices): + self.controller.vertex_image_file = vertices + + def set_aux_images(self, mirrors, mirrors_map): + self.controller.mirrors_mem.image_file = mirrors + self.map_mem.image_file = mirrors_map + + def set_mpu_vector(self, mpu_vector): + self.controller.mpu_vector = mpu_vector + + +class SEGA(System): + def __init__( + self, + num_gpts, + cache_size, + graph_path, + ): + super(SEGA, self).__init__() + assert num_gpts != 0 + assert num_gpts % 2 == 0 + assert (num_gpts & (num_gpts - 1)) == 0 + + self.clk_domain = SrcClockDomain() + self.clk_domain.clock = "2GHz" + self.clk_domain.voltage_domain = VoltageDomain() + self.cache_line_size = 32 + self.mem_mode = "timing" + + self.ctrl = SEGAController("256GiB/s") + self.ctrl.set_vertices_image(f"{graph_path}/vertices") + num_registers = 128 + edge_mem = [] + for i in range(int(num_gpts / 2)): + mem = EdgeMemory("16GiB") + mem.set_image(f"{graph_path}/edgelist_{i}") + edge_mem.append(mem) + self.edge_mem = edge_mem + # Building the GPTs + vertex_ranges = interleave_addresses( + AddrRange(start=0, size="4GiB"), num_gpts, 32 + ) + gpts = [] + for i in range(num_gpts): + gpt = GPT(num_registers, cache_size) + gpt.set_vertex_range(vertex_ranges[i]) + gpt.setEdgeMemPort( + self.edge_mem[i % (int(num_gpts / 2))].getPort() + ) + gpts.append(gpt) + # Creating the interconnect among mpus + for gpt_0 in gpts: + for gpt_1 in gpts: + gpt_0.setReqPort(gpt_1.getRespPort()) + self.gpts = gpts + + self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts]) + + def work_count(self): + return self.ctrl.controller.workCount() + + def set_async_mode(self): + self.ctrl.controller.setAsyncMode() + + def set_bsp_mode(self): + self.ctrl.controller.setBSPMode() + + def set_pg_mode(self): + self.ctrl.controller.setPGMode() + + def set_aux_images(self, mirrors, mirrors_map): + self.ctrl.set_aux_images(mirrors, mirrors_map) + + def set_choose_best(self, choose_best): + self.ctrl.set_choose_best(choose_best) + + def create_pop_count_directory(self, atoms_per_block): + self.ctrl.controller.createPopCountDirectory(atoms_per_block) + + def create_bfs_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSWorkload(init_addr, init_value) + + def create_bfs_visited_workload(self, init_addr, init_value): + self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value) + + def create_sssp_workload(self, init_addr, init_value): + self.ctrl.controller.createSSSPWorkload(init_addr, init_value) + + def create_cc_workload(self): + self.ctrl.controller.createCCWorkload() + + def create_async_pr_workload(self, alpha, threshold): + self.ctrl.controller.createAsyncPRWorkload(alpha, threshold) + + def create_pr_workload(self, num_nodes, alpha): + self.ctrl.controller.createPRWorkload(num_nodes, alpha) + + def get_pr_error(self): + return self.ctrl.controller.getPRError() + + def create_bc_workload(self, init_addr, init_value): + self.ctrl.controller.createBCWorkload(init_addr, init_value) + + def print_answer(self): + self.ctrl.controller.printAnswerToHostSimout() diff --git a/configs/accl/sssp.py b/configs/accl/sssp.py new file mode 100644 index 0000000000..e23ebfb365 --- /dev/null +++ b/configs/accl/sssp.py @@ -0,0 +1,109 @@ +# Copyright (c) 2022 The Regents of the University of California +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import m5 +import argparse + +from m5.objects import * +from sega import SEGA + + +def get_inputs(): + argparser = argparse.ArgumentParser() + argparser.add_argument("num_gpts", type=int) + argparser.add_argument("cache_size", type=str) + argparser.add_argument("graph", type=str) + argparser.add_argument("init_addr", type=int) + argparser.add_argument("init_value", type=int) + argparser.add_argument( + "--sample", + dest="sample", + action="store_const", + const=True, + default=False, + help="Sample sim stats every 100us", + ) + argparser.add_argument( + "--verify", + dest="verify", + action="store_const", + const=True, + default=False, + help="Print final answer", + ) + + args = argparser.parse_args() + + return ( + args.num_gpts, + args.cache_size, + args.graph, + args.init_addr, + args.init_value, + args.sample, + args.verify, + ) + + +if __name__ == "__m5_main__": + ( + num_gpts, + cache_size, + graph, + init_addr, + init_value, + sample, + verify, + ) = get_inputs() + + system = SEGA(num_gpts, cache_size, graph) + root = Root(full_system=False, system=system) + + m5.instantiate() + + system.set_async_mode() + system.create_pop_count_directory(64) + system.create_sssp_workload(init_addr, init_value) + if sample: + while True: + exit_event = m5.simulate(100000000) + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + m5.stats.dump() + m5.stats.reset() + if exit_event.getCause() != "simulate() limit reached": + break + else: + exit_event = m5.simulate() + print( + f"Exited simulation at tick {m5.curTick()} " + + f"because {exit_event.getCause()}" + ) + if verify: + system.print_answer() diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md new file mode 100644 index 0000000000..ebfca7e794 --- /dev/null +++ b/src/accl/graph/TODO.md @@ -0,0 +1,8 @@ +# TODO Items + +* We might need to revisit the fact that we could insert something to a queue on + the same cycle that another event is consuming something from the queue. +* Move checking for wl.degree == 0 to coalesce engine. +* Fix the retry system between memory queue and coalesce engine +* Update inheritance: There is not enough reason for PushEngine and +CoalesceEngine to be of the same type (i.e. delete BaseMemEngine). diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py new file mode 100644 index 0000000000..0585c36e48 --- /dev/null +++ b/src/accl/graph/base/BaseReduceEngine.py @@ -0,0 +1,38 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseReduceEngine(ClockedObject): + abstract = True + type = 'BaseReduceEngine' + cxx_header = "accl/graph/base/base_reduce_engine.hh" + cxx_class = 'gem5::BaseReduceEngine' + + system = Param.System(Parent.any, 'System this Engine is a part of') diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript new file mode 100644 index 0000000000..35111c34d2 --- /dev/null +++ b/src/accl/graph/base/SConscript @@ -0,0 +1,33 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import("*") + +SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"]) + +Source("base_reduce_engine.cc") +Source("graph_workload.cc") diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc new file mode 100644 index 0000000000..ade95800d2 --- /dev/null +++ b/src/accl/graph/base/base_reduce_engine.cc @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/base/base_reduce_engine.hh" + +namespace gem5 +{ + +BaseReduceEngine::BaseReduceEngine(const Params ¶ms): + ClockedObject(params), + system(params.system), + _requestorId(system->getRequestorId(this)) +{} + +BaseReduceEngine::~BaseReduceEngine() +{} + +} diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh new file mode 100644 index 0000000000..268bb60b76 --- /dev/null +++ b/src/accl/graph/base/base_reduce_engine.hh @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ +#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ + +#include "params/BaseReduceEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseReduceEngine : public ClockedObject +{ + private: + System* system; + + protected: + + const RequestorID _requestorId; + + public: + PARAMS(BaseReduceEngine); + BaseReduceEngine(const Params ¶ms); + ~BaseReduceEngine(); + + RequestorID requestorId() { return _requestorId; } +}; + +} + +#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__ diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh new file mode 100644 index 0000000000..f1a26f6ac2 --- /dev/null +++ b/src/accl/graph/base/data_structs.hh @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ +#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ + +#include "base/cprintf.hh" +#include "base/intmath.hh" + +#include +#include +#include +#include + +namespace gem5 +{ + +struct __attribute__ ((packed)) WorkListItem +{ + uint32_t tempProp : 32; + uint32_t prop : 32; + uint32_t edgeIndex : 32; + uint32_t degree : 30; + bool activeNow: 1; + bool activeFuture: 1; + + std::string to_string() + { + return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, " + "degree: %u, activeNow: %s, activeFuture: %s}", + tempProp, prop, edgeIndex, degree, + activeNow ? "true" : "false", + activeFuture ? "true" : "false"); + } + + WorkListItem(): + tempProp(0), + prop(0), + edgeIndex(0), + degree(0), + activeNow(false), + activeFuture(false) + {} + + WorkListItem(uint32_t temp_prop, uint32_t prop, + uint32_t degree, uint32_t edge_index, + bool active_now, bool active_future): + tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree), + activeNow(active_now), activeFuture(active_future) + {} + +}; + +struct __attribute__ ((packed)) Edge +{ + uint16_t weight : 16; + uint64_t neighbor : 48; + + std::string to_string() + { + return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor); + } + + Edge(): weight(0), neighbor(0) {} + + Edge(uint16_t weight, uint64_t neighbor): + weight(weight), + neighbor(neighbor) + {} +}; + +struct __attribute__ ((packed)) MirrorVertex +{ + uint32_t vertexId : 32; + uint32_t prop : 32; + uint32_t edgeIndex : 32; + uint32_t degree : 30; + bool activeNow: 1; + bool activeNext: 1; + + std::string to_string() + { + return csprintf("MirrorVertex{vertexId: %u, prop: %u, edgeIndex: %u, " + "degree: %u, activeNow: %s, activeNext: %s}", + vertexId, prop, edgeIndex, degree, + activeNow ? "true" : "false", + activeNext ? "true" : "false"); + } + MirrorVertex(): + vertexId(-1), + prop(-1), + edgeIndex(-1), + degree(-1), + activeNow(false), + activeNext(false) + {} + + MirrorVertex(uint32_t vertex_id, uint32_t prop, uint32_t degree, + uint32_t edge_index, bool active_now, bool active_next): + vertexId(vertex_id), prop(prop), edgeIndex(edge_index), + degree(degree), activeNow(active_now), activeNext(active_next) + {} + +}; + +static_assert(isPowerOf2(sizeof(WorkListItem))); +static_assert(isPowerOf2(sizeof(Edge))); +static_assert(isPowerOf2(sizeof(MirrorVertex))); + +struct MetaEdge { + uint64_t src; + uint64_t dst; + uint32_t weight; + uint32_t value; + + MetaEdge(): src(0), dst(0), weight(0), value(0) + {} + MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value): + src(src), dst(dst), weight(weight), value(value) + {} + + std::string to_string() + { + return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}", + src, dst, weight, value); + } +}; + +struct Update { + uint64_t src; + uint64_t dst; + uint32_t value; + + Update(): src(0), dst(0), value(0) + {} + Update(uint64_t src, uint64_t dst, uint32_t value): + src(src), dst(dst), value(value) + {} + + std::string to_string() + { + return csprintf("Update{src: %lu, dst:%lu, value: %u}", + src, dst, value); + } +}; + +template +class UniqueFIFO +{ + private: + int cap; + int pop; + + int* added; + int* deleted; + std::deque container; + + public: + UniqueFIFO() { + cap = 0; + pop = 0; + added = nullptr; + deleted = nullptr; + } + + UniqueFIFO(int size) { + cap = size; + pop = 0; + + added = (int*) new int [cap]; + deleted = (int*) new int [cap]; + + for (int i = 0; i < cap; i++) { + added[i] = 0; + deleted[i] = 0; + } + container.clear(); + } + + ~UniqueFIFO() { + delete [] added; + delete [] deleted; + } + + void fix_front() { + while(true) { + T elem = container.front(); + if (deleted[elem] > 0) { + deleted[elem]--; + added[elem]--; + container.pop_front(); + } else { + assert(deleted[elem] == 0); + assert(added[elem] == 1); + break; + } + } + } + + T front() { + fix_front(); + return container.front(); + } + + size_t size() { + return pop; + } + + void clear() { + pop = 0; + for (int i = 0; i < cap; i++) { + added[i] = 0; + deleted[i] = 0; + } + container.clear(); + } + + bool empty() { + return size() == 0; + } + + bool find(T item) { + assert(added[item] >= 0); + assert(deleted[item] >= 0); + int diff = added[item] - deleted[item]; + assert((diff == 0) || (diff == 1)); + return (diff == 1); + } + + void push_back(T item) { + if (!find(item)) { + added[item]++; + pop++; + container.push_back(item); + } + } + + void pop_front() { + T elem = front(); + assert(added[elem] == 1); + added[elem] = 0; + pop--; + container.pop_front(); + } + + void erase(T item) { + assert(find(item)); + deleted[item]++; + pop--; + } + + void operator=(const UniqueFIFO& rhs) { + cap = rhs.cap; + pop = rhs.pop; + container = rhs.container; + added = (int*) new int [cap]; + deleted = (int*) new int [cap]; + std::memcpy(added, rhs.added, cap * sizeof(int)); + std::memcpy(deleted, rhs.deleted, cap * sizeof(int)); + } +}; + +} + +#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__ diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc new file mode 100644 index 0000000000..fd802cf275 --- /dev/null +++ b/src/accl/graph/base/graph_workload.cc @@ -0,0 +1,411 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/base/graph_workload.hh" + +#include + +#include "base/cprintf.hh" +#include "base/intmath.hh" + +namespace gem5 +{ + +template +float +writeToFloat(T value) +{ + assert(sizeof(T) == sizeof(float)); + float float_form; + std::memcpy(&float_form, &value, sizeof(float)); + return float_form; +} + +template +T +readFromFloat(float value) +{ + assert(sizeof(T) == sizeof(float)); + T float_bits; + std::memcpy(&float_bits, &value, sizeof(float)); + return float_bits; +} + +void +BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + size_t pkt_size = pkt->getSize(); + uint64_t aligned_addr = roundDown(initAddr, pkt_size); + + if (pkt->getAddr() == aligned_addr) { + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem)); + WorkListItem new_wl = items[index]; + new_wl.tempProp = initValue; + if (activeCondition(new_wl, items[index])) { + new_wl.activeNow = true; + dir->activate(aligned_addr); + } + items[index] = new_wl; + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); + } +} + +uint32_t +BFSWorkload::reduce(uint32_t update, uint32_t value) +{ + return std::min(update, value); +} + +uint32_t +BFSWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value + 1; +} + +bool +BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0); +} + +uint32_t +BFSWorkload::apply(WorkListItem& wl) +{ + wl.prop = wl.tempProp; + return wl.prop; +} + +std::string +BFSWorkload::printWorkListItem(const WorkListItem wl) +{ + return csprintf( + "WorkListItem{tempProp: %u, prop: %u, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + wl.tempProp, wl.prop, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +uint32_t +BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) { + return value; +} + +void +CCWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + Addr pkt_addr = pkt->getAddr(); + size_t pkt_size = pkt->getSize(); + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + bool atom_active = false; + for (int i = 0; i < num_elements; i++) { + WorkListItem new_wl = items[i]; + new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i; + new_wl.activeNow = activeCondition(new_wl, items[i]); + atom_active |= new_wl.activeNow; + items[i] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); +} + +uint32_t +SSSPWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value + weight; +} + +void +PRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + int num_elements = pkt->getSize() / sizeof(WorkListItem); + WorkListItem items[num_elements]; + pkt->writeDataToBlock((uint8_t*) items, pkt->getSize()); + + bool atom_active = false; + for (int index = 0; index < num_elements; index++) { + WorkListItem new_wl = items[index]; + new_wl.tempProp = readFromFloat(0); + new_wl.prop = readFromFloat(1 - alpha); + new_wl.activeNow = activeCondition(new_wl, items[index]); + atom_active |= new_wl.activeNow; + items[index] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt->getSize()); +} + +uint32_t +PRWorkload::reduce(uint32_t update, uint32_t value) +{ + float update_float = writeToFloat(update); + float value_float = writeToFloat(value); + return readFromFloat(update_float + value_float); +} + +uint32_t +PRWorkload::propagate(uint32_t value, uint32_t weight) +{ + float value_float = writeToFloat(value); + float weight_float = writeToFloat(weight); + if (weight == 0) { + weight_float = 1.0; + } + return readFromFloat(alpha * value_float * weight_float); +} + +bool +PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + float temp_float = writeToFloat(new_wl.tempProp); + float prop_float = writeToFloat(new_wl.prop); + float dist = std::abs(temp_float - prop_float); + return (dist >= threshold) && (new_wl.degree > 0); +} + +uint32_t +PRWorkload::apply(WorkListItem& wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + float delta = (temp_float - prop_float) / wl.degree; + wl.prop = wl.tempProp; + return readFromFloat(delta); +} + +std::string +PRWorkload::printWorkListItem(const WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + return csprintf( + "WorkListItem{tempProp: %f, prop: %f, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_float, prop_float, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +void +BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + size_t pkt_size = pkt->getSize(); + int num_elements = (int) (pkt_size / sizeof(WorkListItem)); + WorkListItem items[num_elements]; + + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + bool atom_active = false; + for (int i = 0; i < num_elements; i++) { + WorkListItem new_wl = items[i]; + new_wl.tempProp = readFromFloat((1 - alpha)/numNodes); + new_wl.prop = readFromFloat(1/numNodes); + new_wl.activeNow = activeCondition(new_wl, items[i]); + atom_active |= new_wl.activeNow; + items[i] = new_wl; + } + if (atom_active) { + dir->activate(pkt->getAddr()); + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); +} + +uint32_t +BSPPRWorkload::reduce(uint32_t update, uint32_t value) +{ + float update_float = writeToFloat(update); + float value_float = writeToFloat(value); + return readFromFloat(update_float + value_float); +} + +uint32_t +BSPPRWorkload::propagate(uint32_t value, uint32_t weight) +{ + float value_float = writeToFloat(value); + return readFromFloat(alpha * value_float); +} + +bool +BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + return (old_wl.degree > 0); +} + +uint32_t +BSPPRWorkload::apply(WorkListItem& wl) +{ + float prop_float = writeToFloat(wl.prop); + float delta = prop_float / wl.degree; + uint32_t delta_uint = readFromFloat(delta); + return delta_uint; +} + +void +BSPPRWorkload::interIterationInit(WorkListItem& wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + error += std::abs(temp_float - prop_float); + wl.prop = wl.tempProp; + wl.tempProp = readFromFloat((1 - alpha) / numNodes); + wl.activeFuture = (wl.degree > 0); +} + +std::string +BSPPRWorkload::printWorkListItem(const WorkListItem wl) +{ + float temp_float = writeToFloat(wl.tempProp); + float prop_float = writeToFloat(wl.prop); + return csprintf( + "WorkListItem{tempProp: %f, prop: %f, degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_float, prop_float, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +void +BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir) +{ + int pkt_size = pkt->getSize(); + int aligned_addr = roundDown(initAddr, pkt_size); + + if (aligned_addr == pkt->getAddr()) { + int num_elements = pkt_size / sizeof(WorkListItem); + WorkListItem items[num_elements]; + pkt->writeDataToBlock((uint8_t*) items, pkt_size); + int index = (initAddr - aligned_addr) / sizeof(WorkListItem); + WorkListItem new_wl = items[index]; + uint32_t prop = 0; + prop |= initValue; + // NOTE: Depth of the initial vertex is 0. + prop &= countMask; + new_wl.tempProp = prop; + new_wl.prop = prop; + if (activeCondition(new_wl, items[index])) { + new_wl.activeNow = true; + dir->activate(aligned_addr); + } + items[index] = new_wl; + + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, pkt_size); + } +} + +uint32_t +BSPBCWorkload::reduce(uint32_t update, uint32_t value) +{ + uint32_t update_depth = (update & depthMask) >> 24; + uint32_t update_count = (update & countMask); + uint32_t value_depth = (value & depthMask) >> 24; + uint32_t value_count = (value & countMask); + if (value_depth == 255) { + value_depth = currentDepth; + value_count = 0; + } + if (value_depth == currentDepth) { + value_count += update_count; + } + uint32_t ret = 0; + ret |= value_count; + warn_if(value_count > 16777215, "value count has grown bigger than 16777125." + " This means the algorithm result might not be correct." + " However, the traversal will not be affected." + " Therefore, performane metrics could be used."); + // HACK: Make sure to always set the depth correctly even if count + // exceeds the 2^24-1 limit. Here we reset the depth section of ret. + ret &= countMask; + // NOTE: Now that the depth is securely reset we can copy the correct value. + ret |= (value_depth << 24); + return ret; +} + +uint32_t +BSPBCWorkload::propagate(uint32_t value, uint32_t weight) +{ + return value; +} + +uint32_t +BSPBCWorkload::apply(WorkListItem& wl) +{ + return wl.prop; +} + +void +BSPBCWorkload::interIterationInit(WorkListItem& wl) +{ + wl.prop = wl.tempProp; +} + +bool +BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl) +{ + uint32_t depth = (new_wl.tempProp & depthMask) >> 24; + return (depth == currentDepth) && (new_wl.degree > 0); +} + +std::string +BSPBCWorkload::printWorkListItem(WorkListItem wl) +{ + uint32_t temp_depth = (wl.tempProp & depthMask) >> 24; + uint32_t temp_count = (wl.tempProp & countMask); + uint32_t depth = (wl.prop & depthMask) >> 24; + uint32_t count = (wl.prop & countMask); + return csprintf( + "WorkListItem{tempProp: (depth: %d, count: %d), " + "prop: (depth: %d, count: %d), degree: %u, " + "edgeIndex: %u, activeNow: %s, activeFuture: %s}", + temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex, + wl.activeNow ? "true" : "false", + wl.activeFuture ? "true" : "false"); +} + +} // namespace gem5 diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh new file mode 100644 index 0000000000..481cfc146f --- /dev/null +++ b/src/accl/graph/base/graph_workload.hh @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ +#define __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ + +#include +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/work_directory.hh" +#include "mem/packet.hh" + + +namespace gem5 +{ + +class GraphWorkload +{ + public: + GraphWorkload() {} + ~GraphWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0; + virtual uint32_t reduce(uint32_t update, uint32_t value) = 0; + virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0; + virtual uint32_t apply(WorkListItem& wl) = 0; + virtual bool betterThan(uint32_t lhs, uint32_t rhs) { return true; } + virtual void iterate() = 0; + virtual void interIterationInit(WorkListItem& wl) = 0; + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0; + virtual std::string printWorkListItem(const WorkListItem wl) = 0; +}; + +class BFSWorkload : public GraphWorkload +{ + private: + uint64_t initAddr; + uint32_t initValue; + + public: + BFSWorkload(uint64_t init_addr, uint32_t init_value): + initAddr(init_addr), initValue(init_value) + {} + + ~BFSWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual bool betterThan(uint32_t lhs, uint32_t rhs) override { return lhs < rhs; } + virtual void iterate() {} + virtual void interIterationInit(WorkListItem& wl) {} + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +class BFSVisitedWorkload : public BFSWorkload +{ + public: + BFSVisitedWorkload(Addr init_addr, uint32_t init_value): + BFSWorkload(init_addr, init_value) + {} + virtual uint32_t propagate(uint32_t value, uint32_t weight) override; +}; + +class CCWorkload : public BFSVisitedWorkload +{ + public: + CCWorkload(): BFSVisitedWorkload(0, 0) {} + virtual void init(PacketPtr pkt, WorkDirectory* dir); +}; + +class SSSPWorkload : public BFSWorkload +{ + public: + SSSPWorkload(Addr init_addr, uint32_t init_value): + BFSWorkload(init_addr, init_value) + {} + virtual uint32_t propagate(uint32_t value, uint32_t weight) override; +}; + +class PRWorkload : public GraphWorkload +{ + private: + float alpha; + float threshold; + + public: + PRWorkload(float alpha, float threshold): + alpha(alpha), threshold(threshold) + {} + + ~PRWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() {} + virtual void interIterationInit(WorkListItem& wl) {}; + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +class BSPPRWorkload : public GraphWorkload +{ + private: + int numNodes; + float alpha; + float prevError; + float error; + + public: + BSPPRWorkload(int num_nodes, float alpha): + numNodes(num_nodes), alpha(alpha), prevError(0), error(0) + {} + + ~BSPPRWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() { prevError = error; error = 0; } + virtual void interIterationInit(WorkListItem& wl); + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); + + float getError() { return prevError; } +}; + +class BSPBCWorkload : public GraphWorkload +{ + private: + Addr initAddr; + uint32_t initValue; + + int currentDepth; + + uint32_t depthMask; + uint32_t countMask; + public: + BSPBCWorkload(Addr init_addr, uint32_t init_value): + initAddr(init_addr), initValue(init_value), + currentDepth(0), depthMask(4278190080), countMask(16777215) + {} + + ~BSPBCWorkload() {} + + virtual void init(PacketPtr pkt, WorkDirectory* dir); + virtual uint32_t reduce(uint32_t update, uint32_t value); + virtual uint32_t propagate(uint32_t value, uint32_t weight); + virtual uint32_t apply(WorkListItem& wl); + virtual void iterate() { currentDepth++; } + virtual void interIterationInit(WorkListItem& wl); + virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl); + virtual std::string printWorkListItem(const WorkListItem wl); +}; + +} + +#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__ diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py new file mode 100644 index 0000000000..10d8b708f0 --- /dev/null +++ b/src/accl/graph/sega/BaseMemoryEngine.py @@ -0,0 +1,42 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.ClockedObject import ClockedObject + +class BaseMemoryEngine(ClockedObject): + abstract = True + type = 'BaseMemoryEngine' + cxx_header = "accl/graph/sega/base_memory_engine.hh" + cxx_class = 'gem5::BaseMemoryEngine' + + system = Param.System(Parent.any, 'System this Engine is a part of') + mem_port = RequestPort("Port to communicate with the memory") + + attached_memory_atom_size = Param.Int(64, "The atom size of the attached " + "memory.") diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py new file mode 100644 index 0000000000..0cdd11d251 --- /dev/null +++ b/src/accl/graph/sega/CenteralController.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.util.pybind import PyBindMethod +from m5.objects.AbstractMemory import AbstractMemory +from m5.objects.BaseMemoryEngine import BaseMemoryEngine + + +class CenteralController(BaseMemoryEngine): + type = "CenteralController" + cxx_header = "accl/graph/sega/centeral_controller.hh" + cxx_class = "gem5::CenteralController" + + mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.") + + choose_best = Param.Bool( + "Whether to prefer the best update " + "value for choosing the next slice" + ) + + vertex_image_file = Param.String("Path to the vertex image file.") + + mirrors_mem = Param.SimpleMemory("Memory to store the vertex mirrors.") + + mpu_vector = VectorParam.MPU("All mpus in the system.") + + edge_image_file = Param.String("Path to the edge image file.") + + abstract_mem_vector = VectorParam.AbstractMemory( + "Abstract Memories to be intialized by edge_image_file." + ) + abstract_mem_atom_size = Param.Int( + 64, "burst size of the abstract memories." + ) + + edge_base = Param.UInt64("Addr of base address range") + + + + cxx_exports = [ + PyBindMethod("setAsyncMode"), + PyBindMethod("setBSPMode"), + PyBindMethod("setPGMode"), + PyBindMethod("createPopCountDirectory"), + PyBindMethod("createBFSWorkload"), + PyBindMethod("createBFSVisitedWorkload"), + PyBindMethod("createSSSPWorkload"), + PyBindMethod("createCCWorkload"), + PyBindMethod("createAsyncPRWorkload"), + PyBindMethod("createPRWorkload"), + PyBindMethod("createBCWorkload"), + PyBindMethod("workCount"), + PyBindMethod("getPRError"), + PyBindMethod("printAnswerToHostSimout"), + ] diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py new file mode 100644 index 0000000000..bb45802c1d --- /dev/null +++ b/src/accl/graph/sega/CoalesceEngine.py @@ -0,0 +1,50 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseMemoryEngine import BaseMemoryEngine + +class CoalesceEngine(BaseMemoryEngine): + type = 'CoalesceEngine' + cxx_header = "accl/graph/sega/coalesce_engine.hh" + cxx_class = 'gem5::CoalesceEngine' + + cache_size = Param.MemorySize("Size of the internal SRAM array.") + + max_resp_per_cycle = Param.Int("Maximum number of vertices to send to " + "requestor in each cycle. Used to limit b/w.") + pending_pull_limit = Param.Int("Maximum number of pending pull processes.") + active_buffer_size = Param.Int("Maximum number of memory active memory " + "atoms ready to send updates. This parameter " + "and post_push_wb_queue_size should be set " + "in tandem. Probably, they should be equal.") + post_push_wb_queue_size = Param.Int("Maximum number of pending wb after " + "apply process for applications that require " + "the apply process to happen exactly before " + "pushing the edgePointer to the PushEngine.") + transitions_per_cycle = Param.Int("Max number of transitions in a cycle") diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py new file mode 100644 index 0000000000..8d2453b01c --- /dev/null +++ b/src/accl/graph/sega/MPU.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.SimObject import SimObject + +class MPU(SimObject): + type = "MPU" + cxx_header = "accl/graph/sega/mpu.hh" + cxx_class = "gem5::MPU" + + system = Param.System(Parent.any, "System this MPU is a part of") + + wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of " + "MPU object.") + coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for " + "each instance of MPU object.") + push_engine = Param.PushEngine(NULL, "Internal PushEngine for each " + "instance of MPU object.") + diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py new file mode 100644 index 0000000000..50e240808e --- /dev/null +++ b/src/accl/graph/sega/PushEngine.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseMemoryEngine import BaseMemoryEngine + +class PushEngine(BaseMemoryEngine): + type = 'PushEngine' + cxx_header = "accl/graph/sega/push_engine.hh" + cxx_class = 'gem5::PushEngine' + + push_req_queue_size = Param.Int("Size of the queue to " + "queue push requests.") + # resp_queue_size should probably be + # significantly bigger than push_req_queue_size + resp_queue_size = Param.Int("Size of the response queue in the " + "push engine where it stores the " + "edges read from memory.") + + examine_window = Param.Int("Number of edges at the front of the edge queue" + " to examine in order to propagate.") + + max_propagates_per_cycle = Param.Int("Maximum number of propagates " + "done per cycle.") + + update_queue_size = Param.Int("Maximum number of entries " + "for each update queue.") + + out_ports = VectorRequestPort("Outgoing ports to all MPUs") + + base_addr = Param.UInt64("Addr of base address range") diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript new file mode 100644 index 0000000000..b3e1a838fb --- /dev/null +++ b/src/accl/graph/sega/SConscript @@ -0,0 +1,55 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2016 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Import("*") + +SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"]) +SimObject("CenteralController.py", sim_objects=["CenteralController"]) +SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"]) +SimObject("MPU.py", sim_objects=["MPU"]) +SimObject("PushEngine.py", sim_objects=["PushEngine"]) +SimObject("WLEngine.py", sim_objects=["WLEngine"]) + +Source("base_memory_engine.cc") +Source("centeral_controller.cc") +Source("coalesce_engine.cc") +Source("enums.cc") +Source("mpu.cc") +Source("push_engine.cc") +Source("wl_engine.cc") + +DebugFlag("BaseMemoryEngine") +DebugFlag("CenteralController") +DebugFlag("CacheBlockState") +DebugFlag("CoalesceEngine") +DebugFlag("PushEngine") +DebugFlag("SEGAStructureSize") +DebugFlag("MSDebug") +DebugFlag("WLEngine") + +CompoundFlag("MPU", ["CoalesceEngine", "PushEngine", + "WLEngine", "BaseMemoryEngine"]) diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py new file mode 100644 index 0000000000..f9ea4488df --- /dev/null +++ b/src/accl/graph/sega/WLEngine.py @@ -0,0 +1,54 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2017 Jason Lowe-Power +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from m5.params import * +from m5.proxy import * +from m5.objects.BaseReduceEngine import BaseReduceEngine + +class WLEngine(BaseReduceEngine): + type = 'WLEngine' + cxx_header = "accl/graph/sega/wl_engine.hh" + cxx_class = 'gem5::WLEngine' + + in_ports = VectorResponsePort("Incoming Ports to receive updates from " + "remote outside") + + update_queue_size = Param.Int("Size of the queue WLEngine stores " + "the incoming updates") + + register_file_size = Param.Int("Number of internal registers the " + "WLEngine has. It can service as " + "many updates as this queueu has " + "entries at the same time.") + + examine_window = Param.Int("Number of updates at the front of update " + "queue examined for reading.") + rd_per_cycle = Param.Int("Maximum number of reads per cycle.") + reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.") + wr_per_cycle = Param.Int("Maximum number of writes per cycle.") + + diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc new file mode 100644 index 0000000000..9f704f71e9 --- /dev/null +++ b/src/accl/graph/sega/base_memory_engine.cc @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/base_memory_engine.hh" + +#include "debug/BaseMemoryEngine.hh" +#include "debug/SEGAStructureSize.hh" + +namespace gem5 +{ + +BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams ¶ms): + ClockedObject(params), + system(params.system), + _requestorId(system->getRequestorId(this)), + memPort(name() + ".mem_port", this), + peerMemoryAtomSize(params.attached_memory_atom_size) +{} + +BaseMemoryEngine::~BaseMemoryEngine() +{} + +Port& +BaseMemoryEngine::getPort(const std::string &if_name, PortID idx) +{ + if (if_name == "mem_port") { + return memPort; + } else { + return SimObject::getPort(if_name, idx); + } +} + +void +BaseMemoryEngine::init() +{ + AddrRangeList memory_ranges = memPort.getAddrRanges(); + + assert(memory_ranges.size() == 1); + + peerMemoryRange = memory_ranges.front(); + + DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is " + "%s. The range is %s interleaved.\n", __func__, + peerMemoryRange.to_string(), + peerMemoryRange.interleaved() ? "" : "not"); +} + +void +BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt) +{ + panic_if(_blocked, "Should never try to send if blocked MemSide!"); + DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to " + "the memory.\n", __func__, pkt->print()); + if (!sendTimingReq(pkt)) + { + blockedPacket = pkt; + _blocked = true; + DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__); + } else { + DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__); + owner->recvMemRetry(); + } +} + +bool +BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt) +{ + return owner->handleMemResp(pkt); +} + +void +BaseMemoryEngine::MemPort::recvReqRetry() +{ + panic_if(!(_blocked && blockedPacket), + "Received retry without a blockedPacket"); + + _blocked = false; + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); +} + +PacketPtr +BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + // Embed it in a packet + PacketPtr pkt = new Packet(req, MemCmd::ReadReq); + pkt->allocate(); + + return pkt; +} + +PacketPtr +BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data) +{ + RequestPtr req = std::make_shared(addr, size, 0, _requestorId); + + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) _requestorId) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::WriteReq); + pkt->allocate(); + pkt->setData(data); + + return pkt; +} + +} diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh new file mode 100644 index 0000000000..31e7d85bef --- /dev/null +++ b/src/accl/graph/sega/base_memory_engine.hh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ + +#include + +#include "base/addr_range.hh" +#include "mem/packet.hh" +#include "mem/port.hh" +#include "params/BaseMemoryEngine.hh" +#include "sim/clocked_object.hh" +#include "sim/system.hh" + +namespace gem5 +{ + +class BaseMemoryEngine : public ClockedObject +{ + protected: + class MemoryEvent : public EventFunctionWrapper + { + private: + bool _pending; + int _prevState; + + public: + MemoryEvent(const std::function &callback, + const std::string &name): + EventFunctionWrapper(callback, name), + _pending(false), _prevState(0) + {} + bool pending() { return _pending; } + void sleep() { _pending = true; } + void wake() { _pending = false; } + void setPrevState(int state) { _prevState = state; } + int getPrevState() { return _prevState; } + }; + + class MemPort : public RequestPort + { + private: + BaseMemoryEngine* owner; + bool _blocked; + PacketPtr blockedPacket; + + public: + MemPort(const std::string& name, BaseMemoryEngine* owner): + RequestPort(name), owner(owner), + _blocked(false), blockedPacket(nullptr) + {} + + void sendPacket(PacketPtr pkt); + bool blocked() { return _blocked; } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + System* system; + const RequestorID _requestorId; + + MemPort memPort; + AddrRange peerMemoryRange; + size_t peerMemoryAtomSize; + + virtual void recvMemRetry() = 0; + virtual bool handleMemResp(PacketPtr pkt) = 0; + + PacketPtr createReadPacket(Addr addr, unsigned int size); + PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data); + + public: + PARAMS(BaseMemoryEngine); + + BaseMemoryEngine(const Params ¶ms); + ~BaseMemoryEngine(); + + Port& getPort(const std::string &if_name, + PortID idx=InvalidPortID) override; + + AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); } + + virtual void recvFunctional(PacketPtr pkt) = 0; + + virtual void init() override; +}; + +} + +#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__ diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr new file mode 100644 index 0000000000..316fcd37d9 --- /dev/null +++ b/src/accl/graph/sega/busyMaskErr @@ -0,0 +1,16 @@ +gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0 + +32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}. +32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}. +32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}. +32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}. +32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}. +32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}. +32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}. + +// This assertion would be hit although it should not. +// It is fixed by a hack in recvWLRead when hit in the cache. +assert(cacheBlocks[block_index].busyMask == 0); diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc new file mode 100644 index 0000000000..a2970a9013 --- /dev/null +++ b/src/accl/graph/sega/centeral_controller.cc @@ -0,0 +1,551 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/centeral_controller.hh" + +#include +#include + +#include "base/addr_range_map.hh" +#include "base/cprintf.hh" +#include "base/loader/memory_image.hh" +#include "base/loader/object_file.hh" +#include "debug/CenteralController.hh" +#include "mem/abstract_mem.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +using memory::AbstractMemory; + +CenteralController::CenteralController(const Params& params): + BaseMemoryEngine(params), + mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET), + mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0), + chooseBest(params.choose_best), + edgeBase(params.edge_base), + nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name()), + stats(*this) +{ + uint64_t total_cache_size = 0; + for (auto mpu : params.mpu_vector) { + mpuVector.push_back(mpu); + mpu->registerCenteralController(this); + total_cache_size += mpu->getCacheSize(); + } + verticesPerSlice = std::floor(total_cache_size / sizeof(WorkListItem)); +} + +Port& +CenteralController::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "mirrors_map_mem") { + return mapPort; + } else if (if_name == "mem_port") { + return BaseMemoryEngine::getPort("mem_port", idx); + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BFSWorkload(init_addr, init_value); +} + +void +CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BFSVisitedWorkload(init_addr, init_value); +} + +void +CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new SSSPWorkload(init_addr, init_value); +} + +void +CenteralController::createCCWorkload() +{ + workload = new CCWorkload(); +} + +void +CenteralController::createAsyncPRWorkload(float alpha, float threshold) +{ + workload = new PRWorkload(alpha, threshold); +} + +void +CenteralController::createPRWorkload(int num_nodes, float alpha) +{ + workload = new BSPPRWorkload(num_nodes, alpha); +} + +void +CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value) +{ + workload = new BSPBCWorkload(init_addr, init_value); +} + +void +CenteralController::createPopCountDirectory(int atoms_per_block) +{ + fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing " + "mode by calling either setAsyncMode or setBSPMode."); + if (mode == ProcessingMode::ASYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->createAsyncPopCountDirectory(atoms_per_block); + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->createBSPPopCountDirectory(atoms_per_block); + } + } + if (mode == ProcessingMode::POLY_GRAPH) { + for (auto mpu: mpuVector) { + mpu->createAsyncPopCountDirectory(atoms_per_block); + } + } +} + +void +CenteralController::startup() +{ + DPRINTF(CenteralController, "Startup 1!\n"); + unsigned int vertex_atom = mpuVector.front()->vertexAtomSize(); + for (auto mpu: mpuVector) { + for (auto range: mpu->getAddrRanges()) { + mpuAddrMap.insert(range, mpu); + } + mpu->setProcessingMode(mode); + mpu->recvWorkload(workload); + } + DPRINTF(CenteralController, "Startup 2!\n"); + + const auto& vertex_file = params().vertex_image_file; + if (vertex_file == "") + return; + DPRINTF(CenteralController, "Startup 3!\n"); + + auto* object = loader::createObjectFile(vertex_file, true); + fatal_if(!object, "%s: Could not load %s.", name(), vertex_file); + + loader::debugSymbolTable.insert(*object->symtab().globals()); + loader::MemoryImage vertex_image = object->buildImage(); + maxVertexAddr = vertex_image.maxAddr(); + + int num_total_vertices = (maxVertexAddr / sizeof(WorkListItem)); + numTotalSlices = std::ceil((double) num_total_vertices / verticesPerSlice); + DPRINTF(CenteralController, "Startup 4!\n"); + + numPendingUpdates = new int [numTotalSlices]; + bestPendingUpdate = new uint32_t [numTotalSlices]; + for (int i = 0; i < numTotalSlices; i++) { + numPendingUpdates[i] = 0; + bestPendingUpdate[i] = -1; + } + DPRINTF(CenteralController, "Startup 5!\n"); + + PortProxy vertex_proxy( + [this](PacketPtr pkt) { + auto routing_entry = mpuAddrMap.contains(pkt->getAddr()); + routing_entry->second->recvFunctional(pkt); + }, vertex_atom); + + panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image."); + + for (auto mpu: mpuVector) { + mpu->postMemInitSetup(); + if (!mpu->running() && (mpu->workCount() > 0)) { + mpu->start(); + } + } + workload->iterate(); + DPRINTF(CenteralController, "Startup 6!\n"); + DPRINTF(CenteralController, "params().edge_image_file = %s\n", params().edge_image_file); + const auto& edge_file = params().edge_image_file; + DPRINTF(CenteralController, "edge_file = %s\n", edge_file); + + // if (edge_file == "") {} // commented this out + // return; + + DPRINTF(CenteralController, "Startup 7!\n"); + + AddrRangeMap abs_mem_range_map; + for (auto abs_mem: params().abstract_mem_vector) { + abs_mem_range_map.insert(abs_mem->getAddrRange(), abs_mem); + } + // DPRINTF(CenteralController, "%s, Edge memory ranges: %s", __func__, abs_mem_range_map); + auto* edge_object = loader::createObjectFile(edge_file, true); + fatal_if(!object, "%s: Could not load %s.", name(), edge_file); + + loader::debugSymbolTable.insert(*edge_object->symtab().globals()); + loader::MemoryImage edge_image = edge_object->buildImage(); + DPRINTF(CenteralController, "Startup 8!\n"); + + PortProxy edge_proxy( + [abs_mem_range_map, this](PacketPtr pkt) { + pkt->setAddr(pkt->getAddr() + mpuVector[0]->getBaseAddr()); + auto routing_entry = abs_mem_range_map.contains(pkt->getAddr()); + routing_entry->second->functionalAccess(pkt); + }, params().abstract_mem_atom_size); + + DPRINTF(CenteralController, "%s, mpuVector[0]->getBaseAddr(): %lu", __func__, mpuVector[0]->getBaseAddr()); + + panic_if(!edge_image.write(edge_proxy), "%s: Unable to write image."); +} + +void +CenteralController::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(blockedPacket != nullptr, + "Should never try to send if blocked!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + DPRINTF(CenteralController, "%s: Port %d: Packet %s " + "is blocked.\n", __func__, _id, pkt->print()); + blockedPacket = pkt; + } else { + DPRINTF(CenteralController, "%s: Port %d: Packet %s " + "sent.\n", __func__, _id, pkt->print()); + } +} + +bool +CenteralController::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp should not be called at all"); +} + +void +CenteralController::ReqPort::recvReqRetry() +{ + panic("recvReqRetry should not be called at all"); +} + +void +CenteralController::recvDoneSignal() +{ + bool done = true; + for (auto mpu : mpuVector) { + done &= mpu->done(); + } + + if (done && mode == ProcessingMode::ASYNCHRONOUS) { + exitSimLoopNow("no update left to process."); + } + + if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) { + for (auto mpu: mpuVector) { + mpu->postConsumeProcess(); + mpu->swapDirectories(); + if (!mpu->running() && (mpu->workCount() > 0)) { + mpu->start(); + } + } + workload->iterate(); + exitSimLoopNow("finished an iteration."); + } + + if (done && mode == ProcessingMode::POLY_GRAPH) { + DPRINTF(CenteralController, "%s: Received done signal.\n", __func__); + exitSimLoopNow("Finished processing a slice."); + if (!nextSliceSwitchEvent.scheduled()) { + schedule(nextSliceSwitchEvent, nextCycle()); + } + } +} + +int +CenteralController::chooseNextSlice() +{ + int ret_slice_id = -1; + int max_pending_count = 0; + // TODO: Make this generalizable for all workloads. + uint32_t best_update = -1; + for (int i = 0; i < numTotalSlices; i++) { + if (numPendingUpdates[i] > max_pending_count) { + max_pending_count = numPendingUpdates[i]; + } + if (numPendingUpdates[i] > 0 && + workload->betterThan(bestPendingUpdate[i], best_update)) { + best_update = bestPendingUpdate[i]; + } + } + if (chooseBest) { + int max_count = 0; + for (int i = 0; i < numTotalSlices; i++) { + if (numPendingUpdates[i] > max_count && + bestPendingUpdate[i] == best_update) { + max_count = numPendingUpdates[i]; + ret_slice_id = i; + } + } + } else { + uint32_t best_value = -1; + for (int i = 0; i < numTotalSlices; i++) { + if (numPendingUpdates[i] == max_pending_count && + workload->betterThan(bestPendingUpdate[i], best_value)) { + best_value = bestPendingUpdate[i]; + ret_slice_id = i; + } + } + } + return ret_slice_id; +} + +void +CenteralController::processNextSliceSwitchEvent() +{ + int vertex_atom = mpuVector.front()->vertexAtomSize(); + int vertices_per_atom = (int) vertex_atom / sizeof(WorkListItem); + int bytes_accessed = 0; + int updates_generated_total = 0; + for (int dst_id = 0; dst_id < numTotalSlices; dst_id++) { + if (dst_id == currentSliceId) { + continue; + } + int updates_generated = 0; + Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(uint64_t); + Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(uint64_t); + PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t)); + PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t)); + mapPort.sendFunctional(start); + mapPort.sendFunctional(end); + Addr start_addr = start->getLE(); + Addr end_addr = end->getLE(); + delete start; + delete end; + DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__, + currentSliceId, dst_id, start_addr, end_addr); + + uint64_t num_bytes = end_addr - start_addr; + uint64_t num_mirrors = (end_addr - start_addr) / sizeof(MirrorVertex); + MirrorVertex* mirrors = new MirrorVertex [num_mirrors]; + + PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes); + memPort.sendFunctional(read_mirrors); + read_mirrors->writeData((uint8_t*) mirrors); + delete read_mirrors; + + WorkListItem vertices [vertices_per_atom]; + for (int i = 0; i < num_mirrors; i++) { + Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem); + Addr aligned_org_addr = roundDown(org_addr, vertex_atom); + int wl_offset = (int) (org_addr - aligned_org_addr) / sizeof(WorkListItem); + PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom); + auto routing_entry = mpuAddrMap.contains(aligned_org_addr); + routing_entry->second->recvFunctional(read_org); + read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom); + delete read_org; + if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) { + assert(vertices[wl_offset].degree == 0); + vertices[wl_offset].prop = vertices[wl_offset].tempProp; + } + if (mirrors[i].prop != vertices[wl_offset].prop) { + mirrors[i].prop = vertices[wl_offset].prop; + if (!mirrors[i].activeNow) { + mirrors[i].activeNow = true; + numPendingUpdates[dst_id]++; + totalUpdatesLeft++; + updates_generated++; + } + bestPendingUpdate[dst_id] = + workload->betterThan(mirrors[i].prop, bestPendingUpdate[dst_id]); + } + } + PacketPtr write_mirrors = + createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors); + memPort.sendFunctional(write_mirrors); + delete write_mirrors; + delete [] mirrors; + DPRINTF(CenteralController, "%s: Done scattering updates from slice " + "%d to slice %d.\n", __func__, currentSliceId, dst_id); + DPRINTF(CenteralController, "%s: Generated %d updates from slice " + "%d to slice %d.\n", __func__, + updates_generated, currentSliceId, dst_id); + updates_generated_total += updates_generated; + bytes_accessed += 2 * num_bytes; + } + DPRINTF(CenteralController, "%s: Done with slice %d.\n", __func__, currentSliceId); + DPRINTF(CenteralController, "%s: Generated a total of %d updates.\n", + __func__, updates_generated_total); + DPRINTF(CenteralController, "%s: There are a total of %d " + "updates left.\n", __func__, totalUpdatesLeft); + if (totalUpdatesLeft > 0) { + currentSliceId = chooseNextSlice(); + } else { + exitSimLoopNow("Done with all the slices."); + return; + } + DPRINTF(CenteralController, "%s: Chose %d as the " + "next slice.\n", __func__, currentSliceId); + + for (int src_id = 0; src_id < numTotalSlices; src_id++) { + if (src_id == currentSliceId) { + continue; + } + Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(uint64_t); + Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(uint64_t); + PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t)); + PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t)); + mapPort.sendFunctional(start); + mapPort.sendFunctional(end); + Addr start_addr = start->getLE(); + Addr end_addr = end->getLE(); + delete start; + delete end; + DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__, + src_id, currentSliceId, start_addr, end_addr); + + uint64_t num_bytes = end_addr - start_addr; + uint64_t num_mirrors = (end_addr - start_addr) / sizeof(MirrorVertex); + MirrorVertex* mirrors = new MirrorVertex [num_mirrors]; + + PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes); + memPort.sendFunctional(read_mirrors); + read_mirrors->writeData((uint8_t*) mirrors); + delete read_mirrors; + for (int i = 0; i < num_mirrors; i++) { + if (mirrors[i].activeNow) { + Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem); + auto routing_entry = mpuAddrMap.contains(org_addr); + routing_entry->second->recvMirrorPush(org_addr, mirrors[i].prop, + mirrors[i].edgeIndex, mirrors[i].degree); + mirrors[i].activeNow = false; + numPendingUpdates[currentSliceId]--; + totalUpdatesLeft--; + } + } + PacketPtr write_mirrors = + createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors); + memPort.sendFunctional(write_mirrors); + delete write_mirrors; + delete [] mirrors; + DPRINTF(CenteralController, "%s: Done gathering updates from slice " + "%d to slice %d.\n", __func__, src_id, currentSliceId); + bytes_accessed += num_bytes; + } + + double mirror_mem_bw = mirrorsMem->getBW(); + Tick time_to_switch = bytes_accessed * mirror_mem_bw; + stats.switchTicks += time_to_switch; + stats.switchedBytes += bytes_accessed; + stats.numSwitches++; + for (auto mpu: mpuVector) { + mpu->startProcessingMirrors(time_to_switch); + } + exitSimLoopNow("Done with slice switch."); +} + +bool +CenteralController::handleMemResp(PacketPtr pkt) +{ + panic("handleMemResp should not be called at all"); +} + +void +CenteralController::recvMemRetry() +{ + panic("recvMemRetry should not be called at all"); +} + +void +CenteralController::recvFunctional(PacketPtr pkt) +{ + panic("recvFunctional should not be called at all"); +} + +int +CenteralController::workCount() +{ + int work_count = 0; + for (auto mpu: mpuVector) { + work_count += mpu->workCount(); + } + return work_count; +} + +float +CenteralController::getPRError() +{ + BSPPRWorkload* pr_workload = dynamic_cast(workload); + return pr_workload->getError(); +} + +void +CenteralController::printAnswerToHostSimout() +{ + unsigned int vertex_atom = mpuVector.front()->vertexAtomSize(); + int num_items = vertex_atom / sizeof(WorkListItem); + WorkListItem items[num_items]; + for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom) + { + PacketPtr pkt = createReadPacket(addr, vertex_atom); + auto routing_entry = mpuAddrMap.contains(pkt->getAddr()); + routing_entry->second->recvFunctional(pkt); + pkt->writeDataToBlock((uint8_t*) items, vertex_atom); + for (int i = 0; i < num_items; i++) { + std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i, + workload->printWorkListItem(items[i])); + + std::cout << print << std::endl; + } + delete pkt; + } +} + +CenteralController::ControllerStats::ControllerStats(CenteralController& _ctrl): + statistics::Group(&_ctrl), ctrl(_ctrl), + ADD_STAT(numSwitches, statistics::units::Byte::get(), + "Number of slices switches completed."), + ADD_STAT(switchedBytes, statistics::units::Byte::get(), + "Number of bytes accessed during slice switching."), + ADD_STAT(switchTicks, statistics::units::Tick::get(), + "Number of ticks spent switching slices."), + ADD_STAT(switchSeconds, statistics::units::Second::get(), + "Traversed Edges Per Second.") +{ +} + +void +CenteralController::ControllerStats::regStats() +{ + using namespace statistics; + + switchSeconds = switchTicks / simFreq; +} + +} diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh new file mode 100644 index 0000000000..74f8124380 --- /dev/null +++ b/src/accl/graph/sega/centeral_controller.hh @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ +#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ + +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/mpu.hh" +#include "base/addr_range.hh" +#include "base/addr_range_map.hh" +#include "base/intmath.hh" +#include "mem/simple_mem.hh" +#include "params/CenteralController.hh" + +namespace gem5 +{ + +class CenteralController : public BaseMemoryEngine +{ + private: + class ReqPort : public RequestPort + { + private: + CenteralController* owner; + PacketPtr blockedPacket; + PortID _id; + + public: + ReqPort(const std::string& name, CenteralController* owner, PortID id): + RequestPort(name), owner(owner), blockedPacket(nullptr), _id(id) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return (blockedPacket != nullptr); } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + ReqPort mapPort; + Addr maxVertexAddr; + ProcessingMode mode; + + memory::SimpleMemory* mirrorsMem; + + // AddrRangeMap abs_mem_range_map; // moved here from .cc file + Addr edgeBase; + std::vector mpuVector; + AddrRangeMap mpuAddrMap; + + int currentSliceId; + int numTotalSlices; + int verticesPerSlice; + int totalUpdatesLeft; + + bool chooseBest; + int* numPendingUpdates; + uint32_t* bestPendingUpdate; + int chooseNextSlice(); + + EventFunctionWrapper nextSliceSwitchEvent; + void processNextSliceSwitchEvent(); + + struct ControllerStats : public statistics::Group + { + ControllerStats(CenteralController& ctrl); + + void regStats() override; + + CenteralController& ctrl; + + statistics::Scalar numSwitches; + statistics::Scalar switchedBytes; + statistics::Scalar switchTicks; + statistics::Formula switchSeconds; + }; + ControllerStats stats; + + protected: + virtual void recvMemRetry() override; + virtual bool handleMemResp(PacketPtr pkt) override; + + public: + GraphWorkload* workload; + + PARAMS(CenteralController); + CenteralController(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + + virtual void startup() override; + + virtual void recvFunctional(PacketPtr pkt) override; + + void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; } + void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; } + void setPGMode() { mode = ProcessingMode::POLY_GRAPH; } + + void createPopCountDirectory(int atoms_per_block); + + void createBFSWorkload(Addr init_addr, uint32_t init_value); + void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value); + void createSSSPWorkload(Addr init_addr, uint32_t init_value); + void createCCWorkload(); + void createAsyncPRWorkload(float alpha, float threshold); + void createPRWorkload(int num_nodes, float alpha); + void createBCWorkload(Addr init_addr, uint32_t init_value); + + void recvDoneSignal(); + + int workCount(); + float getPRError(); + void printAnswerToHostSimout(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__ diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc new file mode 100644 index 0000000000..afb0695206 --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine.cc @@ -0,0 +1,1327 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/coalesce_engine.hh" + +#include + +#include "accl/graph/sega/mpu.hh" +#include "base/intmath.hh" +#include "debug/CacheBlockState.hh" +#include "debug/CoalesceEngine.hh" +#include "debug/SEGAStructureSize.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +CoalesceEngine::CoalesceEngine(const Params ¶ms): + BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0), + numLines((int) (params.cache_size / peerMemoryAtomSize)), + numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))), + lastReadTick(0), onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle), + numReceivedPulls(0), numScheduledPulls(0), pendingPullLimit(params.pending_pull_limit), + pendingPullReads(0), activeBufferSize(params.active_buffer_size), + postPushWBQueueSize(params.post_push_wb_queue_size), + transitionsPerCycle(params.transitions_per_cycle), + nextMemoryEvent([this] { + processNextMemoryEvent(); + }, name() + ".nextMemoryEvent"), + nextResponseEvent([this] { + processNextResponseEvent(); + }, name() + ".nextResponseEvent"), + nextApplyEvent([this] { + processNextApplyEvent(); + }, name() + ".nextApplyEvent"), + nextDoneSignalEvent([this] { + processNextDoneSignalEvent(); + }, name() + ".nextDoneSignalEvent"), + stats(*this) +{ + assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine)); + cacheBlocks = new Block [numLines]; + for (int i = 0; i < numLines; i++) { + cacheBlocks[i] = Block(numElementsPerLine); + } + numActiveBlocksNow = UniqueFIFO(numLines); + numActiveBlocksNext = UniqueFIFO(numLines); + + activeBuffer.clear(); + postPushWBQueue.clear(); + blocksTouchedThisTick.clear(); +} + +void +CoalesceEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + + +// NOTE: Used for initializing memory and reading the final answer +void +CoalesceEngine::recvFunctional(PacketPtr pkt) +{ + if (pkt->isRead()) { + assert(pkt->getSize() == peerMemoryAtomSize); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + + if ((cacheBlocks[block_index].addr == addr) && + (cacheBlocks[block_index].valid)) { + assert(cacheBlocks[block_index].state == CacheState::IDLE); + + pkt->makeResponse(); + pkt->setDataFromBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + } else { + memPort.sendFunctional(pkt); + } + } else { + graphWorkload->init(pkt, currentDirectory); + if (pkt->getAddr() > lastAtomAddr) { + lastAtomAddr = pkt->getAddr(); + } + memPort.sendFunctional(pkt); + } +} + +void +CoalesceEngine::postMemInitSetup() +{ + currentDirectory->setLastAtomAddr(lastAtomAddr); +} + +void +CoalesceEngine::postConsumeProcess() +{ + Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr); + for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) { + Addr addr = peerMemoryRange.addIntlvBits(local_addr); + int block_index = getBlockIndex(addr); + if (cacheBlocks[block_index].addr == addr) { + assert(cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::IDLE); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!cacheBlocks[block_index].items[index].activeNow); + atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture; + graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]); + atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture; + if (cacheBlocks[block_index].items[index].activeFuture) { + cacheBlocks[block_index].items[index].activeFuture = false; + cacheBlocks[block_index].items[index].activeNow = true; + cacheBlocks[block_index].dirty = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + numActiveBlocksNext.push_back(block_index); + } + if (atom_active_future_before && !atom_active_future_after) { + numActiveBlocksNext.erase(block_index); + } + } else { + WorkListItem items[numElementsPerLine]; + PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize); + memPort.sendFunctional(read_pkt); + read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future_before = false; + bool atom_active_future_after = false; + for (int index = 0; index < numElementsPerLine; index++) { + assert(!items[index].activeNow); + atom_active_future_before |= items[index].activeFuture; + graphWorkload->interIterationInit(items[index]); + atom_active_future_after |= items[index].activeFuture; + if (items[index].activeFuture) { + items[index].activeFuture = false; + items[index].activeNow = true; + } + } + if (!atom_active_future_before && atom_active_future_after) { + futureDirectory->activate(addr); + } + if (atom_active_future_before && !atom_active_future_after) { + futureDirectory->deactivate(addr); + } + PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items); + memPort.sendFunctional(write_pkt); + delete read_pkt; + delete write_pkt; + } + } +} + +void +CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = nullptr; +} + +void +CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block) +{ + currentDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); + futureDirectory = new PopCountDirectory( + peerMemoryRange, atoms_per_block, peerMemoryAtomSize); +} + +void +CoalesceEngine::swapDirectories() +{ + assert(currentDirectory->empty()); + assert(numActiveBlocksNow.empty()); + // assert currentDirectory is empty + WorkDirectory* temp = currentDirectory; + currentDirectory = futureDirectory; + futureDirectory = temp; + + numActiveBlocksNow.clear(); + numActiveBlocksNow = numActiveBlocksNext; + numActiveBlocksNext.clear(); +} + +bool +CoalesceEngine::done() +{ + return memAccBuffer.empty() && numActiveBlocksNow.empty() && + activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0); +} + +bool +CoalesceEngine::enoughSpace() +{ + return (activeBuffer.size() + pendingPullReads + numScheduledPulls) < activeBufferSize; +} + +bool +CoalesceEngine::pullCondition() +{ + bool enough_space = enoughSpace(); + bool schedule_limit = numScheduledPulls < pendingPullLimit; + return enough_space && schedule_limit; +} + +// addr should be aligned to peerMemoryAtomSize +int +CoalesceEngine::getBlockIndex(Addr addr) +{ + assert((addr % peerMemoryAtomSize) == 0); + Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr); + return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines; +} + +ReadReturnStatus +CoalesceEngine::recvWLRead(Addr addr) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + assert(aligned_addr % peerMemoryAtomSize == 0); + int block_index = getBlockIndex(aligned_addr); + assert(block_index < numLines); + if (lastReadTick < curTick()) { + blocksTouchedThisTick.clear(); + lastReadTick = curTick(); + } + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + assert(wl_offset < numElementsPerLine); + DPRINTF(CoalesceEngine, "%s: Received a read request for addr: %lu. " + "This request maps to cacheBlocks[%d], aligned_addr: " + "%lu, and wl_offset: %d.\n", __func__, addr, + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if ((cacheBlocks[block_index].addr == aligned_addr) && + (cacheBlocks[block_index].valid)) { + // Hit + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit.\n", __func__, addr); + stats.readHits++; + assert(cacheBlocks[block_index].state != CacheState::INVALID); + responseQueue.push_back(std::make_tuple( + addr, cacheBlocks[block_index].items[wl_offset], curTick())); + + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + // TODO: Stat to count the number of WLItems that have been touched. + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + cacheBlocks[block_index].state = CacheState::BUSY; + // HACK: If a read happens on the same cycle as another operation such + // as apply set lastChangedTick to half a cycle later so that operation + // scheduled by the original operation (apply in this example) are + // invalidated. For more details refer to "accl/graph/sega/busyMaskErr" + cacheBlocks[block_index].lastChangedTick = + curTick() + (Tick) (clockPeriod() / 2); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + blocksTouchedThisTick.insert(block_index); + if (!nextResponseEvent.scheduled()) { + schedule(nextResponseEvent, nextCycle()); + } + + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else if ((cacheBlocks[block_index].addr == aligned_addr) && + (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) { + // Hit under miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a hit under miss.\n", + __func__, addr); + stats.readHitUnderMisses++; + assert(!cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].dirty); + + assert(MSHR.find(block_index) != MSHR.end()); + MSHR[block_index].push_back(addr); + DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to MSHR " + "for cacheBlocks[%d].\n", __func__, addr, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + blocksTouchedThisTick.insert(block_index); + + stats.numVertexReads++; + return ReadReturnStatus::ACCEPT; + } else { + // miss + assert(cacheBlocks[block_index].addr != aligned_addr); + DPRINTF(CoalesceEngine, "%s: Addr: %lu is a miss.\n", __func__, addr); + stats.readMisses++; + if (blocksTouchedThisTick.find(block_index) != blocksTouchedThisTick.end()) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has already been " + "accessed this tick.\n", __func__, block_index); + return ReadReturnStatus::REJECT_ROLL; + } + if (cacheBlocks[block_index].state != CacheState::INVALID) { + // conflict miss + DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with " + "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr); + cacheBlocks[block_index].hasConflict = true; + if (cacheBlocks[block_index].state == CacheState::IDLE) { + if (cacheBlocks[block_index].dirty) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is dirty.\n", + __func__, block_index); + cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); + memAccBuffer.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is now " + "pending write back.\n", __func__, block_index); + } else { + // NOTE: The cache block could still be active but + // not dirty. If active we only have to active tracking + // but can throw the data away. + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not dirty.\n", + __func__, block_index); + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active now.\n", + __func__, block_index); + numActiveBlocksNow.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + } + if (atom_active_future) { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active next.\n", + __func__, block_index); + numActiveBlocksNext.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + // NOTE: Bring the cache line to invalid state. + // NOTE: Above line where we set hasConflict to true + // does not matter anymore since we reset the cache line. + cacheBlocks[block_index].reset(); + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is reset.\n", + __func__, block_index); + } + blocksTouchedThisTick.insert(block_index); + return ReadReturnStatus::REJECT_NO_ROLL; + } else { + blocksTouchedThisTick.insert(block_index); + stats.numConflicts++; + return ReadReturnStatus::REJECT_ROLL; + } + } else { + // cold miss + assert(MSHR.find(block_index) == MSHR.end()); + cacheBlocks[block_index].addr = aligned_addr; + cacheBlocks[block_index].busyMask = 0; + cacheBlocks[block_index].valid = false; + cacheBlocks[block_index].dirty = false; + cacheBlocks[block_index].hasConflict = false; + cacheBlocks[block_index].state = CacheState::PENDING_DATA; + cacheBlocks[block_index].lastChangedTick = curTick(); + + MSHR[block_index].push_back(addr); + memAccBuffer.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextRead(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + blocksTouchedThisTick.insert(block_index); + return ReadReturnStatus::ACCEPT; + } + } +} + +bool +CoalesceEngine::handleMemResp(PacketPtr pkt) +{ + assert(pkt->isResponse()); + DPRINTF(CoalesceEngine, "%s: Received packet: %s from memory.\n", + __func__, pkt->print()); + + onTheFlyReqs--; + if (pkt->isWrite()) { + DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__); + delete pkt; + } else { + assert(pkt->isRead()); + Addr addr = pkt->getAddr(); + int block_index = getBlockIndex(addr); + ReadPurpose* purpose = pkt->findNextSenderState(); + + // NOTE: Regardless of where the pkt will go we have to release the + // reserved space for this pkt in the activeBuffer in case + // it was read from memory for placement in the activeBuffer. + // NOTE: Also we have to stop tracking the address for pullAddrs + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + pendingPullReads--; + pendingPullAddrs.erase(addr); + } + if (cacheBlocks[block_index].addr == addr) { + // If it is in the cache, line should be in PENDING_DATA state. + // Regardless of the purpose for which it was read, it should + // be placed in the cache array. + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + // NOTE: Since it is in PENDING_DATA state it + // should have an entry in the MSHR. + assert(MSHR.find(block_index) != MSHR.end()); + + pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items, + peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + // HACK: In case the pkt was read for push but it was allocated + // for in the cache later on, we should cancel the future + // processNextRead for this block. We could set lastChangedTick + // to curTick() like usual. However, there is no way to ensure + // that processNextRead will be not be called on the same tick + // as the pkt arrives from the memory. Therefore, we will set + // the lastChangedTick to half a cycle before the actual time. + // We move that back in time because it would be fine if + // processNextRead happened before pkt arriveed. processNextRead + // actually will check if there is a pending read for push for + // the address it's trying to populate. + if (purpose->dest() == ReadDestination::READ_FOR_PUSH) { + cacheBlocks[block_index].lastChangedTick = + curTick() - (Tick) (clockPeriod() / 2); + } else { + cacheBlocks[block_index].lastChangedTick = curTick(); + } + + // NOTE: If the atom is active we have to deactivate the tracking + // of this atom in the memory since it's not in memory anymore. + // Since it is going to the cache, cache will be responsible for + // tracking this. Push to activeCacheBlocks for simulator speed + // instead of having to search for active blocks in the cache. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + numActiveBlocksNow.push_back(block_index); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + } + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + numActiveBlocksNext.push_back(block_index); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + + assert(MSHR.find(block_index) != MSHR.end()); + for (auto it = MSHR[block_index].begin(); + it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + MSHR.erase(block_index); + + cacheBlocks[block_index].state = CacheState::BUSY; + if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + delete pkt; + } else { + assert(purpose->dest() == ReadDestination::READ_FOR_PUSH); + // There should be enough room in activeBuffer to place this pkt. + // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space. + // So at this point in code we should have at least one free entry + // in the active buffer which is reserved for this pkt. + assert(activeBuffer.size() + pendingPullReads < activeBufferSize); + + WorkListItem items[numElementsPerLine]; + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + atom_active_future |= items[index].activeFuture; + } + if (atom_active_now) { + int count = currentDirectory->deactivate(addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + if (atom_active_future) { + int count = futureDirectory->deactivate(addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + activeBuffer.emplace_back(pkt, curTick()); + } else { + stats.wastefulBytesRead += pkt->getSize(); + delete pkt; + } + + if (pullCondition()) { + memAccBuffer.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, -1, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + numScheduledPulls++; + } + } + delete purpose; + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } + return true; +} + +void +CoalesceEngine::processNextResponseEvent() +{ + int num_responses_sent = 0; + + Addr addr_response; + WorkListItem worklist_response; + Tick response_queueing_tick; + while(true) { + std::tie(addr_response, worklist_response, response_queueing_tick) = + responseQueue.front(); + Tick waiting_ticks = curTick() - response_queueing_tick; + if (ticksToCycles(waiting_ticks) < 1) { + break; + } + owner->handleIncomingWL(addr_response, worklist_response); + num_responses_sent++; + DPRINTF(CoalesceEngine, + "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n", + __func__, + graphWorkload->printWorkListItem(worklist_response), + addr_response); + + responseQueue.pop_front(); + DPRINTF(SEGAStructureSize, "%s: Popped a response from responseQueue." + " responseQueue.size = %d.\n", __func__, + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Popped a response from responseQueue. " + "responseQueue.size = %d.\n", __func__, + responseQueue.size()); + stats.responseQueueLatency.sample( + waiting_ticks * 1e9 / getClockFrequency()); + if (num_responses_sent >= maxRespPerCycle) { + // TODO: Add the condition to check that front of queue can be + // sent to WLEngine. i.e. it has at least been in the queue for + // one cycle. + if (!responseQueue.empty()) { + stats.responsePortShortage++; + } + break; + } + if (responseQueue.empty()) { + break; + } + } + + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } +} + +void +CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl) +{ + Addr aligned_addr = roundDown(addr, peerMemoryAtomSize); + int block_index = getBlockIndex(aligned_addr); + int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Received a write request for addr: %lu with " + "wl: %s. This request maps to cacheBlocks[%d], " + "aligned_addr: %lu, and wl_offset: %d.\n", + __func__, addr, graphWorkload->printWorkListItem(wl), + block_index, aligned_addr, wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + DPRINTF(CoalesceEngine, "%s: Received a write for WorkListItem: %s " + "with Addr: %lu.\n", __func__, + graphWorkload->printWorkListItem(wl), addr); + + // NOTE: Design does not allow for write misses. + assert(cacheBlocks[block_index].addr == aligned_addr); + // cache state asserts + assert(cacheBlocks[block_index].busyMask != 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].state == CacheState::BUSY); + + // respective bit in busyMask for wl is set. + assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) == + (1 << wl_offset)); + + if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) { + cacheBlocks[block_index].dirty |= true; + } + + bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]); + cacheBlocks[block_index].items[wl_offset] = wl; + + stats.vertexActivations += active ? 1 : 0; + if (mode == ProcessingMode::ASYNCHRONOUS || mode == ProcessingMode::POLY_GRAPH) { + cacheBlocks[block_index].items[wl_offset].activeNow |= active; + if (active && (!numActiveBlocksNow.find(block_index))) { + numActiveBlocksNow.push_back(block_index); + if (!owner->running()) { + owner->start(); + } + } + } + if (mode == ProcessingMode::BULK_SYNCHRONOUS) { + cacheBlocks[block_index].items[wl_offset].activeFuture |= active; + if (active && (!numActiveBlocksNext.find(block_index))) { + numActiveBlocksNext.push_back(block_index); + } + } + + cacheBlocks[block_index].busyMask &= ~(1 << wl_offset); + cacheBlocks[block_index].lastChangedTick = curTick(); + DPRINTF(CoalesceEngine, "%s: Wrote to cacheBlocks[%d][%d] = %s.\n", + __func__, block_index, wl_offset, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset])); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (cacheBlocks[block_index].busyMask == 0) { + if (cacheBlocks[block_index].hasConflict) { + if (cacheBlocks[block_index].dirty) { + cacheBlocks[block_index].state = CacheState::PENDING_WB; + cacheBlocks[block_index].lastChangedTick = curTick(); + memAccBuffer.emplace_back( + [this] (int block_index, Tick schedule_tick) { + processNextWriteBack(block_index, schedule_tick); + }, block_index, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + } else { + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_now) { + numActiveBlocksNow.erase(block_index); + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + } + if (atom_active_future) { + numActiveBlocksNext.erase(block_index); + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + cacheBlocks[block_index].reset(); + } + } else { + cacheBlocks[block_index].state = CacheState::IDLE; + cacheBlocks[block_index].lastChangedTick = curTick(); + } + } + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + stats.numVertexWrites++; + + if ((cacheBlocks[block_index].state == CacheState::IDLE) && + done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextMemoryEvent() +{ + int num_transitions = 0; + std::unordered_set transitions; + MemoryFunctionDeque temp_deque; + temp_deque.clear(); + + while (true) { + if (memPort.blocked()) { + while (!temp_deque.empty()) { + memAccBuffer.push_front(temp_deque.back()); + temp_deque.pop_back(); + } + stats.numMemoryBlocks++; + nextMemoryEvent.sleep(); + return; + } + DPRINTF(CoalesceEngine, "%s: Processing another " + "memory function.\n", __func__); + std::function function; + int input; + Tick tick; + std::tie(function, input, tick) = memAccBuffer.front(); + if ((transitions.find(input) == transitions.end()) || (input == -1)) { + function(input, tick); + memAccBuffer.pop_front(); + transitions.insert(input); + stats.memAccBufferLat.sample((curTick() - tick) * 1e9 / getClockFrequency()); + DPRINTF(CoalesceEngine, "%s: Popped a function from memAccBuffer. " + "memAccBuffer.size = %d.\n", __func__, memAccBuffer.size()); + num_transitions++; + } else { + temp_deque.emplace_back(function, input, tick); + memAccBuffer.pop_front(); + } + if ((num_transitions >= transitionsPerCycle) || memAccBuffer.empty()) { + break; + } + } + + while (!temp_deque.empty()) { + memAccBuffer.push_front(temp_deque.back()); + temp_deque.pop_back(); + } + + assert(!nextMemoryEvent.pending()); + assert(!nextMemoryEvent.scheduled()); + if ((!memAccBuffer.empty())) { + schedule(nextMemoryEvent, nextCycle()); + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextRead(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, cacheBlocks[block_index].to_string()); + // A cache block should not be touched while it's waiting for data. + // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick); + // TODO: Figure out if this is still necessary. + if (cacheBlocks[block_index].lastChangedTick != schedule_tick) { + return; + } + + assert(cacheBlocks[block_index].busyMask == 0); + assert(!cacheBlocks[block_index].valid); + assert(!cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA); + + bool need_send_pkt = true; + + // NOTE: Search postPushWBQueue + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();) + { + PacketPtr wb_pkt = std::get<0>(*wb); + if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) { + wb_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // NOTE: If an atom is in the postPushWBQueue, + // the it is definitely currently not active. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + assert(!cacheBlocks[block_index].items[index].activeNow); + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + numActiveBlocksNext.push_back(block_index); + } + + need_send_pkt = false; + wb = postPushWBQueue.erase(wb); + delete wb_pkt; + } else { + wb++; + } + } + // NOTE: Search activeBuffer + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) { + PacketPtr ab_pkt = std::get<0>(*ab); + if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) { + ab_pkt->writeDataToBlock( + (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize); + + cacheBlocks[block_index].valid = true; + cacheBlocks[block_index].dirty = true; + cacheBlocks[block_index].lastChangedTick = curTick(); + // If an atom is in the activeBuffer, + // then it is definitely currently active. + numActiveBlocksNow.push_back(block_index); + // NOTE: Residence in the activeBuffer does not + // signify anything about future activity. + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) + { + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + if (atom_active_future) { + numActiveBlocksNext.push_back(block_index); + } + + need_send_pkt = false; + ab = activeBuffer.erase(ab); + delete ab_pkt; + if (pullCondition()) { + memAccBuffer.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, -1, curTick()); + numScheduledPulls++; + } + } else { + ab++; + } + } + if (!need_send_pkt) { + for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) { + Addr miss_addr = *it; + Addr aligned_miss_addr = + roundDown(miss_addr, peerMemoryAtomSize); + assert(aligned_miss_addr == cacheBlocks[block_index].addr); + int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem); + DPRINTF(CoalesceEngine, "%s: Addr: %lu in the MSHR for " + "cacheBlocks[%d] can be serviced with the received " + "packet.\n",__func__, miss_addr, block_index); + // TODO: Make this block of code into a function + responseQueue.push_back(std::make_tuple(miss_addr, + cacheBlocks[block_index].items[wl_offset], curTick())); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) " + "to responseQueue. responseQueue.size = %d.\n", + __func__, miss_addr, + graphWorkload->printWorkListItem( + cacheBlocks[block_index].items[wl_offset]), + responseQueue.size()); + cacheBlocks[block_index].busyMask |= (1 << wl_offset); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", + __func__, block_index, + cacheBlocks[block_index].to_string()); + it = MSHR[block_index].erase(it); + } + assert(MSHR[block_index].empty()); + MSHR.erase(block_index); + if ((!nextResponseEvent.scheduled()) && + (!responseQueue.empty())) { + schedule(nextResponseEvent, nextCycle()); + } + cacheBlocks[block_index].state = CacheState::BUSY; + } + + if (pendingPullAddrs.find(cacheBlocks[block_index].addr) != + pendingPullAddrs.end()) { + need_send_pkt = false; + } + + if (need_send_pkt) { + PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr, + peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE); + pkt->pushSenderState(purpose); + DPRINTF(CoalesceEngine, "%s: Created a read packet. addr = %lu, " + "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize()); + memPort.sendPacket(pkt); + onTheFlyReqs++; + } +} + +void +CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n", + __func__, block_index); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + + if (schedule_tick == cacheBlocks[block_index].lastChangedTick) { + assert(cacheBlocks[block_index].busyMask == 0); + assert(cacheBlocks[block_index].valid); + assert(cacheBlocks[block_index].dirty); + assert(cacheBlocks[block_index].hasConflict); + assert(cacheBlocks[block_index].state == CacheState::PENDING_WB); + + // NOTE: If the atom we're writing back is active, we have to + // stop tracking it in the cache and start tracking it in the memory. + bool atom_active_now = false; + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + atom_active_future |= cacheBlocks[block_index].items[index].activeFuture; + } + + PacketPtr pkt = createWritePacket( + cacheBlocks[block_index].addr, peerMemoryAtomSize, + (uint8_t*) cacheBlocks[block_index].items); + DPRINTF(CoalesceEngine, "%s: Created a write packet to " + "Addr: %lu, size = %d.\n", __func__, + pkt->getAddr(), pkt->getSize()); + if (atom_active_future) { + numActiveBlocksNext.erase(block_index); + } + if (atom_active_now) { + numActiveBlocksNow.erase(block_index); + if (enoughSpace()) { + activeBuffer.emplace_back(pkt, curTick()); + } else { + int count = currentDirectory->activate(cacheBlocks[block_index].addr); + stats.currentFrontierSize.sample(currentDirectory->workCount()); + stats.countActiveBlocksNow.sample(count); + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + } else { + if (atom_active_future) { + int count = futureDirectory->activate(cacheBlocks[block_index].addr); + stats.futureFrontierSize.sample(futureDirectory->workCount()); + stats.countActiveBlocksNext.sample(count); + } + memPort.sendPacket(pkt); + onTheFlyReqs++; + } + cacheBlocks[block_index].reset(); + DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__, + block_index, cacheBlocks[block_index].to_string()); + } else { + DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a " + "write back has been scheduled for it. Ignoring " + "the current write back scheduled at tick %lu for " + "the right function scheduled later.\n", + __func__, block_index, schedule_tick); + } +} + +void +CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick) +{ + if (!postPushWBQueue.empty()) { + PacketPtr wb_pkt; + Tick pkt_tick; + std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front(); + if (schedule_tick == pkt_tick) { + WorkListItem items[numElementsPerLine]; + wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + bool atom_active_future = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_future |= items[index].activeFuture; + } + if (atom_active_future) { + futureDirectory->activate(wb_pkt->getAddr()); + } + memPort.sendPacket(wb_pkt); + onTheFlyReqs++; + postPushWBQueue.pop_front(); + } + } +} + +void +CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick) +{ + DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__); + numScheduledPulls--; + if (!currentDirectory->empty()) { + Addr addr = currentDirectory->getNextWork(); + int block_index = getBlockIndex(addr); + + bool in_cache = cacheBlocks[block_index].addr == addr; + bool in_active_buffer = false; + for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) { + PacketPtr pkt = std::get<0>(*ab); + in_active_buffer |= (pkt->getAddr() == addr); + } + bool in_write_buffer = false; + for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++) + { + PacketPtr pkt = std::get<0>(*wb); + in_write_buffer |= (pkt->getAddr() == addr); + } + bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end(); + + if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) { + PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize); + ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH); + pkt->pushSenderState(purpose); + memPort.sendPacket(pkt); + onTheFlyReqs++; + pendingPullReads++; + pendingPullAddrs.insert(addr); + } + } +} + +void +CoalesceEngine::recvMemRetry() +{ + DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__); + + if (!nextMemoryEvent.pending()) { + DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__); + return; + } + assert(!nextMemoryEvent.scheduled()); + nextMemoryEvent.wake(); + schedule(nextMemoryEvent, nextCycle()); +} + +int +CoalesceEngine::workCount() +{ + return numActiveBlocksNow.size() + currentDirectory->workCount() + activeBuffer.size(); +} + +void +CoalesceEngine::recvVertexPull() +{ + numReceivedPulls++; + DPRINTF(CoalesceEngine, "%s: Received a vertex pull. numReceivedPulls: %d.\n", __func__, numReceivedPulls); + + stats.verticesPulled++; + stats.lastVertexPullTime = curTick() - stats.lastResetTick; + if (!nextApplyEvent.scheduled()) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextApplyEvent() +{ + if ((!activeBuffer.empty()) && + (postPushWBQueue.size() < postPushWBQueueSize)) { + PacketPtr pkt; + Tick entrance_tick; + WorkListItem items[numElementsPerLine]; + + std::tie(pkt, entrance_tick) = activeBuffer.front(); + pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize); + + for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) { + if (items[index].activeNow) { + Addr addr = pkt->getAddr() + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(items[index]); + items[index].activeNow = false; + owner->recvVertexPush(addr, delta, items[index].edgeIndex, + items[index].degree); + numReceivedPulls--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + pkt->deleteData(); + pkt->allocate(); + pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize); + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= items[index].activeNow; + } + // NOTE: If the atom is not active anymore. + if (!atom_active_now) { + PacketPtr wb_pkt = createWritePacket(pkt->getAddr(), + peerMemoryAtomSize, (uint8_t*) items); + postPushWBQueue.emplace_back(wb_pkt, curTick()); + activeBuffer.pop_front(); + memAccBuffer.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextPostPushWB(ignore, schedule_tick); + }, -1, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + delete pkt; + } + } else if (!numActiveBlocksNow.empty()) { + int num_visited_indices = 0; + int initial_fifo_length = numActiveBlocksNow.size(); + while (true) { + int block_index = numActiveBlocksNow.front(); + if (cacheBlocks[block_index].state == CacheState::IDLE) { + for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) { + if (cacheBlocks[block_index].items[index].activeNow) { + Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem); + uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]); + cacheBlocks[block_index].items[index].activeNow = false; + cacheBlocks[block_index].dirty = true; + owner->recvVertexPush(addr, delta, + cacheBlocks[block_index].items[index].edgeIndex, + cacheBlocks[block_index].items[index].degree); + numReceivedPulls--; + stats.verticesPushed++; + stats.lastVertexPushTime = curTick() - stats.lastResetTick; + } + } + + bool atom_active_now = false; + for (int index = 0; index < numElementsPerLine; index++) { + atom_active_now |= cacheBlocks[block_index].items[index].activeNow; + } + // NOTE: If we have reached the last item in the cache block + if (!atom_active_now) { + numActiveBlocksNow.erase(block_index); + } + break; + } + // NOTE: If the block with index at the front of activeCacheBlocks + // is not in IDLE state, then roll the that index to the back + numActiveBlocksNow.pop_front(); + numActiveBlocksNow.push_back(block_index); + // NOTE: If we have visited all the items initially in the FIFO. + num_visited_indices++; + if (num_visited_indices == initial_fifo_length) { + break; + } + } + } else { + DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__); + stats.worklessCycles++; + } + + if (pullCondition()) { + memAccBuffer.emplace_back( + [this] (int ignore, Tick schedule_tick) { + processNextVertexPull(ignore, schedule_tick); + }, -1, curTick()); + if ((!nextMemoryEvent.pending()) && + (!nextMemoryEvent.scheduled())) { + schedule(nextMemoryEvent, nextCycle()); + } + numScheduledPulls++; + } + + if ((numReceivedPulls > 0) && (!nextApplyEvent.scheduled())) { + schedule(nextApplyEvent, nextCycle()); + } +} + +void +CoalesceEngine::processNextDoneSignalEvent() +{ + if (done()) { + owner->recvDoneSignal(); + } +} + +CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine& _coalesce): + statistics::Group(&_coalesce), coalesce(_coalesce), lastResetTick(0), + ADD_STAT(numVertexReads, statistics::units::Count::get(), + "Number of memory vertecies read from cache."), + ADD_STAT(numVertexWrites, statistics::units::Count::get(), + "Number of memory vertecies written to cache."), + ADD_STAT(readHits, statistics::units::Count::get(), + "Number of cache hits."), + ADD_STAT(readMisses, statistics::units::Count::get(), + "Number of cache misses."), + ADD_STAT(readHitUnderMisses, statistics::units::Count::get(), + "Number of cache hit under misses."), + ADD_STAT(numConflicts, statistics::units::Count::get(), + "Number of conflicts raised by reads in the cache."), + ADD_STAT(responsePortShortage, statistics::units::Count::get(), + "Number of times a response has been " + "delayed because of port shortage. "), + ADD_STAT(numMemoryBlocks, statistics::units::Count::get(), + "Number of times memory bandwidth was not available."), + ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(), + "Number of bytes read that were not used by coalesce engine"), + ADD_STAT(vertexActivations, statistics::units::Count::get(), + "Number of times a vertex has become active. " + "Only meaningful in async mode"), + ADD_STAT(verticesPulled, statistics::units::Count::get(), + "Number of times a pull request has been sent by PushEngine."), + ADD_STAT(verticesPushed, statistics::units::Count::get(), + "Number of times a vertex has been pushed to the PushEngine"), + ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(), + "Time of the last pull request. (Relative to reset_stats)"), + ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(), + "Time of the last vertex push. (Relative to reset_stats)"), + ADD_STAT(worklessCycles, statistics::units::Count::get(), + "cycles the coalesce engine could not find work for apply"), + ADD_STAT(hitRate, statistics::units::Ratio::get(), + "Hit rate in the cache."), + ADD_STAT(vertexPullBW, statistics::units::Rate::get(), + "Rate at which pull requests arrive."), + ADD_STAT(vertexPushBW, statistics::units::Rate::get(), + "Rate at which vertices are pushed."), + ADD_STAT(currentFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the current bitvector."), + ADD_STAT(futureFrontierSize, statistics::units::Count::get(), + "Histogram of the length of the future bitvector."), + ADD_STAT(countActiveBlocksNow, statistics::units::Count::get(), + "Histogram of the popCount values in the current directory"), + ADD_STAT(countActiveBlocksNext, statistics::units::Count::get(), + "Histogram of the popCount values in the future directory"), + ADD_STAT(responseQueueLatency, statistics::units::Second::get(), + "Histogram of the response latency to WLEngine. (ns)"), + ADD_STAT(memAccBufferLat, statistics::units::Second::get(), + "Histogram of the latency of processing a memory function.") +{ +} + +void +CoalesceEngine::CoalesceStats::regStats() +{ + using namespace statistics; + + hitRate = (readHits + readHitUnderMisses) / + (readHits + readHitUnderMisses + readMisses); + + vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime; + + vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime; + + currentFrontierSize.init(64); + futureFrontierSize.init(64); + countActiveBlocksNow.init(64); + countActiveBlocksNext.init(64); + responseQueueLatency.init(64); + memAccBufferLat.init(64); +} + +void +CoalesceEngine::CoalesceStats::resetStats() +{ + statistics::Group::resetStats(); + + lastResetTick = curTick(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh new file mode 100644 index 0000000000..b7e3821dd7 --- /dev/null +++ b/src/accl/graph/sega/coalesce_engine.hh @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ + +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/work_directory.hh" +#include "base/cprintf.hh" +#include "base/statistics.hh" +#include "params/CoalesceEngine.hh" + +namespace gem5 +{ + +typedef std::deque, int, Tick>> MemoryFunctionDeque; + +class MPU; + +class CoalesceEngine : public BaseMemoryEngine +{ + private: + struct Block + { + WorkListItem* items; + Addr addr; + uint64_t busyMask; + bool valid; + bool dirty; + bool hasConflict; + CacheState state; + Tick lastChangedTick; + Block() {} + Block(int num_elements): + addr(-1), + busyMask(0), + valid(false), + dirty(false), + hasConflict(false), + state(CacheState::INVALID), + lastChangedTick(0) + { + items = new WorkListItem [num_elements]; + } + + void reset() { + addr = -1; + busyMask = 0; + valid = false; + dirty = false; + hasConflict = false; + state = CacheState::INVALID; + lastChangedTick = 0; + } + + std::string to_string() { + return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, " + "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}", + addr, busyMask, valid ? "true" : "false", + dirty ? "true" : "false", hasConflict ? "true" : "false", + cacheStateStrings[state], lastChangedTick); + } + }; + + struct ReadPurpose : public Packet::SenderState + { + ReadDestination _dest; + ReadPurpose(ReadDestination dest): _dest(dest) {} + ReadDestination dest() { return _dest; } + }; + + MPU* owner; + ProcessingMode mode; + WorkDirectory* currentDirectory; + WorkDirectory* futureDirectory; + GraphWorkload* graphWorkload; + + Addr lastAtomAddr; + + int numLines; + int numElementsPerLine; + Block* cacheBlocks; + + Tick lastReadTick; + std::unordered_set blocksTouchedThisTick; + + int onTheFlyReqs; + std::unordered_map> MSHR; + + // Response route to WLEngine + int maxRespPerCycle; + std::deque> responseQueue; + + // Tracking work in cache + int numReceivedPulls; + // NOTE: Remember to erase from these upon eviction from cache + UniqueFIFO numActiveBlocksNow; + UniqueFIFO numActiveBlocksNext; + + int numScheduledPulls; + int pendingPullLimit; + int pendingPullReads; + // A map from addr to sendMask. sendMask determines which bytes to + // send for push when getting the read response from memory. + std::unordered_set pendingPullAddrs; + + int activeBufferSize; + int postPushWBQueueSize; + std::deque> activeBuffer; + std::deque> postPushWBQueue; + + bool enoughSpace(); + bool pullCondition(); + int getBlockIndex(Addr addr); + + int transitionsPerCycle; + MemoryFunctionDeque memAccBuffer; + + MemoryEvent nextMemoryEvent; + void processNextMemoryEvent(); + void processNextRead(int block_index, Tick schedule_tick); + void processNextWriteBack(int block_index, Tick schedule_tick); + void processNextVertexPull(int ignore, Tick schedule_tick); + void processNextPostPushWB(int ignore, Tick schedule_tick); + + EventFunctionWrapper nextResponseEvent; + void processNextResponseEvent(); + + EventFunctionWrapper nextApplyEvent; + void processNextApplyEvent(); + + EventFunctionWrapper nextDoneSignalEvent; + void processNextDoneSignalEvent(); + + struct CoalesceStats : public statistics::Group + { + CoalesceStats(CoalesceEngine& coalesce); + + virtual void regStats() override; + + virtual void resetStats() override; + + CoalesceEngine &coalesce; + + Tick lastResetTick; + + statistics::Scalar numVertexReads; + statistics::Scalar numVertexWrites; + statistics::Scalar readHits; + statistics::Scalar readMisses; + statistics::Scalar readHitUnderMisses; + statistics::Scalar numConflicts; + statistics::Scalar responsePortShortage; + statistics::Scalar numMemoryBlocks; + statistics::Scalar wastefulBytesRead; + statistics::Scalar vertexActivations; + statistics::Scalar verticesPulled; + statistics::Scalar verticesPushed; + statistics::Scalar lastVertexPullTime; + statistics::Scalar lastVertexPushTime; + statistics::Scalar worklessCycles; + + statistics::Formula hitRate; + statistics::Formula vertexPullBW; + statistics::Formula vertexPushBW; + + statistics::Histogram currentFrontierSize; + statistics::Histogram futureFrontierSize; + statistics::Histogram countActiveBlocksNow; + statistics::Histogram countActiveBlocksNext; + statistics::Histogram responseQueueLatency; + statistics::Histogram memAccBufferLat; + }; + + CoalesceStats stats; + + protected: + virtual void recvMemRetry() override; + virtual bool handleMemResp(PacketPtr pkt) override; + + public: + PARAMS(CoalesceEngine); + CoalesceEngine(const Params ¶ms); + void registerMPU(MPU* mpu); + + void setProcessingMode(ProcessingMode _mode) { mode = _mode; } + void createAsyncPopCountDirectory(int atoms_per_block); + void createBSPPopCountDirectory(int atoms_per_block); + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + + virtual void recvFunctional(PacketPtr pkt) override; + void postMemInitSetup(); + void postConsumeProcess(); + void swapDirectories(); + + ReadReturnStatus recvWLRead(Addr addr); + void recvWLWrite(Addr addr, WorkListItem wl); + + int workCount(); + int futureWorkCount(); + void recvVertexPull(); + + bool done(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__ diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc new file mode 100644 index 0000000000..ba57b387f4 --- /dev/null +++ b/src/accl/graph/sega/enums.cc @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/enums.hh" + +namespace gem5 +{ + +const char* registerStateStrings[NUM_REGISTER_STATE] = { + "PENDING_READ", + "PENDING_REDUCE", + "PENDING_WRITE" +}; + +const char* cacheStateStrings[NUM_CACHE_STATE] = { + "INVALID", + "PENDING_DATA", + "BUSY", + "IDLE", + "PENDING_WB" +}; + +const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] = +{ + "ACCEPT", + "REJECT_ROLL", + "REJECT_NO_ROLL" +}; + +const char* readDestinationStrings[NUM_READ_DESTINATION] = +{ + "READ_FOR_CACHE", + "READ_FOR_PUSH" +}; + +} // namespace gem5 diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh new file mode 100644 index 0000000000..0f654c5386 --- /dev/null +++ b/src/accl/graph/sega/enums.hh @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__ +#define __ACCL_GRAPH_SEGA_ENUMS_HH__ + +namespace gem5 +{ + +enum RegisterState +{ + PENDING_READ, + PENDING_REDUCE, + PENDING_WRITE, + NUM_REGISTER_STATE +}; +extern const char* registerStateStrings[NUM_REGISTER_STATE]; + +enum CacheState +{ + INVALID, + PENDING_DATA, + BUSY, + IDLE, + PENDING_WB, + NUM_CACHE_STATE +}; +extern const char* cacheStateStrings[NUM_CACHE_STATE]; + +enum ReadReturnStatus +{ + ACCEPT, + REJECT_ROLL, + REJECT_NO_ROLL, + NUM_READ_RETURN_STATUS +}; +extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS]; + +enum ReadDestination +{ + READ_FOR_CACHE, + READ_FOR_PUSH, + NUM_READ_DESTINATION +}; +extern const char* readDestinationStrings[NUM_READ_DESTINATION]; + +enum ProcessingMode +{ + NOT_SET, + ASYNCHRONOUS, + BULK_SYNCHRONOUS, + POLY_GRAPH, + NUM_PROCESSING_MODE +}; +extern const char* processingModeStrings[NUM_PROCESSING_MODE]; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__ diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc new file mode 100644 index 0000000000..a5063cf685 --- /dev/null +++ b/src/accl/graph/sega/mpu.cc @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/mpu.hh" + +#include "accl/graph/sega/centeral_controller.hh" +#include "debug/MPU.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +MPU::MPU(const Params& params): + SimObject(params), + system(params.system), + wlEngine(params.wl_engine), + coalesceEngine(params.coalesce_engine), + pushEngine(params.push_engine) +{ + wlEngine->registerMPU(this); + coalesceEngine->registerMPU(this); + pushEngine->registerMPU(this); +} + +void +MPU::registerCenteralController(CenteralController* centeral_controller) +{ + centeralController = centeral_controller; +} + +bool +MPU::handleIncomingUpdate(PacketPtr pkt) +{ + return wlEngine->handleIncomingUpdate(pkt); +} + +void +MPU::handleIncomingWL(Addr addr, WorkListItem wl) +{ + wlEngine->handleIncomingWL(addr, wl); +} + +void +MPU::recvWLWrite(Addr addr, WorkListItem wl) +{ + coalesceEngine->recvWLWrite(addr, wl); +} + +void +MPU::recvWorkload(GraphWorkload* workload) +{ + coalesceEngine->recvWorkload(workload); + pushEngine->recvWorkload(workload); + wlEngine->recvWorkload(workload); +} + +void +MPU::recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + pushEngine->recvVertexPush(addr, delta, edge_index, degree); +} + +void +MPU::recvMirrorPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + pushEngine->recvMirrorPush(addr, delta, edge_index, degree); +} + +void +MPU::recvDoneSignal() +{ + if (done()) { + centeralController->recvDoneSignal(); + } +} + +bool +MPU::done() +{ + return wlEngine->done() && coalesceEngine->done() && pushEngine->done(); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh new file mode 100644 index 0000000000..7d6d7d4003 --- /dev/null +++ b/src/accl/graph/sega/mpu.hh @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_MPU_HH__ +#define __ACCL_GRAPH_SEGA_MPU_HH__ + +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/coalesce_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "accl/graph/sega/push_engine.hh" +#include "accl/graph/sega/wl_engine.hh" +#include "base/addr_range.hh" +#include "mem/packet.hh" +#include "sim/sim_object.hh" +#include "sim/system.hh" +#include "params/MPU.hh" + +namespace gem5 +{ + +class CenteralController; + +class MPU : public SimObject +{ + private: + System* system; + CenteralController* centeralController; + + WLEngine* wlEngine; + CoalesceEngine* coalesceEngine; + PushEngine* pushEngine; + + public: + PARAMS(MPU); + MPU(const Params& params); + void registerCenteralController(CenteralController* centeral_controller); + + void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); } + void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); } + void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); } + + unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; } + AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); } + uint64_t getCacheSize() { return coalesceEngine->params().cache_size; } + void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); } + void postMemInitSetup() { coalesceEngine->postMemInitSetup(); } + void postConsumeProcess() { coalesceEngine->postConsumeProcess(); } + void swapDirectories() { coalesceEngine->swapDirectories(); } + + bool handleIncomingUpdate(PacketPtr pkt); + + void handleIncomingWL(Addr addr, WorkListItem wl); + ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); } + void recvWLWrite(Addr addr, WorkListItem wl); + void recvWorkload(GraphWorkload* Workload); + + int workCount() { return coalesceEngine->workCount(); } + void recvVertexPull() { return coalesceEngine->recvVertexPull(); } + bool running() { return pushEngine->running(); } + void start() { return pushEngine->start(); } + void recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + + void recvMirrorPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + void startProcessingMirrors(Tick time_to_wait) { pushEngine->startProcessingMirrors(time_to_wait); } + + void recvDoneSignal(); + bool done(); + + uint64_t getBaseAddr() {return pushEngine->params().base_addr;}; +}; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_MPU_HH__ diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc new file mode 100644 index 0000000000..6173fd67e8 --- /dev/null +++ b/src/accl/graph/sega/push_engine.cc @@ -0,0 +1,593 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/push_engine.hh" + +#include "accl/graph/sega/mpu.hh" +#include "base/intmath.hh" +#include "debug/PushEngine.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +PushEngine::PushEngine(const Params& params): + BaseMemoryEngine(params), + _running(false), + lastIdleEntranceTick(0), + numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size), + onTheFlyMemReqs(0), outstandingEdgeReqs(0), maxOutstandingEdgeReqs(0), + edgeQueueSize(params.resp_queue_size), + examineWindow(params.examine_window), + maxPropagatesPerCycle(params.max_propagates_per_cycle), + updateQueueSize(params.update_queue_size), + nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()), + nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()), + nextPropagateEvent([this] { processNextPropagateEvent(); }, name()), + nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()), + base(params.base_addr), + stats(*this) +{ + destinationQueues.clear(); + for (int i = 0; i < params.port_out_ports_connection_count; ++i) { + outPorts.emplace_back(name() + ".out_ports" + std::to_string(i), this, i); + destinationQueues.emplace_back(); + destinationQueues[i].clear(); + sourceAndValueMaps.emplace_back(); + sourceAndValueMaps[i].clear(); + } +} + +Port& +PushEngine::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "out_ports") { + return outPorts[idx]; + } else if (if_name == "mem_port") { + return BaseMemoryEngine::getPort(if_name, idx); + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +PushEngine::init() +{ + localAddrRange = owner->getAddrRanges(); + for (int i = 0; i < outPorts.size(); i++){ + AddrRangeList range_list = outPorts[i].getAddrRanges(); + assert(range_list.size() == 1); + AddrRange range = outPorts[i].getAddrRanges().front(); + portAddrMap.insert(range, i); + } +} + +void +PushEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + +void +PushEngine::ReqPort::sendPacket(PacketPtr pkt) +{ + panic_if(blockedPacket != nullptr, + "Should never try to send if blocked!"); + // If we can't send the packet across the port, store it for later. + if (!sendTimingReq(pkt)) + { + DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__); + blockedPacket = pkt; + } +} + +bool +PushEngine::ReqPort::recvTimingResp(PacketPtr pkt) +{ + panic("recvTimingResp called on the request port."); +} + +void +PushEngine::ReqPort::recvReqRetry() +{ + panic_if(blockedPacket == nullptr, + "Received retry without a blockedPacket."); + + DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. " + "blockedPacket: %s.\n", __func__, _id, blockedPacket->print()); + PacketPtr pkt = blockedPacket; + blockedPacket = nullptr; + sendPacket(pkt); + if (blockedPacket == nullptr) { + DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__); + owner->recvReqRetry(); + } +} + +void +PushEngine::recvReqRetry() +{ + DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__); + if (!nextUpdatePushEvent.scheduled()) { + schedule(nextUpdatePushEvent, nextCycle()); + } +} + +bool +PushEngine::vertexSpace() +{ + return (edgePointerQueueSize == 0) || + ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize); +} + +bool +PushEngine::workLeft() +{ + return ((owner->workCount() - numPendingPulls) > 0); +} + +bool +PushEngine::done() +{ + bool empty_update_queues = true; + for (int i = 0; i < outPorts.size(); i++) { + empty_update_queues &= destinationQueues[i].empty(); + } + return empty_update_queues && metaEdgeQueue.empty() && + (onTheFlyMemReqs == 0) && edgePointerQueue.empty(); +} + +void +PushEngine::start() +{ + assert(!_running); + // assert(!nextVertexPullEvent.scheduled()); + + _running = true; + // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick); + // NOTE: We might have to check for size availability here. + assert(workLeft()); + if (vertexSpace() && !nextVertexPullEvent.scheduled()) { + schedule(nextVertexPullEvent, nextCycle()); + } +} + +void +PushEngine::processNextVertexPullEvent() +{ + if (workLeft()) { + numPendingPulls++; + owner->recvVertexPull(); + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + } else { + _running = false; + lastIdleEntranceTick = curTick(); + DPRINTF(PushEngine, "%s: In idle state now.\n", __func__); + } +} + +void +PushEngine::recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + assert(degree > 0); + assert((edgePointerQueueSize == 0) || + ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize)); + + Addr start_addr = edge_index * sizeof(Edge); + Addr end_addr = start_addr + (degree * sizeof(Edge)); + EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr, + sizeof(Edge), peerMemoryAtomSize); + + edgePointerQueue.emplace_back(info_gen, curTick()); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); + numPendingPulls--; + + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + + if ((!nextMemoryReadEvent.pending()) && + (!nextMemoryReadEvent.scheduled())) { + schedule(nextMemoryReadEvent, nextCycle()); + } +} + +void +PushEngine::recvMirrorPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree) +{ + Addr start_addr = edge_index * sizeof(Edge); + Addr end_addr = start_addr + (degree * sizeof(Edge)); + EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr, + sizeof(Edge), peerMemoryAtomSize); + + edgePointerQueue.emplace_back(info_gen, curTick()); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); +} + +void +PushEngine::startProcessingMirrors(Tick time_to_wait) +{ + assert(!nextMemoryReadEvent.pending()); + assert(!nextMemoryReadEvent.scheduled()); + Cycles wait = ticksToCycles(time_to_wait); + if (!edgePointerQueue.empty()) { + schedule(nextMemoryReadEvent, clockEdge(wait)); + } +} + +void +PushEngine::processNextMemoryReadEvent() +{ + if (memPort.blocked()) { + nextMemoryReadEvent.sleep(); + return; + } + Addr aligned_addr, offset; + int num_edges; + + EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front()); + Tick entrance_tick = std::get<1>(edgePointerQueue.front()); + std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo(); + aligned_addr += base; + if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) + { + DPRINTF(PushEngine, "%s: Current packet information generated by " + "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, " + "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges); + + PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize); + PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges}; + reqInfoMap[pkt->req] = push_info; + reqTickMap[pkt->req] = curTick(); //added this for stats + memPort.sendPacket(pkt); + onTheFlyMemReqs += num_edges; + outstandingEdgeReqs++; + maxOutstandingEdgeReqs = std::max(maxOutstandingEdgeReqs, outstandingEdgeReqs); + stats.maxOutstandingEdgeRequests = maxOutstandingEdgeReqs; + stats.outstandingEdgeRequests.sample(outstandingEdgeReqs); + // stats.outstandingEdgeRequests.sample(onTheFlyMemReqs); + + curr_info.iterate(); + if (curr_info.done()) { + DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__); + stats.edgePointerQueueLatency.sample( + (curTick() - entrance_tick) * 1e9 / getClockFrequency()); + edgePointerQueue.pop_front(); + stats.edgePointerQueueLength.sample(edgePointerQueue.size()); + DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. " + "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size()); + } + } + + if (vertexSpace() && (!nextVertexPullEvent.scheduled())) { + schedule(nextVertexPullEvent, nextCycle()); + } + + if (!edgePointerQueue.empty()) { + assert(!nextMemoryReadEvent.pending()); + assert(!nextMemoryReadEvent.scheduled()); + schedule(nextMemoryReadEvent, nextCycle()); + } +} + +void +PushEngine::recvMemRetry() +{ + if (nextMemoryReadEvent.pending()) { + DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__); + nextMemoryReadEvent.wake(); + schedule(nextMemoryReadEvent, nextCycle()); + } +} + +bool +PushEngine::handleMemResp(PacketPtr pkt) +{ + // TODO: in case we need to edit edges, get rid of second statement. + assert(pkt->isResponse() && (!pkt->isWrite())); + // here check tick and update the stats + // DPRINTF(PushEngine, "%s: Received a memory response.\n", __func__); + + uint8_t pkt_data [peerMemoryAtomSize]; + PushInfo push_info = reqInfoMap[pkt->req]; + Tick entrance_tick = reqTickMap[pkt->req]; + pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize); + + for (int i = 0; i < push_info.numElements; i++) { + Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge)); + Addr edge_dst = edge->neighbor; + uint32_t edge_weight = edge->weight; + MetaEdge meta_edge( + push_info.src, edge_dst, edge_weight, push_info.value); + metaEdgeQueue.emplace_back(meta_edge, curTick()); + stats.edgeQueueLength.sample(metaEdgeQueue.size()); + } + stats.edgeQueueLength.sample(metaEdgeQueue.size()); + stats.numWastefulEdgesRead += + (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements; + + onTheFlyMemReqs -= push_info.numElements; + outstandingEdgeReqs--; + stats.avgEdgeAccessLatency.sample( + (curTick() - entrance_tick)); + reqTickMap.erase(pkt->req); // added this for edgeAcccessLatency + reqInfoMap.erase(pkt->req); + + delete pkt; + + if (!nextPropagateEvent.scheduled()) { + schedule(nextPropagateEvent, nextCycle()); + } + return true; +} + +void +PushEngine::processNextPropagateEvent() +{ + int num_propagates = 0; + int num_tries = 0; + int num_reads = 0; + std::deque> temp_edge; + for (int i = 0; i < examineWindow; i++) { + if (metaEdgeQueue.empty()) { + break; + } + temp_edge.push_back(metaEdgeQueue.front()); + metaEdgeQueue.pop_front(); + } + int max_visits = temp_edge.size(); + + while(true) { + MetaEdge meta_edge; + Tick entrance_tick; + std::tie(meta_edge, entrance_tick) = temp_edge.front(); + + DPRINTF(PushEngine, "%s: The edge to process is %s.\n", + __func__, meta_edge.to_string()); + + uint32_t update_value = + graphWorkload->propagate(meta_edge.value, meta_edge.weight); + temp_edge.pop_front(); + num_tries++; + + if (enqueueUpdate(meta_edge.src, meta_edge.dst, update_value)) { + DPRINTF(PushEngine, "%s: Sent %s to port queues.\n", + __func__, meta_edge.to_string()); + num_reads++; + stats.numPropagates++; + stats.edgeQueueLatency.sample( + (curTick() - entrance_tick) * 1e9 / getClockFrequency()); + } else { + temp_edge.emplace_back(meta_edge, entrance_tick); + stats.updateQueueFull++; + } + num_propagates++; + + if (temp_edge.empty()) { + break; + } + if (num_tries >= max_visits) { + break; + } + } + + while (!temp_edge.empty()) { + metaEdgeQueue.push_front(temp_edge.back()); + temp_edge.pop_back(); + } + + stats.numPropagatesHist.sample(num_propagates); + + assert(!nextPropagateEvent.scheduled()); + if (!metaEdgeQueue.empty()) { + schedule(nextPropagateEvent, nextCycle()); + } +} + +bool +PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value) +{ + Addr aligned_dst = roundDown(dst, owner->vertexAtomSize()); + AddrRange update_range(aligned_dst, aligned_dst + owner->vertexAtomSize()); + auto entry = portAddrMap.contains(update_range); + PortID port_id = entry->second; + + DPRINTF(PushEngine, "%s: Update{src: %lu, dst:%lu, value: %u} " + "belongs to port %d.\n", + __func__, src, dst, value, port_id); + DPRINTF(PushEngine, "%s: There are %d updates already " + "in queue for port %d.\n", __func__, + destinationQueues[port_id].size(), port_id); + + assert(destinationQueues[port_id].size() == sourceAndValueMaps[port_id].size()); + + int num_updates = 0; + for (auto queue: destinationQueues) { + num_updates += queue.size(); + } + + if (sourceAndValueMaps[port_id].find(dst) != sourceAndValueMaps[port_id].end()) { + DPRINTF(PushEngine, "%s: Found an existing update " + "for dst: %lu.\n", __func__, dst); + Addr prev_src; + uint32_t prev_val; + std::tie(prev_src, prev_val) = sourceAndValueMaps[port_id][dst]; + uint32_t new_val = graphWorkload->reduce(value, prev_val); + sourceAndValueMaps[port_id][dst] = std::make_tuple(prev_src, new_val); + DPRINTF(PushEngine, "%s: Coalesced Update{src: %lu, dst:%lu, value: %u} " + "with Update{src: %lu, dst:%lu, value: %u} to" + "Update{src: %lu, dst:%lu, value: %u}.\n", __func__, + src, dst, value, prev_src, dst, prev_val, + prev_src, dst, new_val); + stats.updateQueueCoalescions++; + return true; + } else if (num_updates < (updateQueueSize * destinationQueues.size())) { + DPRINTF(PushEngine, "%s: There is a free entry available " + "in queue for port %d.\n", __func__, port_id); + destinationQueues[port_id].emplace_back(dst, curTick()); + sourceAndValueMaps[port_id][dst] = std::make_tuple(src, value); + DPRINTF(PushEngine, "%s: Emplaced Update{src: %lu, dst:%lu, value: %u} " + "at the back of queue for port %d. " + "Size of queue for port %d is %d.\n", __func__, + src, dst, value, port_id, port_id, + destinationQueues[port_id].size()); + stats.updateQueueLength.sample(destinationQueues[port_id].size()); + if (!nextUpdatePushEvent.scheduled()) { + schedule(nextUpdatePushEvent, nextCycle()); + } + return true; + } + DPRINTF(PushEngine, "%s: DestinationQueue for pot %d is blocked.\n", + __func__, port_id); + return false; +} + +template PacketPtr +PushEngine::createUpdatePacket(Addr addr, T value) +{ + RequestPtr req = std::make_shared(addr, sizeof(T), 0, 0); + // Dummy PC to have PC-based prefetchers latch on; get entropy into higher + // bits + req->setPC(((Addr) 1) << 2); + + PacketPtr pkt = new Packet(req, MemCmd::UpdateWL); + + pkt->allocate(); + // pkt->setData(data); + pkt->setLE(value); + + return pkt; +} + +void +PushEngine::processNextUpdatePushEvent() +{ + int next_time_send = 0; + + for (int i = 0; i < outPorts.size(); i++) { + if (outPorts[i].blocked()) { + DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, i); + continue; + } + DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, i); + if (destinationQueues[i].empty()) { + DPRINTF(PushEngine, "%s: Respective queue for " + "port %d is empty.\n", __func__, i); + continue; + } + Addr dst; + Tick entrance_tick; + std::tie(dst, entrance_tick) = destinationQueues[i].front(); + Addr src; + uint32_t value; + std::tie(src, value) = sourceAndValueMaps[i][dst]; + + PacketPtr pkt = createUpdatePacket(dst, value); + outPorts[i].sendPacket(pkt); + destinationQueues[i].pop_front(); + sourceAndValueMaps[i].erase(dst); + DPRINTF(PushEngine, "%s: Sent Update{src: %lu, dst:%lu, value: %u} to " + "port %d. Respective queue size is %d.\n", __func__, + src, dst, value, i, destinationQueues[i].size()); + if (destinationQueues[i].size() > 0) { + next_time_send += 1; + } + stats.numUpdates++; + } + + assert(!nextUpdatePushEvent.scheduled()); + if (next_time_send > 0) { + schedule(nextUpdatePushEvent, nextCycle()); + } +} + +PushEngine::PushStats::PushStats(PushEngine& _push): + statistics::Group(&_push), push(_push), + ADD_STAT(numPropagates, statistics::units::Count::get(), + "Number of propagate operations done."), + ADD_STAT(updateQueueFull, statistics::units::Count::get(), + "Number of times the update queue returns false."), + ADD_STAT(numNetBlocks, statistics::units::Count::get(), + "Number of updates blocked by network."), + // ADD_STAT(numIdleCycles, statistics::units::Count::get(), + // "Number of cycles PushEngine has been idle."), + ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(), + "Number of coalescions in the update queues."), + ADD_STAT(numUpdates, statistics::units::Count::get(), + "Number of updates sent to the network."), + ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(), + "Number of wasteful edges read from edge memory."), + ADD_STAT(TEPS, statistics::units::Rate::get(), + "Traversed Edges Per Second."), + ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of the edgePointerQueue."), + ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(), + "Histogram of the size of the edgePointerQueue."), + ADD_STAT(edgeQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of the metaEdgeQueue."), + ADD_STAT(edgeQueueLength, statistics::units::Count::get(), + "Histogram of the size of the metaEdgeQueue."), + ADD_STAT(updateQueueLength, statistics::units::Count::get(), + "Histogram of the length of updateQueues."), + ADD_STAT(numPropagatesHist, statistics::units::Count::get(), + "Histogram of number of propagates sent."), + ADD_STAT(avgEdgeAccessLatency, statistics::units::Second::get(), + "Histogram of edgeAccessLatency."), + ADD_STAT(outstandingEdgeRequests, statistics::units::Count::get(), + "Histogram of the size of the outstanding edge requests."), + ADD_STAT(maxOutstandingEdgeRequests, statistics::units::Count::get(), + "Histogram of the size of the outstanding edge requests.") +{ +} + +void +PushEngine::PushStats::regStats() +{ + using namespace statistics; + + TEPS = numPropagates / simSeconds; + + edgePointerQueueLatency.init(64); + edgePointerQueueLength.init(64); + edgeQueueLatency.init(64); + edgeQueueLength.init(64); + updateQueueLength.init(64); + numPropagatesHist.init(1 + push.params().max_propagates_per_cycle); + + // need to check what these init values mean + avgEdgeAccessLatency.init(64); + outstandingEdgeRequests.init(64); +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh new file mode 100644 index 0000000000..41fb6391cd --- /dev/null +++ b/src/accl/graph/sega/push_engine.hh @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2021 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ + +#include +#include + +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/sega/base_memory_engine.hh" +#include "accl/graph/sega/enums.hh" +#include "base/addr_range_map.hh" +#include "base/intmath.hh" +#include "params/PushEngine.hh" + +namespace gem5 +{ + +class MPU; + +class PushEngine : public BaseMemoryEngine +{ + private: + class ReqPort : public RequestPort + { + private: + PushEngine* owner; + PacketPtr blockedPacket; + PortID _id; + + public: + ReqPort(const std::string& name, PushEngine* owner, PortID id) : + RequestPort(name, owner), + owner(owner), blockedPacket(nullptr), _id(id) + {} + void sendPacket(PacketPtr pkt); + bool blocked() { return (blockedPacket != nullptr); } + + protected: + virtual bool recvTimingResp(PacketPtr pkt); + virtual void recvReqRetry(); + }; + + class EdgeReadInfoGen { + private: + Addr _src; + uint32_t _delta; + + Addr _start; + Addr _end; + size_t _step; + size_t _atom; + + public: + EdgeReadInfoGen(Addr src, uint32_t delta, Addr start, + Addr end, size_t step, size_t atom): + _src(src), _delta(delta), _start(start), + _end(end), _step(step), _atom(atom) + {} + + Addr src() { return _src; } + uint32_t delta() { return _delta; } + + std::tuple nextReadPacketInfo() + { + panic_if(done(), "Should not call nextPacketInfo when done.\n"); + Addr aligned_addr = roundDown(_start, _atom); + Addr offset = _start - aligned_addr; + int num_items = 0; + + if (_end > (aligned_addr + _atom)) { + num_items = (_atom - offset) / _step; + } else { + num_items = (_end - _start) / _step; + } + + return std::make_tuple(aligned_addr, offset, num_items); + } + + void iterate() + { + panic_if(done(), "Should not call iterate when done.\n"); + Addr aligned_addr = roundDown(_start, _atom); + _start = aligned_addr + _atom; + } + + bool done() { return (_start >= _end); } + }; + + struct PushInfo { + Addr src; + uint32_t value; + Addr offset; + int numElements; + }; + + MPU* owner; + GraphWorkload* graphWorkload; + + bool _running; + Tick lastIdleEntranceTick; + + AddrRangeList localAddrRange; + Addr base; + int numPendingPulls; + int edgePointerQueueSize; + std::deque> edgePointerQueue; + std::unordered_map reqInfoMap; + std::unordered_map reqTickMap; + + + int onTheFlyMemReqs; + int outstandingEdgeReqs; + int maxOutstandingEdgeReqs; + int edgeQueueSize; + int examineWindow; + int maxPropagatesPerCycle; + std::deque> metaEdgeQueue; + + int updateQueueSize; + template PacketPtr createUpdatePacket(Addr addr, T value); + bool enqueueUpdate(Addr src, Addr dst, uint32_t value); + std::vector>> destinationQueues; + std::vector>> sourceAndValueMaps; + AddrRangeMap portAddrMap; + std::vector outPorts; + + bool vertexSpace(); + bool workLeft(); + + EventFunctionWrapper nextVertexPullEvent; + void processNextVertexPullEvent(); + + MemoryEvent nextMemoryReadEvent; + void processNextMemoryReadEvent(); + + EventFunctionWrapper nextPropagateEvent; + void processNextPropagateEvent(); + + EventFunctionWrapper nextUpdatePushEvent; + void processNextUpdatePushEvent(); + + struct PushStats : public statistics::Group + { + PushStats(PushEngine& push); + + void regStats() override; + + PushEngine &push; + + statistics::Scalar numMemoryBlocks; + statistics::Scalar numPropagates; + statistics::Scalar updateQueueFull; + statistics::Scalar numNetBlocks; + statistics::Scalar updateQueueCoalescions; + statistics::Scalar numUpdates; + statistics::Scalar numWastefulEdgesRead; + + statistics::Formula TEPS; + + statistics::Histogram edgePointerQueueLatency; + statistics::Histogram edgePointerQueueLength; + statistics::Histogram edgeQueueLatency; + statistics::Histogram edgeQueueLength; + statistics::Histogram updateQueueLength; + statistics::Histogram numPropagatesHist; + + statistics::Histogram avgEdgeAccessLatency; + statistics::Histogram outstandingEdgeRequests; + statistics::Scalar maxOutstandingEdgeRequests = 0; + }; + + PushStats stats; + + protected: + virtual void recvMemRetry(); + virtual bool handleMemResp(PacketPtr pkt); + + public: + PARAMS(PushEngine); + PushEngine(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + virtual void init() override; + void registerMPU(MPU* mpu); + + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); } + + void start(); + bool running() { return _running; } + void recvVertexPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + void recvMirrorPush(Addr addr, uint32_t delta, + uint32_t edge_index, uint32_t degree); + void startProcessingMirrors(Tick time_to_wait); + + void recvReqRetry(); + + bool done(); +}; + +} + +#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__ diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md new file mode 100644 index 0000000000..203c47cf02 --- /dev/null +++ b/src/accl/graph/sega/state_machine.md @@ -0,0 +1 @@ +# CoalesceEngine Block state machine \ No newline at end of file diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc new file mode 100644 index 0000000000..8e5ccc9ebe --- /dev/null +++ b/src/accl/graph/sega/wl_engine.cc @@ -0,0 +1,499 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accl/graph/sega/wl_engine.hh" + +#include +#include +#include + +#include "accl/graph/sega/mpu.hh" +#include "debug/SEGAStructureSize.hh" +#include "debug/WLEngine.hh" +#include "mem/packet_access.hh" +#include "sim/sim_exit.hh" + +namespace gem5 +{ + +WLEngine::WLEngine(const WLEngineParams& params): + BaseReduceEngine(params), + updateQueueSize(params.update_queue_size), + examineWindow(params.examine_window), + maxReadsPerCycle(params.rd_per_cycle), + maxReducesPerCycle(params.reduce_per_cycle), + maxWritesPerCycle(params.wr_per_cycle), + registerFileSize(params.register_file_size), + nextReadEvent([this]{ processNextReadEvent(); }, name()), + nextReduceEvent([this]{ processNextReduceEvent(); }, name()), + nextWriteEvent([this] { processNextWriteEvent(); }, name()), + nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()), + stats(*this) +{ + for (int i = 0; i < params.port_in_ports_connection_count; ++i) { + inPorts.emplace_back( + name() + ".in_ports" + std::to_string(i), this, i); + } +} + +Port& +WLEngine::getPort(const std::string& if_name, PortID idx) +{ + if (if_name == "in_ports") { + return inPorts[idx]; + } else { + return ClockedObject::getPort(if_name, idx); + } +} + +void +WLEngine::init() +{ + for (int i = 0; i < inPorts.size(); i++){ + inPorts[i].sendRangeChange(); + } +} + +void +WLEngine::registerMPU(MPU* mpu) +{ + owner = mpu; +} + +AddrRangeList +WLEngine::getAddrRanges() +{ + return owner->getAddrRanges(); +} + +void +WLEngine::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +AddrRangeList +WLEngine::RespPort::getAddrRanges() const +{ + return owner->getAddrRanges(); +} + +void +WLEngine::RespPort::checkRetryReq() +{ + if (needSendRetryReq) { + needSendRetryReq = false; + sendRetryReq(); + } +} + +bool +WLEngine::RespPort::recvTimingReq(PacketPtr pkt) +{ + if (!owner->handleIncomingUpdate(pkt)) { + needSendRetryReq = true; + return false; + } + + return true; +} + +Tick +WLEngine::RespPort::recvAtomic(PacketPtr pkt) +{ + panic("recvAtomic unimpl."); +} + +void +WLEngine::RespPort::recvFunctional(PacketPtr pkt) +{ + owner->recvFunctional(pkt); +} + +void +WLEngine::RespPort::recvRespRetry() +{ + panic("recvRespRetry from response port is called."); +} + +void +WLEngine::checkRetryReq() +{ + std::vector random_shuffle; + for (int i = 0; i < inPorts.size(); i++) { + random_shuffle.push_back(i); + } + std::random_device rd; + std::mt19937 gen(rd()); + std::shuffle(random_shuffle.begin(), random_shuffle.end(), gen); + + for (int i = 0; i < inPorts.size(); i++) { + inPorts[random_shuffle[i]].checkRetryReq(); + } +} + +bool +WLEngine::done() +{ + return registerFile.empty() && updateQueue.empty(); +} + +bool +WLEngine::handleIncomingUpdate(PacketPtr pkt) +{ + Addr update_addr = pkt->getAddr(); + uint32_t update_value = pkt->getLE(); + + if (valueMap.find(update_addr) != valueMap.end()) { + assert((updateQueueSize == 0) || + (updateQueue.size() <= updateQueueSize)); + DPRINTF(WLEngine, "%s: Found an already queued update to %u. ", + "Current value is: %u.\n", __func__, + update_addr, valueMap[update_addr]); + valueMap[update_addr] = + graphWorkload->reduce(update_value, valueMap[update_addr]); + stats.numIncomingUpdates++; + stats.updateQueueCoalescions++; + } else { + assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize)); + if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) { + return false; + } else { + updateQueue.emplace_back(update_addr, curTick()); + valueMap[update_addr] = update_value; + stats.numIncomingUpdates++; + DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, update_addr, update_value, + updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the " + "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n", + __func__, update_addr, update_value, + updateQueue.size(), updateQueueSize); + } + } + + // delete the packet since it's not needed anymore. + delete pkt; + + if (!nextReadEvent.scheduled()) { + schedule(nextReadEvent, nextCycle()); + } + return true; +} + +void +WLEngine::processNextReadEvent() +{ + std::deque> temp_queue; + for (int i = 0; i < examineWindow; i++) { + if (updateQueue.empty()) { + break; + } + temp_queue.push_back(updateQueue.front()); + updateQueue.pop_front(); + } + + int num_reads = 0; + int num_popped = 0; + int num_tries = 0; + int max_visits = temp_queue.size(); + while (true) { + Addr update_addr; + Tick enter_tick; + std::tie(update_addr, enter_tick) = temp_queue.front(); + + uint32_t update_value = valueMap[update_addr]; + DPRINTF(WLEngine, "%s: Looking at the front of the updateQueue. " + "(addr: %lu, value: %u).\n", __func__, update_addr, update_value); + if ((registerFile.find(update_addr) == registerFile.end())) { + DPRINTF(WLEngine, "%s: No register already allocated for addr: %lu " + "in registerFile.\n", __func__, update_addr); + if (registerFile.size() < registerFileSize) { + DPRINTF(WLEngine, "%s: There are free registers available in the " + "registerFile.\n", __func__); + ReadReturnStatus read_status = owner->recvWLRead(update_addr); + if (read_status == ReadReturnStatus::ACCEPT) { + DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read " + "request to addr: %lu.\n", __func__, update_addr); + registerFile[update_addr] = std::make_tuple(RegisterState::PENDING_READ, update_value); + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) " + "to registerFile. registerFile.size = %d, " + "registerFileSize = %d.\n", __func__, update_addr, + update_value, registerFile.size(), registerFileSize); + DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) " + "to registerFile. registerFile.size = %d, " + "registerFileSize = %d.\n", __func__, update_addr, + update_value, registerFile.size(), registerFileSize); + temp_queue.pop_front(); + valueMap.erase(update_addr); + num_reads++; + num_popped++; + stats.updateQueueLatency.sample( + (curTick() - enter_tick) * 1e9 / getClockFrequency()); + DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, temp_queue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + vertexReadTime[update_addr] = curTick(); + } else { + if (read_status == ReadReturnStatus::REJECT_ROLL) { + temp_queue.pop_front(); + temp_queue.emplace_back(update_addr, enter_tick); + DPRINTF(WLEngine, "%s: Received a reject from cache. " + "Rolling the update.\n", __func__); + stats.numUpdateRolls++; + } else { + temp_queue.pop_front(); + temp_queue.emplace_back(update_addr, enter_tick); + DPRINTF(WLEngine, "%s: Received a reject with no roll " + "from cache. Rolling the update anyway.\n", __func__); + } + } + } else { + DPRINTF(WLEngine, "%s: There are no free registers " + "available in the registerFile.\n", __func__); + temp_queue.pop_front(); + temp_queue.emplace_back(update_addr, enter_tick); + stats.registerShortage++; + } + } else { + DPRINTF(WLEngine, "%s: A register has already been allocated for " + "addr: %lu in registerFile. registerFile[%lu] = %u.\n", __func__, + update_addr, update_addr, std::get<1>(registerFile[update_addr])); + RegisterState state = std::get<0>(registerFile[update_addr]); + if (state == RegisterState::PENDING_WRITE) { + // NOTE: If it's pending write, let it be written. + DPRINTF(WLEngine, "%s: Respective register for addr: " + "%lu is pending a write to the cache. Rolling " + "the update.\n", __func__, update_addr); + temp_queue.pop_front(); + temp_queue.emplace_back(update_addr, enter_tick); + } else { + uint32_t curr_value = std::get<1>(registerFile[update_addr]); + uint32_t new_value = graphWorkload->reduce(update_value, curr_value); + registerFile[update_addr] = std::make_tuple(state, new_value); + DPRINTF(WLEngine, "%s: Reduced the update_value: %u with the entry in" + " registerFile. registerFile[%lu] = %u.\n", __func__, + update_value, update_addr, std::get<1>(registerFile[update_addr])); + stats.registerFileCoalescions++; + temp_queue.pop_front(); + valueMap.erase(update_addr); + num_popped++; + stats.updateQueueLatency.sample( + (curTick() - enter_tick) * 1e9 / getClockFrequency()); + DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) " + "from updateQueue. updateQueue.size = %d. " + "updateQueueSize = %d.\n", __func__, update_addr, + update_value, updateQueue.size(), updateQueueSize); + } + } + + num_tries++; + if (num_reads >= maxReadsPerCycle) { + if (!temp_queue.empty()) { + stats.numReadPortShortage++; + } + break; + } + if (num_tries >= max_visits) { + break; + } + if (temp_queue.empty()) { + break; + } + } + + while (!temp_queue.empty()) { + updateQueue.push_front(temp_queue.back()); + temp_queue.pop_back(); + } + if (num_popped > 0) { + checkRetryReq(); + } + if (!updateQueue.empty() && !nextReadEvent.scheduled()) { + schedule(nextReadEvent, nextCycle()); + } +} + +void +WLEngine::handleIncomingWL(Addr addr, WorkListItem wl) +{ + assert(workListFile.size() <= registerFileSize); + assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_READ); + + workListFile[addr] = wl; + DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to " + "workListFile. workListFile.size = %d.\n", __func__, addr, + graphWorkload->printWorkListItem(wl), workListFile.size()); + DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to " + "workListFile. workListFile.size = %d.\n", __func__, addr, + graphWorkload->printWorkListItem(wl), workListFile.size()); + + uint32_t value = std::get<1>(registerFile[addr]); + registerFile[addr] = std::make_tuple(RegisterState::PENDING_REDUCE, value); + toReduce.push_back(addr); + + stats.vertexReadLatency.sample( + ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency()); + vertexReadTime.erase(addr); + + if (!nextReduceEvent.scheduled()) { + schedule(nextReduceEvent, nextCycle()); + } +} + +void +WLEngine::processNextReduceEvent() +{ + int num_reduces = 0; + while (true) { + Addr addr = toReduce.front(); + assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_REDUCE); + uint32_t update_value = std::get<1>(registerFile[addr]); + DPRINTF(WLEngine, "%s: Reducing for addr: %lu, update_value: %u, " + "temp_prop: %s.\n", __func__, addr, + update_value, workListFile[addr].tempProp); + workListFile[addr].tempProp = + graphWorkload->reduce(update_value, workListFile[addr].tempProp); + DPRINTF(WLEngine, "%s: Reduction result: %s", __func__, + graphWorkload->printWorkListItem(workListFile[addr])); + registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value); + num_reduces++; + stats.numReductions++; + toReduce.pop_front(); + toWrite.push_back(addr); + + if (num_reduces >= maxReducesPerCycle) { + if (!toReduce.empty()) { + stats.numReducerShortage++; + } + break; + } + if (toReduce.empty()) { + break; + } + } + + if (!toWrite.empty() && !nextWriteEvent.scheduled()) { + schedule(nextWriteEvent, nextCycle()); + } + + if (!toReduce.empty() && !nextReduceEvent.scheduled()) { + schedule(nextReduceEvent, nextCycle()); + } +} + +void +WLEngine::processNextWriteEvent() +{ + int num_writes = 0; + while (true) { + Addr addr = toWrite.front(); + assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_WRITE); + owner->recvWLWrite(addr, workListFile[addr]); + registerFile.erase(addr); + workListFile.erase(addr); + toWrite.pop_front(); + num_writes++; + if (num_writes >= maxWritesPerCycle) { + if (!toWrite.empty()) { + stats.numWritePortShortage++; + } + break; + } + if (toWrite.empty()) { + break; + } + } + + if (done() && !nextDoneSignalEvent.scheduled()) { + schedule(nextDoneSignalEvent, nextCycle()); + } + + if (!toWrite.empty() && !nextWriteEvent.scheduled()) { + schedule(nextWriteEvent, nextCycle()); + } +} + +void +WLEngine::processNextDoneSignalEvent() +{ + if (done()) { + owner->recvDoneSignal(); + } +} + +WLEngine::WorkListStats::WorkListStats(WLEngine& _wl): + statistics::Group(&_wl), wl(_wl), + ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(), + "Number of coalescions in the update queues."), + ADD_STAT(registerShortage, statistics::units::Count::get(), + "Number of times updates were " + "stalled because of register shortage"), + ADD_STAT(numUpdateRolls, statistics::units::Count::get(), + "Number of times an update has been rolled back " + "to the back of the update queue due to cache reject."), + ADD_STAT(numReadPortShortage, statistics::units::Count::get(), + "Number of times limited by read per cycle."), + ADD_STAT(registerFileCoalescions, statistics::units::Count::get(), + "Number of memory blocks read for vertecies"), + ADD_STAT(numReductions, statistics::units::Count::get(), + "Number of memory blocks read for vertecies"), + ADD_STAT(numReducerShortage, statistics::units::Count::get(), + "Number of times limited by number of reducers."), + ADD_STAT(numWritePortShortage, statistics::units::Count::get(), + "Number of times limited by write per cycle."), + ADD_STAT(numIncomingUpdates, statistics::units::Count::get(), + "Number of inocoming updates for each GPT."), + ADD_STAT(vertexReadLatency, statistics::units::Second::get(), + "Histogram of the latency of reading a vertex (ns)."), + ADD_STAT(updateQueueLatency, statistics::units::Second::get(), + "Histogram of the latency of dequeuing an update (ns).") +{ +} + +void +WLEngine::WorkListStats::regStats() +{ + using namespace statistics; + + vertexReadLatency.init(64); + updateQueueLatency.init(64); + +} + +} // namespace gem5 diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh new file mode 100644 index 0000000000..b1a85a2465 --- /dev/null +++ b/src/accl/graph/sega/wl_engine.hh @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ +#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ + +#include +#include + +#include "accl/graph/base/base_reduce_engine.hh" +#include "accl/graph/base/graph_workload.hh" +#include "accl/graph/base/data_structs.hh" +#include "accl/graph/sega/enums.hh" +#include "base/statistics.hh" +#include "params/WLEngine.hh" + +namespace gem5 +{ + +class MPU; + +class WLEngine : public BaseReduceEngine +{ + private: + class RespPort : public ResponsePort + { + private: + WLEngine* owner; + bool needSendRetryReq; + PortID _id; + + public: + RespPort(const std::string& name, WLEngine* owner, PortID id): + ResponsePort(name), owner(owner), needSendRetryReq(false), _id(id) + {} + virtual AddrRangeList getAddrRanges() const; + + PortID id() { return _id; } + void checkRetryReq(); + + protected: + virtual bool recvTimingReq(PacketPtr pkt); + virtual Tick recvAtomic(PacketPtr pkt); + virtual void recvFunctional(PacketPtr pkt); + virtual void recvRespRetry(); + }; + + MPU* owner; + GraphWorkload* graphWorkload; + + std::vector inPorts; + + int updateQueueSize; + std::deque> updateQueue; + std::unordered_map valueMap; + + int examineWindow; + int maxReadsPerCycle; + int maxReducesPerCycle; + int maxWritesPerCycle; + + int registerFileSize; + std::unordered_map> registerFile; + std::unordered_map workListFile; + std::deque toReduce; + std::deque toWrite; + + std::unordered_map vertexReadTime; + + EventFunctionWrapper nextReadEvent; + void processNextReadEvent(); + + EventFunctionWrapper nextReduceEvent; + void processNextReduceEvent(); + + EventFunctionWrapper nextWriteEvent; + void processNextWriteEvent(); + + EventFunctionWrapper nextDoneSignalEvent; + void processNextDoneSignalEvent(); + + struct WorkListStats : public statistics::Group + { + WorkListStats(WLEngine& worklist); + + void regStats() override; + + WLEngine &wl; + statistics::Scalar updateQueueCoalescions; + statistics::Scalar registerShortage; + statistics::Scalar numUpdateRolls; + statistics::Scalar numReadPortShortage; + statistics::Scalar registerFileCoalescions; + statistics::Scalar numReductions; + statistics::Scalar numReducerShortage; + statistics::Scalar numWritePortShortage; + statistics::Scalar numIncomingUpdates; + + statistics::Histogram vertexReadLatency; + statistics::Histogram updateQueueLatency; + }; + + WorkListStats stats; + + public: + PARAMS(WLEngine); + WLEngine(const Params& params); + Port& getPort(const std::string& if_name, + PortID idx = InvalidPortID) override; + virtual void init() override; + void registerMPU(MPU* mpu); + + AddrRangeList getAddrRanges(); + void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; } + void recvFunctional(PacketPtr pkt); + + bool handleIncomingUpdate(PacketPtr pkt); + void handleIncomingWL(Addr addr, WorkListItem wl); + + void checkRetryReq(); + + bool done(); +}; + +// virtual AddrRangeList getAddrRanges() const; + +// protected: +// virtual bool recvTimingReq(PacketPtr pkt); +// virtual Tick recvAtomic(PacketPtr pkt); +// virtual void recvFunctional(PacketPtr pkt); +// virtual void recvRespRetry(); + + +} +#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__ diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh new file mode 100644 index 0000000000..620e97f654 --- /dev/null +++ b/src/accl/graph/sega/work_directory.hh @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2020 The Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer; + * redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution; + * neither the name of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ +#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ + +#include + +#include "accl/graph/base/data_structs.hh" +#include "base/addr_range.hh" +#include "base/types.hh" + +namespace gem5 +{ + +class WorkDirectory +{ + public: + virtual int activate(Addr atom_addr) = 0; + virtual int deactivate(Addr atom_addr) = 0; + virtual Addr getNextWork() = 0; + + virtual int workCount() = 0; + bool empty() { return workCount() == 0; } + + virtual void setLastAtomAddr(Addr atom_addr) = 0; +}; + +class PopCountDirectory: public WorkDirectory +{ + private: + AddrRange memoryRange; + + int numAtomsPerBlock; + int memoryAtomSize; + int blockSize; + + uint32_t _workCount; + + int numCounters; + int lastCounterIndex; + uint32_t* popCount; + + uint32_t prevIndex; + uint32_t currentCounter; + + UniqueFIFO activeBlockIndices; + + int getIndexFromAtomAddr(Addr atom_addr) + { + assert((atom_addr % memoryAtomSize) == 0); + Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr); + int index = (int) (trimmed_addr / blockSize); + return index; + } + + Addr getAtomAddrFromIndex(int block_index, int atom_index) + { + Addr block_addr = block_index * blockSize; + Addr trimmed_addr = block_addr + atom_index * memoryAtomSize; + return memoryRange.addIntlvBits(trimmed_addr); + } + + public: + PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size): + WorkDirectory(), + memoryRange(mem_range), numAtomsPerBlock(atoms_per_block), + memoryAtomSize(atom_size), _workCount(0), + prevIndex(-1), currentCounter(0) + { + blockSize = numAtomsPerBlock * memoryAtomSize; + int numCounters = (int) (memoryRange.size() / blockSize); + lastCounterIndex = numCounters - 1; + popCount = new uint32_t [numCounters]; + for (int index = 0; index < numCounters; index++) { + popCount[index] = 0; + } + activeBlockIndices = UniqueFIFO(numCounters); + } + + // CAUTION: This should only be called when the work + // directory **is not** tracking the the atom with atom_addr + virtual int activate(Addr atom_addr) + { + int index = getIndexFromAtomAddr(atom_addr); + uint32_t prev_count = popCount[index]; + popCount[index]++; + _workCount++; + activeBlockIndices.push_back(index); + assert(popCount[index] > prev_count); + assert(popCount[index] <= numAtomsPerBlock); + return popCount[index]; + } + + // CAUTION: This should only be called when the work + // directory **is** tracking the the atom with atom_addr + virtual int deactivate(Addr atom_addr) + { + int index = getIndexFromAtomAddr(atom_addr); + uint32_t prev_count = popCount[index]; + popCount[index]--; + _workCount--; + if (popCount[index] == 0) { + activeBlockIndices.erase(index); + } + assert(popCount[index] < prev_count); + assert(popCount[index] <= numAtomsPerBlock); + return popCount[index]; + } + + virtual int workCount() { return _workCount; } + + void setLastAtomAddr(Addr atom_addr) + { + lastCounterIndex = getIndexFromAtomAddr(atom_addr); + } + + // CAUTION: This directory only tracks active vertices in the memory + // and it does not have any information on the state of the cache and/or + // the active buffer or the write buffer. Therefore, it might generate a + // read request to an address that might be in any of those. In that case, + // the generated address should be ignored. + virtual Addr getNextWork() + { + // Why ask directory if it's empty? + assert(!activeBlockIndices.empty()); + int front_index = activeBlockIndices.front(); + assert(popCount[front_index] > 0); + if ((prevIndex != -1) && (prevIndex != front_index)) { + currentCounter = 0; + } + if (currentCounter == numAtomsPerBlock) { + currentCounter = 0; + activeBlockIndices.pop_front(); + activeBlockIndices.push_back(front_index); + } + int current_index = activeBlockIndices.front(); + Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter); + prevIndex = current_index; + currentCounter++; + return ret_addr; + } +}; + +} // namespace gem5 + +#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__ diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh index 11fb1cd668..b314bfefe1 100644 --- a/src/base/addr_range.hh +++ b/src/base/addr_range.hh @@ -48,6 +48,7 @@ #include "base/bitfield.hh" #include "base/cprintf.hh" +#include "base/intmath.hh" #include "base/logging.hh" #include "base/types.hh" @@ -748,6 +749,40 @@ class AddrRange } return AddrRange(start, end); } + + friend AddrRange + mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit) + { + assert(left.interleaved()); + assert(right.interleaved()); + assert(left.mergesWith(right)); + + uint8_t old_left_match = left.intlvMatch; + uint8_t new_left_match = 0; + uint8_t old_right_match = right.intlvMatch; + uint8_t new_right_match = 0; + int new_bits = left.masks.size() - 1; + + // assumption: masks is sorted in ascending order + std::vector new_masks; + for (auto mask: left.masks) { + uint64_t lsb_mask = (mask ^ (mask - 1)) + 1; + if ((lsb_mask >> 1) != (1 << pch_bit)) { + new_masks.push_back(mask); + new_left_match |= ((old_left_match & 1) << new_bits); + new_left_match >>= 1; + new_right_match |= ((old_right_match & 1) << new_bits); + new_right_match >>= 1; + } + old_left_match >>= 1; + old_right_match >>= 1; + } + panic_if(new_left_match != new_right_match, + "The two ranges can not be a pseudo channel pair " + "given the pseudochannel bit position of params.pch_bit."); + + return AddrRange(left._start, left._end, new_masks, new_left_match); + } }; static inline AddrRangeList @@ -833,6 +868,16 @@ RangeSize(Addr start, Addr size) return AddrRange(start, start + size); } +inline bool +contains(AddrRangeList range_list, Addr addr) +{ + bool ret = false; + for (auto range: range_list) { + ret |= range.contains(addr); + } + return ret; +} + } // namespace gem5 #endif // __BASE_ADDR_RANGE_HH__ diff --git a/src/base/statistics.hh b/src/base/statistics.hh index 8156be5a79..22be74ec90 100644 --- a/src/base/statistics.hh +++ b/src/base/statistics.hh @@ -1051,7 +1051,7 @@ class VectorBase : public DataWrapVec Proxy operator[](off_type index) { - assert (index < size()); + // assert (index < size()); return Proxy(this->self(), index); } }; diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py index 45d89a76c9..f32ffe6f0a 100644 --- a/src/mem/HBMCtrl.py +++ b/src/mem/HBMCtrl.py @@ -42,6 +42,8 @@ class HBMCtrl(MemCtrl): # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces dram_2 = Param.DRAMInterface("DRAM memory interface") + pch_bit = Param.Int("Position of PseudoChannel bit in addresses.") + # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces # gives the best results with following min_r/w_per_switch min_reads_per_switch = 64 diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc index f87fa2dcbb..6e7a0761d5 100644 --- a/src/mem/hbm_ctrl.cc +++ b/src/mem/hbm_ctrl.cc @@ -45,7 +45,7 @@ namespace memory HBMCtrl::HBMCtrl(const HBMCtrlParams &p) : MemCtrl(p), - retryRdReqPC1(false), retryWrReqPC1(false), + retryRdReqPC1(false), retryWrReqPC1(false), pchBit(p.pch_bit), nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1, respondEventPC1, nextReqEventPC1, retryWrReqPC1);}, name()), @@ -226,7 +226,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt) bool is_pc0; // TODO: make the interleaving bit across pseudo channels a parameter - if (bits(pkt->getAddr(), 6) == 0) { + if (bits(pkt->getAddr(), pchBit) == 0) { is_pc0 = true; } else { is_pc0 = false; @@ -487,8 +487,11 @@ AddrRangeList HBMCtrl::getAddrRanges() { AddrRangeList ranges; - ranges.push_back(pc0Int->getAddrRange()); - ranges.push_back(pc1Int->getAddrRange()); + AddrRange pc0Int_range = pc0Int->getAddrRange(); + AddrRange pc1Int_range = pc1Int->getAddrRange(); + ranges.push_back( + mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit) + ); return ranges; } diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh index b17caa6b49..657851eaa6 100644 --- a/src/mem/hbm_ctrl.hh +++ b/src/mem/hbm_ctrl.hh @@ -79,6 +79,8 @@ class HBMCtrl : public MemCtrl bool retryRdReqPC1; bool retryWrReqPC1; + int pchBit; + /** * Remove commands that have already issued from rowBurstTicks * and colBurstTicks diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc index 9a3600f331..b43c1b8366 100644 --- a/src/mem/mem_ctrl.cc +++ b/src/mem/mem_ctrl.cc @@ -211,7 +211,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt, for (int cnt = 0; cnt < pkt_count; ++cnt) { unsigned size = std::min((addr | (burst_size - 1)) + 1, base_addr + pkt->getSize()) - addr; - stats.readPktSize[ceilLog2(size)]++; + // stats.readPktSize[ceilLog2(size)]++; stats.readBursts++; stats.requestorReadAccesses[pkt->requestorId()]++; @@ -1213,8 +1213,8 @@ MemCtrl::CtrlStats::CtrlStats(MemCtrl &_ctrl) ADD_STAT(numWrRetry, statistics::units::Count::get(), "Number of times write queue was full causing retry"), - ADD_STAT(readPktSize, statistics::units::Count::get(), - "Read request sizes (log2)"), + // ADD_STAT(readPktSize, statistics::units::Count::get(), + // "Read request sizes (log2)"), ADD_STAT(writePktSize, statistics::units::Count::get(), "Write request sizes (log2)"), @@ -1286,7 +1286,7 @@ MemCtrl::CtrlStats::regStats() avgRdQLen.precision(2); avgWrQLen.precision(2); - readPktSize.init(ceilLog2(ctrl.system()->cacheLineSize()) + 1); + // readPktSize.init(ceilLog2(ctrl.system()->cacheLineSize()) + 1); writePktSize.init(ceilLog2(ctrl.system()->cacheLineSize()) + 1); rdQLenPdf.init(ctrl.readBufferSize); diff --git a/src/mem/mem_ctrl.hh b/src/mem/mem_ctrl.hh index 917798ffa7..d33724e327 100644 --- a/src/mem/mem_ctrl.hh +++ b/src/mem/mem_ctrl.hh @@ -581,7 +581,7 @@ class MemCtrl : public qos::MemCtrl statistics::Scalar numRdRetry; statistics::Scalar numWrRetry; - statistics::Vector readPktSize; + // statistics::Vector readPktSize; statistics::Vector writePktSize; statistics::Vector rdQLenPdf; statistics::Vector wrQLenPdf; diff --git a/src/mem/packet.cc b/src/mem/packet.cc index 31dc330cab..daf9d18e88 100644 --- a/src/mem/packet.cc +++ b/src/mem/packet.cc @@ -237,6 +237,7 @@ MemCmd::commandInfo[] = { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" }, { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" }, { {IsRequest}, InvalidCmd, "TlbiExtSync" }, + { {IsRequest, HasData}, InvalidCmd, "UpdateWL"} }; AddrRange diff --git a/src/mem/packet.hh b/src/mem/packet.hh index ed7a94f4fb..69686e7835 100644 --- a/src/mem/packet.hh +++ b/src/mem/packet.hh @@ -149,6 +149,8 @@ class MemCmd HTMAbort, // Tlb shootdown TlbiExtSync, + // MPU Accelerator + UpdateWL, NUM_MEM_CMDS }; diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc index 19e1a53e84..55145ab7d7 100644 --- a/src/mem/port_proxy.cc +++ b/src/mem/port_proxy.cc @@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) : void PortProxy::readBlobPhys(Addr addr, Request::Flags flags, - void *p, int size) const + void *p, Addr size) const { for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done(); gen.next()) { @@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags, void PortProxy::writeBlobPhys(Addr addr, Request::Flags flags, - const void *p, int size) const + const void *p, Addr size) const { for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done(); gen.next()) { @@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags, void PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags, - uint8_t v, int size) const + uint8_t v, Addr size) const { // quick and dirty... uint8_t *buf = new uint8_t[size]; diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh index 29f6ba60a4..8cd21322ea 100644 --- a/src/mem/port_proxy.hh +++ b/src/mem/port_proxy.hh @@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol * Read size bytes memory at physical address and store in p. */ void readBlobPhys(Addr addr, Request::Flags flags, - void *p, int size) const; + void *p, Addr size) const; /** * Write size bytes from p to physical address. */ void writeBlobPhys(Addr addr, Request::Flags flags, - const void *p, int size) const; + const void *p, Addr size) const; /** * Fill size bytes starting at physical addr with byte value val. */ void memsetBlobPhys(Addr addr, Request::Flags flags, - uint8_t v, int size) const; + uint8_t v, Addr size) const; @@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryReadBlob(Addr addr, void *p, int size) const + tryReadBlob(Addr addr, void *p, Addr size) const { readBlobPhys(addr, 0, p, size); return true; @@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryWriteBlob(Addr addr, const void *p, int size) const + tryWriteBlob(Addr addr, const void *p, Addr size) const { writeBlobPhys(addr, 0, p, size); return true; @@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol * Returns true on success and false on failure. */ virtual bool - tryMemsetBlob(Addr addr, uint8_t val, int size) const + tryMemsetBlob(Addr addr, uint8_t val, Addr size) const { memsetBlobPhys(addr, 0, val, size); return true; @@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryReadBlob, but insists on success. */ void - readBlob(Addr addr, void *p, int size) const + readBlob(Addr addr, void *p, Addr size) const { if (!tryReadBlob(addr, p, size)) fatal("readBlob(%#x, ...) failed", addr); @@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryWriteBlob, but insists on success. */ void - writeBlob(Addr addr, const void *p, int size) const + writeBlob(Addr addr, const void *p, Addr size) const { if (!tryWriteBlob(addr, p, size)) fatal("writeBlob(%#x, ...) failed", addr); @@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol * Same as tryMemsetBlob, but insists on success. */ void - memsetBlob(Addr addr, uint8_t v, int size) const + memsetBlob(Addr addr, uint8_t v, Addr size) const { if (!tryMemsetBlob(addr, v, size)) fatal("memsetBlob(%#x, ...) failed", addr); diff --git a/src/mem/simple_mem.hh b/src/mem/simple_mem.hh index 75a03fbe0e..0be85e9d86 100644 --- a/src/mem/simple_mem.hh +++ b/src/mem/simple_mem.hh @@ -180,7 +180,6 @@ class SimpleMemory : public AbstractMemory std::unique_ptr pendingDelete; public: - SimpleMemory(const SimpleMemoryParams &p); DrainState drain() override; @@ -189,6 +188,8 @@ class SimpleMemory : public AbstractMemory PortID idx=InvalidPortID) override; void init() override; + double getBW() { return bandwidth; } + protected: Tick recvAtomic(PacketPtr pkt); Tick recvAtomicBackdoor(PacketPtr pkt, MemBackdoorPtr &_backdoor); diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc index 8ab859f40d..bc698c1a07 100644 --- a/src/mem/translating_port_proxy.cc +++ b/src/mem/translating_port_proxy.cc @@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen, } bool -TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const +TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const { constexpr auto mode = BaseMMU::Read; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( @@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const bool TranslatingPortProxy::tryWriteBlob( - Addr addr, const void *p, int size) const + Addr addr, const void *p, Addr size) const { constexpr auto mode = BaseMMU::Write; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( @@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob( } bool -TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const +TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const { constexpr auto mode = BaseMMU::Write; return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional( diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh index bedb57a3ce..7e619784b1 100644 --- a/src/mem/translating_port_proxy.hh +++ b/src/mem/translating_port_proxy.hh @@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy /** Version of tryReadblob that translates virt->phys and deals * with page boundries. */ - bool tryReadBlob(Addr addr, void *p, int size) const override; + bool tryReadBlob(Addr addr, void *p, Addr size) const override; /** Version of tryWriteBlob that translates virt->phys and deals * with page boundries. */ - bool tryWriteBlob(Addr addr, const void *p, int size) const override; + bool tryWriteBlob(Addr addr, const void *p, Addr size) const override; /** * Fill size bytes starting at addr with byte value val. */ - bool tryMemsetBlob(Addr address, uint8_t v, int size) const override; + bool tryMemsetBlob(Addr address, uint8_t v, Addr size) const override; }; } // namespace gem5 diff --git a/src/python/m5/SimObject.py b/src/python/m5/SimObject.py index 08105d8833..cdfd0f367a 100644 --- a/src/python/m5/SimObject.py +++ b/src/python/m5/SimObject.py @@ -1048,6 +1048,7 @@ def find_any(self, ptype): def find_all(self, ptype): all = {} # search children + # print(f"I {self._name} am self at entry.") for child in self._children.values(): # a child could be a list, so ensure we visit each item if isinstance(child, list): @@ -1064,34 +1065,54 @@ def find_all(self, ptype): all[child] = True if isSimObject(child): # also add results from the child itself + # print(f"I {self._name} am child to {self._parent}") + # print(f"Me children are {self._children}") + # print(f"Me looking for {ptype}") child_all, done = child.find_all(ptype) + # print(f"My ({self._name}) children are {child_all}") all.update(dict(zip(child_all, [done] * len(child_all)))) # search param space for pname, pdesc in self._params.items(): if issubclass(pdesc.ptype, ptype): match_obj = self._values[pname] if not isproxy(match_obj) and not isNullPointer(match_obj): - all[match_obj] = True + # print(f"I {match_obj} be match_object") # maybe we can either make the DRAM interfaces children? + # print(type(match_obj)) # or we can maybe check if isSimObjectVector, then serialize it + # print(f" here is all: {type(all)} {all}") + if type(match_obj) is SimObjectVector: + # print("sim object vector!!!") + for simobj in match_obj: + print(simobj) + all[simobj] = True + else: + all[match_obj] = True + # print(f"post all is true") # Also make sure to sort the keys based on the objects' path to # ensure that the order is the same on all hosts + # print(f"I {self._name} am self at exit.") return sorted(all.keys(), key=lambda o: o.path()), True def unproxy(self, base): return self def unproxyParams(self): + print(f"Me be {self._name} at the entry of unproxyParams.") for param in self._params.keys(): value = self._values.get(param) + print(f"me value is {value}") + if value != None and isproxy(value): try: + print(f"me type im trying to unproxy is {type(value)}") value = value.unproxy(self) except: + print(f"Me be {param} when hit error") print( f"Error in unproxying param '{param}' of {self.path()}" ) raise setattr(self, param, value) - + print(f"Me be {self._name} at the exit of unproxyParams.") # Unproxy ports in sorted order so that 'append' operations on # vector ports are done in a deterministic fashion. port_names = list(self._ports.keys()) @@ -1190,7 +1211,21 @@ def getCCParams(self): self.path(), param, ) - + if (not isinstance(value, EthernetAddr)) and isproxy(value): + # At the time of adding this error unproxying params happens + # in simulate.py at lines 103-104 (commit hash: f56459470a) + # To understand how attributes are handled for SimObjects + # refer to SimObject::__setattr__. + fatal( + f"Param {param} for {self._name} has value = {value}. " + "This value is a not a valid value. This could be caused " + f"by {param} not having been unproxied correctly. " + "One reason why this might happen is if you have " + "mistakenly added a child SimObject as an attr and not a " + "child by giving it a name that starts with an underscore " + f"`_`. {self.path()} should not say 'orphan.'" + ) + value = value.getValue() if isinstance(self._params[param], VectorParamDesc): assert isinstance(value, list) diff --git a/src/python/m5/params.py b/src/python/m5/params.py index 2ca6dfcc14..65f87ff3df 100644 --- a/src/python/m5/params.py +++ b/src/python/m5/params.py @@ -260,11 +260,14 @@ def getValue(self): return [v.getValue() for v in self] def unproxy(self, base): + print(f" me Unproxying vector param, me is {self}") if len(self) == 1 and isinstance(self[0], proxy.BaseProxy): + print(f"me in if params.py, about to try to unproxy {self[0]}") # The value is a proxy (e.g. Parent.any, Parent.all or # Parent.x) therefore try resolve it return self[0].unproxy(base) else: + print("me in else params.py") return [v.unproxy(base) for v in self]