From 8e086a06b7e27cf7c39d7230e3392cea3c29e9c1 Mon Sep 17 00:00:00 2001
From: Xi Luo <dycz0fx@gmail.com>
Date: Tue, 8 Mar 2022 00:59:10 +0000
Subject: [PATCH] realm: add FPGA module

---
 cmake/legion_defines.h.in            |    2 +
 cmake/realm_defines.h.in             |    2 +
 examples/fpga/Makefile               |   82 ++
 examples/fpga/README.md              |   32 +
 examples/fpga/fpga.cc                |  409 ++++++
 examples/fpga/vadd.cpp               |   45 +
 examples/realm_fpga/Makefile         |  158 +++
 examples/realm_fpga/README.md        |   33 +
 examples/realm_fpga/fpga_vadd.cc     |  317 +++++
 examples/realm_fpga/vadd.cpp         |   45 +
 runtime/legion/runtime.cc            |    5 +
 runtime/mappers/default_mapper.cc    |  137 +-
 runtime/mappers/default_mapper.h     |   32 +-
 runtime/mappers/mapping_utilities.cc |    2 +
 runtime/realm/fpga/fpga_module.cc    | 1825 ++++++++++++++++++++++++++
 runtime/realm/fpga/fpga_module.h     |  395 ++++++
 runtime/realm/fpga/fpga_utils.cc     |   64 +
 runtime/realm/fpga/fpga_utils.h      |   14 +
 runtime/realm/fpga/xcl2.cc           |  157 +++
 runtime/realm/fpga/xcl2.hpp          |  137 ++
 runtime/realm/machine_impl.cc        |    1 +
 runtime/realm/mem_impl.h             |    3 +
 runtime/realm/module.cc              |    5 +
 runtime/realm/realm_c.h              |    6 +-
 runtime/realm/transfer/channel.h     |    5 +
 runtime/runtime.mk                   |   25 +
 26 files changed, 3922 insertions(+), 16 deletions(-)
 create mode 100644 examples/fpga/Makefile
 create mode 100644 examples/fpga/README.md
 create mode 100644 examples/fpga/fpga.cc
 create mode 100644 examples/fpga/vadd.cpp
 create mode 100644 examples/realm_fpga/Makefile
 create mode 100644 examples/realm_fpga/README.md
 create mode 100644 examples/realm_fpga/fpga_vadd.cc
 create mode 100644 examples/realm_fpga/vadd.cpp
 create mode 100644 runtime/realm/fpga/fpga_module.cc
 create mode 100644 runtime/realm/fpga/fpga_module.h
 create mode 100644 runtime/realm/fpga/fpga_utils.cc
 create mode 100644 runtime/realm/fpga/fpga_utils.h
 create mode 100644 runtime/realm/fpga/xcl2.cc
 create mode 100644 runtime/realm/fpga/xcl2.hpp

diff --git a/cmake/legion_defines.h.in b/cmake/legion_defines.h.in
index 2517ec32fa..852162078d 100644
--- a/cmake/legion_defines.h.in
+++ b/cmake/legion_defines.h.in
@@ -41,6 +41,8 @@
 
 #cmakedefine LEGION_USE_HDF5
 
+#cmakedefine LEGION_USE_FPGA
+
 #cmakedefine LEGION_SPY
 
 #cmakedefine LEGION_USE_HIP
diff --git a/cmake/realm_defines.h.in b/cmake/realm_defines.h.in
index 90d1fa70f0..f5fc71e659 100644
--- a/cmake/realm_defines.h.in
+++ b/cmake/realm_defines.h.in
@@ -59,6 +59,8 @@
 
 #cmakedefine REALM_USE_HDF5
 
+#cmakedefine REALM_USE_FPGA
+
 #cmakedefine REALM_USE_LIBDL
 #cmakedefine REALM_USE_DLMOPEN
 
diff --git a/examples/fpga/Makefile b/examples/fpga/Makefile
new file mode 100644
index 0000000000..20db046ebc
--- /dev/null
+++ b/examples/fpga/Makefile
@@ -0,0 +1,82 @@
+# Copyright 2021 Stanford University
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+ifndef LG_RT_DIR
+$(error LG_RT_DIR variable is not defined, aborting build)
+endif
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 1		# Include debugging symbols
+MAX_DIM         ?= 3		# Maximum number of dimensions
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 0		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Include FPGA support (requires Xilinx FPGA)
+USE_FPGA        ?= 1
+# Path to the Xilinx platform definition file
+PLATFORM        ?= /home/centos/src/project_data/aws-fpga/Vitis/aws_platform/xilinx_aws-vu9p-f1_shell-v04261818_201920_2/xilinx_aws-vu9p-f1_shell-v04261818_201920_2.xpfm
+# FPGA kenrel compilation target: Software emulation (sw_emu), hardware emulation (hw_emu), hardware (hw)
+TARGET          ?= sw_emu
+# v++ flags
+VPPFLAGS = -g -I.
+
+# Put the binary file name here
+OUTFILE		?= fpga
+# List all the application source files here
+GEN_SRC		?= fpga.cc	# .cc files
+
+# You can modify these variables, some will be appended to by the runtime makefile
+INC_FLAGS	?=
+CC_FLAGS	?=
+NVCC_FLAGS	?=
+HIPCC_FLAGS     ?=
+GASNET_FLAGS	?=
+LD_FLAGS	?=
+
+.PHONY: run
+run: kernel all
+	XCL_EMULATION_MODE=$(TARGET) ./fpga -ll:fpga 1 -ll:fpga_size 4 -ll:fpga_xclbin vadd.xclbin
+
+kernel: vadd.xclbin emconfig
+
+vadd.xo: vadd.cpp
+	v++ $(VPPFLAGS) -t $(TARGET) --platform $(PLATFORM) -c -k vadd -o vadd.xo vadd.cpp
+
+# create 4 CUs
+vadd.xclbin: vadd.xo
+	v++ $(VPPFLAGS) -t $(TARGET) --platform $(PLATFORM) -l --connectivity.nk vadd:4 --connectivity.sp vadd_1.m_axi_gmem:DDR[0] --connectivity.sp vadd_2.m_axi_gmem:DDR[0] --connectivity.sp vadd_3.m_axi_gmem:DDR[0] --connectivity.sp vadd_4.m_axi_gmem:DDR[0] -o vadd.xclbin vadd.xo
+
+emconfig: emconfig.json
+emconfig.json:
+	emconfigutil --platform $(PLATFORM)
+
+.PHONY: cleankernel
+cleankernel:
+	rm -rf _x *.xo *.xclbin *.log *.xclbin.* *.xo.* emconfig.json .Xil .run
+
+.PHONY: cleanall
+cleanall: clean cleankernel
+
+###########################################################################
+#
+#   Don't change anything below here
+#
+###########################################################################
+
+include $(LG_RT_DIR)/runtime.mk
+
diff --git a/examples/fpga/README.md b/examples/fpga/README.md
new file mode 100644
index 0000000000..60c727d73e
--- /dev/null
+++ b/examples/fpga/README.md
@@ -0,0 +1,32 @@
+This is an example of using Xilinx FPGAs as accelators in Realm.
+===============	
+## Prerequisite:
+Xilinx XRT 2021.1 and Xilinx Vitis 2021.1.
+
+This example is tested on an F1 node on AWS. Please refer to `https://github.com/aws/aws-fpga` for more information about using FPGA on AWS.
+ 
+## Steps:
+1. Create an AWS instance with FPGA Developer AMI (1.11.0 tested) in AWS Marketplace
+
+2. Get AWS FGPA Development Kit: 
+```
+    git clone https://github.com/aws/aws-fpga.git $AWS_FPGA_REPO_DIR
+```
+
+3. Set up environment:
+```
+    source /home/centos/src/project_data/aws-fpga/vitis_setup.sh
+    source /opt/Xilinx/Vivado/2021.1/settings64.sh
+
+    export LG_RT_DIR=/home/centos/programs/legion_fpga/runtime
+```
+
+4. Build and run the example
+```
+    make run
+```
+
+5. Clean
+```
+    make cleanall
+```
diff --git a/examples/fpga/fpga.cc b/examples/fpga/fpga.cc
new file mode 100644
index 0000000000..ddb6b16cac
--- /dev/null
+++ b/examples/fpga/fpga.cc
@@ -0,0 +1,409 @@
+/* Copyright 2021 Stanford University
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cstdio>
+#include <cassert>
+#include <cstdlib>
+#include "legion.h"
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include "realm/fpga/fpga_utils.h"
+#include "realm/fpga/xcl2.hpp"
+
+using namespace Legion;
+
+#define DEFAULT_COMP_UNITS 1
+
+enum TaskIDs
+{
+    TOP_LEVEL_TASK_ID,
+    INIT_FIELD_TASK_ID,
+    VADD_TASK_ID,
+    CHECK_TASK_ID,
+};
+
+enum FieldIDs
+{
+    FID_X,
+    FID_Y,
+    FID_Z,
+};
+
+#define DATA_TYPE int
+
+void top_level_task(const Task *task,
+                    const std::vector<PhysicalRegion> &regions,
+                    Context ctx, Runtime *runtime)
+{
+    int num_elements = 8;
+    int num_subregions = 4;
+    // See if we have any command line arguments to parse
+    // Note we now have a new command line parameter which specifies
+    // how many subregions we should make.
+    {
+        const InputArgs &command_args = Runtime::get_input_args();
+        for (int i = 1; i < command_args.argc; i++)
+        {
+            if (!strcmp(command_args.argv[i], "-n"))
+                num_elements = atoi(command_args.argv[++i]);
+            if (!strcmp(command_args.argv[i], "-b"))
+                num_subregions = atoi(command_args.argv[++i]);
+        }
+    }
+    printf("Running vadd for %d elements...\n", num_elements);
+    printf("Partitioning data into %d sub-regions...\n", num_subregions);
+
+    // Create our logical regions using the same schemas as earlier examples
+    Rect<1> elem_rect(0, num_elements - 1);
+    IndexSpace is = runtime->create_index_space(ctx, elem_rect);
+    runtime->attach_name(is, "is");
+    FieldSpace input_fs = runtime->create_field_space(ctx);
+    runtime->attach_name(input_fs, "input_fs");
+    {
+        FieldAllocator allocator =
+            runtime->create_field_allocator(ctx, input_fs);
+        allocator.allocate_field(sizeof(DATA_TYPE), FID_X);
+        runtime->attach_name(input_fs, FID_X, "X");
+        allocator.allocate_field(sizeof(DATA_TYPE), FID_Y);
+        runtime->attach_name(input_fs, FID_Y, "Y");
+    }
+    FieldSpace output_fs = runtime->create_field_space(ctx);
+    runtime->attach_name(output_fs, "output_fs");
+    {
+        FieldAllocator allocator =
+            runtime->create_field_allocator(ctx, output_fs);
+        allocator.allocate_field(sizeof(DATA_TYPE), FID_Z);
+        runtime->attach_name(output_fs, FID_Z, "Z");
+    }
+    LogicalRegion input_lr = runtime->create_logical_region(ctx, is, input_fs);
+    runtime->attach_name(input_lr, "input_lr");
+    LogicalRegion output_lr = runtime->create_logical_region(ctx, is, output_fs);
+    runtime->attach_name(output_lr, "output_lr");
+
+    // In addition to using rectangles and domains for launching index spaces
+    // of tasks (see example 02), Legion also uses them for performing
+    // operations on logical regions.  Here we create a rectangle and a
+    // corresponding domain for describing the space of subregions that we
+    // want to create.  Each subregion is assigned a 'color' which is why
+    // we name the variables 'color_bounds' and 'color_domain'.  We'll use
+    // these below when we partition the region.
+    Rect<1> color_bounds(0, num_subregions - 1);
+    IndexSpace color_is = runtime->create_index_space(ctx, color_bounds);
+
+    // Parallelism in Legion is implicit.  This means that rather than
+    // explicitly saying what should run in parallel, Legion applications
+    // partition up data and tasks specify which regions they access.
+    // The Legion runtime computes non-interference as a function of
+    // regions, fields, and privileges and then determines which tasks
+    // are safe to run in parallel.
+    //
+    // Data partitioning is performed on index spaces.  The partitioning
+    // operation is used to break an index space of points into subsets
+    // of points each of which will become a sub index space.  Partitions
+    // created on an index space are then transitively applied to all the
+    // logical regions created using the index space.  We will show how
+    // to get names to the subregions later in this example.
+    //
+    // Here we want to create the IndexPartition 'ip'.  We'll illustrate
+    // two ways of creating an index partition depending on whether the
+    // array being partitioned can be evenly partitioned into subsets
+    // or not.  There are other methods to partitioning index spaces
+    // which are not covered here.  We'll cover the case of coloring
+    // individual points in an index space in our capstone circuit example.
+    IndexPartition ip = runtime->create_equal_partition(ctx, is, color_is);
+    runtime->attach_name(ip, "ip");
+
+    // The index space 'is' was used in creating two logical regions: 'input_lr'
+    // and 'output_lr'.  By creating an IndexPartitiong of 'is' we implicitly
+    // created a LogicalPartition for each of the logical regions created using
+    // 'is'.  The Legion runtime provides several ways of getting the names for
+    // these LogicalPartitions.  We'll look at one of them here.  The
+    // 'get_logical_partition' method takes a LogicalRegion and an IndexPartition
+    // and returns the LogicalPartition of the given LogicalRegion that corresponds
+    // to the given IndexPartition.
+    LogicalPartition input_lp = runtime->get_logical_partition(ctx, input_lr, ip);
+    runtime->attach_name(input_lp, "input_lp");
+    LogicalPartition output_lp = runtime->get_logical_partition(ctx, output_lr, ip);
+    runtime->attach_name(output_lp, "output_lp");
+
+    // Create our launch domain.  Note that is the same as color domain
+    // as we are going to launch one task for each subregion we created.
+    ArgumentMap arg_map;
+
+    // As in previous examples, we now want to launch tasks for initializing
+    // both the fields.  However, to increase the amount of parallelism
+    // exposed to the runtime we will launch separate sub-tasks for each of
+    // the logical subregions created by our partitioning.  To express this
+    // we create an IndexLauncher for launching an index space of tasks
+    // the same as example 02.
+    IndexLauncher init_launcher(INIT_FIELD_TASK_ID, color_is,
+                                TaskArgument(NULL, 0), arg_map);
+    // For index space task launches we don't want to have to explicitly
+    // enumerate separate region requirements for all points in our launch
+    // domain.  Instead Legion allows applications to place an upper bound
+    // on privileges required by subtasks and then specify which privileges
+    // each subtask receives using a projection function.  In the case of
+    // the field initialization task, we say that all the subtasks will be
+    // using some subregion of the LogicalPartition 'input_lp'.  Applications
+    // may also specify upper bounds using logical regions and not partitions.
+    //
+    // The Legion implementation assumes that all all points in an index
+    // space task launch request non-interfering privileges and for performance
+    // reasons this is unchecked.  This means if two tasks in the same index
+    // space are accessing aliased data, then they must either both be
+    // with read-only or reduce privileges.
+    //
+    // When the runtime enumerates the launch_domain, it will invoke the
+    // projection function for each point in the space and use the resulting
+    // LogicalRegion computed for each point in the index space of tasks.
+    // The projection ID '0' is reserved and corresponds to the identity
+    // function which simply zips the space of tasks with the space of
+    // subregions in the partition.  Applications can register their own
+    // projections functions via the 'register_region_projection' and
+    // 'register_partition_projection' functions before starting
+    // the runtime similar to how tasks are registered.
+    init_launcher.add_region_requirement(
+        RegionRequirement(input_lp, 0 /*projection ID*/,
+                          WRITE_DISCARD, EXCLUSIVE, input_lr));
+    init_launcher.region_requirements[0].add_field(FID_X);
+    runtime->execute_index_space(ctx, init_launcher);
+
+    // Modify our region requirement to initialize the other field
+    // in the same way.  Note that after we do this we have exposed
+    // 2*num_subregions task-level parallelism to the runtime because
+    // we have launched tasks that are both data-parallel on
+    // sub-regions and task-parallel on accessing different fields.
+    // The power of Legion is that it allows programmers to express
+    // these data usage patterns and automatically extracts both
+    // kinds of parallelism in a unified programming framework.
+    init_launcher.region_requirements[0].privilege_fields.clear();
+    init_launcher.region_requirements[0].instance_fields.clear();
+    init_launcher.region_requirements[0].add_field(FID_Y);
+    runtime->execute_index_space(ctx, init_launcher);
+
+    // We launch the subtasks for performing the vadd computation
+    // in a similar way to the initialize field tasks.  Note we
+    // again make use of two RegionRequirements which use a
+    // partition as the upper bound for the privileges for the task.
+    IndexLauncher vadd_launcher(VADD_TASK_ID, color_is,
+                                TaskArgument(NULL, 0), arg_map);
+    vadd_launcher.add_region_requirement(
+        RegionRequirement(input_lp, 0 /*projection ID*/,
+                          READ_ONLY, EXCLUSIVE, input_lr));
+    vadd_launcher.region_requirements[0].add_field(FID_X);
+    vadd_launcher.region_requirements[0].add_field(FID_Y);
+    vadd_launcher.add_region_requirement(
+        RegionRequirement(output_lp, 0 /*projection ID*/,
+                          WRITE_DISCARD, EXCLUSIVE, output_lr));
+    vadd_launcher.region_requirements[1].add_field(FID_Z);
+    runtime->execute_index_space(ctx, vadd_launcher);
+
+    // While we could also issue parallel subtasks for the checking
+    // task, we only issue a single task launch to illustrate an
+    // important Legion concept.  Note the checking task operates
+    // on the entire 'input_lr' and 'output_lr' regions and not
+    // on the subregions.  Even though the previous tasks were
+    // all operating on subregions, Legion will correctly compute
+    // data dependences on all the subtasks that generated the
+    // data in these two regions.
+    TaskLauncher check_launcher(CHECK_TASK_ID, TaskArgument(NULL, 0));
+    check_launcher.add_region_requirement(
+        RegionRequirement(input_lr, READ_ONLY, EXCLUSIVE, input_lr));
+    check_launcher.region_requirements[0].add_field(FID_X);
+    check_launcher.region_requirements[0].add_field(FID_Y);
+    check_launcher.add_region_requirement(
+        RegionRequirement(output_lr, READ_ONLY, EXCLUSIVE, output_lr));
+    check_launcher.region_requirements[1].add_field(FID_Z);
+    runtime->execute_task(ctx, check_launcher);
+
+    runtime->destroy_logical_region(ctx, input_lr);
+    runtime->destroy_logical_region(ctx, output_lr);
+    runtime->destroy_field_space(ctx, input_fs);
+    runtime->destroy_field_space(ctx, output_fs);
+    runtime->destroy_index_space(ctx, is);
+    runtime->destroy_index_space(ctx, color_is);
+}
+
+void init_field_task(const Task *task,
+                     const std::vector<PhysicalRegion> &regions,
+                     Context ctx, Runtime *runtime)
+{
+    assert(regions.size() == 1);
+    assert(task->regions.size() == 1);
+    assert(task->regions[0].privilege_fields.size() == 1);
+
+    FieldID fid = *(task->regions[0].privilege_fields.begin());
+    const int point = task->index_point.point_data[0];
+    printf("Initializing field %d for block %d... with ( ", fid, point);
+
+    const FieldAccessor<WRITE_DISCARD, DATA_TYPE, 1> acc(regions[0], fid);
+    // Note here that we get the domain for the subregion for
+    // this task from the runtime which makes it safe for running
+    // both as a single task and as part of an index space of tasks.
+    Rect<1> rect = runtime->get_index_space_domain(ctx,
+                                                   task->regions[0].region.get_index_space());
+    for (PointInRectIterator<1> pir(rect); pir(); pir++)
+    {
+        int rand = drand48() * 100;
+        acc[*pir] = rand;
+        printf("%d ", rand);
+    }
+    printf(")\n");
+}
+
+void vadd_task(const Task *task,
+               const std::vector<PhysicalRegion> &regions,
+               Context ctx, Runtime *runtime)
+{
+    assert(regions.size() == 2);
+    assert(task->regions.size() == 2);
+    const int point = task->index_point.point_data[0];
+
+    const FieldAccessor<READ_ONLY, DATA_TYPE, 1, coord_t, Realm::AffineAccessor<DATA_TYPE, 1, coord_t>> acc_x(regions[0], FID_X);
+    const FieldAccessor<READ_ONLY, DATA_TYPE, 1, coord_t, Realm::AffineAccessor<DATA_TYPE, 1, coord_t>> acc_y(regions[0], FID_Y);
+    const FieldAccessor<WRITE_DISCARD, DATA_TYPE, 1, coord_t, Realm::AffineAccessor<DATA_TYPE, 1, coord_t>> acc_z(regions[1], FID_Z);
+    printf("Running vadd computation for point %d...\n", point);
+    Rect<1> rect = runtime->get_index_space_domain(ctx,
+                                                   task->regions[0].region.get_index_space());
+    printf("rect.volume = %lu, acc_x.ptr = %p, acc_y.ptr = %p, acc_z.ptr = %p \n", rect.volume(), acc_x.accessor.ptr(rect.lo), acc_y.accessor.ptr(rect.lo), acc_z.accessor.ptr(rect.lo));
+    // for (PointInRectIterator<1> pir(rect); pir(); pir++)
+    //   acc_z[*pir] = acc_x[*pir] + acc_y[*pir];
+
+    size_t data_size = rect.volume();
+    int num_cu = DEFAULT_COMP_UNITS;
+    std::vector<cl::Kernel> krnls(num_cu);
+    cl::Program program = FPGAGetCurrentProgram();
+    cl_int err;
+    // Creating Kernel objects
+    for (int i = 0; i < num_cu; i++)
+    {
+        OCL_CHECK(err, krnls[i] = cl::Kernel(program, "vadd", &err));
+    }
+
+    // Creating sub-buffers
+    cl::Buffer device_buff = FPGAGetCurrentBuffer();
+    void *base_ptr_sys = FPGAGetBasePtrSys();
+    auto chunk_size = data_size / num_cu;
+    size_t vector_size_bytes = sizeof(int) * chunk_size;
+    std::vector<cl::Buffer> buffer_in1(num_cu);
+    std::vector<cl::Buffer> buffer_in2(num_cu);
+    std::vector<cl::Buffer> buffer_output(num_cu);
+
+    for (int i = 0; i < num_cu; i++) {
+        cl_buffer_region buffer_in1_info = {(uint64_t)acc_x.ptr(rect.lo) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes};
+        OCL_CHECK(err, buffer_in1[i] = device_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in1_info, &err));
+        cl_buffer_region buffer_in2_info = {(uint64_t)acc_y.ptr(rect.lo) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes};
+        OCL_CHECK(err, buffer_in2[i] = device_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in2_info, &err));
+        cl_buffer_region buffer_output_info = {(uint64_t)acc_z.ptr(rect.lo) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes};
+        OCL_CHECK(err, buffer_output[i] = device_buff.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_output_info, &err));
+    }
+
+    for (int i = 0; i < num_cu; i++) {
+        int narg = 0;
+
+        // Setting kernel arguments
+        OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in1[i]));
+        OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in2[i]));
+        OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_output[i]));
+        OCL_CHECK(err, err = krnls[i].setArg(narg++, (int)chunk_size));
+    }
+
+    cl::Event task_events[num_cu];
+    cl::CommandQueue command_queue = FPGAGetCurrentCommandQueue();
+    for (int i = 0; i < num_cu; i++) {
+        // Launch the kernel
+        OCL_CHECK(err, err = command_queue.enqueueTask(krnls[i], nullptr, &task_events[i]));
+    }
+
+    std::vector<cl::Event> wait_events[num_cu];
+    // Copy result from device global memory to host local memory
+    for (int i = 0; i < num_cu; i++) {
+        wait_events[i].push_back(task_events[i]);
+        OCL_CHECK(err, err = command_queue.enqueueMigrateMemObjects({buffer_output[i]}, CL_MIGRATE_MEM_OBJECT_HOST, &wait_events[i], nullptr));
+    }
+    // OCL_CHECK(err, err = command_queue.finish());
+    OCL_CHECK(err, err = command_queue.flush());
+
+    printf("fpga kernels flushed\n");
+}
+
+void check_task(const Task *task,
+                const std::vector<PhysicalRegion> &regions,
+                Context ctx, Runtime *runtime)
+{
+    assert(regions.size() == 2);
+    assert(task->regions.size() == 2);
+
+    const FieldAccessor<READ_ONLY, DATA_TYPE, 1> acc_x(regions[0], FID_X);
+    const FieldAccessor<READ_ONLY, DATA_TYPE, 1> acc_y(regions[0], FID_Y);
+    const FieldAccessor<READ_ONLY, DATA_TYPE, 1> acc_z(regions[1], FID_Z);
+
+    Rect<1> rect = runtime->get_index_space_domain(ctx,
+                                                   task->regions[0].region.get_index_space());
+    printf("Checking results...\n");
+    bool all_passed = true;
+    for (PointInRectIterator<1> pir(rect); pir(); pir++)
+    {
+        DATA_TYPE expected = acc_x[*pir] + acc_y[*pir];
+        DATA_TYPE received = acc_z[*pir];
+        // Probably shouldn't check for floating point equivalence but
+        // the order of operations are the same should they should
+        // be bitwise equal.
+        if (expected != received)
+        {
+            all_passed = false;
+        }
+        printf("expected = %d received = %d\n", expected, received);
+    }
+    if (all_passed)
+        printf("SUCCESS!\n");
+    else
+        printf("FAILURE!\n");
+}
+
+int main(int argc, char **argv)
+{
+    Runtime::set_top_level_task_id(TOP_LEVEL_TASK_ID);
+
+    {
+        TaskVariantRegistrar registrar(TOP_LEVEL_TASK_ID, "top_level");
+        registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+        Runtime::preregister_task_variant<top_level_task>(registrar, "top_level");
+    }
+
+    {
+        TaskVariantRegistrar registrar(INIT_FIELD_TASK_ID, "init_field");
+        registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+        registrar.set_leaf();
+        Runtime::preregister_task_variant<init_field_task>(registrar, "init_field");
+    }
+
+    {
+        TaskVariantRegistrar registrar(VADD_TASK_ID, "vadd");
+        registrar.add_constraint(ProcessorConstraint(Processor::FPGA_PROC));
+        registrar.set_leaf();
+        Runtime::preregister_task_variant<vadd_task>(registrar, "vadd");
+    }
+
+    {
+        TaskVariantRegistrar registrar(CHECK_TASK_ID, "check");
+        registrar.add_constraint(ProcessorConstraint(Processor::LOC_PROC));
+        registrar.set_leaf();
+        Runtime::preregister_task_variant<check_task>(registrar, "check");
+    }
+
+    return Runtime::start(argc, argv);
+}
diff --git a/examples/fpga/vadd.cpp b/examples/fpga/vadd.cpp
new file mode 100644
index 0000000000..81b95028f6
--- /dev/null
+++ b/examples/fpga/vadd.cpp
@@ -0,0 +1,45 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+// TRIPCOUNT indentifier
+const unsigned int c_min = 4096;
+const unsigned int c_max = 4 * 1024 * 1024;
+
+/*
+    Vector Addition Kernel Implementation
+    Arguments:
+        in1   (input)     --> Input Vector1
+        in2   (input)     --> Input Vector2
+        out_r   (output)    --> Output Vector
+        size  (input)     --> Size of Vector in Integer
+*/
+
+extern "C" {
+void vadd(const int* in1, // Read-Only Vector 1
+          const int* in2, // Read-Only Vector 2
+          int* out_r,     // Output Result
+          int size                 // Size in integer
+          ) {
+// Unoptimized vector addition kernel to increase the kernel execution time
+// Large execution time required to showcase parallel execution of multiple
+// compute units in this example.
+vadd1:
+    for (int i = 0; i < size; i++) {
+#pragma HLS LOOP_TRIPCOUNT min = c_min max = c_max
+        out_r[i] = in1[i] + in2[i];
+    }
+}
+}
diff --git a/examples/realm_fpga/Makefile b/examples/realm_fpga/Makefile
new file mode 100644
index 0000000000..16e55b3e41
--- /dev/null
+++ b/examples/realm_fpga/Makefile
@@ -0,0 +1,158 @@
+# Copyright 2021 Stanford University
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+ifndef LG_RT_DIR
+$(error LG_RT_DIR variable is not defined, aborting build)
+endif
+# vpath stuff below doesn't like a trailing slash in LG_RT_DIR
+override LG_RT_DIR := $(patsubst %/,%,$(LG_RT_DIR))
+
+# Flags for directing the runtime makefile what to include
+DEBUG           ?= 1		# Include debugging symbols
+MAX_DIM         ?= 3		# Maximum number of dimensions
+OUTPUT_LEVEL    ?= LEVEL_DEBUG	# Compile time logging level
+USE_CUDA        ?= 0		# Include CUDA support (requires CUDA)
+USE_GASNET      ?= 0		# Include GASNet support (requires GASNet)
+USE_HDF         ?= 0		# Include HDF5 support (requires HDF5)
+ALT_MAPPERS     ?= 0		# Include alternative mappers (not recommended)
+
+# Include FPGA support (requires Xilinx FPGA)
+USE_FPGA        ?= 1
+# Path to the Xilinx platform definition file
+PLATFORM        ?= /home/centos/src/project_data/aws-fpga/Vitis/aws_platform/xilinx_aws-vu9p-f1_shell-v04261818_201920_2/xilinx_aws-vu9p-f1_shell-v04261818_201920_2.xpfm
+# FPGA kenrel compilation target: Software emulation (sw_emu), hardware emulation (hw_emu), hardware (hw)
+TARGET          ?= sw_emu
+# v++ flags
+VPPFLAGS = -g -I.
+
+
+# Put the binary file name here
+OUTFILE		:=
+# List all the application source files here
+GEN_SRC		:= 				# .cc files
+GEN_GPU_SRC	:=				# .cu files
+
+# You can modify these variables, some will be appended to by the runtime makefile
+INC_FLAGS	?=
+CC_FLAGS	?=
+NVCC_FLAGS	?=
+GASNET_FLAGS	?=
+LD_FLAGS	?=
+CXX ?= g++
+
+# we're going to include runtime.mk to get variable settings, but then
+#  do our own build steps
+NO_BUILD_RULES=1
+include $(LG_RT_DIR)/runtime.mk
+
+TESTS := fpga_vadd
+
+# can set arguments to be passed to a test when running
+TESTARGS_fpga_vadd :=  -ll:fpga 1 -ll:fpga_size 4 -ll:fpga_xclbin vadd.xclbin -level fpga=1 -level app=1
+
+REALM_OBJS := $(patsubst %.cc,%.o,$(notdir $(REALM_SRC))) \
+              $(patsubst %.cc.o,%.o,$(notdir $(REALM_INST_OBJS))) \
+              $(patsubst %.S,%.o,$(notdir $(ASM_SRC)))
+EMPTY :=
+SPACE := $(EMPTY) $(EMPTY)
+RUNTIME_VPATH := $(subst $(SPACE),:,$(sort $(dir $(REALM_SRC))))
+vpath %.cc .:$(RUNTIME_VPATH)
+vpath %.S .:$(RUNTIME_VPATH)
+
+REALM_LIB := librealm.a
+
+TEST_OBJS := $(TESTS:%=%.o)
+
+.PHONY: run_all
+run_all : $(TESTS:%=run_%)
+
+.PHONY: run_%
+run_% : % vadd.xclbin emconfig
+	@# this echos exactly once, even if -s was specified
+	@echo XCL_EMULATION_MODE=$(TARGET) $(LAUNCHER) ./$* $(TESTARGS_$*)
+	@XCL_EMULATION_MODE=$(TARGET) $(LAUNCHER) ./$* $(TESTARGS_$*)
+
+.PHONY: build
+build : $(TESTS) vadd.xclbin emconfig
+
+.PHONY: cleankernel
+cleankernel:
+	rm -rf _x *.xo *.xclbin *.log *.xclbin.* *.xo.* emconfig.json .Xil .run
+
+.PHONY: clean
+clean : cleankernel
+	rm -f $(REALM_LIB) $(REALM_OBJS) $(TESTS) $(TEST_OBJS)
+	rm -f legion_defines.h realm_defines.h
+
+$(TESTS) : % : %.o librealm.a
+	$(CXX) -o $@ $< -L. -lrealm $(LEGION_LD_FLAGS) $(LD_FLAGS)
+
+$(REALM_LIB) : $(REALM_OBJS)
+	rm -f $(REALM_LIB)
+	ar rc $(REALM_LIB) $(REALM_OBJS)
+
+%.o : %.cc $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER)
+	$(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(CC_FLAGS)
+
+$(REALM_OBJS) : CC_FLAGS+=$(REALM_SYMBOL_VISIBILITY)
+
+%.o : %.S
+	$(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(CC_FLAGS)
+
+# deppart-related stuff
+ifneq ($(USE_PGI),1)
+image_%.o : image_tmpl.cc $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER)
+	$(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(REALM_SYMBOL_VISIBILITY) $(CC_FLAGS) -DINST_N1=$(word 1,$(subst _, ,$*)) -DINST_N2=$(word 2,$(subst _, ,$*))
+
+preimage_%.o : preimage_tmpl.cc $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER)
+	$(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(REALM_SYMBOL_VISIBILITY) $(CC_FLAGS) -DINST_N1=$(word 1,$(subst _, ,$*)) -DINST_N2=$(word 2,$(subst _, ,$*))
+
+byfield_%.o : byfield_tmpl.cc $(REALM_DEFINES_HEADER) $(LEGION_DEFINES_HEADER)
+	$(CXX) -fPIC -o $@ -c $< $(INC_FLAGS) $(REALM_SYMBOL_VISIBILITY) $(CC_FLAGS) -DINST_N1=$(word 1,$(subst _, ,$*)) -DINST_N2=$(word 2,$(subst _, ,$*))
+else
+# nvc++ names some symbols based on the source filename, so the trick above
+#  of compiling multiple things from the same template with different defines
+#  causes linker errors - work around by generating a different source file for
+#  each case, but don't leave them lying around
+image_%.cc :
+	echo '#define' INST_N1 $(word 1,$(subst _, ,$(notdir $*))) > $@
+	echo '#define' INST_N2 $(word 2,$(subst _, ,$(notdir $*))) >> $@
+	echo '#include' '"realm/deppart/image_tmpl.cc"' >> $@
+
+preimage_%.cc :
+	echo '#define' INST_N1 $(word 1,$(subst _, ,$(notdir $*))) > $@
+	echo '#define' INST_N2 $(word 2,$(subst _, ,$(notdir $*))) >> $@
+	echo '#include' '"realm/deppart/preimage_tmpl.cc"' >> $@
+
+byfield_%.cc :
+	echo '#define' INST_N1 $(word 1,$(subst _, ,$(notdir $*))) > $@
+	echo '#define' INST_N2 $(word 2,$(subst _, ,$(notdir $*))) >> $@
+	echo '#include' '"realm/deppart/byfield_tmpl.cc"' >> $@
+
+.INTERMEDIATE: $(patsubst %.cc.o,%.cc,$(notdir $(REALM_INST_OBJS)))
+endif
+
+# Create FPGA kernel (vadd with 4 CUs)
+vadd.xo: vadd.cpp
+	v++ $(VPPFLAGS) -t $(TARGET) --platform $(PLATFORM) -c -k vadd -o vadd.xo vadd.cpp
+
+vadd.xclbin: vadd.xo
+	v++ $(VPPFLAGS) -t $(TARGET) --platform $(PLATFORM) -l --connectivity.nk vadd:4 --connectivity.sp vadd_1.m_axi_gmem:DDR[0] --connectivity.sp vadd_2.m_axi_gmem:DDR[0] --connectivity.sp vadd_3.m_axi_gmem:DDR[0] --connectivity.sp vadd_4.m_axi_gmem:DDR[0] -o vadd.xclbin vadd.xo
+
+# Create emulation config file
+emconfig: emconfig.json
+emconfig.json:
+	emconfigutil --platform $(PLATFORM)
diff --git a/examples/realm_fpga/README.md b/examples/realm_fpga/README.md
new file mode 100644
index 0000000000..4b6acfd66c
--- /dev/null
+++ b/examples/realm_fpga/README.md
@@ -0,0 +1,33 @@
+This is an example of using Xilinx FPGAs as accelators in Realm.
+===============	
+## Prerequisite:
+Xilinx XRT 2021.1 and Xilinx Vitis 2021.1.
+
+This example is tested on an F1 node on AWS. Please refer to `https://github.com/aws/aws-fpga` for more information about using FPGA on AWS.
+ 
+## Steps:
+1. Create an AWS instance with FPGA Developer AMI (1.11.0 tested) in AWS Marketplace
+
+2. Get AWS FGPA Development Kit: 
+```
+    git clone https://github.com/aws/aws-fpga.git $AWS_FPGA_REPO_DIR
+```
+
+3. Set up environment:
+```
+    source /home/centos/src/project_data/aws-fpga/vitis_setup.sh
+    source /opt/Xilinx/Vivado/2021.1/settings64.sh
+
+    export LG_RT_DIR=/home/centos/programs/legion_fpga/runtime
+```
+
+4. Build and run the example
+```
+    make build
+    make run_fpga_vadd
+```
+
+5. Clean
+```
+    make clean
+```
diff --git a/examples/realm_fpga/fpga_vadd.cc b/examples/realm_fpga/fpga_vadd.cc
new file mode 100644
index 0000000000..5b9e6b051a
--- /dev/null
+++ b/examples/realm_fpga/fpga_vadd.cc
@@ -0,0 +1,317 @@
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <unistd.h>  //sleep
+
+#include "realm.h"
+#include "realm/fpga/fpga_utils.h"
+
+#include "realm/fpga/xcl2.hpp"
+
+using namespace Realm;
+
+#define DEFAULT_COMP_UNITS 1
+#define DATA_SIZE 12
+
+enum
+{
+    FID_X = 101,
+    FID_Y = 102,
+    FID_Z = 103,
+};
+
+// execute a task on FPGA Processor
+Logger log_app("app");
+
+enum
+{
+    TOP_LEVEL_TASK = Processor::TASK_ID_FIRST_AVAILABLE + 0,
+    FPGA_TASK,
+};
+
+struct FPGAArgs
+{
+    RegionInstance x_inst, y_inst, z_inst;
+    Rect<1> bounds;
+};
+
+void fpga_task(const void *args, size_t arglen,
+               const void *userdata, size_t userlen, Processor p)
+{
+    log_app.print() << "fpga task started";
+    const FPGAArgs &local_args = *(const FPGAArgs *)args;
+
+    // get affine accessors for each of our three instances
+    AffineAccessor<int, 1> ra_x = AffineAccessor<int, 1>(local_args.x_inst,
+                                                         FID_X);
+    AffineAccessor<int, 1> ra_y = AffineAccessor<int, 1>(local_args.y_inst,
+                                                         FID_Y);
+    AffineAccessor<int, 1> ra_z = AffineAccessor<int, 1>(local_args.z_inst,
+                                                         FID_Z);
+    size_t data_size = local_args.bounds.volume();
+
+    int num_cu = DEFAULT_COMP_UNITS;
+    std::vector<cl::Kernel> krnls(num_cu);
+    cl::Program program = FPGAGetCurrentProgram();
+    cl_int err;
+    // Creating Kernel objects
+    for (int i = 0; i < num_cu; i++)
+    {
+        OCL_CHECK(err, krnls[i] = cl::Kernel(program, "vadd", &err));
+    }
+
+    // Creating sub-buffers
+    cl::Buffer device_buff = FPGAGetCurrentBuffer();
+    void *base_ptr_sys = FPGAGetBasePtrSys();
+    auto chunk_size = data_size / num_cu;
+    size_t vector_size_bytes = sizeof(int) * chunk_size;
+    std::vector<cl::Buffer> buffer_in1(num_cu);
+    std::vector<cl::Buffer> buffer_in2(num_cu);
+    std::vector<cl::Buffer> buffer_output(num_cu);
+
+    for (int i = 0; i < num_cu; i++) {
+        cl_buffer_region buffer_in1_info = {(uint64_t)ra_x.ptr(0) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes};
+        OCL_CHECK(err, buffer_in1[i] = device_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in1_info, &err));
+        cl_buffer_region buffer_in2_info = {(uint64_t)ra_y.ptr(0) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes};
+        OCL_CHECK(err, buffer_in2[i] = device_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in2_info, &err));
+        cl_buffer_region buffer_output_info = {(uint64_t)ra_z.ptr(0) - (uint64_t)base_ptr_sys + i * vector_size_bytes, vector_size_bytes};
+        OCL_CHECK(err, buffer_output[i] = device_buff.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_output_info, &err));
+        log_app.info() << "buffer_output_info " << buffer_output_info.origin << " " << buffer_output_info.size;
+    }
+
+    for (int i = 0; i < num_cu; i++) {
+        int narg = 0;
+
+        // Setting kernel arguments
+        OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in1[i]));
+        OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in2[i]));
+        OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_output[i]));
+        OCL_CHECK(err, err = krnls[i].setArg(narg++, (int)chunk_size));
+    }
+
+    cl::Event task_events[num_cu];
+    cl::CommandQueue command_queue = FPGAGetCurrentCommandQueue();
+    for (int i = 0; i < num_cu; i++) {
+        // Launch the kernel
+        OCL_CHECK(err, err = command_queue.enqueueTask(krnls[i], nullptr, &task_events[i]));
+    }
+
+    std::vector<cl::Event> wait_events[num_cu];
+    // Copy result from device global memory to host local memory
+    for (int i = 0; i < num_cu; i++) {
+        wait_events[i].push_back(task_events[i]);
+        OCL_CHECK(err, err = command_queue.enqueueMigrateMemObjects({buffer_output[i]}, CL_MIGRATE_MEM_OBJECT_HOST, &wait_events[i], nullptr));
+    }
+    // OCL_CHECK(err, err = command_queue.finish());
+    OCL_CHECK(err, err = command_queue.flush());
+
+    log_app.print() << "fpga kernels flushed";
+}
+
+void top_level_task(const void *args, size_t arglen,
+                    const void *userdata, size_t userlen, Processor p)
+{
+    log_app.print() << "top task running on " << p;
+    Machine machine = Machine::get_machine();
+    std::set<Processor> all_processors;
+    machine.get_all_processors(all_processors);
+    for (std::set<Processor>::const_iterator it = all_processors.begin();
+         it != all_processors.end();
+         it++)
+    {
+        Processor pp = (*it);
+        if (pp.kind() == Processor::FPGA_PROC)
+        {
+            Memory cpu_mem = Memory::NO_MEMORY;
+            Memory fpga_mem = Memory::NO_MEMORY;
+            std::set<Memory> visible_mems;
+            machine.get_visible_memories(pp, visible_mems);
+            for (std::set<Memory>::const_iterator it = visible_mems.begin();
+                 it != visible_mems.end(); it++)
+            {
+                if (it->kind() == Memory::FPGA_MEM)
+                {
+                    fpga_mem = *it;
+                    log_app.print() << "fpga memory: " << *it << " capacity="
+                                    << (it->capacity() >> 20) << " MB";
+                }
+                if (it->kind() == Memory::SYSTEM_MEM)
+                {
+                    cpu_mem = *it;
+                    log_app.print() << "sys memory: " << *it << " capacity="
+                                    << (it->capacity() >> 20) << " MB";
+                }
+            }
+
+            int init_x_value = 1;
+            int init_y_value = 2;
+            int init_z_value = 9;
+
+            Rect<1> bounds(0, DATA_SIZE - 1);
+
+            std::map<FieldID, size_t> field_sizes;
+            field_sizes[FID_X] = sizeof(int);
+            field_sizes[FID_Y] = sizeof(int);
+            field_sizes[FID_Z] = sizeof(int);
+
+            RegionInstance cpu_inst;
+            RegionInstance::create_instance(cpu_inst, cpu_mem,
+                                            bounds, field_sizes,
+                                            0 /*SOA*/, ProfilingRequestSet())
+                .wait();
+            log_app.print() << "created cpu memory instance: " << cpu_inst;
+
+            CopySrcDstField cpu_x_field, cpu_y_field, cpu_z_field;
+            cpu_x_field.inst = cpu_inst;
+            cpu_x_field.field_id = FID_X;
+            cpu_x_field.size = sizeof(int);
+
+            cpu_y_field.inst = cpu_inst;
+            cpu_y_field.field_id = FID_Y;
+            cpu_y_field.size = sizeof(int);
+
+            cpu_z_field.inst = cpu_inst;
+            cpu_z_field.field_id = FID_Z;
+            cpu_z_field.size = sizeof(int);
+
+            RegionInstance fpga_inst;
+            RegionInstance::create_instance(fpga_inst, fpga_mem,
+                                            bounds, field_sizes,
+                                            0 /*SOA*/, ProfilingRequestSet())
+                .wait();
+            log_app.print() << "created fpga memory instance: " << fpga_inst;
+
+            CopySrcDstField fpga_x_field, fpga_y_field, fpga_z_field;
+            fpga_x_field.inst = fpga_inst;
+            fpga_x_field.field_id = FID_X;
+            fpga_x_field.size = sizeof(int);
+
+            fpga_y_field.inst = fpga_inst;
+            fpga_y_field.field_id = FID_Y;
+            fpga_y_field.size = sizeof(int);
+
+            fpga_z_field.inst = fpga_inst;
+            fpga_z_field.field_id = FID_Z;
+            fpga_z_field.size = sizeof(int);
+
+            AffineAccessor<int, 1> fpga_ra_x = AffineAccessor<int, 1>(fpga_inst, FID_X);
+            AffineAccessor<int, 1> cpu_ra_y = AffineAccessor<int, 1>(cpu_inst, FID_Y);
+            AffineAccessor<int, 1> fpga_ra_y = AffineAccessor<int, 1>(fpga_inst, FID_Y);
+
+            //Test fill: fill fpga memory directly
+            Event fill_x;
+            {
+                std::vector<CopySrcDstField> fill_vec;
+                fill_vec.push_back(fpga_x_field);
+                fill_x = bounds.fill(fill_vec, ProfilingRequestSet(),
+                                     &init_x_value, sizeof(init_x_value));
+            }
+            fill_x.wait();
+
+            Event fill_z;
+            {
+                std::vector<CopySrcDstField> fill_vec;
+                fill_vec.push_back(fpga_z_field);
+                fill_z = bounds.fill(fill_vec, ProfilingRequestSet(),
+                                     &init_z_value, sizeof(init_z_value));
+            }
+            fill_z.wait();
+
+            printf("fpga_ra_x:\n");
+            for (int i = bounds.lo; i <= bounds.hi; i++)
+            {
+                printf("%d ", fpga_ra_x[i]);
+            }
+            printf("\n");
+
+            // fill cpu mem and copy to fpga mem
+            Event fill_y_cpu;
+            {
+                std::vector<CopySrcDstField> fill_vec;
+                fill_vec.push_back(cpu_y_field);
+                fill_y_cpu = bounds.fill(fill_vec, ProfilingRequestSet(),
+                                         &init_y_value, sizeof(init_y_value));
+            }
+            fill_y_cpu.wait();
+
+            Event copy_y;
+            {
+                std::vector<CopySrcDstField> srcs, dsts;
+                srcs.push_back(cpu_y_field);
+                dsts.push_back(fpga_y_field);
+                copy_y = bounds.copy(srcs, dsts, ProfilingRequestSet());
+            }
+            copy_y.wait();
+
+            printf("cpu_ra_y:\n");
+            for (int i = bounds.lo; i <= bounds.hi; i++)
+            {
+                printf("%d ", cpu_ra_y[i]);
+            }
+            printf("\n");
+            printf("fpga_ra_y:\n");
+            for (int i = bounds.lo; i <= bounds.hi; i++)
+            {
+                printf("%d ", fpga_ra_y[i]);
+            }
+            printf("\n");
+
+            FPGAArgs fpga_args;
+            fpga_args.x_inst = fpga_inst;
+            fpga_args.y_inst = fpga_inst;
+            fpga_args.z_inst = fpga_inst;
+            fpga_args.bounds = bounds;
+            Event e = pp.spawn(FPGA_TASK, &fpga_args, sizeof(fpga_args));
+
+            // Copy back
+            Event z_ready;
+            {
+                std::vector<CopySrcDstField> srcs, dsts;
+                srcs.push_back(fpga_z_field);
+                dsts.push_back(cpu_z_field);
+                z_ready = bounds.copy(srcs, dsts, ProfilingRequestSet(), e);
+            }
+            z_ready.wait();
+
+            AffineAccessor<int, 1> ra_z = AffineAccessor<int, 1>(cpu_inst, FID_Z);
+            for (int i = bounds.lo; i <= bounds.hi; i++)
+            {
+                printf("%d ", ra_z[i]);
+            }
+            printf("\n");
+        }
+    }
+
+    log_app.print() << "all done!";
+}
+
+int main(int argc, char **argv)
+{
+    // sleep(20);
+    Runtime rt;
+    rt.init(&argc, &argv);
+    rt.register_task(TOP_LEVEL_TASK, top_level_task);
+
+    Processor::register_task_by_kind(Processor::FPGA_PROC, false /*!global*/,
+                                     FPGA_TASK,
+                                     CodeDescriptor(fpga_task),
+                                     ProfilingRequestSet())
+        .wait();
+
+    // select a processor to run the top level task on
+    Processor p = Machine::ProcessorQuery(Machine::get_machine())
+                      .only_kind(Processor::LOC_PROC)
+                      .first();
+    assert(p.exists());
+
+    // collective launch of a single task - everybody gets the same finish event
+    Event e = rt.collective_spawn(p, TOP_LEVEL_TASK, 0, 0);
+
+    // request shutdown once that task is complete
+    rt.shutdown(e);
+
+    // now sleep this thread until that shutdown actually happens
+    rt.wait_for_shutdown();
+
+    return 0;
+}
diff --git a/examples/realm_fpga/vadd.cpp b/examples/realm_fpga/vadd.cpp
new file mode 100644
index 0000000000..81b95028f6
--- /dev/null
+++ b/examples/realm_fpga/vadd.cpp
@@ -0,0 +1,45 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+// TRIPCOUNT indentifier
+const unsigned int c_min = 4096;
+const unsigned int c_max = 4 * 1024 * 1024;
+
+/*
+    Vector Addition Kernel Implementation
+    Arguments:
+        in1   (input)     --> Input Vector1
+        in2   (input)     --> Input Vector2
+        out_r   (output)    --> Output Vector
+        size  (input)     --> Size of Vector in Integer
+*/
+
+extern "C" {
+void vadd(const int* in1, // Read-Only Vector 1
+          const int* in2, // Read-Only Vector 2
+          int* out_r,     // Output Result
+          int size                 // Size in integer
+          ) {
+// Unoptimized vector addition kernel to increase the kernel execution time
+// Large execution time required to showcase parallel execution of multiple
+// compute units in this example.
+vadd1:
+    for (int i = 0; i < size; i++) {
+#pragma HLS LOOP_TRIPCOUNT min = c_min max = c_max
+        out_r[i] = in1[i] + in2[i];
+    }
+}
+}
diff --git a/runtime/legion/runtime.cc b/runtime/legion/runtime.cc
index ade87017aa..5184b8eae2 100644
--- a/runtime/legion/runtime.cc
+++ b/runtime/legion/runtime.cc
@@ -12313,6 +12313,11 @@ namespace Legion {
                 LegionSpy::log_processor_kind(kind, "Python");
                 break;
               }
+            case Processor::FPGA_PROC:
+              {
+                LegionSpy::log_processor_kind(kind, "FPGA");
+                break;
+              }
             default:
               assert(false); // unknown processor kind
           }
diff --git a/runtime/mappers/default_mapper.cc b/runtime/mappers/default_mapper.cc
index ec2920bb1c..049d8cd572 100644
--- a/runtime/mappers/default_mapper.cc
+++ b/runtime/mappers/default_mapper.cc
@@ -59,13 +59,15 @@ namespace Legion {
                       own ? name : strdup(name)),
         next_local_gpu(0), next_local_cpu(0), next_local_io(0),
         next_local_procset(0), next_local_omp(0), next_local_py(0),
+        next_local_fpga(0),
         next_global_gpu(Processor::NO_PROC),
         next_global_cpu(Processor::NO_PROC), next_global_io(Processor::NO_PROC),
         next_global_procset(Processor::NO_PROC),
         next_global_omp(Processor::NO_PROC), next_global_py(Processor::NO_PROC),
+        next_global_fpga(Processor::NO_PROC),
         global_gpu_query(NULL), global_cpu_query(NULL), global_io_query(NULL),
         global_procset_query(NULL), global_omp_query(NULL),
-        global_py_query(NULL),
+        global_py_query(NULL), global_fpga_query(NULL),
         max_steals_per_theft(STATIC_MAX_PERMITTED_STEALS),
         max_steal_count(STATIC_MAX_STEAL_COUNT),
         breadth_first_traversal(STATIC_BREADTH_FIRST),
@@ -155,6 +157,11 @@ namespace Legion {
                 local_omps.push_back(*it);
                 break;
               }
+            case Processor::FPGA_PROC:
+              {
+                local_fpgas.push_back(*it);
+                break;
+              }
             default: // ignore anything else
               break;
           }
@@ -215,6 +222,15 @@ namespace Legion {
                 remote_omps[node] = *it;
               break;
             }
+          case Processor::FPGA_PROC:
+            {
+              // See if we already have a target FPGA processor for this node
+              if (node >= remote_fpgas.size())
+                remote_fpgas.resize(node+1, Processor::NO_PROC);
+              if (!remote_fpgas[node].exists())
+                remote_fpgas[node] = *it;
+              break;
+            }
           default: // ignore anything else
             break;
         }
@@ -282,6 +298,19 @@ namespace Legion {
           }
         }
         if (total_nodes == 0) total_nodes = remote_pys.size();
+      } 
+      if (!local_fpgas.empty()) {
+        for (unsigned idx = 0; idx < remote_fpgas.size(); idx++) {
+          if (idx == node_id) continue;  // ignore our own node
+          if (!remote_fpgas[idx].exists()) {
+            log_mapper.error("Default mapper has FPGA procs on node %d, but "
+                             "could not detect FPGA procs on node %d. The "
+                             "current default mapper implementation assumes "
+                             "symmetric heterogeneity.", node_id, idx);
+            assert(false);
+          }
+        }
+        if (total_nodes == 0) total_nodes = remote_fpgas.size();
       }
       // Initialize our random number generator
       const size_t short_bits = 8*sizeof(unsigned short);
@@ -403,6 +432,8 @@ namespace Legion {
             return default_get_next_local_omp();
           case Processor::PY_PROC:
             return default_get_next_local_py();
+          case Processor::FPGA_PROC:
+            return default_get_next_local_fpga();
           default: // make warnings go away
             break;
         }
@@ -432,6 +463,8 @@ namespace Legion {
                 return default_get_next_local_omp();
               case Processor::PY_PROC:
                 return default_get_next_local_py();
+              case Processor::FPGA_PROC:
+                return default_get_next_local_fpga();
               default: // make warnings go away
                 break;
             }
@@ -457,6 +490,8 @@ namespace Legion {
                   return default_get_next_global_omp();
                 case Processor::PY_PROC:
                   return default_get_next_global_py();
+                case Processor::FPGA_PROC:
+                  return default_get_next_global_fpga();
                 default: // make warnings go away
                   break;
               }
@@ -479,6 +514,8 @@ namespace Legion {
                 return default_get_next_local_omp();
               case Processor::PY_PROC:
                 return default_get_next_local_py();
+              case Processor::FPGA_PROC:
+                return default_get_next_local_fpga();
               default: // make warnings go away
                 break;
             }
@@ -691,6 +728,38 @@ namespace Legion {
       return result;
     }
 
+    //--------------------------------------------------------------------------
+    Processor DefaultMapper::default_get_next_local_fpga(void)
+    //--------------------------------------------------------------------------
+    {
+      Processor result = local_fpgas[next_local_fpga++];
+      if (next_local_fpga == local_fpgas.size())
+        next_local_fpga = 0;
+      return result;
+    }
+
+    //--------------------------------------------------------------------------
+    Processor DefaultMapper::default_get_next_global_fpga(void)
+    //--------------------------------------------------------------------------
+    {
+      if (total_nodes == 1)
+        return default_get_next_local_fpga();
+      if (!next_global_fpga.exists())
+      {
+        global_fpga_query = new Machine::ProcessorQuery(machine);
+        global_fpga_query->only_kind(Processor::FPGA_PROC);
+        next_global_fpga = global_fpga_query->first();
+      }
+      Processor result = next_global_fpga;
+      next_global_fpga = global_fpga_query->next(result);
+      if (!next_global_fpga.exists())
+      {
+        delete global_fpga_query;
+        global_fpga_query = NULL;
+      }
+      return result;
+    }
+
     //--------------------------------------------------------------------------
     DefaultMapper::VariantInfo DefaultMapper::default_find_preferred_variant(
                                      const Task &task, MapperContext ctx,
@@ -799,6 +868,13 @@ namespace Legion {
                     continue;
                   break;
                 }
+              case Processor::FPGA_PROC:
+                {
+                  kindString += "FPGA_PROC ";
+                  if (local_fpgas.empty())
+                    continue;
+                  break;
+                }
               default:
                 assert(false); // unknown processor type
             }
@@ -922,8 +998,8 @@ namespace Legion {
     //--------------------------------------------------------------------------
     {
       // Default mapper is ignorant about task IDs so just do whatever:
-      // 1) GPU > OMP > procset > cpu > IO > Python  (default)
-      // 2) OMP > procset > cpu > IO > Python > GPU  (with PREFER_CPU_VARIANT)
+      // 1) GPU > OMP > procset > cpu > IO > Python > FPGA (default)
+      // 2) OMP > procset > cpu > IO > Python > GPU > FPGA (with PREFER_CPU_VARIANT)
       // It is up to the caller to filter out processor kinds that aren't
       // suitable for a given task
       bool prefer_cpu = ((task.tag & PREFER_CPU_VARIANT) != 0);
@@ -936,6 +1012,7 @@ namespace Legion {
       if (local_pys.size() > 0) ranking.push_back(Processor::PY_PROC);
       if ((local_gpus.size() > 0) && prefer_cpu)
        ranking.push_back(Processor::TOC_PROC);
+      if (local_fpgas.size() > 0) ranking.push_back(Processor::FPGA_PROC);
     }
 
     //--------------------------------------------------------------------------
@@ -1095,6 +1172,23 @@ namespace Legion {
                 }
                 break;
               }
+            case Processor::FPGA_PROC:
+              {
+                if (task.index_domain.get_volume() > local_fpgas.size())
+                {
+                  if (!global_memory.exists())
+                  {
+                    log_mapper.error("Default mapper failure. No memory found "
+                        "for FPGA task %s (ID %lld) which is visible "
+                        "for all point in the index space.",
+                        task.get_task_name(), task.get_unique_id());
+                    assert(false);
+                  }
+                  else
+                    target_memory = global_memory;
+                }
+                break;
+              }
             default:
               assert(false); // unrecognized processor kind
           }
@@ -1137,6 +1231,7 @@ namespace Legion {
           case Processor::PROC_SET:
           case Processor::OMP_PROC:
           case Processor::PY_PROC:
+          case Processor::FPGA_PROC:  //TODO: use system mem for FPGA
             {
               visible_memories.only_kind(Memory::SYSTEM_MEM);
               if (visible_memories.count() == 0)
@@ -1284,6 +1379,12 @@ namespace Legion {
                                input, output, omp_slices_cache);
             break;
           }
+        case Processor::FPGA_PROC:
+          {
+            default_slice_task(task, local_fpgas, remote_fpgas,
+                               input, output, fpga_slices_cache);
+            break;
+          }
         default:
           assert(false); // unimplemented processor kind
       }
@@ -1747,6 +1848,16 @@ namespace Legion {
               }
               break;
             }
+          case Processor::FPGA_PROC:
+            {
+              // TODO:
+              if (!task.must_epoch_task)
+                target_procs.insert(target_procs.end(),
+                    local_fpgas.begin(), local_fpgas.end());
+              else
+                target_procs.push_back(task.target_proc);
+              break;
+            }
           default:
             assert(false); // unrecognized processor kind
         }
@@ -3168,6 +3279,11 @@ namespace Legion {
             *result = local_pys.size();
             break;
           }
+        case DEFAULT_TUNABLE_LOCAL_FPGAS:
+          {
+            *result = local_fpgas.size();
+            break;
+          }
         case DEFAULT_TUNABLE_GLOBAL_GPUS:
           {
             // TODO: deal with machine asymmetry here
@@ -3198,6 +3314,12 @@ namespace Legion {
             *result = (local_pys.size() * total_nodes);
             break;
           }
+        case DEFAULT_TUNABLE_GLOBAL_FPGAS:
+          {
+            // TODO: deal with machine asymmetry here
+            *result = (local_fpgas.size() * total_nodes);
+            break;
+          }
         default:
           {
             log_mapper.error("Default mapper error. Unrecognized tunable ID %d "
@@ -3524,6 +3646,15 @@ namespace Legion {
                 }
                 break;
               }
+            case Processor::FPGA_PROC:
+              {
+                if (local_fpgas.empty())
+                {
+                  ++it;
+                  continue;
+                }
+                break;
+              }
             default:
               assert(false); // unknown processor kind
           }
diff --git a/runtime/mappers/default_mapper.h b/runtime/mappers/default_mapper.h
index 6c0c458301..cc17c4bce8 100644
--- a/runtime/mappers/default_mapper.h
+++ b/runtime/mappers/default_mapper.h
@@ -46,12 +46,14 @@ namespace Legion {
         DEFAULT_TUNABLE_LOCAL_IOS = 3,
         DEFAULT_TUNABLE_LOCAL_OMPS = 4,
         DEFAULT_TUNABLE_LOCAL_PYS = 5,
-        DEFAULT_TUNABLE_GLOBAL_CPUS = 6,
-        DEFAULT_TUNABLE_GLOBAL_GPUS = 7,
-        DEFAULT_TUNABLE_GLOBAL_IOS = 8,
-        DEFAULT_TUNABLE_GLOBAL_OMPS = 9,
-        DEFAULT_TUNABLE_GLOBAL_PYS = 10,
-        DEFAULT_TUNABLE_LAST = 11, // this one must always be last and unused
+        DEFAULT_TUNABLE_LOCAL_FPGAS = 6,
+        DEFAULT_TUNABLE_GLOBAL_CPUS = 7,
+        DEFAULT_TUNABLE_GLOBAL_GPUS = 8,
+        DEFAULT_TUNABLE_GLOBAL_IOS = 9,
+        DEFAULT_TUNABLE_GLOBAL_OMPS = 10,
+        DEFAULT_TUNABLE_GLOBAL_PYS = 11,
+        DEFAULT_TUNABLE_GLOBAL_FPGAS = 12,
+        DEFAULT_TUNABLE_LAST = 13, // this one must always be last and unused
       };
       enum MappingKind {
         TASK_MAPPING,
@@ -383,6 +385,8 @@ namespace Legion {
       Processor default_get_next_global_procset(void);
       Processor default_get_next_local_omp(void);
       Processor default_get_next_global_omp(void);
+      Processor default_get_next_local_fpga(void);
+      Processor default_get_next_global_fpga(void);
       VariantInfo default_find_preferred_variant(
                                  const Task &task, MapperContext ctx,
                                  bool needs_tight_bound, bool cache = true,
@@ -472,12 +476,14 @@ namespace Legion {
       std::vector<Processor> local_procsets;
       std::vector<Processor> local_omps;
       std::vector<Processor> local_pys;
+      std::vector<Processor> local_fpgas;
       std::vector<Processor> remote_gpus;
       std::vector<Processor> remote_cpus;
       std::vector<Processor> remote_ios;
       std::vector<Processor> remote_procsets;
       std::vector<Processor> remote_omps;
       std::vector<Processor> remote_pys;
+      std::vector<Processor> remote_fpgas;
       // multipleNumaDomainsPresent is set to true when the target machine
       // has multiple separate NUMA memories (SOCKET_MEM). This controls how
       // processor selection for NUMA aware allocations is performed.
@@ -485,20 +491,24 @@ namespace Legion {
     protected:
       // For doing round-robining of tasks onto processors
       unsigned next_local_gpu, next_local_cpu, next_local_io,
-               next_local_procset, next_local_omp, next_local_py;
+               next_local_procset, next_local_omp, next_local_py,
+               next_local_fpga;
       Processor next_global_gpu, next_global_cpu, next_global_io,
-                next_global_procset, next_global_omp, next_global_py;
+                next_global_procset, next_global_omp, next_global_py,
+                next_global_fpga;
       Machine::ProcessorQuery *global_gpu_query, *global_cpu_query,
                               *global_io_query, *global_procset_query,
-                              *global_omp_query, *global_py_query;
-    protected:
+                              *global_omp_query, *global_py_query,
+                              *global_fpga_query;
+    protected: 
       // Cached mapping information about the application
       std::map<Domain,std::vector<TaskSlice> > gpu_slices_cache,
                                                cpu_slices_cache,
                                                io_slices_cache,
                                                procset_slices_cache,
                                                omp_slices_cache,
-                                               py_slices_cache;
+                                               py_slices_cache,
+                                               fpga_slices_cache;
       std::map<std::pair<TaskID,Processor::Kind>,
                VariantInfo>                    preferred_variants;
       std::map<std::pair<TaskID,Processor>,
diff --git a/runtime/mappers/mapping_utilities.cc b/runtime/mappers/mapping_utilities.cc
index 5520e21a8a..18ef032c1f 100644
--- a/runtime/mappers/mapping_utilities.cc
+++ b/runtime/mappers/mapping_utilities.cc
@@ -1063,6 +1063,7 @@ namespace Legion {
           case Processor::PROC_SET: return "PROC_SET";
           case Processor::OMP_PROC: return "OMP_PROC";
           case Processor::PY_PROC: return "PY_PROC";
+          case Processor::FPGA_PROC: return "FPGA_PROC";
           default: assert(false); return "";
         }
       }
@@ -1084,6 +1085,7 @@ namespace Legion {
           case Memory::LEVEL3_CACHE: return "LEVEL3_CACHE";
           case Memory::LEVEL2_CACHE: return "LEVEL2_CACHE";
           case Memory::LEVEL1_CACHE: return "LEVEL1_CACHE";
+          case Memory::FPGA_MEM: return "FPGA_MEM";
           default: assert(false); return "";
         }
       }
diff --git a/runtime/realm/fpga/fpga_module.cc b/runtime/realm/fpga/fpga_module.cc
new file mode 100644
index 0000000000..f3bbf570aa
--- /dev/null
+++ b/runtime/realm/fpga/fpga_module.cc
@@ -0,0 +1,1825 @@
+#include "realm/fpga/fpga_module.h"
+
+#include "realm/logging.h"
+#include "realm/cmdline.h"
+#include "realm/utils.h"
+
+#include <sys/stat.h>
+#include <sys/mman.h>
+
+namespace Realm
+{
+    namespace FPGA
+    {
+
+        namespace ThreadLocal
+        {
+            static REALM_THREAD_LOCAL FPGAProcessor *current_fpga_proc = NULL;
+        }
+
+        Logger log_fpga("fpga");
+
+        // need types with various powers-of-2 size/alignment - we have up to
+        //  uint64_t as builtins, but we need trivially-copyable 16B and 32B things
+        struct dummy_16b_t
+        {
+            uint64_t a, b;
+        };
+        struct dummy_32b_t
+        {
+            uint64_t a, b, c, d;
+        };
+        REALM_ALIGNED_TYPE_CONST(aligned_16b_t, dummy_16b_t, 16);
+        REALM_ALIGNED_TYPE_CONST(aligned_32b_t, dummy_32b_t, 32);
+
+        template <typename T>
+        static void fpga_memcpy_2d_typed(uintptr_t dst_base, uintptr_t dst_lstride,
+                                         uintptr_t src_base, uintptr_t src_lstride,
+                                         size_t bytes, size_t lines)
+        {
+            for (size_t i = 0; i < lines; i++)
+            {
+                std::copy(reinterpret_cast<const T *>(src_base),
+                          reinterpret_cast<const T *>(src_base + bytes),
+                          reinterpret_cast<T *>(dst_base));
+                // manual strength reduction
+                src_base += src_lstride;
+                dst_base += dst_lstride;
+            }
+        }
+
+        static void fpga_memcpy_2d(uintptr_t dst_base, uintptr_t dst_lstride,
+                                   uintptr_t src_base, uintptr_t src_lstride,
+                                   size_t bytes, size_t lines)
+        {
+            // by subtracting 1 from bases, strides, and lengths, we get LSBs set
+            //  based on the common alignment of every parameter in the copy
+            unsigned alignment = ((dst_base - 1) & (dst_lstride - 1) &
+                                  (src_base - 1) & (src_lstride - 1) &
+                                  (bytes - 1));
+            // TODO: consider jump table approach?
+            if ((alignment & 31) == 31)
+                fpga_memcpy_2d_typed<aligned_32b_t>(dst_base, dst_lstride,
+                                                    src_base, src_lstride,
+                                                    bytes, lines);
+            else if ((alignment & 15) == 15)
+                fpga_memcpy_2d_typed<aligned_16b_t>(dst_base, dst_lstride,
+                                                    src_base, src_lstride,
+                                                    bytes, lines);
+            else if ((alignment & 7) == 7)
+                fpga_memcpy_2d_typed<uint64_t>(dst_base, dst_lstride, src_base, src_lstride,
+                                               bytes, lines);
+            else if ((alignment & 3) == 3)
+                fpga_memcpy_2d_typed<uint32_t>(dst_base, dst_lstride, src_base, src_lstride,
+                                               bytes, lines);
+            else if ((alignment & 1) == 1)
+                fpga_memcpy_2d_typed<uint16_t>(dst_base, dst_lstride, src_base, src_lstride,
+                                               bytes, lines);
+            else
+                fpga_memcpy_2d_typed<uint8_t>(dst_base, dst_lstride, src_base, src_lstride,
+                                              bytes, lines);
+        }
+
+        /**
+         * FPGAWorker: it is responsible for making progress on one or more Command Queues.
+         * This may be done directly by an FPGAProcessor or in a background thread spawned for the purpose.
+         */
+        FPGAWorker::FPGAWorker(void)
+            : BackgroundWorkItem("FPGA device worker"),
+              condvar(lock),
+              core_rsrv(0),
+              worker_thread(0),
+              thread_sleeping(false),
+              worker_shutdown_requested(false)
+        {
+        }
+
+        FPGAWorker::~FPGAWorker(void)
+        {
+            // shutdown should have already been called
+            assert(worker_thread == 0);
+        }
+
+        void FPGAWorker::start_background_thread(
+            Realm::CoreReservationSet &crs,
+            size_t stack_size)
+        {
+            assert(manager == 0);
+            core_rsrv = new Realm::CoreReservation("A worker thread", crs,
+                                                   Realm::CoreReservationParameters());
+            Realm::ThreadLaunchParameters tlp;
+            worker_thread = Realm::Thread::create_kernel_thread<FPGAWorker, &FPGAWorker::thread_main>(this, tlp, *core_rsrv, 0);
+        }
+
+        void FPGAWorker::shutdown_background_thread(void)
+        {
+            {
+                AutoLock<> al(lock);
+                worker_shutdown_requested.store(true);
+                if (thread_sleeping)
+                {
+                    thread_sleeping = false;
+                    condvar.broadcast();
+                }
+            }
+
+            worker_thread->join();
+            delete worker_thread;
+            worker_thread = 0;
+
+            delete core_rsrv;
+            core_rsrv = 0;
+        }
+
+        void FPGAWorker::add_queue(FPGAQueue *queue)
+        {
+            bool was_empty = false;
+            {
+                AutoLock<> al(lock);
+#ifdef DEBUG_REALM
+                // insist that the caller de-duplicate these
+                for (ActiveQueue::iterator it = active_queues.begin();
+                     it != active_queues.end();
+                     ++it)
+                    assert(*it != queue);
+#endif
+                was_empty = active_queues.empty();
+                active_queues.push_back(queue);
+                if (thread_sleeping)
+                {
+                    thread_sleeping = false;
+                    condvar.broadcast();
+                }
+            }
+            // if we're a background work item, request attention if needed
+            if (was_empty && (manager != 0))
+                make_active();
+        }
+
+        bool FPGAWorker::do_work(TimeLimit work_until)
+        {
+            // pop the first queue off the list and immediately become re-activ if more queues remain
+            FPGAQueue *queue = 0;
+            bool still_not_empty = false;
+            {
+                AutoLock<> al(lock);
+
+                assert(!active_queues.empty());
+                queue = active_queues.front();
+                active_queues.pop_front();
+                still_not_empty = !active_queues.empty();
+            }
+            if (still_not_empty)
+                make_active();
+            // do work for the queue we popped, paying attention to the cutoff time
+            bool requeue_q = false;
+
+            if (queue->reap_events(work_until))
+            {
+                // still work (e.g. copies) to do
+                if (work_until.is_expired())
+                {
+                    // out of time - save it for later
+                    requeue_q = true;
+                }
+                else if (queue->issue_copies(work_until))
+                    requeue_q = true;
+            }
+
+            bool was_empty = false;
+            if (requeue_q)
+            {
+                AutoLock<> al(lock);
+                was_empty = active_queues.empty();
+                active_queues.push_back(queue);
+            }
+            // note that we can need requeueing even if we called make_active above!
+            return was_empty;
+        }
+
+        bool FPGAWorker::process_queues(bool sleep_on_empty)
+        {
+            FPGAQueue *cur_queue = 0;
+            FPGAQueue *first_queue = 0;
+            bool requeue_queue = false;
+            while (true)
+            {
+                // grab the front queue in the list
+                {
+                    AutoLock<> al(lock);
+                    // if we didn't finish work on the queue from the previous
+                    // iteration, add it back to the end
+                    if (requeue_queue)
+                        active_queues.push_back(cur_queue);
+
+                    while (active_queues.empty())
+                    {
+                        // sleep only if this was the first attempt to get a queue
+                        if (sleep_on_empty && (first_queue == 0) &&
+                            !worker_shutdown_requested.load())
+                        {
+                            thread_sleeping = true;
+                            condvar.wait();
+                        }
+                        else
+                            return false;
+                    }
+                    cur_queue = active_queues.front();
+                    // did we wrap around?  if so, stop for now
+                    if (cur_queue == first_queue)
+                        return true;
+
+                    active_queues.pop_front();
+                    if (!first_queue)
+                        first_queue = cur_queue;
+                }
+                // and do some work for it
+                requeue_queue = false;
+                // reap_events report whether any kind of work
+                if (!cur_queue->reap_events(TimeLimit()))
+                    continue;
+                if (!cur_queue->issue_copies(TimeLimit()))
+                    continue;
+                // if we fall, the queues never went empty at any time, so it's up to us to requeue
+                requeue_queue = true;
+            }
+        }
+
+        void FPGAWorker::thread_main(void)
+        {
+            // TODO: consider busy-waiting in some cases to reduce latency?
+            while (!worker_shutdown_requested.load())
+            {
+                bool work_left = process_queues(true);
+                // if there was work left, yield our thread
+                // for now to avoid a tight spin loop
+                if (work_left)
+                    Realm::Thread::yield();
+            }
+        }
+
+        /**
+         * FPGAWorkFence: used to determine when a device kernel completes execution
+         */
+        FPGAWorkFence::FPGAWorkFence(Realm::Operation *op)
+            : Realm::Operation::AsyncWorkItem(op)
+        {
+        }
+
+        void FPGAWorkFence::request_cancellation(void)
+        {
+            // ignored - no way to shoot down FPGA work
+        }
+
+        void FPGAWorkFence::print(std::ostream &os) const
+        {
+            os << "FPGAWorkFence";
+        }
+
+        void FPGAWorkFence::enqueue(FPGAQueue *queue)
+        {
+            queue->add_fence(this);
+        }
+
+        /**
+         * FPGAQueue: device command queue
+         */
+        FPGAQueue::FPGAQueue(FPGADevice *fpga_device, FPGAWorker *fpga_worker, cl::CommandQueue &command_queue)
+            : fpga_device(fpga_device), fpga_worker(fpga_device->fpga_worker), command_queue(command_queue)
+        {
+            log_fpga.info() << "Create FPGAQueue ";
+            pending_events.clear();
+            pending_copies.clear();
+        }
+
+        FPGAQueue::~FPGAQueue(void)
+        {
+        }
+
+        cl::CommandQueue &FPGAQueue::get_command_queue() const
+        {
+            return command_queue;
+        }
+
+        void FPGAQueue::add_fence(FPGAWorkFence *fence)
+        {
+            cl::Event opencl_event;
+            cl_int err = 0;
+            OCL_CHECK(err, err = command_queue.enqueueMarkerWithWaitList(nullptr, &opencl_event));
+            add_event(opencl_event, fence, 0);
+        }
+
+        void FPGAQueue::add_notification(FPGACompletionNotification *notification)
+        {
+            cl::Event opencl_event;
+            cl_int err = 0;
+            OCL_CHECK(err, err = command_queue.enqueueMarkerWithWaitList(nullptr, &opencl_event));
+
+            add_event(opencl_event, 0, notification);
+        }
+
+        // add event to worker so it can be progressed
+        void FPGAQueue::add_event(cl::Event opencl_event,
+                                  FPGAWorkFence *fence,
+                                  FPGACompletionNotification *n)
+
+        {
+            bool add_to_worker = false;
+            // assert(opencl_event != nullptr);
+            {
+                AutoLock<> al(mutex);
+                // remember to add ourselves
+                // to the worker if we didn't already have work
+                add_to_worker = pending_events.empty() &&
+                                pending_copies.empty();
+                PendingEvent e;
+                e.opencl_event = opencl_event;
+                e.fence = fence;
+                e.notification = n;
+                pending_events.push_back(e);
+            }
+            if (add_to_worker)
+                fpga_worker->add_queue(this);
+        }
+
+        bool FPGAQueue::has_work(void) const
+        {
+            return (!pending_events.empty() ||
+                    !pending_copies.empty());
+        }
+
+        bool FPGAQueue::reap_events(TimeLimit work_until)
+        {
+            // peek at the first event
+            cl::Event opencl_event;
+            FPGACompletionNotification *notification = 0;
+            bool event_valid = false;
+            {
+                AutoLock<> al(mutex);
+
+                if (pending_events.empty())
+                    // no events left, but command queue
+                    // might have other work left
+                    return has_work();
+                opencl_event = pending_events.front().opencl_event;
+                notification = pending_events.front().notification;
+                event_valid = true;
+            }
+            // we'll keep looking at events
+            // until we find one that hasn't triggered
+            bool work_left = true;
+            while (event_valid)
+            {
+                cl_int err = 0;
+                cl_int status;
+                OCL_CHECK(err, err = opencl_event.getInfo(CL_EVENT_COMMAND_EXECUTION_STATUS, &status));
+                if (status == CL_QUEUED || status == CL_SUBMITTED || status == CL_RUNNING)
+                {
+                    // event is not finished - check again later
+                    return true;
+                }
+                else if (status != CL_COMPLETE)
+                {
+                    log_fpga.fatal() << "Error reported on FPGA " << fpga_device->name;
+                }
+
+                // this event has triggered
+                FPGAWorkFence *fence = 0;
+                {
+                    AutoLock<> al(mutex);
+
+                    const PendingEvent &e = pending_events.front();
+                    assert(e.opencl_event == opencl_event);
+                    fence = e.fence;
+                    notification = e.notification;
+                    pending_events.pop_front();
+
+                    if (pending_events.empty())
+                    {
+                        event_valid = false;
+                        work_left = has_work();
+                    }
+                    else
+                    {
+                        opencl_event = pending_events.front().opencl_event;
+                    }
+                }
+                if (fence)
+                {
+                    fence->mark_finished(true /*successful*/); // set preconditions for next tasks
+                }
+                if (notification)
+                {
+                    notification->request_completed();
+                }
+                if (event_valid && work_until.is_expired())
+                    return true;
+            }
+            // if we get here, we ran out of events, but there might have been
+            // other kinds of work that we need to let the caller know about
+            return work_left;
+        }
+
+        void FPGAQueue::add_copy(FPGADeviceMemcpy *copy)
+        {
+            bool add_to_worker = false;
+            {
+                AutoLock<> al(mutex);
+                // add if we haven't been added yet
+                add_to_worker =
+                    pending_copies.empty() && pending_events.empty();
+                pending_copies.push_back(copy);
+            }
+            if (add_to_worker)
+                fpga_worker->add_queue(this);
+        }
+
+        bool FPGAQueue::issue_copies(TimeLimit work_until)
+        {
+            while (true)
+            {
+                // if we cause the list to go empty,
+                // we stop even if more copies show
+                // up because we don't want to requeue ourselves twice
+                bool list_exhausted = false;
+                FPGADeviceMemcpy *copy = 0;
+                {
+                    AutoLock<> al(mutex);
+                    if (pending_copies.empty())
+                        // no copies left,
+                        // but queue might have other work left
+                        return has_work();
+                    copy = pending_copies.front();
+                    pending_copies.pop_front();
+                    list_exhausted = !has_work();
+                }
+                copy->execute(this);
+                delete copy;
+                // if the list was exhausted, let the caller know
+                if (list_exhausted)
+                    return false;
+
+                // if we still have work, but time's up, return also
+                if (work_until.is_expired())
+                    return true;
+            }
+            return false; // should never reach here
+        }
+
+        FPGADevice::FPGADevice(cl::Device &device, std::string name, std::string xclbin, FPGAWorker *fpga_worker, size_t fpga_coprocessor_num_cu, std::string fpga_coprocessor_kernel)
+            : name(name), fpga_worker(fpga_worker), fpga_coprocessor_num_cu(fpga_coprocessor_num_cu), fpga_coprocessor_kernel(fpga_coprocessor_kernel)
+        {
+            cl_int err;
+            this->device = device;
+            // Create FPGA context and command queue for each device
+            OCL_CHECK(err, context = cl::Context(device, nullptr, nullptr, nullptr, &err));
+            OCL_CHECK(err, command_queue = cl::CommandQueue(context, device,
+                                                            CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err));
+
+            // Program the device
+            auto fileBuf = xcl::read_binary_file(xclbin);
+            cl::Program::Binaries bins{{fileBuf.data(), fileBuf.size()}};
+            log_fpga.info() << "Trying to program device " << device.getInfo<CL_DEVICE_NAME>();
+            OCL_CHECK(err, program = cl::Program(context, {device}, bins, nullptr, &err));
+
+            fpga_mem = nullptr;
+            local_sysmem = nullptr;
+            local_ibmem = nullptr;
+            fpga_queue = nullptr;
+            create_fpga_queues();
+        }
+
+        FPGADevice::~FPGADevice()
+        {
+            if (fpga_queue != nullptr)
+            {
+                delete fpga_queue;
+                fpga_queue = nullptr;
+            }
+            command_queue.finish();
+        }
+
+        void FPGADevice::create_dma_channels(RuntimeImpl *runtime)
+        {
+            if (!fpga_mem)
+            {
+                return;
+            }
+
+            const std::vector<MemoryImpl *> &local_mems = runtime->nodes[Network::my_node_id].memories;
+            for (std::vector<Realm::MemoryImpl *>::const_iterator it = local_mems.begin();
+                 it != local_mems.end();
+                 it++)
+            {
+                if ((*it)->lowlevel_kind == Memory::SYSTEM_MEM)
+                {
+                    this->local_sysmem = *it;
+                    log_fpga.info() << "local_sysmem " << std::hex << (*it)->me.id << std::dec << " kind: " << (*it)->kind << " low-level kind: " << (*it)->lowlevel_kind;
+                    break;
+                }
+            }
+
+            const std::vector<IBMemory *> &local_ib_mems = runtime->nodes[Network::my_node_id].ib_memories;
+            for (std::vector<Realm::IBMemory *>::const_iterator it = local_ib_mems.begin();
+                 it != local_ib_mems.end();
+                 it++)
+            {
+                if ((*it)->lowlevel_kind == Memory::REGDMA_MEM)
+                {
+                    this->local_ibmem = *it;
+                    log_fpga.info() << "local_ibmem " << std::hex << (*it)->me.id << std::dec << " kind: " << (*it)->kind << " low-level kind: " << (*it)->lowlevel_kind;
+                    break;
+                }
+            }
+
+            runtime->add_dma_channel(new FPGAfillChannel(this, &runtime->bgwork));
+            runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_IN_DEV, &runtime->bgwork));
+            runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_TO_DEV, &runtime->bgwork));
+            runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_FROM_DEV, &runtime->bgwork));
+            runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_COMP, &runtime->bgwork));
+
+            Machine::MemoryMemoryAffinity mma;
+            mma.m1 = fpga_mem->me;
+            mma.m2 = local_sysmem->me;
+            mma.bandwidth = 20; // TODO
+            mma.latency = 200;
+            runtime->add_mem_mem_affinity(mma);
+
+            mma.m1 = fpga_mem->me;
+            mma.m2 = local_ibmem->me;
+            mma.bandwidth = 20; // TODO
+            mma.latency = 200;
+            runtime->add_mem_mem_affinity(mma);
+
+            // TODO: create p2p channel
+            // runtime->add_dma_channel(new FPGAChannel(this, XFER_FPGA_PEER_DEV, &runtime->bgwork));
+        }
+
+        void FPGADevice::create_fpga_mem(RuntimeImpl *runtime, size_t size)
+        {
+            // TODO: only use membank 0 for now
+            cl_int err;
+            void *base_ptr_sys = nullptr;
+            posix_memalign((void **)&base_ptr_sys, 4096, size);
+            OCL_CHECK(err, buff = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size, base_ptr_sys, &err));
+            Memory m = runtime->next_local_memory_id();
+            this->fpga_mem = new FPGADeviceMemory(m, this, base_ptr_sys, size);
+            runtime->add_memory(fpga_mem);
+            log_fpga.info() << "create_fpga_mem: "
+                            << device.getInfo<CL_DEVICE_NAME>()
+                            << ", size = "
+                            << (size >> 20) << " MB"
+                            << ", base_ptr_sys = " << base_ptr_sys;
+        }
+
+        void FPGADevice::create_fpga_ib(RuntimeImpl *runtime, size_t size)
+        {
+            // TODO: only use membank 0 for now
+            cl_int err;
+            void *base_ptr_sys = nullptr;
+            posix_memalign((void **)&base_ptr_sys, 4096, size);
+            OCL_CHECK(err, ib_buff = cl::Buffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, size, base_ptr_sys, &err));
+            Memory m = runtime->next_local_ib_memory_id();
+            IBMemory *ib_mem;
+            ib_mem = new IBMemory(m, size,
+                                  MemoryImpl::MKIND_FPGA, Memory::FPGA_MEM,
+                                  base_ptr_sys, 0);
+            this->fpga_ib = ib_mem;
+            runtime->add_ib_memory(ib_mem);
+            log_fpga.info() << "create_fpga_ib: "
+                            << device.getInfo<CL_DEVICE_NAME>()
+                            << ", size = "
+                            << (size >> 20) << " MB"
+                            << ", base_ptr_sys = " << base_ptr_sys;
+        }
+
+        void FPGADevice::create_fpga_queues()
+        {
+            fpga_queue = new FPGAQueue(this, fpga_worker, command_queue);
+        }
+
+        void FPGADevice::copy_to_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification)
+        {
+            log_fpga.info() << "copy_to_fpga: src = " << src << " dst = " << dst
+                            << " src_offset = " << src_offset << "dst_offset = " << dst_offset
+                            << " bytes = " << bytes << " notification = " << notification;
+
+            FPGADeviceMemcpy *copy = new FPGADeviceMemcpy1D(this,
+                                                            dst,
+                                                            src,
+                                                            bytes,
+                                                            dst_offset,
+                                                            FPGA_MEMCPY_HOST_TO_DEVICE,
+                                                            notification);
+            fpga_queue->add_copy(copy);
+        }
+
+        void FPGADevice::copy_from_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification)
+        {
+            log_fpga.info() << "copy_from_fpga: src = " << src << " dst = " << dst
+                            << " src_offset = " << src_offset << "dst_offset = " << dst_offset
+                            << " bytes = " << bytes << " notification = " << notification;
+
+            FPGADeviceMemcpy *copy = new FPGADeviceMemcpy1D(this,
+                                                            dst,
+                                                            src,
+                                                            bytes,
+                                                            src_offset,
+                                                            FPGA_MEMCPY_DEVICE_TO_HOST,
+                                                            notification);
+            fpga_queue->add_copy(copy);
+        }
+
+        void FPGADevice::copy_within_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification)
+        {
+            log_fpga.info() << "copy_within_fpga: src = " << src << " dst = " << dst
+                            << " src_offset = " << src_offset << "dst_offset = " << dst_offset
+                            << " bytes = " << bytes << " notification = " << notification;
+            FPGADeviceMemcpy *copy = new FPGADeviceMemcpy1D(this,
+                                                            dst,
+                                                            src,
+                                                            bytes,
+                                                            dst_offset,
+                                                            FPGA_MEMCPY_DEVICE_TO_DEVICE,
+                                                            notification);
+            fpga_queue->add_copy(copy);
+        }
+
+        void FPGADevice::copy_to_peer(FPGADevice *dst_dev, void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification)
+        {
+            log_fpga.info() << "copy_to_peer(not implemented!): dst_dev = " << dst_dev
+                            << "src = " << src << " dst = " << dst
+                            << " src_offset = " << src_offset << "dst_offset = " << dst_offset
+                            << " bytes = " << bytes << " notification = " << notification;
+            assert(0);
+        }
+
+        // TODO: FPGA coprocessor kernels are invoked in this function
+        void FPGADevice::comp(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification)
+        {
+            log_fpga.info() << "comp: src = " << src << " dst = " << dst
+                            << " src_offset = " << src_offset << "dst_offset = " << dst_offset
+                            << " bytes = " << bytes << " notification = " << notification;
+            // An example of invoking an FPGA coprocessor kernel
+            // program device
+            int num_cu = fpga_coprocessor_num_cu;
+            std::vector<cl::Kernel> krnls(num_cu);
+            cl_int err;
+            // Creating Kernel objects
+            for (int i = 0; i < num_cu; i++)
+            {
+                OCL_CHECK(err, krnls[i] = cl::Kernel(program, fpga_coprocessor_kernel.c_str(), &err));
+            }
+            // Creating sub-buffers
+            auto chunk_size = bytes / num_cu;
+            size_t vector_size_bytes = sizeof(int) * chunk_size;
+            std::vector<cl::Buffer> buffer_in1(num_cu);
+            std::vector<cl::Buffer> buffer_in2(num_cu);
+            std::vector<cl::Buffer> buffer_output(num_cu);
+
+            // I/O data vectors
+            std::vector<int, aligned_allocator<int>> source_in2(bytes, 1);
+
+            for (int i = 0; i < num_cu; i++)
+            {
+                OCL_CHECK(err, buffer_in2[i] = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, vector_size_bytes,
+                                                          source_in2.data() + i * chunk_size, &err));
+            }
+            for (int i = 0; i < num_cu; i++)
+            {
+                cl_buffer_region buffer_in1_info = {src_offset + i * vector_size_bytes, vector_size_bytes};
+                OCL_CHECK(err, buffer_in1[i] = ib_buff.createSubBuffer(CL_MEM_READ_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_in1_info, &err));
+                cl_buffer_region buffer_output_info = {dst_offset + i * vector_size_bytes, vector_size_bytes};
+                OCL_CHECK(err, buffer_output[i] = ib_buff.createSubBuffer(CL_MEM_WRITE_ONLY, CL_BUFFER_CREATE_TYPE_REGION, &buffer_output_info, &err));
+            }
+
+            for (int i = 0; i < num_cu; i++)
+            {
+                int narg = 0;
+
+                // Setting kernel arguments
+                OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in1[i]));
+                OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_in2[i]));
+                OCL_CHECK(err, err = krnls[i].setArg(narg++, buffer_output[i]));
+                OCL_CHECK(err, err = krnls[i].setArg(narg++, (int)chunk_size));
+            }
+
+            cl::Event task_events[num_cu];
+            for (int i = 0; i < num_cu; i++)
+            {
+                // Launch the kernel
+                OCL_CHECK(err, err = command_queue.enqueueTask(krnls[i], nullptr, &task_events[i]));
+            }
+
+            std::vector<cl::Event> wait_events[num_cu];
+            // Copy result from device global memory to host local memory
+            for (int i = 0; i < num_cu; i++)
+            {
+                wait_events[i].push_back(task_events[i]);
+                OCL_CHECK(err, err = command_queue.enqueueMigrateMemObjects({buffer_output[i]}, CL_MIGRATE_MEM_OBJECT_HOST, &wait_events[i], nullptr));
+            }
+
+            OCL_CHECK(err, err = command_queue.flush());
+            fpga_queue->add_notification(notification);
+        }
+
+        bool FPGADevice::is_in_buff(void *ptr)
+        {
+            uint64_t base_ptr = (uint64_t)(fpga_mem->base_ptr_sys);
+            if ((uint64_t)ptr >= base_ptr && (uint64_t)ptr <= base_ptr + fpga_mem->size)
+            {
+                return true;
+            }
+            return false;
+        }
+
+        bool FPGADevice::is_in_ib_buff(void *ptr)
+        {
+            uint64_t base_ptr = (uint64_t)(fpga_ib->get_direct_ptr(0, 0));
+            if ((uint64_t)ptr >= base_ptr && (uint64_t)ptr <= base_ptr + fpga_ib->size)
+            {
+                return true;
+            }
+            return false;
+        }
+
+        /**
+         * Device Memory Copy Operations
+         */
+        FPGADeviceMemcpy::FPGADeviceMemcpy(FPGADevice *fpga_device,
+                                           FPGAMemcpyKind kind,
+                                           FPGACompletionNotification *notification)
+            : fpga_device(fpga_device), kind(kind), notification(notification)
+        {
+        }
+
+        /**
+         * 1D Memory Copy Operation
+         */
+        FPGADeviceMemcpy1D::FPGADeviceMemcpy1D(FPGADevice *fpga_device,
+                                               void *dst,
+                                               const void *src,
+                                               size_t bytes,
+                                               off_t buff_offset,
+                                               FPGAMemcpyKind kind,
+                                               FPGACompletionNotification *notification)
+            : FPGADeviceMemcpy(fpga_device, kind, notification),
+              dst(dst), src(src), elmt_size(bytes), buff_offset(buff_offset)
+        {
+        }
+
+        FPGADeviceMemcpy1D::~FPGADeviceMemcpy1D(void)
+        {
+        }
+
+        void FPGADeviceMemcpy1D::do_span(off_t pos, size_t len)
+        {
+            off_t span_start = pos * elmt_size;
+            size_t span_bytes = len * elmt_size;
+            void *dstptr = ((uint8_t *)dst) + span_start;
+            void *srcptr = ((uint8_t *)src) + span_start;
+            size_t size = span_bytes;
+            cl_int err = 0;
+            log_fpga.debug() << "do_span: buff_offset " << buff_offset << " size " << size << " srcptr " << srcptr << " dstptr " << dstptr;
+            cl::Buffer *temp_buff = nullptr;
+            if (kind == FPGA_MEMCPY_HOST_TO_DEVICE)
+            {
+                if (fpga_device->is_in_ib_buff(dstptr))
+                    temp_buff = &(fpga_device->ib_buff);
+                else if (fpga_device->is_in_buff(dstptr))
+                    temp_buff = &(fpga_device->buff);
+                else
+                {
+                    log_fpga.error() << "dstptr is not in buffer or ib_buffer";
+                    assert(0);
+                }
+                OCL_CHECK(err, err = fpga_queue->get_command_queue().enqueueWriteBuffer(*temp_buff,     // buffer on the FPGA
+                                                                                        CL_FALSE,       // blocking call
+                                                                                        buff_offset,    // buffer offset in bytes
+                                                                                        size,           // Size in bytes
+                                                                                        (void *)srcptr, // Pointer to the data to copy
+                                                                                        nullptr, nullptr));
+            }
+            else if (kind == FPGA_MEMCPY_DEVICE_TO_HOST)
+            {
+                if (fpga_device->is_in_ib_buff(srcptr))
+                    temp_buff = &(fpga_device->ib_buff);
+                else if (fpga_device->is_in_buff(srcptr))
+                    temp_buff = &(fpga_device->buff);
+                else
+                {
+                    log_fpga.error() << "srcptr is not in buffer or ib_buffer";
+                    assert(0);
+                }
+                OCL_CHECK(err, err = fpga_queue->get_command_queue().enqueueReadBuffer(*temp_buff,     // buffer on the FPGA
+                                                                                       CL_FALSE,       // blocking call
+                                                                                       buff_offset,    // buffer offset in bytes
+                                                                                       size,           // Size in bytes
+                                                                                       (void *)dstptr, // Pointer to the data to copy
+                                                                                       nullptr, nullptr));
+            }
+            else if (kind == FPGA_MEMCPY_DEVICE_TO_DEVICE)
+            {
+                if (fpga_device->is_in_ib_buff(dstptr))
+                    temp_buff = &(fpga_device->ib_buff);
+                else if (fpga_device->is_in_buff(dstptr))
+                    temp_buff = &(fpga_device->buff);
+                else
+                {
+                    log_fpga.error() << "dstptr is not in buffer or ib_buffer";
+                    assert(0);
+                }
+                OCL_CHECK(err, err = fpga_queue->get_command_queue().enqueueWriteBuffer(*temp_buff,     // buffer on the FPGA
+                                                                                        CL_FALSE,       // blocking call
+                                                                                        buff_offset,    // buffer offset in bytes
+                                                                                        size,           // Size in bytes
+                                                                                        (void *)srcptr, // Pointer to the data to copy
+                                                                                        nullptr, nullptr));
+            }
+            else if (kind == FPGA_MEMCPY_PEER_TO_PEER)
+            {
+                log_fpga.error() << "FPGA_MEMCPY_PEER_TO_PEER not implemented";
+                assert(0);
+            }
+            else
+            {
+                log_fpga.error() << "FPGADeviceMemcpy kind error";
+                assert(0);
+            }
+            OCL_CHECK(err, err = fpga_queue->get_command_queue().flush());
+        }
+
+        void FPGADeviceMemcpy1D::execute(FPGAQueue *queue)
+        {
+            log_fpga.info("FPGADevice memcpy: dst=%p src=%p bytes=%zd kind=%d",
+                          dst, src, elmt_size, kind);
+            // save queue into local variable
+            // for do_span (which may be called indirectly by ElementMask::forall_ranges)
+            fpga_queue = queue;
+            do_span(0, 1);
+            if (notification)
+            {
+                fpga_queue->add_notification(notification);
+            }
+            log_fpga.info("fpga memcpy 1d issued: dst=%p src=%p bytes=%zd kind=%d",
+                          dst, src, elmt_size, kind);
+        }
+
+        FPGAModule::FPGAModule()
+            : Module("fpga"), cfg_num_fpgas(0), cfg_use_worker_threads(false),
+              cfg_use_shared_worker(true), cfg_fpga_mem_size(4 << 20), cfg_fpga_ib_size(4 << 20),
+              cfg_fpga_coprocessor_num_cu(1)
+        {
+            shared_worker = nullptr;
+            cfg_fpga_xclbin = "";
+            fpga_devices.clear();
+            dedicated_workers.clear();
+            fpga_procs_.clear();
+            cfg_fpga_coprocessor_kernel = "";
+        }
+
+        FPGAModule::~FPGAModule(void)
+        {
+            if (!this->fpga_devices.empty())
+            {
+                for (size_t i = 0; i < fpga_devices.size(); i++)
+                {
+
+                    // xclClose(fpga_devices[i]->dev_handle);
+                    delete this->fpga_devices[i];
+                }
+            }
+        }
+
+        Module *FPGAModule::create_module(RuntimeImpl *runtime, std::vector<std::string> &cmdline)
+        {
+            FPGAModule *m = new FPGAModule;
+            log_fpga.info() << "create_module";
+            // first order of business - read command line parameters
+            {
+                Realm::CommandLineParser cp;
+                cp.add_option_int("-ll:fpga", m->cfg_num_fpgas);
+                cp.add_option_bool("-ll:fpga_work_thread", m->cfg_use_worker_threads);
+                cp.add_option_bool("-ll:fpga_shared_worker", m->cfg_use_shared_worker);
+                cp.add_option_int_units("-ll:fpga_size", m->cfg_fpga_mem_size, 'm');
+                cp.add_option_int_units("-ll:fpga_ib_size", m->cfg_fpga_ib_size, 'm');
+                cp.add_option_string("-ll:fpga_xclbin", m->cfg_fpga_xclbin);
+                cp.add_option_int("-ll:fpga_coprocessor_num_cu", m->cfg_fpga_coprocessor_num_cu);
+                cp.add_option_string("-ll:fpga_coprocessor_kernel", m->cfg_fpga_coprocessor_kernel);
+
+                bool ok = cp.parse_command_line(cmdline);
+                if (!ok)
+                {
+                    log_fpga.error() << "error reading fpga parameters";
+                    exit(1);
+                }
+            }
+            return m;
+        }
+
+        // do any general initialization - this is called after all configuration is complete
+        void FPGAModule::initialize(RuntimeImpl *runtime)
+        {
+            log_fpga.info() << "initialize";
+            Module::initialize(runtime);
+
+            std::vector<cl::Device> devices = xcl::get_xil_devices();
+            if (cfg_num_fpgas > devices.size())
+            {
+                log_fpga.error() << cfg_num_fpgas << " FPGA Processors requested, but only " << devices.size() << " available!";
+                exit(1);
+            }
+
+            // if we are using a shared worker, create that next
+            if (cfg_use_shared_worker)
+            {
+                shared_worker = new FPGAWorker;
+                if (cfg_use_worker_threads)
+                {
+                    shared_worker->start_background_thread(
+                        runtime->core_reservation_set(),
+                        1 << 20); // hardcoded worker stack size
+                }
+                else
+                    shared_worker->add_to_manager(&(runtime->bgwork));
+            }
+
+            // set up fpga_devices
+            for (unsigned int i = 0; i < cfg_num_fpgas; i++)
+            {
+                // for each device record the FPGAWorker
+                FPGAWorker *worker;
+                if (cfg_use_shared_worker)
+                {
+                    worker = shared_worker;
+                }
+                else
+                {
+                    worker = new FPGAWorker;
+                    if (cfg_use_worker_threads)
+                        worker->start_background_thread(
+                            runtime->core_reservation_set(),
+                            1 << 20); // hardcoded worker stack size
+                    else
+                        worker->add_to_manager(&(runtime->bgwork));
+                }
+
+                FPGADevice *fpga_device = new FPGADevice(devices[i], "fpga" + std::to_string(i), cfg_fpga_xclbin, worker, cfg_fpga_coprocessor_num_cu, cfg_fpga_coprocessor_kernel);
+                fpga_devices.push_back(fpga_device);
+
+                if (!cfg_use_shared_worker)
+                {
+                    log_fpga.info() << "add to dedicated workers " << worker;
+                    dedicated_workers[fpga_device] = worker;
+                }
+            }
+        }
+
+        // create any memories provided by this module (default == do nothing)
+        //  (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id)
+        void FPGAModule::create_memories(RuntimeImpl *runtime)
+        {
+            log_fpga.info() << "create_memories";
+            Module::create_memories(runtime);
+            if (cfg_fpga_mem_size > 0)
+            {
+                for (size_t i = 0; i < cfg_num_fpgas; i++)
+                {
+                    fpga_devices[i]->create_fpga_mem(runtime, cfg_fpga_mem_size);
+                }
+            }
+            if (cfg_fpga_ib_size > 0)
+            {
+                for (size_t i = 0; i < cfg_num_fpgas; i++)
+                {
+                    fpga_devices[i]->create_fpga_ib(runtime, cfg_fpga_ib_size);
+                }
+            }
+        }
+
+        // create any processors provided by the module (default == do nothing)
+        //  (each new ProcessorImpl should use a Processor from RuntimeImpl::next_local_processor_id)
+        void FPGAModule::create_processors(RuntimeImpl *runtime)
+        {
+            Module::create_processors(runtime);
+            for (size_t i = 0; i < cfg_num_fpgas; i++)
+            {
+                Processor p = runtime->next_local_processor_id();
+                FPGAProcessor *proc = new FPGAProcessor(fpga_devices[i], p, runtime->core_reservation_set());
+                fpga_procs_.push_back(proc);
+                runtime->add_processor(proc);
+                log_fpga.info() << "create fpga processor " << i;
+
+                // create mem affinities to add a proc to machine model
+                // create affinities between this processor and system/reg memories
+                // if the memory is one we created, use the kernel-reported distance
+                // to adjust the answer
+                std::vector<MemoryImpl *> &local_mems = runtime->nodes[Network::my_node_id].memories;
+                for (std::vector<MemoryImpl *>::iterator it = local_mems.begin();
+                     it != local_mems.end();
+                     ++it)
+                {
+                    Memory::Kind kind = (*it)->get_kind();
+                    if (kind == Memory::SYSTEM_MEM or kind == Memory::FPGA_MEM)
+                    {
+                        Machine::ProcessorMemoryAffinity pma;
+                        pma.p = p;
+                        pma.m = (*it)->me;
+
+                        // use the same made-up numbers as in
+                        //  runtime_impl.cc
+                        if (kind == Memory::SYSTEM_MEM)
+                        {
+                            pma.bandwidth = 100; // "large"
+                            pma.latency = 5;     // "small"
+                        }
+                        else if (kind == Memory::FPGA_MEM)
+                        {
+                            pma.bandwidth = 200; // "large"
+                            pma.latency = 10;    // "small"
+                        }
+                        else
+                        {
+                            assert(0 && "wrong memory kind");
+                        }
+
+                        runtime->add_proc_mem_affinity(pma);
+                    }
+                }
+            }
+        }
+
+        // create any DMA channels provided by the module (default == do nothing)
+        void FPGAModule::create_dma_channels(RuntimeImpl *runtime)
+        {
+            log_fpga.info() << "create_dma_channels";
+            for (std::vector<FPGADevice *>::iterator it = fpga_devices.begin(); it != fpga_devices.end(); it++)
+            {
+                (*it)->create_dma_channels(runtime);
+            }
+
+            Module::create_dma_channels(runtime);
+        }
+
+        // create any code translators provided by the module (default == do nothing)
+        void FPGAModule::create_code_translators(RuntimeImpl *runtime)
+        {
+            log_fpga.info() << "create_code_translators";
+            Module::create_code_translators(runtime);
+        }
+
+        // clean up any common resources created by the module - this will be called
+        //  after all memories/processors/etc. have been shut down and destroyed
+        void FPGAModule::cleanup(void)
+        {
+            log_fpga.info() << "cleanup";
+            // clean up worker(s)
+            if (shared_worker)
+            {
+#ifdef DEBUG_REALM
+                shared_worker->shutdown_work_item();
+#endif
+                if (cfg_use_worker_threads)
+                    shared_worker->shutdown_background_thread();
+
+                delete shared_worker;
+                shared_worker = 0;
+            }
+            for (std::map<FPGADevice *, FPGAWorker *>::iterator it = dedicated_workers.begin();
+                 it != dedicated_workers.end();
+                 it++)
+            {
+                log_fpga.info() << "shutdown worker in cleanup";
+                FPGAWorker *worker = it->second;
+#ifdef DEBUG_REALM
+                worker->shutdown_work_item();
+#endif
+                if (cfg_use_worker_threads)
+                    worker->shutdown_background_thread();
+
+                delete worker;
+            }
+            dedicated_workers.clear();
+            Module::cleanup();
+        }
+
+        template <typename T>
+        class FPGATaskScheduler : public T
+        {
+        public:
+            FPGATaskScheduler(Processor proc, Realm::CoreReservation &core_rsrv, FPGAProcessor *fpga_proc);
+            virtual ~FPGATaskScheduler(void);
+
+        protected:
+            virtual bool execute_task(Task *task);
+            virtual void execute_internal_task(InternalTask *task);
+            FPGAProcessor *fpga_proc_;
+        };
+
+        template <typename T>
+        FPGATaskScheduler<T>::FPGATaskScheduler(Processor proc,
+                                                Realm::CoreReservation &core_rsrv,
+                                                FPGAProcessor *fpga_proc) : T(proc, core_rsrv), fpga_proc_(fpga_proc)
+        {
+        }
+
+        template <typename T>
+        FPGATaskScheduler<T>::~FPGATaskScheduler(void)
+        {
+        }
+
+        template <typename T>
+        bool FPGATaskScheduler<T>::execute_task(Task *task)
+        {
+            assert(ThreadLocal::current_fpga_proc == NULL);
+            ThreadLocal::current_fpga_proc = fpga_proc_;
+            FPGAQueue *queue = fpga_proc_->fpga_device->fpga_queue;
+            log_fpga.info() << "execute_task " << task;
+
+            // we'll use a "work fence" to track when the kernels launched by this task actually
+            //  finish - this must be added to the task _BEFORE_ we execute
+            FPGAWorkFence *fence = new FPGAWorkFence(task);
+            task->add_async_work_item(fence);
+            bool ok = T::execute_task(task);
+            fence->enqueue(queue);
+
+            assert(ThreadLocal::current_fpga_proc == fpga_proc_);
+            ThreadLocal::current_fpga_proc = NULL;
+            return ok;
+        }
+
+        template <typename T>
+        void FPGATaskScheduler<T>::execute_internal_task(InternalTask *task)
+        {
+            assert(ThreadLocal::current_fpga_proc == NULL);
+            ThreadLocal::current_fpga_proc = fpga_proc_;
+            log_fpga.info() << "execute_internal_task";
+            T::execute_internal_task(task);
+            assert(ThreadLocal::current_fpga_proc == fpga_proc_);
+            ThreadLocal::current_fpga_proc = NULL;
+        }
+
+        FPGAProcessor::FPGAProcessor(FPGADevice *fpga_device, Processor me, Realm::CoreReservationSet &crs)
+            : LocalTaskProcessor(me, Processor::FPGA_PROC)
+        {
+            log_fpga.info() << "FPGAProcessor()";
+            this->fpga_device = fpga_device;
+            Realm::CoreReservationParameters params;
+            params.set_num_cores(1);
+            params.set_alu_usage(params.CORE_USAGE_SHARED);
+            params.set_fpu_usage(params.CORE_USAGE_SHARED);
+            params.set_ldst_usage(params.CORE_USAGE_SHARED);
+            params.set_max_stack_size(2 << 20);
+            std::string name = stringbuilder() << "fpga proc " << me;
+            core_rsrv_ = new Realm::CoreReservation(name, crs, params);
+
+#ifdef REALM_USE_USER_THREADS
+            UserThreadTaskScheduler *sched = new FPGATaskScheduler<UserThreadTaskScheduler>(me, *core_rsrv_, this);
+#else
+            KernelThreadTaskScheduler *sched = new FPGATaskScheduler<KernelThreadTaskScheduler>(me, *core_rsrv_, this);
+#endif
+            set_scheduler(sched);
+        }
+
+        FPGAProcessor::~FPGAProcessor(void)
+        {
+            delete core_rsrv_;
+        }
+
+        FPGAProcessor *FPGAProcessor::get_current_fpga_proc(void)
+        {
+            return ThreadLocal::current_fpga_proc;
+        }
+
+        FPGADeviceMemory::FPGADeviceMemory(Memory memory, FPGADevice *device, void *base_ptr_sys, size_t size)
+            : LocalManagedMemory(memory, size, MKIND_FPGA, 512, Memory::FPGA_MEM, NULL), base_ptr_sys(base_ptr_sys)
+        {
+        }
+
+        FPGADeviceMemory::~FPGADeviceMemory(void)
+        {
+            // this function is invoked before ~FPGADevice
+            if (base_ptr_sys != nullptr)
+            {
+                free(base_ptr_sys);
+                base_ptr_sys = nullptr;
+            }
+        }
+
+        void FPGADeviceMemory::get_bytes(off_t src_offset, void *dst, size_t size)
+        {
+
+            FPGACompletionEvent n; // TODO: fix me
+            void *src = (void *)((uint8_t *)(base_ptr_sys) + src_offset);
+            off_t dst_offset = (uint8_t *)dst - (uint8_t *)(base_ptr_sys);
+            get_device()->copy_from_fpga(dst, src, dst_offset, src_offset, size, &n);
+            n.request_completed();
+        }
+
+        void FPGADeviceMemory::put_bytes(off_t dst_offset, const void *src, size_t size)
+        {
+            FPGACompletionEvent n; // TODO: fix me
+            void *dst = (void *)((uint8_t *)(base_ptr_sys) + dst_offset);
+            off_t src_offset = (uint8_t *)src - (uint8_t *)(base_ptr_sys);
+            get_device()->copy_to_fpga(dst, src, dst_offset, src_offset, size, &n);
+            n.request_completed();
+        }
+
+        void *FPGADeviceMemory::get_direct_ptr(off_t offset, size_t size)
+        {
+            return (void *)((uint8_t *)base_ptr_sys + offset);
+        }
+
+        void FPGACompletionEvent::request_completed(void)
+        {
+            log_fpga.info() << "request_completed " << req;
+            req->xd->notify_request_read_done(req);
+            req->xd->notify_request_write_done(req);
+        }
+
+        FPGAXferDes::FPGAXferDes(uintptr_t _dma_op, Channel *_channel,
+                                 NodeID _launch_node, XferDesID _guid,
+                                 const std::vector<XferDesPortInfo> &inputs_info,
+                                 const std::vector<XferDesPortInfo> &outputs_info,
+                                 int _priority, XferDesKind kind)
+            : XferDes(_dma_op, _channel, _launch_node, _guid,
+                      inputs_info, outputs_info,
+                      _priority, 0, 0)
+        {
+            if ((inputs_info.size() >= 1) &&
+                (input_ports[0].mem->kind == MemoryImpl::MKIND_FPGA))
+            {
+                // all input ports should agree on which fpga they target
+                src_fpga = ((FPGADeviceMemory *)(input_ports[0].mem))->device;
+                for (size_t i = 1; i < input_ports.size(); i++)
+                {
+                    // exception: control and indirect ports should be readable from cpu
+                    if ((int(i) == input_control.control_port_idx) ||
+                        (int(i) == output_control.control_port_idx) ||
+                        input_ports[i].is_indirect_port)
+                    {
+                        assert((input_ports[i].mem->kind == MemoryImpl::MKIND_SYSMEM));
+                        continue;
+                    }
+                    assert(input_ports[i].mem == input_ports[0].mem);
+                }
+            }
+            else
+            {
+                src_fpga = 0;
+            }
+
+            if ((outputs_info.size() >= 1) &&
+                (output_ports[0].mem->kind == MemoryImpl::MKIND_FPGA))
+            {
+                // all output ports should agree on which adev they target
+                dst_fpga = ((FPGADeviceMemory *)(output_ports[0].mem))->device;
+                for (size_t i = 1; i < output_ports.size(); i++)
+                    assert(output_ports[i].mem == output_ports[0].mem);
+            }
+            else
+            {
+                dst_fpga = 0;
+            }
+
+            // if we're doing a multi-hop copy, we'll dial down the request
+            //  sizes to improve pipelining
+            bool multihop_copy = false;
+            for (size_t i = 1; i < input_ports.size(); i++)
+                if (input_ports[i].peer_guid != XFERDES_NO_GUID)
+                    multihop_copy = true;
+            for (size_t i = 1; i < output_ports.size(); i++)
+                if (output_ports[i].peer_guid != XFERDES_NO_GUID)
+                    multihop_copy = true;
+
+            log_fpga.info() << "create FPGAXferDes " << kind;
+            this->kind = kind;
+            switch (kind)
+            {
+            case XFER_FPGA_TO_DEV:
+                if (multihop_copy)
+                    max_req_size = 4 << 20;
+                break;
+            case XFER_FPGA_FROM_DEV:
+                if (multihop_copy)
+                    max_req_size = 4 << 20;
+                break;
+            case XFER_FPGA_IN_DEV:
+                max_req_size = 1 << 30;
+                break;
+            case XFER_FPGA_PEER_DEV:
+                max_req_size = 256 << 20;
+                break;
+            case XFER_FPGA_COMP:
+                if (multihop_copy)
+                    max_req_size = 4 << 20;
+                break;
+            default:
+                break;
+            }
+            const int max_nr = 10; // TODO:FIXME
+            for (int i = 0; i < max_nr; i++)
+            {
+                FPGARequest *fpga_req = new FPGARequest;
+                fpga_req->xd = this;
+                fpga_req->event.req = fpga_req;
+                available_reqs.push(fpga_req);
+            }
+        }
+
+        long FPGAXferDes::get_requests(Request **requests, long nr)
+        {
+            FPGARequest **reqs = (FPGARequest **)requests;
+            // no do allow 2D and 3D copies
+            // unsigned flags = (TransferIterator::LINES_OK |
+            //                   TransferIterator::PLANES_OK);
+            unsigned flags = 0;
+            long new_nr = default_get_requests(requests, nr, flags);
+            for (long i = 0; i < new_nr; i++)
+            {
+                switch (kind)
+                {
+                case XFER_FPGA_TO_DEV:
+                {
+                    reqs[i]->src_base = input_ports[reqs[i]->src_port_idx].mem->get_direct_ptr(reqs[i]->src_off, reqs[i]->nbytes);
+                    reqs[i]->dst_base = output_ports[reqs[i]->dst_port_idx].mem->get_direct_ptr(reqs[i]->dst_off, reqs[i]->nbytes);
+                    assert(reqs[i]->src_base != 0);
+                    assert(reqs[i]->dst_base != 0);
+                    break;
+                }
+                case XFER_FPGA_FROM_DEV:
+                {
+                    reqs[i]->src_base = input_ports[reqs[i]->src_port_idx].mem->get_direct_ptr(reqs[i]->src_off, reqs[i]->nbytes);
+                    reqs[i]->dst_base = output_ports[reqs[i]->dst_port_idx].mem->get_direct_ptr(reqs[i]->dst_off, reqs[i]->nbytes);
+                    assert(reqs[i]->src_base != 0);
+                    assert(reqs[i]->dst_base != 0);
+                    break;
+                }
+                case XFER_FPGA_IN_DEV:
+                {
+                    reqs[i]->src_base = input_ports[reqs[i]->src_port_idx].mem->get_direct_ptr(reqs[i]->src_off, reqs[i]->nbytes);
+                    reqs[i]->dst_base = output_ports[reqs[i]->dst_port_idx].mem->get_direct_ptr(reqs[i]->dst_off, reqs[i]->nbytes);
+                    assert(reqs[i]->src_base != 0);
+                    assert(reqs[i]->dst_base != 0);
+                    break;
+                }
+                case XFER_FPGA_PEER_DEV:
+                {
+                    reqs[i]->dst_fpga = dst_fpga;
+                    break;
+                }
+                case XFER_FPGA_COMP:
+                {
+                    reqs[i]->src_base = input_ports[reqs[i]->src_port_idx].mem->get_direct_ptr(reqs[i]->src_off, reqs[i]->nbytes);
+                    reqs[i]->dst_base = output_ports[reqs[i]->dst_port_idx].mem->get_direct_ptr(reqs[i]->dst_off, reqs[i]->nbytes);
+                    assert(reqs[i]->src_base != 0);
+                    assert(reqs[i]->dst_base != 0);
+                    break;
+                }
+                default:
+                    assert(0);
+                }
+            }
+            return new_nr;
+        }
+
+        bool FPGAXferDes::progress_xd(FPGAChannel *channel, TimeLimit work_until)
+        {
+            Request *rq;
+            bool did_work = false;
+            do
+            {
+                long count = get_requests(&rq, 1);
+                if (count > 0)
+                {
+                    channel->submit(&rq, count);
+                    did_work = true;
+                }
+                else
+                    break;
+            } while (!work_until.is_expired());
+            return did_work;
+        }
+
+        void FPGAXferDes::notify_request_read_done(Request *req)
+        {
+            default_notify_request_read_done(req);
+        }
+
+        void FPGAXferDes::notify_request_write_done(Request *req)
+        {
+            default_notify_request_write_done(req);
+        }
+
+        void FPGAXferDes::flush()
+        {
+        }
+
+        FPGAChannel::FPGAChannel(FPGADevice *_src_fpga, XferDesKind _kind, BackgroundWorkManager *bgwork)
+            : SingleXDQChannel<FPGAChannel, FPGAXferDes>(bgwork, _kind, "FPGA channel")
+        {
+            log_fpga.info() << "FPGAChannel(): " << (int)_kind;
+            src_fpga = _src_fpga;
+
+            Memory temp_fpga_mem = src_fpga->fpga_mem->me;
+            Memory temp_fpga_ib_mem = src_fpga->fpga_ib->me;
+            Memory temp_sys_mem = src_fpga->local_sysmem->me;
+            Memory temp_rdma_mem = src_fpga->local_ibmem->me;
+
+            switch (_kind)
+            {
+            case XFER_FPGA_TO_DEV:
+            {
+                unsigned bw = 10; // TODO
+                unsigned latency = 0;
+                unsigned frag_overhead = 0;
+                add_path(temp_sys_mem, temp_fpga_mem, bw, latency, frag_overhead, XFER_FPGA_TO_DEV);
+                add_path(temp_rdma_mem, temp_fpga_mem, bw, latency, frag_overhead, XFER_FPGA_TO_DEV);
+                add_path(temp_sys_mem, temp_fpga_ib_mem, bw, latency, frag_overhead, XFER_FPGA_TO_DEV);
+                break;
+            }
+            case XFER_FPGA_FROM_DEV:
+            {
+                unsigned bw = 10; // TODO
+                unsigned latency = 0;
+                unsigned frag_overhead = 0;
+                add_path(temp_fpga_mem, temp_sys_mem, bw, latency, frag_overhead, XFER_FPGA_FROM_DEV);
+                add_path(temp_fpga_mem, temp_rdma_mem, bw, latency, frag_overhead, XFER_FPGA_FROM_DEV);
+                add_path(temp_fpga_ib_mem, temp_sys_mem, bw, latency, frag_overhead, XFER_FPGA_FROM_DEV);
+                break;
+            }
+            case XFER_FPGA_IN_DEV:
+            {
+                // self-path
+                unsigned bw = 10; // TODO
+                unsigned latency = 0;
+                unsigned frag_overhead = 0;
+                add_path(temp_fpga_mem, temp_fpga_mem, bw, latency, frag_overhead, XFER_FPGA_IN_DEV);
+                add_path(temp_fpga_ib_mem, temp_fpga_ib_mem, bw, latency, frag_overhead, XFER_FPGA_IN_DEV);
+                break;
+            }
+            case XFER_FPGA_PEER_DEV:
+            {
+                // just do paths to peers - they'll do the other side
+                assert(0 && "not implemented");
+                break;
+            }
+            case XFER_FPGA_COMP:
+            {
+                unsigned bw = 10; // TODO
+                unsigned latency = 1000;
+                unsigned frag_overhead = 0;
+                add_path(temp_fpga_mem, temp_fpga_mem, bw, latency, frag_overhead, XFER_FPGA_COMP);
+                add_path(temp_fpga_ib_mem, temp_fpga_ib_mem, bw, latency, frag_overhead, XFER_FPGA_COMP);
+                break;
+            }
+            default:
+                assert(0);
+            }
+        }
+
+        FPGAChannel::~FPGAChannel()
+        {
+        }
+
+        XferDes *FPGAChannel::create_xfer_des(uintptr_t dma_op,
+                                              NodeID launch_node,
+                                              XferDesID guid,
+                                              const std::vector<XferDesPortInfo> &inputs_info,
+                                              const std::vector<XferDesPortInfo> &outputs_info,
+                                              int priority,
+                                              XferDesRedopInfo redop_info,
+                                              const void *fill_data, size_t fill_size)
+        {
+            assert(redop_info.id == 0);
+            assert(fill_size == 0);
+            return new FPGAXferDes(dma_op, this, launch_node, guid,
+                                   inputs_info, outputs_info,
+                                   priority, kind);
+        }
+
+        long FPGAChannel::submit(Request **requests, long nr)
+        {
+            for (long i = 0; i < nr; i++)
+            {
+                FPGARequest *req = (FPGARequest *)requests[i];
+                // no serdez support
+                assert(req->xd->input_ports[req->src_port_idx].serdez_op == 0);
+                assert(req->xd->output_ports[req->dst_port_idx].serdez_op == 0);
+
+                // empty transfers don't need to bounce off the ADevice
+                if (req->nbytes == 0)
+                {
+                    req->xd->notify_request_read_done(req);
+                    req->xd->notify_request_write_done(req);
+                    continue;
+                }
+
+                switch (req->dim)
+                {
+                case Request::DIM_1D:
+                {
+                    switch (kind)
+                    {
+                    case XFER_FPGA_TO_DEV:
+                        src_fpga->copy_to_fpga(req->dst_base, req->src_base,
+                                               req->dst_off, req->src_off,
+                                               req->nbytes, &req->event);
+                        break;
+                    case XFER_FPGA_FROM_DEV:
+                        src_fpga->copy_from_fpga(req->dst_base, req->src_base,
+                                                 req->dst_off, req->src_off,
+                                                 req->nbytes, &req->event);
+                        break;
+                    case XFER_FPGA_IN_DEV:
+                        src_fpga->copy_within_fpga(req->dst_base, req->src_base,
+                                                   req->dst_off, req->src_off,
+                                                   req->nbytes, &req->event);
+                        break;
+                    case XFER_FPGA_PEER_DEV:
+                        src_fpga->copy_to_peer(req->dst_fpga,
+                                               req->dst_base, req->src_base,
+                                               req->dst_off, req->src_off,
+                                               req->nbytes, &req->event);
+                        break;
+                    case XFER_FPGA_COMP:
+                        src_fpga->comp(req->dst_base, req->src_base,
+                                       req->dst_off, req->src_off,
+                                       req->nbytes, &req->event);
+                        break;
+                    default:
+                        assert(0);
+                    }
+                    break;
+                }
+
+                case Request::DIM_2D:
+                {
+                    switch (kind)
+                    {
+                    case XFER_FPGA_TO_DEV:
+                        assert(0 && "not implemented");
+                        break;
+                    case XFER_FPGA_FROM_DEV:
+                        assert(0 && "not implemented");
+                        break;
+                    case XFER_FPGA_IN_DEV:
+                        assert(0 && "not implemented");
+                        break;
+                    case XFER_FPGA_PEER_DEV:
+                        assert(0 && "not implemented");
+                        break;
+                    case XFER_FPGA_COMP:
+                        assert(0 && "not implemented");
+                        break;
+                    default:
+                        assert(0);
+                    }
+                    break;
+                }
+
+                case Request::DIM_3D:
+                {
+                    switch (kind)
+                    {
+                    case XFER_FPGA_TO_DEV:
+                        assert(0 && "not implemented");
+                        break;
+                    case XFER_FPGA_FROM_DEV:
+                        assert(0 && "not implemented");
+                        break;
+                    case XFER_FPGA_IN_DEV:
+                        assert(0 && "not implemented");
+                        break;
+                    case XFER_FPGA_PEER_DEV:
+                        assert(0 && "not implemented");
+                        break;
+                    case XFER_FPGA_COMP:
+                        assert(0 && "not implemented");
+                        break;
+                    default:
+                        assert(0);
+                    }
+                    break;
+                }
+
+                default:
+                    assert(0);
+                }
+            }
+
+            return nr;
+        }
+
+        FPGAfillXferDes::FPGAfillXferDes(uintptr_t _dma_op, Channel *_channel,
+                                         NodeID _launch_node, XferDesID _guid,
+                                         const std::vector<XferDesPortInfo> &inputs_info,
+                                         const std::vector<XferDesPortInfo> &outputs_info,
+                                         int _priority,
+                                         const void *_fill_data, size_t _fill_size)
+            : XferDes(_dma_op, _channel, _launch_node, _guid,
+                      inputs_info, outputs_info,
+                      _priority, _fill_data, _fill_size)
+        {
+            kind = XFER_FPGA_IN_DEV;
+
+            // no direct input data for us
+            assert(input_control.control_port_idx == -1);
+            input_control.current_io_port = -1;
+        }
+
+        long FPGAfillXferDes::get_requests(Request **requests, long nr)
+        {
+            // unused
+            assert(0);
+            return 0;
+        }
+
+        bool FPGAfillXferDes::progress_xd(FPGAfillChannel *channel,
+                                          TimeLimit work_until)
+        {
+            bool did_work = false;
+            ReadSequenceCache rseqcache(this, 2 << 20);
+            WriteSequenceCache wseqcache(this, 2 << 20);
+
+            while (true)
+            {
+                size_t min_xfer_size = 4096; // TODO: make controllable
+                size_t max_bytes = get_addresses(min_xfer_size, &rseqcache);
+                if (max_bytes == 0)
+                    break;
+
+                XferPort *out_port = 0;
+                size_t out_span_start = 0;
+                if (output_control.current_io_port >= 0)
+                {
+                    out_port = &output_ports[output_control.current_io_port];
+                    out_span_start = out_port->local_bytes_total;
+                }
+
+                size_t total_bytes = 0;
+                if (out_port != 0)
+                {
+                    // input and output both exist - transfer what we can
+                    log_fpga.info() << "memfill chunk: min=" << min_xfer_size
+                                    << " max=" << max_bytes;
+
+                    uintptr_t out_base = reinterpret_cast<uintptr_t>(out_port->mem->get_direct_ptr(0, 0));
+                    uintptr_t initial_out_offset = out_port->addrcursor.get_offset();
+                    while (total_bytes < max_bytes)
+                    {
+                        AddressListCursor &out_alc = out_port->addrcursor;
+
+                        uintptr_t out_offset = out_alc.get_offset();
+
+                        // the reported dim is reduced for partially consumed address
+                        //  ranges - whatever we get can be assumed to be regular
+                        int out_dim = out_alc.get_dim();
+
+                        size_t bytes = 0;
+                        size_t bytes_left = max_bytes - total_bytes;
+                        // memfills don't need to be particularly big to achieve
+                        //  peak efficiency, so trim to something that takes
+                        //  10's of us to be responsive to the time limit
+                        // NOTE: have to be a little careful and make sure the limit
+                        //  is a multiple of the fill size - we'll make it a power-of-2
+                        const size_t TARGET_CHUNK_SIZE = 256 << 10; // 256KB
+                        if (bytes_left > TARGET_CHUNK_SIZE)
+                        {
+                            size_t max_chunk = fill_size;
+                            while (max_chunk < TARGET_CHUNK_SIZE)
+                                max_chunk <<= 1;
+                            bytes_left = std::min(bytes_left, max_chunk);
+                        }
+
+                        if (out_dim > 0)
+                        {
+                            size_t ocount = out_alc.remaining(0);
+
+                            // contig bytes is always the first dimension
+                            size_t contig_bytes = std::min(ocount, bytes_left);
+
+                            // catch simple 1D case first
+                            if ((contig_bytes == bytes_left) ||
+                                ((contig_bytes == ocount) && (out_dim == 1)))
+                            {
+                                bytes = contig_bytes;
+                                // we only have one element worth of data, so fill
+                                //  multiple elements by using a "2d" copy with a
+                                //  source stride of 0
+                                size_t repeat_count = contig_bytes / fill_size;
+#ifdef DEBUG_REALM
+                                assert((contig_bytes % fill_size) == 0);
+#endif
+                                fpga_memcpy_2d(out_base + out_offset, fill_size,
+                                               reinterpret_cast<uintptr_t>(fill_data), 0,
+                                               fill_size, repeat_count);
+                                out_alc.advance(0, bytes);
+                            }
+                            else
+                            {
+                                // grow to a 2D fill
+                                assert(0 && "FPGA 2D fill not implemented");
+                            }
+                        }
+                        else
+                        {
+                            // scatter adddress list
+                            assert(0);
+                        }
+
+#ifdef DEBUG_REALM
+                        assert(bytes <= bytes_left);
+#endif
+                        total_bytes += bytes;
+
+                        // stop if it's been too long, but make sure we do at least the
+                        //  minimum number of bytes
+                        if ((total_bytes >= min_xfer_size) && work_until.is_expired())
+                            break;
+                    }
+                    for (size_t i = 0; i < 50; i++)
+                    {
+                        printf("%d ", ((int *)(channel->fpga->fpga_mem->base_ptr_sys))[i]);
+                    }
+                    printf("\n");
+
+                    log_fpga.info() << "before write buffer, initial_out_offset " << initial_out_offset << " total_bytes " << total_bytes;
+
+                    cl_int err = 0;
+                    OCL_CHECK(err, err = channel->fpga->command_queue.enqueueWriteBuffer(channel->fpga->buff,                                                              // buffer on the FPGA
+                                                                                         CL_TRUE,                                                                          // blocking call
+                                                                                         initial_out_offset,                                                               // buffer offset in bytes
+                                                                                         total_bytes,                                                                      // Size in bytes
+                                                                                         (void *)((uint64_t)(channel->fpga->fpga_mem->base_ptr_sys) + initial_out_offset), // Pointer to the data to copy
+                                                                                         nullptr, nullptr));
+                    log_fpga.info() << "after write buffer";
+                }
+                else
+                {
+                    // fill with no output, so just count the bytes
+                    total_bytes = max_bytes;
+                }
+
+                // mem fill is always immediate, so handle both skip and copy with
+                //  the same code
+                wseqcache.add_span(output_control.current_io_port,
+                                   out_span_start, total_bytes);
+                out_span_start += total_bytes;
+
+                bool done = record_address_consumption(total_bytes, total_bytes);
+
+                did_work = true;
+
+                if (done || work_until.is_expired())
+                    break;
+            }
+
+            rseqcache.flush();
+            wseqcache.flush();
+
+            return did_work;
+        }
+
+        FPGAfillChannel::FPGAfillChannel(FPGADevice *_fpga, BackgroundWorkManager *bgwork)
+            : SingleXDQChannel<FPGAfillChannel, FPGAfillXferDes>(bgwork,
+                                                                 XFER_GPU_IN_FB,
+                                                                 "FPGA fill channel"),
+              fpga(_fpga)
+        {
+            Memory temp_fpga_mem = fpga->fpga_mem->me;
+
+            unsigned bw = 10; // TODO
+            unsigned latency = 0;
+            unsigned frag_overhead = 0;
+            add_path(Memory::NO_MEMORY, temp_fpga_mem,
+                     bw, latency, frag_overhead, XFER_FPGA_IN_DEV);
+
+            xdq.add_to_manager(bgwork);
+        }
+
+        XferDes *FPGAfillChannel::create_xfer_des(uintptr_t dma_op,
+                                                  NodeID launch_node,
+                                                  XferDesID guid,
+                                                  const std::vector<XferDesPortInfo> &inputs_info,
+                                                  const std::vector<XferDesPortInfo> &outputs_info,
+                                                  int priority,
+                                                  XferDesRedopInfo redop_info,
+                                                  const void *fill_data, size_t fill_size)
+        {
+            assert(redop_info.id == 0);
+            return new FPGAfillXferDes(dma_op, this, launch_node, guid,
+                                       inputs_info, outputs_info,
+                                       priority,
+                                       fill_data, fill_size);
+        }
+
+        long FPGAfillChannel::submit(Request **requests, long nr)
+        {
+            // unused
+            assert(0);
+            return 0;
+        }
+
+    }; // namespace FPGA
+};     // namespace Realm
diff --git a/runtime/realm/fpga/fpga_module.h b/runtime/realm/fpga/fpga_module.h
new file mode 100644
index 0000000000..f29c3df99c
--- /dev/null
+++ b/runtime/realm/fpga/fpga_module.h
@@ -0,0 +1,395 @@
+#ifndef REALM_FPGA_H
+#define REALM_FPGA_H
+
+#include "realm/module.h"
+#include "realm/proc_impl.h"
+#include "realm/mem_impl.h"
+#include "realm/runtime_impl.h"
+#include "realm/transfer/channel.h"
+#include "realm/circ_queue.h"
+#include "realm/transfer/ib_memory.h"
+
+// OpenCL utility layer
+#include "xcl2.hpp"
+
+namespace Realm
+{
+    namespace FPGA
+    {
+        class FPGAQueue;
+        class FPGADevice;
+        class FPGADeviceMemcpy;
+        class FPGARequest;
+
+        class FPGACompletionNotification
+        {
+        public:
+            virtual ~FPGACompletionNotification(void) {}
+
+            virtual void request_completed(void) = 0;
+        };
+
+
+        class FPGACompletionEvent : public FPGACompletionNotification
+        {
+        public:
+            void request_completed(void);
+
+            FPGARequest *req;
+        };
+
+        class FPGAWorker : public BackgroundWorkItem
+        {
+        public:
+            FPGAWorker(void);
+            virtual ~FPGAWorker(void);
+            // adds a stream that has work to be done
+            void add_queue(FPGAQueue *queue);
+            // processes work on queues,
+            // optionally sleeping for work to show up
+            // returns true if work remains to be done
+            bool process_queues(bool sleep_on_empty);
+            void start_background_thread(Realm::CoreReservationSet &
+                                             crs,
+                                         size_t stack_size);
+            void shutdown_background_thread(void);
+            bool do_work(TimeLimit work_until);
+
+        public:
+            void thread_main(void);
+
+        protected:
+            Mutex lock;
+            Mutex::CondVar condvar;
+            typedef Realm::CircularQueue<FPGAQueue *, 16> ActiveQueue;
+            ActiveQueue active_queues;
+            // used by the background thread (if any)
+            Realm::CoreReservation *core_rsrv;
+            Realm::Thread *worker_thread;
+            bool thread_sleeping;
+            atomic<bool> worker_shutdown_requested;
+        };
+
+
+        class FPGAWorkFence : public Realm::Operation::AsyncWorkItem
+        {
+        public:
+            FPGAWorkFence(Realm::Operation *op);
+            virtual void request_cancellation(void);
+            void enqueue(FPGAQueue *queue);
+            virtual void print(std::ostream &os) const;
+        };
+
+        class FPGAQueue
+        {
+        public:
+            FPGAQueue(FPGADevice *fpga_device, FPGAWorker *fpga_worker, cl::CommandQueue &command_queue);
+            ~FPGAQueue(void);
+            cl::CommandQueue &get_command_queue() const;
+            void add_fence(FPGAWorkFence *fence);
+            void add_notification(FPGACompletionNotification *notification);
+            bool reap_events(TimeLimit work_until);
+            void add_copy(FPGADeviceMemcpy *copy);
+            bool issue_copies(TimeLimit work_until);
+            void add_event(cl::Event opencl_event,
+                           FPGAWorkFence *fence,
+                           FPGACompletionNotification *n = 0);
+
+        protected:
+            // may only be tested with lock held
+            bool has_work(void) const;
+            FPGADevice *fpga_device;
+            FPGAWorker *fpga_worker;
+            cl::CommandQueue &command_queue;
+            Mutex mutex;
+            struct PendingEvent
+            {
+                cl::Event opencl_event;
+                FPGAWorkFence *fence;
+                FPGACompletionNotification *notification;
+            };
+            std::deque<PendingEvent> pending_events;
+            std::deque<FPGADeviceMemcpy *> pending_copies;
+        };
+
+        class FPGADeviceMemory;
+
+        class FPGADevice
+        {
+        public:
+            std::string name;
+            cl::Device device;
+            cl::Buffer buff;
+            cl::Buffer ib_buff;
+            cl::Context context;
+            cl::CommandQueue command_queue;
+            cl::Program program;
+            FPGADevice(cl::Device &device, std::string name, std::string xclbin, FPGAWorker *fpga_worker, size_t fpga_coprocessor_num_cu, std::string fpga_coprocessor_kernel);
+            ~FPGADevice();
+            void create_fpga_mem(RuntimeImpl *runtime, size_t size);
+            void create_fpga_ib(RuntimeImpl *runtime, size_t size);
+            void create_dma_channels(RuntimeImpl *runtime);
+            void create_fpga_queues();
+            void copy_to_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification);
+            void copy_from_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification);
+            void copy_within_fpga(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification);
+            void copy_to_peer(FPGADevice *dst_dev, void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification);
+            void comp(void *dst, const void *src, off_t dst_offset, off_t src_offset, size_t bytes, FPGACompletionNotification *notification);
+            bool is_in_buff(void *ptr);
+            bool is_in_ib_buff(void *ptr);
+            FPGADeviceMemory *fpga_mem;
+            IBMemory *fpga_ib;
+            MemoryImpl *local_sysmem;
+            IBMemory *local_ibmem;
+            FPGAWorker *fpga_worker;
+            FPGAQueue *fpga_queue;
+            size_t fpga_coprocessor_num_cu;
+            std::string fpga_coprocessor_kernel;
+        };
+
+        enum FPGAMemcpyKind
+        {
+            FPGA_MEMCPY_HOST_TO_DEVICE,
+            FPGA_MEMCPY_DEVICE_TO_HOST,
+            FPGA_MEMCPY_DEVICE_TO_DEVICE,
+            FPGA_MEMCPY_PEER_TO_PEER,
+        };
+
+        // An abstract base class for all FPGA memcpy operations
+        class FPGADeviceMemcpy
+        {
+        public:
+            FPGADeviceMemcpy(FPGADevice *fpga_device, FPGAMemcpyKind kind, FPGACompletionNotification *notification);
+            virtual ~FPGADeviceMemcpy(void) {}
+
+        public:
+            virtual void execute(FPGAQueue *queue) = 0;
+
+        public:
+            FPGADevice *const fpga_device;
+
+        protected:
+            FPGAMemcpyKind kind;
+            FPGACompletionNotification *notification;
+        };
+
+        class FPGADeviceMemcpy1D : public FPGADeviceMemcpy
+        {
+        public:
+            FPGADeviceMemcpy1D(FPGADevice *fpga_device,
+                               void *dst,
+                               const void *src,
+                               size_t bytes,
+                               off_t buff_offset,
+                               FPGAMemcpyKind kind,
+                               FPGACompletionNotification *notification);
+
+            virtual ~FPGADeviceMemcpy1D(void);
+
+        public:
+            virtual void execute(FPGAQueue *q);
+
+        protected:
+            void *dst;
+            const void *src;
+            size_t elmt_size;
+            off_t buff_offset;
+
+        private:
+            void do_span(off_t pos, size_t len);
+            FPGAQueue *fpga_queue;
+        };
+
+        class FPGAProcessor : public LocalTaskProcessor
+        {
+        public:
+            FPGAProcessor(FPGADevice *fpga_device, Processor me, Realm::CoreReservationSet &crs);
+            virtual ~FPGAProcessor(void);
+            static FPGAProcessor *get_current_fpga_proc(void);
+            FPGADevice *fpga_device;
+
+        protected:
+            Realm::CoreReservation *core_rsrv_;
+        };
+
+        class FPGADeviceMemory : public LocalManagedMemory
+        {
+        public:
+            FPGADeviceMemory(Memory memory, FPGADevice *device, void *base_ptr_sys, size_t size);
+            virtual ~FPGADeviceMemory(void);
+            virtual void get_bytes(off_t offset, void *dst, size_t size);
+            virtual void put_bytes(off_t offset, const void *src, size_t size);
+            virtual void *get_direct_ptr(off_t offset, size_t size);
+
+            FPGADevice *get_device() const { return device; };
+            void *get_mem_base_sys() const { return base_ptr_sys; };
+            FPGADevice *device;
+            void *base_ptr_sys;
+        };
+
+        class FPGARequest : public Request
+        {
+        public:
+            const void *src_base;
+            void *dst_base;
+            FPGADevice *dst_fpga;
+            FPGACompletionEvent event;
+        };
+
+        class FPGAChannel;
+
+        class FPGAXferDes : public XferDes
+        {
+        public:
+            FPGAXferDes(uintptr_t _dma_op, Channel *_channel,
+                        NodeID _launch_node, XferDesID _guid,
+                        const std::vector<XferDesPortInfo> &inputs_info,
+                        const std::vector<XferDesPortInfo> &outputs_info,
+                        int _priority, XferDesKind kind);
+
+            ~FPGAXferDes()
+            {
+                while (!available_reqs.empty())
+                {
+                    FPGARequest *fpga_req = (FPGARequest *)available_reqs.front();
+                    available_reqs.pop();
+                    delete fpga_req;
+                }
+            }
+
+            long default_get_requests_tentative(Request **requests, long nr, unsigned flags);
+            long get_requests(Request **requests, long nr);
+            void notify_request_read_done(Request *req);
+            void notify_request_write_done(Request *req);
+            void flush();
+
+            bool progress_xd(FPGAChannel *channel, TimeLimit work_until);
+
+        private:
+            FPGADevice *src_fpga;
+            FPGADevice *dst_fpga;
+        };
+
+        class FPGAChannel : public SingleXDQChannel<FPGAChannel, FPGAXferDes>
+        {
+        public:
+            FPGAChannel(FPGADevice *_src_fpga, XferDesKind _kind,
+                        BackgroundWorkManager *bgwork);
+            ~FPGAChannel();
+
+            // TODO: multiple concurrent copies not ok for now
+            static const bool is_ordered = true;
+
+            virtual XferDes *create_xfer_des(uintptr_t dma_op,
+                                             NodeID launch_node,
+                                             XferDesID guid,
+                                             const std::vector<XferDesPortInfo> &inputs_info,
+                                             const std::vector<XferDesPortInfo> &outputs_info,
+                                             int priority,
+                                             XferDesRedopInfo redop_info,
+                                             const void *fill_data, size_t fill_size);
+
+            long submit(Request **requests, long nr);
+
+        private:
+            FPGADevice *src_fpga;
+        };
+
+        class FPGAfillChannel;
+
+        class FPGAfillXferDes : public XferDes
+        {
+        public:
+            FPGAfillXferDes(uintptr_t _dma_op, Channel *_channel,
+                            NodeID _launch_node, XferDesID _guid,
+                            const std::vector<XferDesPortInfo> &inputs_info,
+                            const std::vector<XferDesPortInfo> &outputs_info,
+                            int _priority,
+                            const void *_fill_data, size_t _fill_size);
+
+            long get_requests(Request **requests, long nr);
+
+            bool progress_xd(FPGAfillChannel *channel, TimeLimit work_until);
+
+        protected:
+            size_t reduced_fill_size;
+        };
+
+        class FPGAfillChannel : public SingleXDQChannel<FPGAfillChannel, FPGAfillXferDes>
+        {
+        public:
+            FPGAfillChannel(FPGADevice *_fpga, BackgroundWorkManager *bgwork);
+
+            // TODO: multiple concurrent fills not ok for now
+            static const bool is_ordered = true;
+
+            virtual XferDes *create_xfer_des(uintptr_t dma_op,
+                                             NodeID launch_node,
+                                             XferDesID guid,
+                                             const std::vector<XferDesPortInfo> &inputs_info,
+                                             const std::vector<XferDesPortInfo> &outputs_info,
+                                             int priority,
+                                             XferDesRedopInfo redop_info,
+                                             const void *fill_data, size_t fill_size);
+
+            long submit(Request **requests, long nr);
+
+        protected:
+            friend class FPGAfillXferDes;
+            FPGADevice *fpga;
+        };
+
+        class FPGAModule : public Module
+        {
+        protected:
+            FPGAModule(void);
+
+        public:
+            virtual ~FPGAModule(void);
+
+            static Module *create_module(RuntimeImpl *runtime, std::vector<std::string> &cmdline);
+
+            // do any general initialization - this is called after all configuration is
+            //  complete
+            virtual void initialize(RuntimeImpl *runtime);
+
+            // create any memories provided by this module (default == do nothing)
+            //  (each new MemoryImpl should use a Memory from RuntimeImpl::next_local_memory_id)
+            virtual void create_memories(RuntimeImpl *runtime);
+
+            // create any processors provided by the module (default == do nothing)
+            //  (each new ProcessorImpl should use a Processor from
+            //   RuntimeImpl::next_local_processor_id)
+            virtual void create_processors(RuntimeImpl *runtime);
+
+            // create any DMA channels provided by the module (default == do nothing)
+            virtual void create_dma_channels(RuntimeImpl *runtime);
+
+            // create any code translators provided by the module (default == do nothing)
+            virtual void create_code_translators(RuntimeImpl *runtime);
+
+            // clean up any common resources created by the module - this will be called
+            //  after all memories/processors/etc. have been shut down and destroyed
+            virtual void cleanup(void);
+
+        public:
+            size_t cfg_num_fpgas;
+            bool cfg_use_worker_threads;
+            bool cfg_use_shared_worker;
+            size_t cfg_fpga_mem_size;
+            size_t cfg_fpga_ib_size;
+            FPGAWorker *shared_worker;
+            std::map<FPGADevice *, FPGAWorker *> dedicated_workers;
+            std::string cfg_fpga_xclbin;
+            std::vector<FPGADevice *> fpga_devices;
+            size_t cfg_fpga_coprocessor_num_cu;
+            std::string cfg_fpga_coprocessor_kernel;
+
+        protected:
+            std::vector<FPGAProcessor *> fpga_procs_;
+        };
+
+    }; // namespace FPGA
+};     // namespace Realm
+
+#endif
diff --git a/runtime/realm/fpga/fpga_utils.cc b/runtime/realm/fpga/fpga_utils.cc
new file mode 100644
index 0000000000..6da890e233
--- /dev/null
+++ b/runtime/realm/fpga/fpga_utils.cc
@@ -0,0 +1,64 @@
+#include "realm/fpga/fpga_module.h"
+#include "realm/fpga/fpga_utils.h"
+#include "realm/logging.h"
+
+namespace Realm
+{
+    namespace FPGA
+    {
+        extern Logger log_fpga;
+    }
+}
+
+extern "C"
+{
+    using namespace Realm;
+    using namespace Realm::FPGA;
+
+    REALM_PUBLIC_API cl::Device FPGAGetCurrentDevice(void)
+    {
+        FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc();
+        cl::Device ret = p->fpga_device->device;
+        log_fpga.info() << "FPGAGetCurrentDevice()";
+        return ret;
+    }
+
+    REALM_PUBLIC_API cl::Context FPGAGetCurrentContext(void)
+    {
+        FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc();
+        cl::Context ret = p->fpga_device->context;
+        log_fpga.info() << "FPGAGetCurrentContext()";
+        return ret;
+    }
+
+    REALM_PUBLIC_API cl::Program FPGAGetCurrentProgram(void)
+    {
+        FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc();
+        cl::Program ret = p->fpga_device->program;
+        log_fpga.info() << "FPGAGetCurrentProgram()";
+        return ret;
+    }
+
+    REALM_PUBLIC_API cl::Buffer FPGAGetCurrentBuffer(void)
+    {
+        FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc();
+        cl::Buffer ret = p->fpga_device->buff;
+        log_fpga.info() << "FPGAGetCurrentBuffer()";
+        return ret;
+    }
+
+    REALM_PUBLIC_API cl::CommandQueue FPGAGetCurrentCommandQueue(void)
+    {
+        FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc();
+        cl::CommandQueue ret = p->fpga_device->command_queue;
+        log_fpga.info() << "FPGAGetCurrentCommandQueue()";
+        return ret;
+    }
+
+    REALM_PUBLIC_API void *FPGAGetBasePtrSys(void)
+    {
+        FPGAProcessor *p = FPGAProcessor::get_current_fpga_proc();
+        void *ret = p->fpga_device->fpga_mem->base_ptr_sys;
+        return ret;
+    }
+}
diff --git a/runtime/realm/fpga/fpga_utils.h b/runtime/realm/fpga/fpga_utils.h
new file mode 100644
index 0000000000..45b53ead53
--- /dev/null
+++ b/runtime/realm/fpga/fpga_utils.h
@@ -0,0 +1,14 @@
+#ifndef FPGA_UTILS_H
+#define FPGA_UTILS_H
+
+#include "xcl2.hpp"
+extern "C"
+{
+    cl::Device FPGAGetCurrentDevice(void);
+    cl::Context FPGAGetCurrentContext(void);
+    cl::Program FPGAGetCurrentProgram(void);
+    cl::Buffer FPGAGetCurrentBuffer(void);
+    cl::CommandQueue FPGAGetCurrentCommandQueue(void);
+    void *FPGAGetBasePtrSys(void);
+}
+#endif // FPGA_UTILS_H
diff --git a/runtime/realm/fpga/xcl2.cc b/runtime/realm/fpga/xcl2.cc
new file mode 100644
index 0000000000..fdc4f8a27b
--- /dev/null
+++ b/runtime/realm/fpga/xcl2.cc
@@ -0,0 +1,157 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+#include "xcl2.hpp"
+#include <climits>
+#include <sys/stat.h>
+#include <string>
+#include <iomanip>
+#include <sstream>
+#if defined(_WINDOWS)
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
+namespace xcl {
+std::vector<cl::Device> get_devices(const std::string& vendor_name) {
+    size_t i;
+    cl_int err;
+    std::vector<cl::Platform> platforms;
+    OCL_CHECK(err, err = cl::Platform::get(&platforms));
+    cl::Platform platform;
+    for (i = 0; i < platforms.size(); i++) {
+        platform = platforms[i];
+        OCL_CHECK(err, std::string platformName = platform.getInfo<CL_PLATFORM_NAME>(&err));
+        if (!(platformName.compare(vendor_name))) {
+            std::cout << "Found Platform" << std::endl;
+            std::cout << "Platform Name: " << platformName.c_str() << std::endl;
+            break;
+        }
+    }
+    if (i == platforms.size()) {
+        std::cout << "Error: Failed to find Xilinx platform" << std::endl;
+        std::cout << "Found the following platforms : " << std::endl;
+        for (size_t j = 0; j < platforms.size(); j++) {
+            platform = platforms[j];
+            OCL_CHECK(err, std::string platformName = platform.getInfo<CL_PLATFORM_NAME>(&err));
+            std::cout << "Platform Name: " << platformName.c_str() << std::endl;
+        }
+        exit(EXIT_FAILURE);
+    }
+    // Getting ACCELERATOR Devices and selecting 1st such device
+    std::vector<cl::Device> devices;
+    OCL_CHECK(err, err = platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR, &devices));
+    return devices;
+}
+
+std::vector<cl::Device> get_xil_devices() {
+    return get_devices("Xilinx");
+}
+
+cl::Device find_device_bdf(const std::vector<cl::Device>& devices, const std::string& bdf) {
+    char device_bdf[20];
+    cl_int err;
+    cl::Device device;
+    int cnt = 0;
+    for (uint32_t i = 0; i < devices.size(); i++) {
+        OCL_CHECK(err, err = devices[i].getInfo(CL_DEVICE_PCIE_BDF, &device_bdf));
+        if (bdf == device_bdf) {
+            device = devices[i];
+            cnt++;
+            break;
+        }
+    }
+    if (cnt == 0) {
+        std::cout << "Invalid device bdf. Please check and provide valid bdf\n";
+        exit(EXIT_FAILURE);
+    }
+    return device;
+}
+std::vector<unsigned char> read_binary_file(const std::string& xclbin_file_name) {
+    std::cout << "INFO: Reading " << xclbin_file_name << std::endl;
+    FILE* fp;
+    if ((fp = fopen(xclbin_file_name.c_str(), "r")) == nullptr) {
+        printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str());
+        exit(EXIT_FAILURE);
+    }
+    // Loading XCL Bin into char buffer
+    std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n";
+    std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary);
+    bin_file.seekg(0, bin_file.end);
+    auto nb = bin_file.tellg();
+    bin_file.seekg(0, bin_file.beg);
+    std::vector<unsigned char> buf;
+    buf.resize(nb);
+    bin_file.read(reinterpret_cast<char*>(buf.data()), nb);
+    return buf;
+}
+
+bool is_emulation() {
+    bool ret = false;
+    char* xcl_mode = getenv("XCL_EMULATION_MODE");
+    if (xcl_mode != nullptr) {
+        ret = true;
+    }
+    return ret;
+}
+
+bool is_hw_emulation() {
+    bool ret = false;
+    char* xcl_mode = getenv("XCL_EMULATION_MODE");
+    if ((xcl_mode != nullptr) && !strcmp(xcl_mode, "hw_emu")) {
+        ret = true;
+    }
+    return ret;
+}
+double round_off(double n) {
+    double d = n * 100.0;
+    int i = d + 0.5;
+    d = i / 100.0;
+    return d;
+}
+
+std::string convert_size(size_t size) {
+    static const char* SIZES[] = {"B", "KB", "MB", "GB"};
+    uint32_t div = 0;
+    size_t rem = 0;
+
+    while (size >= 1024 && div < (sizeof SIZES / sizeof *SIZES)) {
+        rem = (size % 1024);
+        div++;
+        size /= 1024;
+    }
+
+    double size_d = (float)size + (float)rem / 1024.0;
+    double size_val = round_off(size_d);
+
+    std::stringstream stream;
+    stream << std::fixed << std::setprecision(2) << size_val;
+    std::string size_str = stream.str();
+    std::string result = size_str + " " + SIZES[div];
+    return result;
+}
+
+bool is_xpr_device(const char* device_name) {
+    const char* output = strstr(device_name, "xpr");
+
+    if (output == nullptr) {
+        return false;
+    } else {
+        return true;
+    }
+}
+}; // namespace xcl
diff --git a/runtime/realm/fpga/xcl2.hpp b/runtime/realm/fpga/xcl2.hpp
new file mode 100644
index 0000000000..aa52839dc4
--- /dev/null
+++ b/runtime/realm/fpga/xcl2.hpp
@@ -0,0 +1,137 @@
+/**
+* Copyright (C) 2019-2021 Xilinx, Inc
+*
+* Licensed under the Apache License, Version 2.0 (the "License"). You may
+* not use this file except in compliance with the License. A copy of the
+* License is located at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+* License for the specific language governing permissions and limitations
+* under the License.
+*/
+
+#pragma once
+
+#define CL_HPP_CL_1_2_DEFAULT_BUILD
+#define CL_HPP_TARGET_OPENCL_VERSION 120
+#define CL_HPP_MINIMUM_OPENCL_VERSION 120
+#define CL_HPP_ENABLE_PROGRAM_CONSTRUCTION_FROM_ARRAY_COMPATIBILITY 1
+#define CL_USE_DEPRECATED_OPENCL_1_2_APIS
+
+// OCL_CHECK doesn't work if call has templatized function call
+#define OCL_CHECK(error, call)                                                                   \
+    call;                                                                                        \
+    if (error != CL_SUCCESS) {                                                                   \
+        printf("%s:%d Error calling " #call ", error code is: %d\n", __FILE__, __LINE__, error); \
+        exit(EXIT_FAILURE);                                                                      \
+    }
+
+#include <CL/cl2.hpp>
+#include <CL/cl_ext_xilinx.h>
+#include <fstream>
+#include <iostream>
+// When creating a buffer with user pointer (CL_MEM_USE_HOST_PTR), under the
+// hood
+// User ptr is used if and only if it is properly aligned (page aligned). When
+// not
+// aligned, runtime has no choice but to create its own host side buffer that
+// backs
+// user ptr. This in turn implies that all operations that move data to and from
+// device incur an extra memcpy to move data to/from runtime's own host buffer
+// from/to user pointer. So it is recommended to use this allocator if user wish
+// to
+// Create Buffer/Memory Object with CL_MEM_USE_HOST_PTR to align user buffer to
+// the
+// page boundary. It will ensure that user buffer will be used when user create
+// Buffer/Mem Object with CL_MEM_USE_HOST_PTR.
+template <typename T>
+struct aligned_allocator {
+    using value_type = T;
+
+    aligned_allocator() {}
+
+    aligned_allocator(const aligned_allocator&) {}
+
+    template <typename U>
+    aligned_allocator(const aligned_allocator<U>&) {}
+
+    T* allocate(std::size_t num) {
+        void* ptr = nullptr;
+
+#if defined(_WINDOWS)
+        {
+            ptr = _aligned_malloc(num * sizeof(T), 4096);
+            if (ptr == nullptr) {
+                std::cout << "Failed to allocate memory" << std::endl;
+                exit(EXIT_FAILURE);
+            }
+        }
+#else
+        {
+            if (posix_memalign(&ptr, 4096, num * sizeof(T))) throw std::bad_alloc();
+        }
+#endif
+        return reinterpret_cast<T*>(ptr);
+    }
+    void deallocate(T* p, std::size_t num) {
+#if defined(_WINDOWS)
+        _aligned_free(p);
+#else
+        free(p);
+#endif
+    }
+};
+
+namespace xcl {
+std::vector<cl::Device> get_xil_devices();
+std::vector<cl::Device> get_devices(const std::string& vendor_name);
+cl::Device find_device_bdf(const std::vector<cl::Device>& devices, const std::string& bdf);
+std::string convert_size(size_t size);
+std::vector<unsigned char> read_binary_file(const std::string& xclbin_file_name);
+bool is_emulation();
+bool is_hw_emulation();
+bool is_xpr_device(const char* device_name);
+class Stream {
+   public:
+    static decltype(&clCreateStream) createStream;
+    static decltype(&clReleaseStream) releaseStream;
+    static decltype(&clReadStream) readStream;
+    static decltype(&clWriteStream) writeStream;
+    static decltype(&clPollStreams) pollStreams;
+    static void init(const cl_platform_id& platform) {
+        void* bar = clGetExtensionFunctionAddressForPlatform(platform, "clCreateStream");
+        createStream = (decltype(&clCreateStream))bar;
+        bar = clGetExtensionFunctionAddressForPlatform(platform, "clReleaseStream");
+        releaseStream = (decltype(&clReleaseStream))bar;
+        bar = clGetExtensionFunctionAddressForPlatform(platform, "clReadStream");
+        readStream = (decltype(&clReadStream))bar;
+        bar = clGetExtensionFunctionAddressForPlatform(platform, "clWriteStream");
+        writeStream = (decltype(&clWriteStream))bar;
+        bar = clGetExtensionFunctionAddressForPlatform(platform, "clPollStreams");
+        pollStreams = (decltype(&clPollStreams))bar;
+    }
+};
+class P2P {
+   public:
+    static decltype(&xclGetMemObjectFd) getMemObjectFd;
+    static decltype(&xclGetMemObjectFromFd) getMemObjectFromFd;
+    static void init(const cl_platform_id& platform) {
+        void* bar = clGetExtensionFunctionAddressForPlatform(platform, "xclGetMemObjectFd");
+        getMemObjectFd = (decltype(&xclGetMemObjectFd))bar;
+        bar = clGetExtensionFunctionAddressForPlatform(platform, "xclGetMemObjectFromFd");
+        getMemObjectFromFd = (decltype(&xclGetMemObjectFromFd))bar;
+    }
+};
+class Ext {
+   public:
+    static decltype(&xclGetComputeUnitInfo) getComputeUnitInfo;
+    static void init(const cl_platform_id& platform) {
+        void* bar = clGetExtensionFunctionAddressForPlatform(platform, "xclGetComputeUnitInfo");
+        getComputeUnitInfo = (decltype(&xclGetComputeUnitInfo))bar;
+    }
+};
+}
diff --git a/runtime/realm/machine_impl.cc b/runtime/realm/machine_impl.cc
index ccb1fdf9da..a2956eca31 100644
--- a/runtime/realm/machine_impl.cc
+++ b/runtime/realm/machine_impl.cc
@@ -409,6 +409,7 @@ namespace Realm {
       // these can be the source of a RemoteWrite, but it's non-ideal
     case Memory::SYSTEM_MEM:
     case Memory::Z_COPY_MEM:
+    case Memory::FPGA_MEM:
       {
 	send_ok = true;
 	recv_ok = true;
diff --git a/runtime/realm/mem_impl.h b/runtime/realm/mem_impl.h
index 1a21fdcca2..149f9170d7 100644
--- a/runtime/realm/mem_impl.h
+++ b/runtime/realm/mem_impl.h
@@ -61,6 +61,9 @@ namespace Realm {
       MKIND_FILE,    // file memory accessible by owner node
 #ifdef REALM_USE_HDF5
       MKIND_HDF,     // HDF memory accessible by owner node
+#endif
+#ifdef REALM_USE_FPGA
+      MKIND_FPGA,    // FPGA device memory
 #endif
     };
 
diff --git a/runtime/realm/module.cc b/runtime/realm/module.cc
index c068693847..f1fdc0362d 100644
--- a/runtime/realm/module.cc
+++ b/runtime/realm/module.cc
@@ -78,6 +78,11 @@ REGISTER_REALM_MODULE_STATIC(Realm::LLVMJit::LLVMJitModule);
 REGISTER_REALM_MODULE_STATIC(Realm::HDF5::HDF5Module);
 #endif
 
+#if defined REALM_USE_FPGA
+#include "realm/fpga/fpga_module.h"
+REGISTER_REALM_MODULE_STATIC(Realm::FPGA::FPGAModule);
+#endif
+
 #ifdef REALM_USE_GASNET1
 #include "realm/gasnet1/gasnet1_module.h"
 REGISTER_REALM_NETWORK_MODULE_STATIC(Realm::GASNet1Module);
diff --git a/runtime/realm/realm_c.h b/runtime/realm/realm_c.h
index 5600b88d01..1e059b88b0 100644
--- a/runtime/realm/realm_c.h
+++ b/runtime/realm/realm_c.h
@@ -51,7 +51,8 @@ typedef unsigned long long realm_barrier_timestamp_t;
   __op__(PROC_GROUP, "Processor group") \
   __op__(PROC_SET, "Set of Processors for OpenMP/Kokkos etc.") \
   __op__(OMP_PROC, "OpenMP (or similar) thread pool") \
-  __op__(PY_PROC, "Python interpreter")
+  __op__(PY_PROC, "Python interpreter") \
+  __op__(FPGA_PROC, "FPGA")
 
 typedef enum realm_processor_kind_t {
 #define C_ENUMS(name, desc) name,
@@ -74,7 +75,8 @@ typedef enum realm_processor_kind_t {
   __op__(LEVEL3_CACHE, "CPU L3 Visible to all processors on the node, better performance to processors on same socket") \
   __op__(LEVEL2_CACHE, "CPU L2 Visible to all processors on the node, better performance to one processor") \
   __op__(LEVEL1_CACHE, "CPU L1 Visible to all processors on the node, better performance to one processor") \
-  __op__(GPU_MANAGED_MEM, "Managed memory that can be cached by either host or GPU")
+  __op__(GPU_MANAGED_MEM, "Managed memory that can be cached by either host or GPU") \
+  __op__(FPGA_MEM, "FPGA device memory")
 
 typedef enum realm_memory_kind_t {
 #define C_ENUMS(name, desc) name,
diff --git a/runtime/realm/transfer/channel.h b/runtime/realm/transfer/channel.h
index 88393d6b28..4642b4a5bc 100644
--- a/runtime/realm/transfer/channel.h
+++ b/runtime/realm/transfer/channel.h
@@ -72,6 +72,11 @@ namespace Realm {
       XFER_FILE_WRITE,
       XFER_ADDR_SPLIT,
       XFER_MEM_FILL,
+      XFER_FPGA_TO_DEV,
+      XFER_FPGA_FROM_DEV,
+      XFER_FPGA_IN_DEV,
+      XFER_FPGA_PEER_DEV,
+      XFER_FPGA_COMP,
     };
 
     class Request {
diff --git a/runtime/runtime.mk b/runtime/runtime.mk
index 78d3eb27eb..7e8cc48167 100644
--- a/runtime/runtime.mk
+++ b/runtime/runtime.mk
@@ -697,6 +697,25 @@ ifeq ($(strip $(USE_HDF)), 1)
   endif
 endif
 
+# Realm doesn't use FPGA by default
+USE_FPGA ?= 0
+ifeq ($(strip $(USE_FPGA)), 1)
+  REALM_CC_FLAGS += -DREALM_USE_FPGA
+  LEGION_CC_FLAGS += -DLEGION_USE_FPGA
+  # provide this for backward-compatibility in applications
+  CC_FLAGS += -DUSE_FPGA -I$(XILINX_VIVADO)/include -I$(XILINX_XRT)/include 
+  FC_FLAGS += -DUSE_FPGA
+  LEGION_LD_FLAGS += -L$(XILINX_XRT)/lib -lOpenCL -pthread
+  ifdef FPGA_ROOT
+       CC_FLAGS += -I$(FPGA_ROOT)/include
+       FC_FLAGS += -I$(FPGA_ROOT)/include
+       LD_FLAGS += -L$(FPGA_ROOT)/lib
+  else
+    CC_FLAGS += -I/usr/include
+    FC_FLAGS += -I/usr/include
+  endif
+endif
+
 SKIP_MACHINES= titan% daint% excalibur% cori%
 # use mpi{cc,cxx,f90} compiler wrappers if USE_MPI=1
 ifeq ($(strip $(USE_MPI)),1)
@@ -935,6 +954,12 @@ REALM_SRC 	+= $(LG_RT_DIR)/realm/hdf5/hdf5_module.cc \
 		   $(LG_RT_DIR)/realm/hdf5/hdf5_internal.cc \
 		   $(LG_RT_DIR)/realm/hdf5/hdf5_access.cc
 endif
+ifeq ($(strip $(USE_FPGA)),1)
+REALM_SRC 	+= $(LG_RT_DIR)/realm/fpga/fpga_module.cc \
+               $(LG_RT_DIR)/realm/fpga/fpga_utils.cc \
+               $(LG_RT_DIR)/realm/fpga/xcl2.cc
+endif
+
 REALM_SRC 	+= $(LG_RT_DIR)/realm/activemsg.cc \
                    $(LG_RT_DIR)/realm/nodeset.cc \
                    $(LG_RT_DIR)/realm/network.cc