Skip to content

Commit

Permalink
Clean up the submit_parfor_kenrel for reduction.
Browse files Browse the repository at this point in the history
  • Loading branch information
mingjie-intel authored and diptorupd committed May 15, 2023
1 parent ee90420 commit e289add
Show file tree
Hide file tree
Showing 2 changed files with 130 additions and 71 deletions.
200 changes: 130 additions & 70 deletions numba_dpex/core/passes/parfor_lowering_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,26 +50,16 @@ class ParforLowerImpl:
for a parfor and submits it to a queue.
"""

def _submit_parfor_kernel(
self,
lowerer,
kernel_fn,
loop_ranges,
kernel_type,
reductionHelper=None,
):
"""
Adds the call to the dpex kernel function from the main function.
"""
def _prepare_queue_num_args(self, kernel_fn, lowerer):
keep_alive_kernels.append(kernel_fn.kernel)

# Helper class that generates the LLVM IR values inside the current LLVM
# module that are needed to submit the kernel to a queue.
ir_builder = KernelLaunchIRBuilder(lowerer, kernel_fn.kernel)
self.ir_builder = KernelLaunchIRBuilder(lowerer, kernel_fn.kernel)

# Create a local variable storing a pointer to a DPCTLSyclQueueRef
# pointer.
curr_queue = ir_builder.get_queue(exec_queue=kernel_fn.queue)
self.curr_queue = self.ir_builder.get_queue(exec_queue=kernel_fn.queue)

num_flattened_args = 0

Expand All @@ -86,100 +76,171 @@ def _submit_parfor_kernel(
num_flattened_args += 1

# Create LLVM values for the kernel args list and kernel arg types list
args_list = ir_builder.allocate_kernel_arg_array(num_flattened_args)
args_ty_list = ir_builder.allocate_kernel_arg_ty_array(
self.args_list = self.ir_builder.allocate_kernel_arg_array(
num_flattened_args
)
self.args_ty_list = self.ir_builder.allocate_kernel_arg_ty_array(
num_flattened_args
)

# Populate the args_list and the args_ty_list LLVM arrays
kernel_arg_num = 0
self.kernel_arg_num = 0
for arg_num, arg in enumerate(kernel_fn.kernel_args):
argtype = kernel_fn.kernel_arg_types[arg_num]
llvm_val = _getvar_or_none(lowerer, arg)
if not llvm_val:
raise AssertionError
if isinstance(argtype, DpnpNdArray):
ir_builder.build_array_arg(
self.ir_builder.build_array_arg(
array_val=llvm_val,
array_rank=argtype.ndim,
arg_list=args_list,
args_ty_list=args_ty_list,
arg_num=kernel_arg_num,
arg_list=self.args_list,
args_ty_list=self.args_ty_list,
arg_num=self.kernel_arg_num,
)
# FIXME: Get rid of magic constants
kernel_arg_num += 5 + (2 * argtype.ndim)
self.kernel_arg_num += 5 + (2 * argtype.ndim)
else:
if argtype == types.complex64:
ir_builder.build_complex_arg(
self.ir_builder.build_complex_arg(
llvm_val,
types.float32,
args_list,
args_ty_list,
kernel_arg_num,
self.args_list,
self.args_ty_list,
self.kernel_arg_num,
)
kernel_arg_num += 2
self.kernel_arg_num += 2
elif argtype == types.complex128:
ir_builder.build_complex_arg(
self.ir_builder.build_complex_arg(
llvm_val,
types.float64,
args_list,
args_ty_list,
kernel_arg_num,
self.args_list,
self.args_ty_list,
self.kernel_arg_num,
)
kernel_arg_num += 2
self.kernel_arg_num += 2
else:
ir_builder.build_arg(
self.ir_builder.build_arg(
llvm_val,
argtype,
args_list,
args_ty_list,
kernel_arg_num,
self.args_list,
self.args_ty_list,
self.kernel_arg_num,
)
kernel_arg_num += 1
self.kernel_arg_num += 1

def _submit_parfor_kernel(
self,
lowerer,
kernel_fn,
loop_ranges,
):
"""
Adds the call to the dpex kernel function from the main function.
"""

self._prepare_queue_num_args(kernel_fn, lowerer)
# Create a global range over which to submit the kernel based on the
# loop_ranges of the parfor
global_range = []
# SYCL ranges can have at max 3 dimension. If the parfor is of a higher
# dimension then the indexing for the higher dimensions is done inside
# the kernel.
global_range_rank = len(loop_ranges) if len(loop_ranges) < 3 else 3
if kernel_type == 0:
for i in range(global_range_rank):
start, stop, step = loop_ranges[i]
stop = _load_range(lowerer, stop)
if step != 1:
raise UnsupportedParforError(
"non-unit strides are not yet supported."
)
global_range.append(stop)
else:
stop = None
if kernel_type == 1:
stop = _load_range(lowerer, 1)
else:
stop = reductionHelper.global_size_var
stop = _load_range(lowerer, stop)

for i in range(global_range_rank):
start, stop, step = loop_ranges[i]
stop = _load_range(lowerer, stop)
if step != 1:
raise UnsupportedParforError(
"non-unit strides are not yet supported."
)
global_range.append(stop)

local_range = []
if kernel_type == 2:
local_range.append(
_load_range(lowerer, reductionHelper.work_group_size)
)

# Submit a synchronous kernel
ir_builder.submit_sync_ranged_kernel(
curr_queue,
kernel_arg_num,
args_list,
args_ty_list,
self.ir_builder.submit_sync_ranged_kernel(
self.curr_queue,
self.kernel_arg_num,
self.args_list,
self.args_ty_list,
global_range,
local_range,
)

# At this point we can free the DPCTLSyclQueueRef (curr_queue)
self.ir_builder.free_queue(sycl_queue_val=self.curr_queue)

def _submit_reduction_main_parfor_kernel(
self,
lowerer,
kernel_fn,
reductionHelper=None,
):
"""
Adds the call to the dpex kernel function from the main function.
"""

self._prepare_queue_num_args(kernel_fn, lowerer)
# Create a global range over which to submit the kernel based on the
# loop_ranges of the parfor
global_range = []

stop = reductionHelper.global_size_var
stop = _load_range(lowerer, stop)
global_range.append(stop)

local_range = []
local_range.append(
_load_range(lowerer, reductionHelper.work_group_size)
)

# Submit a synchronous kernel
self.ir_builder.submit_sync_ranged_kernel(
self.curr_queue,
self.kernel_arg_num,
self.args_list,
self.args_ty_list,
global_range,
local_range,
)

# At this point we can free the DPCTLSyclQueueRef (curr_queue)
ir_builder.free_queue(sycl_queue_val=curr_queue)
self.ir_builder.free_queue(sycl_queue_val=self.curr_queue)

def _submit_reduction_remainder_parfor_kernel(
self,
lowerer,
kernel_fn,
):
"""
Adds the call to the dpex kernel function from the main function.
"""

self._prepare_queue_num_args(kernel_fn, lowerer)
# Create a global range over which to submit the kernel based on the
# loop_ranges of the parfor
global_range = []

stop = _load_range(lowerer, 1)

global_range.append(stop)

local_range = []

# Submit a synchronous kernel
self.ir_builder.submit_sync_ranged_kernel(
self.curr_queue,
self.kernel_arg_num,
self.args_list,
self.args_ty_list,
global_range,
local_range,
)

# At this point we can free the DPCTLSyclQueueRef (curr_queue)
self.ir_builder.free_queue(sycl_queue_val=self.curr_queue)

def _lower_parfor_as_kernel(self, lowerer, parfor):
"""Lowers a parfor node created by the dpjit compiler to a kernel.
Expand Down Expand Up @@ -314,11 +375,9 @@ def _lower_parfor_as_kernel(self, lowerer, parfor):
parfor_reddict,
)

self._submit_parfor_kernel(
self._submit_reduction_main_parfor_kernel(
lowerer,
psrfor_kernel,
loop_ranges,
2,
reductionHelperList[0],
)

Expand All @@ -332,12 +391,9 @@ def _lower_parfor_as_kernel(self, lowerer, parfor):
reductionHelperList,
)

self._submit_parfor_kernel(
self._submit_reduction_remainder_parfor_kernel(
lowerer,
psrfor_kernel,
loop_ranges,
1,
reductionHelperList[0],
)

reductionKernelVar.copy_final_sum_to_host(psrfor_kernel)
Expand All @@ -360,7 +416,11 @@ def _lower_parfor_as_kernel(self, lowerer, parfor):
raise UnsupportedParforError

# Finally submit the kernel
self._submit_parfor_kernel(lowerer, psrfor_kernel, loop_ranges, 0)
self._submit_parfor_kernel(
lowerer,
psrfor_kernel,
loop_ranges,
)

# TODO: free the kernel at this point

Expand Down
1 change: 0 additions & 1 deletion numba_dpex/dpnp_iface/arrayobj.py
Original file line number Diff line number Diff line change
Expand Up @@ -355,7 +355,6 @@ def ol_dpnp_zeros(
"""

_ndim = _ty_parse_shape(shape)
# _layout = _parse_layout(order)
_dtype = _parse_dtype(dtype)
_layout = _parse_layout(order)
_usm_type = _parse_usm_type(usm_type) if usm_type is not None else "device"
Expand Down

0 comments on commit e289add

Please sign in to comment.