diff --git a/.github/release.yml b/.github/release.yml
new file mode 100644
index 000000000..0a37704fb
--- /dev/null
+++ b/.github/release.yml
@@ -0,0 +1,17 @@
+changelog:
+ exclude:
+ labels:
+ - category:task
+ categories:
+ - title: 🐛 Bug Fixes
+ labels:
+ - category:bug-fix
+ - title: 🚀 New Features
+ labels:
+ - category:new-feature
+ - title: 🛠️ Improvements
+ labels:
+ - category:improvement
+ - title: 📖 Documentation
+ labels:
+ - category:documentation
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index eda13fbda..b9b3bc526 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -6,6 +6,9 @@ on:
pull_request:
branches-ignore:
- gh-pages # deployment target branch (this workflow should not exist on that branch anyway)
+ schedule:
+ # * is a special character in YAML so you have to quote this string
+ - cron: '0 */6 * * *'
env:
COMMIT: ${{ github.event.pull_request.head.sha || github.sha }}
PROJECT: github-cunumeric-ci
@@ -57,7 +60,7 @@ jobs:
if: always()
- name: Upload Build Log
if: always()
- uses: actions/upload-artifact@v2
+ uses: actions/upload-artifact@v3
with:
name: build-log
path: ./**/${{ env.COMMIT }}-build.log.gpg
@@ -74,9 +77,10 @@ jobs:
- {name: GPU test, options: --use cuda --gpus 1 --debug, log: gpu}
- {name: 2 GPUs test, options: --use cuda --gpus 2 --debug, log: gpus}
- {name: OpenMP test, options: --use openmp --omps 1 --ompthreads 2 --debug, log: omp}
- - {name: 2 OpenMPs test, options: --use openmp --omps 2 --ompthreads 2 --debug, log: omps}
+ - {name: 2 NUMA OpenMPs test, options: --use openmp --omps 2 --ompthreads 2 --numamem 2048 --debug, log: omps}
- {name: Eager execution test, options: --use eager --debug, log: eager}
- {name: mypy, options: mypy, log: mypy}
+ - {name: documentation, options: docs, log: docs}
name: ${{ matrix.name }}
steps:
- name: Dump GitHub context
@@ -125,7 +129,7 @@ jobs:
cat *artifacts/*/*
- name: Upload Log
if: always()
- uses: actions/upload-artifact@v2
+ uses: actions/upload-artifact@v3
with:
name: test-${{ matrix.log }}-log
path: ./**/${{ env.COMMIT }}-test-${{ matrix.log }}.log.gpg
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 878ef81ac..bc47df8a7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,18 +1,25 @@
repos:
+ - repo: https://github.com/pre-commit/mirrors-mypy
+ rev: 'v0.991'
+ hooks:
+ - id: mypy
+ language: system
+ pass_filenames: false
+ args: ['cunumeric']
- repo: https://github.com/PyCQA/isort
- rev: 5.10.1
+ rev: 5.11.4
hooks:
- id: isort
- repo: https://github.com/psf/black
- rev: 22.8.0
+ rev: 22.12.0
hooks:
- id: black
- repo: https://github.com/PyCQA/flake8
- rev: 5.0.4
+ rev: 6.0.0
hooks:
- id: flake8
- repo: https://github.com/pre-commit/mirrors-clang-format
- rev: 'v14.0.6' # Use the sha / tag you want to point at
+ rev: 'v15.0.7' # Use the sha / tag you want to point at
hooks:
- id: clang-format
files: \.(cu|cuh|h|cc|inl)$
@@ -25,5 +32,9 @@ repos:
entry: python scripts/hooks/enforce_pytest_main.py
language: python
pass_filenames: false
+
+ci:
+ skip: [mypy]
+
default_language_version:
python: python3
diff --git a/BUILD.md b/BUILD.md
index cea0ce4d1..e7183437c 100644
--- a/BUILD.md
+++ b/BUILD.md
@@ -15,89 +15,36 @@ limitations under the License.
-->
-# Overview
+# Basic build
-The build system is designed to enable two different modes of use:
-1. Simple `pip install` for users
-2. Highly customizable incremental builds for developers
+Users must have a working installation of the
+[Legate Core](https://github.com/nv-legate/legate.core)
+library prior to installing cuNumeric. **Installing cuNumeric by itself will not
+automatically install Legate Core.**
-We review each of these modes with examples.
+As for other dependencies, the Dependencies section on the
+[Legate Core build instructions](https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md)
+also covers cuNumeric, so no additional packages are required.
+Once Legate Core is installed, you can simply invoke `./install.py` from the
+cuNumeric top-level directory. The build will automatically pick up the
+configuration used when building Legate Core (e.g. the CUDA Toolkit directory).
-# Building for Users
+# Advanced topics
-## Using install.py
+## Building through pip & cmake
-For releases <= 22.07, the main method for building cuNumeric was the `install.py` script.
-Although the underlying implementation has significantly changed, `install.py` still supports the
-same usage and same set of flags. For a full list of flags, users can run:
+cuNumeric uses the same cmake/scikit-build-based build workflow as Legate Core.
+See the
+[Legate Core build instructions](https://github.com/nv-legate/legate.core/blob/HEAD/BUILD.md)
+for an overview.
-```
-$ ./install.py --help
-```
-
-## Using Conda
-
-cuNumeric can be installed using Conda by pointing to the required channels (`-c`):
-
-```
-conda install -c nvidia -c conda-forge -c legate legate-core
-```
-
-## Using pip
-
-cuNumeric is not yet registered in a standard pip repository. However, users can still use the
-pip installer to build and install cuNumeric. After downloading or cloning the cunumeric source,
-users can run the following in the cunumeric folder:
-
-```
-$ pip install .
-```
-or
-```
-$ python3 -m pip install .
-```
-
-This will install cuNumeric in the standard packages directory for the environment Python.
-Note: This is currently not sufficient for running cuNumeric programs. cuNumeric relies
-on the `legate` launcher from Legate core, which must be installed separately.
-For details on installing Legate, consult the [Legate repository](https://github.com/nv-legate/legate.core).
-
-### Advanced Customization
+There are several examples in the `scripts` folder. We walk through the steps in
+`build-with-legate-separately-no-install.sh` here.
-If users need to customize details of the underlying CMake build, they can pass
-CMake flags through the `SKBUILD_CONFIGURE_OPTIONS` environment variable:
+We assume a pre-existing Legate Core build. For details on building Legate Core,
+consult the [Legate Core repository](https://github.com/nv-legate/legate.core).
-```
-$ SKBUILD_CONFIGURE_OPTIONS="-D Legion_USE_CUDA:BOOL=ON" \
- pip install .
-```
-An alternative syntax using `setup.py` with `scikit-build` is
-```
-$ python setup.py install -- -DLegion_USE_CUDA:BOOL=ON
-```
-
-# Building for Developers
-
-## Overview
-
-pip uses [scikit-build](https://scikit-build.readthedocs.io/en/latest/)
-in `setup.py` to drive the build and installation. A `pip install` will trigger three general actions:
-
-1. CMake build and installation of C++ libraries
-2. CMake generation of configuration files and build-dependent Python files
-3. pip installation of Python files
-
-The CMake build can be configured independently of `pip`, allowing incremental C++ builds directly through CMake.
-This simplifies rebuilding `libcunumeric.so` either via command-line or via IDE.
-After building the C++ libraries, the `pip install` can be done in "editable" mode using the `-e` flag.
-This configures the Python site packages to import the Python source tree directly.
-The Python source can then be edited and used directly for testing without requiring a `pip install`.
-
-## Example
-
-There are several examples in the `scripts` folder. We walk through the steps in the `build-with-legate-separately-no-install.sh` here.
-We assume a pre-existing Legate CUDA build. For details on building Legate, consult the [Legate repository](https://github.com/nv-legate/legate.core).
First, the CMake build needs to be configured:
```
@@ -106,6 +53,7 @@ $ cmake -S . -B build -GNinja -D legate_core_ROOT:STRING=path/to/legate/build
We point cuNumeric to the Legate *build* tree, not an installation.
This generates all build-dependent headers and Python files.
+
Once configured, we can build the C++ libraries:
```
@@ -118,14 +66,12 @@ Once the C++ libraries are available, we can do an editable (development) pip in
```
$ SKBUILD_BUILD_OPTIONS="-D FIND_CUNUMERIC_CPP=ON -D cunumeric_ROOT=$(pwd)/build" \
python3 -m pip install \
- --root / --no-deps --no-build-isolation
+ --root / --no-deps --no-build-isolation
--editable .
```
-The Python source tree and CMake build tree are now available with the environment Python
-for running cuNumeric programs. The diagram below illustrates the
+The Python source tree and CMake build tree are now available with the environment Python
+for running cuNumeric programs. The diagram below illustrates the
complete workflow for building both Legate core and cuNumeric.
-
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53bbc0790..417bb9aa4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,7 +47,7 @@ include(rapids-cuda)
include(rapids-export)
include(rapids-find)
-set(cunumeric_version 22.10.00)
+set(cunumeric_version 22.12.00)
# For now we want the optimization flags to match on both normal make and cmake
# builds so we override the cmake defaults here for release, this changes
@@ -79,24 +79,23 @@ endif()
if(CMAKE_GENERATOR STREQUAL "Ninja")
function(add_touch_cunumeric_ninja_build_target)
set(_suf )
- set(_depends )
if(SKBUILD)
set(_suf "_python")
endif()
+ add_custom_target("touch_cunumeric${_suf}_ninja_build" ALL
+ COMMAND ${CMAKE_COMMAND} -E touch_nocreate "${CMAKE_CURRENT_BINARY_DIR}/build.ninja"
+ COMMENT "touch build.ninja so ninja doesn't re-run CMake on rebuild"
+ VERBATIM
+ )
foreach(_dep IN ITEMS cunumeric cunumeric_python
legion_core legion_core_python
Legion LegionRuntime
Realm RealmRuntime
Regent)
if(TARGET ${_dep})
- list(APPEND _depends ${_dep})
+ add_dependencies("touch_cunumeric${_suf}_ninja_build" ${_dep})
endif()
endforeach()
- add_custom_target("touch_cunumeric${_suf}_ninja_build" ALL
- COMMAND ${CMAKE_COMMAND} -E touch_nocreate "${CMAKE_CURRENT_BINARY_DIR}/build.ninja"
- COMMENT "touch build.ninja so ninja doesn't re-run CMake on rebuild"
- VERBATIM DEPENDS ${_depends}
- )
endfunction()
add_touch_cunumeric_ninja_build_target()
endif()
diff --git a/README.md b/README.md
index 1d85c650d..dbe358373 100644
--- a/README.md
+++ b/README.md
@@ -35,17 +35,6 @@ canonical NumPy implementation.
If you have questions, please contact us at legate(at)nvidia.com.
-1. [Installation](#installation)
-1. [Dependencies](#dependencies)
-1. [Building from Source](#building-from-source)
-1. [Usage and Execution](#usage-and-execution)
-1. [Supported and Planned Features](#supported-and-planned-features)
-1. [Supported Types and Dimensions](#supported-types-and-dimensions)
-1. [Documentation](#documentation)
-1. [Future Directions](#future-directions)
-1. [Contributing](#contributing)
-1. [Known Bugs](#known-bugs)
-
## Installation
cuNumeric is available [on conda](https://anaconda.org/legate/cunumeric):
@@ -53,76 +42,11 @@ cuNumeric is available [on conda](https://anaconda.org/legate/cunumeric):
```
conda install -c nvidia -c conda-forge -c legate cunumeric
```
+
The conda package is compatible with CUDA >= 11.4 (CUDA driver version >= r470),
and Volta or later GPU architectures.
-Docker image build scripts, as well as specialized install scripts for
-supported clusters are available on the
-[quickstart](https://github.com/nv-legate/quickstart) repo.
-
-Read on for general instructions on building cuNumeric from source.
-
-## Dependencies
-
-Users must have a working installation of the
-[Legate Core](https://github.com/nv-legate/legate.core)
-library prior to installing cuNumeric.
-
-cuNumeric requires the following:
-
- - Python >= 3.8
- - [CUDA](https://developer.nvidia.com/cuda-downloads) >= 10.2
- - GNU Make
- - C++17 compatible compiler (g++, clang, or nvc++)
- - Fortran compiler (for building OpenBLAS; not necessary if you provide a pre-built version of OpenBLAS)
- - the Python packages listed in any one of the conda environment files:
- - `conda/environment-test-3.8.yml`
- - `conda/environment-test-3.9.yml`
- - `conda/environment-test-3.10.yml`
-
-See the [corresponding section](https://github.com/nv-legate/legate.core#dependencies)
-on the Legate Core instructions for help on installing the required Python packages
-using conda.
-
-cuNumeric is tested and guaranteed to be compatible with Volta and later GPU
-architectures. You can use cuNumeric with Pascal GPUs as well, but there could
-be issues due to lack of independent thread scheduling. Please report any such
-issues on GitHub.
-
-## Building from Source
-
-Installation can be done the `install.py` script.
-For releases >= 22.10, `pip install` is now available.
-The most common installation command is:
-
-```
-./install.py --with-core
-```
-
-This will build cuNumeric against the Legate Core installation and then
-install cuNumeric into the same location.
-
-If Legate Core has been installed with CUDA support, a working cuTENSOR
-installation must also be provided to the installation command with the
-`--with-cutensor` option:
-```
-./install.py --with-core --with-cutensor
-```
-
-You can also specify an installation of [OpenBLAS](https://www.openblas.net/)
-to use for the build. If you already have an installation of OpenBLAS on your
-machine, you can inform the installation script using the `--with-openblas`
-option:
-
-```
-./install.py --with-openblas
-```
-
-Advanced users can also invoke `install.py --help` to see options for
-configuring cuNumeric by invoking the `install.py` script directly.
-More information on building - including development workflows - can be found
-in the [build instructions](BUILD.md)
-
+See [BUILD.md](BUILD.md) for instructions on building cuNumeric from source.
## Usage and Execution
diff --git a/cmake/generate_install_info_py.cmake b/cmake/generate_install_info_py.cmake
new file mode 100644
index 000000000..2fb14cbcb
--- /dev/null
+++ b/cmake/generate_install_info_py.cmake
@@ -0,0 +1,31 @@
+#=============================================================================
+# Copyright 2022 NVIDIA Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+execute_process(
+ COMMAND ${CMAKE_C_COMPILER}
+ -E -DLEGATE_USE_PYTHON_CFFI
+ -I "${CMAKE_CURRENT_LIST_DIR}/../src/cunumeric"
+ -P "${CMAKE_CURRENT_LIST_DIR}/../src/cunumeric/cunumeric_c.h"
+ ECHO_ERROR_VARIABLE
+ OUTPUT_VARIABLE header
+ COMMAND_ERROR_IS_FATAL ANY
+)
+
+set(libpath "")
+configure_file(
+ "${CMAKE_CURRENT_LIST_DIR}/../cunumeric/install_info.py.in"
+ "${CMAKE_CURRENT_LIST_DIR}/../cunumeric/install_info.py"
+@ONLY)
diff --git a/conda/conda-build/build.sh b/conda/conda-build/build.sh
index b1d79b52b..d0df68008 100644
--- a/conda/conda-build/build.sh
+++ b/conda/conda-build/build.sh
@@ -1,7 +1,5 @@
#!/bin/bash
-
-set -x;
-
+
# Rewrite conda's -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY to
# -DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
CMAKE_ARGS="$(echo "$CMAKE_ARGS" | sed -r "s@_INCLUDE=ONLY@_INCLUDE=BOTH@g")"
@@ -32,6 +30,8 @@ export CUDAFLAGS="-UNDEBUG"
export CMAKE_GENERATOR=Ninja
export CUDAHOSTCXX=${CXX}
+echo "Build starting on $(date)"
+
cmake -S . -B build ${CMAKE_ARGS}
cmake --build build -j$CPU_COUNT
cmake --install build
@@ -51,6 +51,8 @@ $PYTHON -m pip install \
--disable-pip-version-check \
. -vv
+echo "Build ending on $(date)"
+
# Legion leaves an egg-info file which will confuse conda trying to pick up the information
# Remove it so the legate-core is the only egg-info file added
rm -rf $SP_DIR/legion*egg-info
diff --git a/conda/conda-build/meta.yaml b/conda/conda-build/meta.yaml
index 4adf59927..cc352ff62 100644
--- a/conda/conda-build/meta.yaml
+++ b/conda/conda-build/meta.yaml
@@ -104,7 +104,10 @@ requirements:
# the nvcc requirement is necessary because it contains crt/host_config.h used by cuda runtime. This is a packaging bug that has been reported.
- cuda-nvcc ={{ cuda_version }}
# libcurand is used both in CPU and GPU builds
- - libcurand-dev
+ # temporarily pin curand until problems are resolved
+ - libcurand-dev =10.3.0.86
+ # the following line is only necessary for pinning curand
+ - libcurand =10.3.0.86
# cudart needed for CPU and GPU builds because of curand
- cuda-cudart-dev ={{ cuda_version }}
- python
@@ -122,7 +125,7 @@ requirements:
- cuda-cudart-dev ={{ cuda_version }}
- cuda-nvtx ={{ cuda_version }}
# - libcutensor-dev >=1.3
- - cutensor >=1.3
+ - cutensor >=1.3 =*_*
- libcublas-dev
- libcusolver-dev
- libcufft-dev
@@ -132,12 +135,15 @@ requirements:
run:
- numpy {{ numpy_version }}
- libopenblas =* =*openmp*
-{% if gpu_enabled_bool %}
+{% if not gpu_enabled_bool %}
+ - legate-core ={{ core_version }} =*_cpu
+{% else %}
+ - legate-core ={{ core_version }}
- cuda-cudart >={{ cuda_version }}
# - libcutensor >=1.3
- - cutensor >=1.3
+ - cutensor >=1.3 =*_*
- libcublas
- - libcusolver
+ - libcusolver =11.4.1.48-0
- libcufft
{% endif %}
- opt_einsum >=3.3
diff --git a/conda/environment-test-3.10.yml b/conda/environment-test-3.10.yml
deleted file mode 100644
index 1066db97e..000000000
--- a/conda/environment-test-3.10.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: cunumeric-test
-channels:
- - conda-forge
-dependencies:
- - python=3.10
-
- # build
- - git
- - nccl
- - make
- - zlib
- - cmake>=3.24
- - ninja
- - openmpi
- - c-compiler
- - cxx-compiler
- - gcc_linux-64 # [linux64]
- - sysroot_linux-64==2.17 # [linux64]
- - setuptools>=60
- - cutensor>=1.3.3
- - scikit-build>=0.13.1
-
- # runtime
- - cffi
- - numpy>=1.22
- - opt_einsum
- - pyarrow>=5
- - scipy
- - typing_extensions
- - llvm-openmp
- - openblas=*=*openmp*
-
- # tests
- - clang>=8
- - clang-tools>=8
- - colorama
- - coverage
- - mock
- - mypy>=0.961
- - pre-commit
- - pynvml
- - pytest
- - pytest-cov
- - pytest-mock
- - pytest-lazy-fixture
- - types-docutils
-
- # pip dependencies
- - pip
- - pip:
- # docs
- - jinja2
- - pydata-sphinx-theme
- - recommonmark
- - markdown<3.4.0
- - sphinx>=4.4.0
- - sphinx-copybutton
- - sphinx-markdown-tables
-
- # examples
- - tifffile
diff --git a/conda/environment-test-3.8.yml b/conda/environment-test-3.8.yml
deleted file mode 100644
index 9049ec0b9..000000000
--- a/conda/environment-test-3.8.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: cunumeric-test
-channels:
- - conda-forge
-dependencies:
- - python=3.8
-
- # build
- - git
- - nccl
- - make
- - zlib
- - cmake>=3.24
- - ninja
- - openmpi
- - c-compiler
- - cxx-compiler
- - gcc_linux-64 # [linux64]
- - sysroot_linux-64==2.17 # [linux64]
- - setuptools>=60
- - cutensor>=1.3.3
- - scikit-build>=0.13.1
-
- # runtime
- - cffi
- - numpy>=1.22
- - opt_einsum
- - pyarrow>=5
- - scipy
- - typing_extensions
- - llvm-openmp
- - openblas=*=*openmp*
-
- # tests
- - clang>=8
- - clang-tools>=8
- - colorama
- - coverage
- - mock
- - mypy>=0.961
- - pre-commit
- - pynvml
- - pytest
- - pytest-cov
- - pytest-mock
- - pytest-lazy-fixture
- - types-docutils
-
- # pip dependencies
- - pip
- - pip:
- # docs
- - jinja2
- - pydata-sphinx-theme
- - recommonmark
- - markdown<3.4.0
- - sphinx>=4.4.0
- - sphinx-copybutton
- - sphinx-markdown-tables
-
- # examples
- - tifffile
diff --git a/conda/environment-test-3.9.yml b/conda/environment-test-3.9.yml
deleted file mode 100644
index 482277bae..000000000
--- a/conda/environment-test-3.9.yml
+++ /dev/null
@@ -1,61 +0,0 @@
-name: cunumeric-test
-channels:
- - conda-forge
-dependencies:
- - python=3.9
-
- # build
- - git
- - nccl
- - make
- - zlib
- - cmake>=3.24
- - ninja
- - openmpi
- - c-compiler
- - cxx-compiler
- - gcc_linux-64 # [linux64]
- - sysroot_linux-64==2.17 # [linux64]
- - setuptools>=60
- - cutensor>=1.3.3
- - scikit-build>=0.13.1
-
- # runtime
- - cffi
- - numpy>=1.22
- - opt_einsum
- - pyarrow>=5
- - scipy
- - typing_extensions
- - llvm-openmp
- - openblas=*=*openmp*
-
- # tests
- - clang>=8
- - clang-tools>=8
- - colorama
- - coverage
- - mock
- - mypy>=0.961
- - pre-commit
- - pynvml
- - pytest
- - pytest-cov
- - pytest-mock
- - pytest-lazy-fixture
- - types-docutils
-
- # pip dependencies
- - pip
- - pip:
- # docs
- - jinja2
- - pydata-sphinx-theme
- - recommonmark
- - markdown<3.4.0
- - sphinx>=4.4.0
- - sphinx-copybutton
- - sphinx-markdown-tables
-
- # examples
- - tifffile
diff --git a/cunumeric/__init__.py b/cunumeric/__init__.py
index b8b028f9f..7c9e122aa 100644
--- a/cunumeric/__init__.py
+++ b/cunumeric/__init__.py
@@ -42,4 +42,4 @@
from . import _version
-__version__ = _version.get_versions()["version"] # type: ignore
+__version__ = _version.get_versions()["version"] # type: ignore [no-untyped-call]
diff --git a/cunumeric/array.py b/cunumeric/array.py
index dd389f995..d163e4ca4 100644
--- a/cunumeric/array.py
+++ b/cunumeric/array.py
@@ -31,11 +31,16 @@
cast,
)
+import legate.core.types as ty
import numpy as np
-import pyarrow # type: ignore
+import pyarrow # type: ignore [import]
from legate.core import Array
-from numpy.core.multiarray import normalize_axis_index # type: ignore
-from numpy.core.numeric import normalize_axis_tuple # type: ignore
+from numpy.core.multiarray import ( # type: ignore [attr-defined]
+ normalize_axis_index,
+)
+from numpy.core.numeric import ( # type: ignore [attr-defined]
+ normalize_axis_tuple,
+)
from typing_extensions import ParamSpec
from .config import (
@@ -94,7 +99,8 @@ def add_boilerplate(
parameter (if present), to cuNumeric ndarrays.
* Convert the special "where" parameter (if present) to a valid predicate.
"""
- keys: Set[str] = set(array_params)
+ keys = set(array_params)
+ assert len(keys) == len(array_params)
def decorator(func: Callable[P, R]) -> Callable[P, R]:
assert not hasattr(
@@ -104,18 +110,18 @@ def decorator(func: Callable[P, R]) -> Callable[P, R]:
# For each parameter specified by name, also consider the case where
# it's passed as a positional parameter.
indices: Set[int] = set()
- all_formals: Set[str] = set()
where_idx: Optional[int] = None
out_idx: Optional[int] = None
- for (idx, param) in enumerate(signature(func).parameters):
- all_formals.add(param)
+ params = signature(func).parameters
+ extra = keys - set(params)
+ assert len(extra) == 0, f"unknown parameter(s): {extra}"
+ for (idx, param) in enumerate(params):
if param == "where":
where_idx = idx
elif param == "out":
out_idx = idx
elif param in keys:
indices.add(idx)
- assert len(keys - all_formals) == 0, "unkonwn parameter(s)"
@wraps(func)
def wrapper(*args: Any, **kwargs: Any) -> R:
@@ -186,6 +192,21 @@ def _convert_all_to_numpy(obj: Any) -> Any:
return obj
+# FIXME: we can't give an accurate return type as mypy thinks
+# the pyarrow import can be ignored, and can't override the check
+# either, because no-any-unimported needs Python >= 3.10. We can
+# fix it once we bump up the Python version
+def convert_numpy_dtype_to_pyarrow(dtype: np.dtype[Any]) -> Any:
+ if dtype.kind != "c":
+ return pyarrow.from_numpy_dtype(dtype)
+ elif dtype == np.complex64:
+ return ty.complex64
+ elif dtype == np.complex128:
+ return ty.complex128
+ else:
+ raise ValueError(f"Unsupported NumPy dtype: {dtype}")
+
+
@clone_np_ndarray
class ndarray:
def __init__(
@@ -268,7 +289,7 @@ def __legate_data_interface__(self) -> dict[str, Any]:
# All of our thunks implement the Legate Store interface
# so we just need to convert our type and stick it in
# a Legate Array
- arrow_type = pyarrow.from_numpy_dtype(self.dtype)
+ arrow_type = convert_numpy_dtype_to_pyarrow(self.dtype)
# If the thunk is an eager array, we need to convert it to a
# deferred array so we can extract a legate store
deferred_thunk = runtime.to_deferred_array(self._thunk)
@@ -520,6 +541,10 @@ def flat(self) -> np.flatiter[npt.NDArray[Any]]:
--------
flatten : Return a copy of the array collapsed into one dimension.
+ Availability
+ --------
+ Single CPU
+
"""
return self.__array__().flat
@@ -919,12 +944,8 @@ def _convert_key(self, key: Any, first: bool = True) -> Any:
key = convert_to_cunumeric_ndarray(key)
if key.dtype != bool and not np.issubdtype(key.dtype, np.integer):
raise TypeError("index arrays should be int or bool type")
- if key.dtype != bool and key.dtype != np.int64:
- runtime.warn(
- "converting index array to int64 type",
- category=RuntimeWarning,
- )
- key = key.astype(np.int64)
+ if key.dtype != bool:
+ key = key._warn_and_convert(np.dtype(np.int64))
return key._thunk
@@ -2099,16 +2120,16 @@ def compress(
"""
a = self
- if condition.ndim != 1:
+ try:
+ if condition.ndim != 1:
+ raise ValueError(
+ "Dimension mismatch: condition must be a 1D array"
+ )
+ except AttributeError:
raise ValueError(
"Dimension mismatch: condition must be a 1D array"
)
- if condition.dtype != bool:
- runtime.warn(
- "converting condition to bool type",
- category=RuntimeWarning,
- )
- condition = condition.astype(bool)
+ condition = condition._warn_and_convert(np.dtype(bool))
if axis is None:
axis = 0
@@ -2475,6 +2496,62 @@ def diagonal(
raise ValueError("Either axis1/axis2 or axes must be supplied")
return self._diag_helper(offset=offset, axes=axes, extract=extract)
+ @add_boilerplate("indices", "values")
+ def put(
+ self, indices: ndarray, values: ndarray, mode: str = "raise"
+ ) -> None:
+ """
+ Replaces specified elements of the array with given values.
+
+ Refer to :func:`cunumeric.put` for full documentation.
+
+ See Also
+ --------
+ cunumeric.put : equivalent function
+
+ Availability
+ --------
+ Multiple GPUs, Multiple CPUs
+
+ """
+
+ if values.size == 0 or indices.size == 0 or self.size == 0:
+ return
+
+ if mode not in ("raise", "wrap", "clip"):
+ raise ValueError(
+ "mode must be one of 'clip', 'raise', or 'wrap' "
+ f"(got {mode})"
+ )
+
+ if mode == "wrap":
+ indices = indices % self.size
+ elif mode == "clip":
+ indices = indices.clip(0, self.size - 1)
+
+ indices = indices._warn_and_convert(np.dtype(np.int64))
+ values = values._warn_and_convert(self.dtype)
+
+ if indices.ndim > 1:
+ indices = indices.ravel()
+
+ if self.shape == ():
+ if mode == "raise":
+ if indices.min() < -1 or indices.max() > 0:
+ raise ValueError("Indices out of bounds")
+ if values.shape == ():
+ v = values
+ else:
+ v = values[0]
+ self._thunk.copy(v._thunk, deep=False)
+ return
+
+ # call _wrap on the values if they need to be wrapped
+ if values.ndim != indices.ndim or values.size != indices.size:
+ values = values._wrap(indices.size)
+
+ self._thunk.put(indices._thunk, values._thunk, mode == "raise")
+
@add_boilerplate()
def trace(
self,
@@ -2572,7 +2649,7 @@ def dump(self, file: Union[str, Path]) -> None:
Availability
--------
- Multiple GPUs, Multiple CPUs
+ Single CPU
"""
self.__array__().dump(file=file)
@@ -3572,7 +3649,7 @@ def tofile(self, fid: Any, sep: str = "", format: str = "%s") -> None:
Availability
--------
- Multiple GPUs, Multiple CPUs
+ Single CPU
"""
return self.__array__().tofile(fid=fid, sep=sep, format=format)
@@ -3744,12 +3821,46 @@ def flip(self, axis: Any = None) -> ndarray:
def view(
self,
dtype: Union[npt.DTypeLike, None] = None,
- type: Union[Any, None] = None,
+ type: Union[type, None] = None,
) -> ndarray:
+ """
+ New view of array with the same data.
+
+ Parameters
+ ----------
+ dtype : data-type or ndarray sub-class, optional
+ Data-type descriptor of the returned view, e.g., float32 or int16.
+ Omitting it results in the view having the same data-type as the
+ input array. This argument can also be specified as an ndarray
+ sub-class, which then specifies the type of the returned object
+ (this is equivalent to setting the ``type`` parameter).
+ type : ndarray sub-class, optional
+ Type of the returned view, e.g., ndarray or matrix. Again, omission
+ of the parameter results in type preservation.
+
+ Notes
+ -----
+ cuNumeric does not currently support type reinterpretation, or
+ conversion to ndarray sub-classes; use :func:`ndarray.__array__()` to
+ convert to `numpy.ndarray`.
+
+ See Also
+ --------
+ numpy.ndarray.view
+
+ Availability
+ --------
+ Multiple GPUs, Multiple CPUs
+ """
if dtype is not None and dtype != self.dtype:
raise NotImplementedError(
"cuNumeric does not currently support type reinterpretation"
)
+ if type is not None:
+ raise NotImplementedError(
+ "cuNumeric does not currently support conversion to ndarray "
+ "sub-classes; use __array__() to convert to numpy.ndarray"
+ )
return ndarray(shape=self.shape, dtype=self.dtype, thunk=self._thunk)
def unique(self) -> ndarray:
@@ -3821,6 +3932,16 @@ def _maybe_convert(self, dtype: np.dtype[Any], hints: Any) -> ndarray:
copy._thunk.convert(self._thunk)
return copy
+ def _warn_and_convert(self, dtype: np.dtype[Any]) -> ndarray:
+ if self.dtype != dtype:
+ runtime.warn(
+ f"converting array to {dtype} type",
+ category=RuntimeWarning,
+ )
+ return self.astype(dtype)
+ else:
+ return self
+
# For performing normal/broadcast unary operations
@classmethod
def _perform_unary_op(
diff --git a/cunumeric/config.py b/cunumeric/config.py
index 88802b911..cad52e77f 100644
--- a/cunumeric/config.py
+++ b/cunumeric/config.py
@@ -15,6 +15,7 @@
from __future__ import annotations
import os
+from abc import abstractmethod
from enum import IntEnum, unique
from typing import TYPE_CHECKING, Any, List, Union, cast
@@ -166,6 +167,7 @@ class _CunumericSharedLib:
CUNUMERIC_NONZERO: int
CUNUMERIC_PACKBITS: int
CUNUMERIC_POTRF: int
+ CUNUMERIC_PUTMASK: int
CUNUMERIC_RAND: int
CUNUMERIC_READ: int
CUNUMERIC_RED_ALL: int
@@ -192,7 +194,6 @@ class _CunumericSharedLib:
CUNUMERIC_TRANSPOSE_COPY_2D: int
CUNUMERIC_TRILU: int
CUNUMERIC_TRSM: int
- CUNUMERIC_TUNABLE_HAS_NUMAMEM: int
CUNUMERIC_TUNABLE_MAX_EAGER_VOLUME: int
CUNUMERIC_TUNABLE_NUM_GPUS: int
CUNUMERIC_TUNABLE_NUM_PROCS: int
@@ -269,6 +270,7 @@ class _CunumericSharedLib:
CUNUMERIC_WRITE: int
CUNUMERIC_ZIP: int
+ @abstractmethod
def cunumeric_has_curand(self) -> int:
...
@@ -356,6 +358,7 @@ class CuNumericOpCode(IntEnum):
NONZERO = _cunumeric.CUNUMERIC_NONZERO
PACKBITS = _cunumeric.CUNUMERIC_PACKBITS
POTRF = _cunumeric.CUNUMERIC_POTRF
+ PUTMASK = _cunumeric.CUNUMERIC_PUTMASK
RAND = _cunumeric.CUNUMERIC_RAND
READ = _cunumeric.CUNUMERIC_READ
REPEAT = _cunumeric.CUNUMERIC_REPEAT
@@ -520,7 +523,6 @@ class CuNumericTunable(IntEnum):
NUM_GPUS = _cunumeric.CUNUMERIC_TUNABLE_NUM_GPUS
NUM_PROCS = _cunumeric.CUNUMERIC_TUNABLE_NUM_PROCS
MAX_EAGER_VOLUME = _cunumeric.CUNUMERIC_TUNABLE_MAX_EAGER_VOLUME
- HAS_NUMAMEM = _cunumeric.CUNUMERIC_TUNABLE_HAS_NUMAMEM
# Match these to CuNumericScanCode in cunumeric_c.h
diff --git a/cunumeric/coverage.py b/cunumeric/coverage.py
index f4d2e0128..3efad0342 100644
--- a/cunumeric/coverage.py
+++ b/cunumeric/coverage.py
@@ -17,7 +17,13 @@
import warnings
from dataclasses import dataclass
from functools import wraps
-from types import FunctionType, MethodDescriptorType, MethodType, ModuleType
+from types import (
+ BuiltinFunctionType,
+ FunctionType,
+ MethodDescriptorType,
+ MethodType,
+ ModuleType,
+)
from typing import Any, Container, Mapping, Optional, cast
import numpy as np
@@ -78,7 +84,7 @@ class CuWrapperMetadata:
class CuWrapped(AnyCallable, Protocol):
_cunumeric: CuWrapperMetadata
- __wrapped__: Any
+ __wrapped__: AnyCallable
__name__: str
__qualname__: str
@@ -194,7 +200,9 @@ def wrapper(*args: Any, **kwargs: Any) -> Any:
def clone_module(
- origin_module: ModuleType, new_globals: dict[str, Any]
+ origin_module: ModuleType,
+ new_globals: dict[str, Any],
+ include_builtin_function_type: bool = False,
) -> None:
"""Copy attributes from one module to another, excluding submodules
@@ -230,7 +238,10 @@ def clone_module(
# Only need to wrap things that are in the origin module to begin with
if attr not in origin_module.__dict__:
continue
- if isinstance(value, (FunctionType, lgufunc)):
+ if isinstance(value, (FunctionType, lgufunc)) or (
+ include_builtin_function_type
+ and isinstance(value, BuiltinFunctionType)
+ ):
wrapped = implemented(
cast(AnyCallable, value), mod_name, attr, reporting=reporting
)
@@ -239,7 +250,10 @@ def clone_module(
from numpy import ufunc as npufunc
for attr, value in missing.items():
- if isinstance(value, (FunctionType, npufunc)):
+ if isinstance(value, (FunctionType, npufunc)) or (
+ include_builtin_function_type
+ and isinstance(value, BuiltinFunctionType)
+ ):
wrapped = unimplemented(value, mod_name, attr, reporting=reporting)
new_globals[attr] = wrapped
else:
diff --git a/cunumeric/deferred.py b/cunumeric/deferred.py
index 04fe6e829..baa2cf50d 100644
--- a/cunumeric/deferred.py
+++ b/cunumeric/deferred.py
@@ -18,13 +18,13 @@
from collections import Counter
from collections.abc import Iterable
from enum import IntEnum, unique
-from functools import reduce
+from functools import reduce, wraps
+from inspect import signature
from itertools import product
from typing import (
TYPE_CHECKING,
Any,
Callable,
- Collection,
Dict,
Optional,
Sequence,
@@ -35,8 +35,10 @@
import legate.core.types as ty
import numpy as np
-from legate.core import Future, ReductionOp, Store
-from numpy.core.numeric import normalize_axis_tuple # type: ignore
+from legate.core import Annotation, Future, ReductionOp, Store
+from numpy.core.numeric import ( # type: ignore [attr-defined]
+ normalize_axis_tuple,
+)
from typing_extensions import ParamSpec
from .config import (
@@ -95,24 +97,39 @@ def _prod(tpl: Sequence[int]) -> int:
def auto_convert(
- indices: Collection[int], keys: Sequence[str] = []
+ *thunk_params: str,
) -> Callable[[Callable[P, R]], Callable[P, R]]:
- indices = set(indices)
+ """
+ Converts all named parameters to DeferredArrays.
+ """
+ keys = set(thunk_params)
+ assert len(keys) == len(thunk_params)
def decorator(func: Callable[P, R]) -> Callable[P, R]:
- def wrapper(*args: Any, **kwargs: Any) -> Any:
+ assert not hasattr(
+ func, "__wrapped__"
+ ), "this decorator must be the innermost"
+
+ # For each parameter specified by name, also consider the case where
+ # it's passed as a positional parameter.
+ params = signature(func).parameters
+ extra = keys - set(params)
+ assert len(extra) == 0, f"unknown parameter(s): {extra}"
+ indices = {idx for (idx, param) in enumerate(params) if param in keys}
+
+ @wraps(func)
+ def wrapper(*args: Any, **kwargs: Any) -> R:
+ # Convert relevant arguments to DeferredArrays
self = args[0]
-
args = tuple(
- self.runtime.to_deferred_array(arg) if idx in indices else arg
+ self.runtime.to_deferred_array(arg)
+ if idx in indices and arg is not None
+ else arg
for (idx, arg) in enumerate(args)
)
- for key in keys:
- v = kwargs.get(key, None)
- if v is None:
- continue
- v = self.runtime.to_deferred_array(v)
- kwargs[key] = v
+ for (k, v) in kwargs.items():
+ if k in keys and v is not None:
+ kwargs[k] = self.runtime.to_deferred_array(v)
return func(*args, **kwargs)
@@ -350,7 +367,7 @@ def conj(self) -> NumPyThunk:
return result
# Copy source array to the destination array
- @auto_convert([1])
+ @auto_convert("rhs")
def copy(self, rhs: Any, deep: bool = False) -> None:
if self.scalar and rhs.scalar:
self.base.set_storage(rhs.base.storage)
@@ -517,57 +534,152 @@ def _slice_store(k: slice, store: Store, dim: int) -> tuple[slice, Store]:
return k, store
- def _create_indexing_array(
- self, key: Any, is_set: bool = False
- ) -> tuple[bool, Any, Any, Any]:
- store = self.base
- rhs = self
- # the index where the first index_array is passed to the [] operator
- start_index = -1
+ def _has_single_boolean_array(
+ self, key: Any, is_set: bool
+ ) -> tuple[bool, DeferredArray, Any]:
if isinstance(key, NumPyThunk) and key.dtype == bool:
- if not isinstance(key, DeferredArray):
- key = self.runtime.to_deferred_array(key)
-
- # in case when boolean array is passed as an index, shape for all
- # its dimensions should be the same as the shape of
- # corresponding dimensions of the input array
- for i in range(key.ndim):
- if key.shape[i] != rhs.shape[i]:
- raise ValueError(
- "shape of the index array for "
- f"dimension {i} doesn't match to the shape of the"
- f"index array which is {rhs.shape[i]}"
- )
-
- # if key or rhs are empty, return an empty array with correct shape
- if key.size == 0 or rhs.size == 0:
- if rhs.size == 0 and key.size != 0:
- # we need to calculate shape of the 0 dim of output region
- # even though the size of it is 0
- # this can potentially be replaced with COUNT_NONZERO
- s = key.nonzero()[0].size
- else:
- s = 0
+ return True, self, key
+ else:
+ # key is a single array of indices
+ if isinstance(key, NumPyThunk):
+ return False, self, key
+
+ assert isinstance(key, tuple)
+
+ key = self._unpack_ellipsis(key, self.ndim)
+
+ # loop through all the keys to check if there
+ # is a single NumPyThunk entry
+ num_arrays = 0
+ transpose_index = 0
+ for dim, k in enumerate(key):
+ if isinstance(k, NumPyThunk):
+ num_arrays += 1
+ transpose_index = dim
+
+ # this is the case when there is a single boolean array passed
+ # in this case we transpose original array so that the indx
+ # to which boolean array is passed to goes first
+ # doing this we can avoid going through Realm Copy which should
+ # improve performance
+ if (
+ num_arrays == 1
+ and key[transpose_index].dtype == bool
+ and is_set
+ ):
+ lhs = self
+ key_dim = key[transpose_index].ndim
+ transpose_indices = tuple(
+ (transpose_index + i) for i in range(0, key_dim)
+ )
+ transpose_indices += tuple(
+ i for i in range(0, transpose_index)
+ )
+ transpose_indices += tuple(
+ i for i in range(transpose_index + key_dim, lhs.ndim)
+ )
- out_shape = (s,) + tuple(
- rhs.shape[i] for i in range(key.ndim, rhs.ndim)
+ new_key = tuple(key[i] for i in range(0, transpose_index))
+ new_key += tuple(
+ key[i] for i in range(transpose_index + 1, len(key))
)
- out = cast(
- DeferredArray,
- self.runtime.create_empty_thunk(
- out_shape,
- rhs.dtype,
- inputs=[rhs],
- ),
+ lhs = lhs.transpose(transpose_indices)
+
+ # transform original array for all other keys in the tuple
+ if len(new_key) > 0:
+ shift = 0
+ store = lhs.base
+ for dim, k in enumerate(new_key):
+ if np.isscalar(k):
+ if k < 0: # type: ignore [operator]
+ k += store.shape[dim + key_dim + shift]
+ store = store.project(dim + key_dim + shift, k)
+ shift -= 1
+ elif k is np.newaxis:
+ store = store.promote(dim + key_dim + shift, 1)
+ elif isinstance(k, slice):
+ k, store = self._slice_store(
+ k, store, dim + key_dim + shift
+ )
+ else:
+ raise TypeError(
+ "Unsupported entry type passed to advanced ",
+ "indexing operation",
+ )
+ lhs = DeferredArray(self.runtime, store, self.dtype)
+
+ return True, lhs, key[transpose_index]
+
+ # this is a general advanced indexing case
+ else:
+ return False, self, key
+
+ def _advanced_indexing_with_boolean_array(
+ self,
+ key: Any,
+ is_set: bool = False,
+ set_value: Optional[Any] = None,
+ ) -> tuple[bool, Any, Any, Any]:
+ rhs = self
+ if not isinstance(key, DeferredArray):
+ key = self.runtime.to_deferred_array(key)
+
+ # in case when boolean array is passed as an index, shape for all
+ # its dimensions should be the same as the shape of
+ # corresponding dimensions of the input array
+ for i in range(key.ndim):
+ if key.shape[i] != rhs.shape[i]:
+ raise ValueError(
+ "shape of the index array for "
+ f"dimension {i} doesn't match to the shape of the"
+ f"index array which is {rhs.shape[i]}"
)
- out.fill(np.zeros((), dtype=out.dtype))
- return False, rhs, out, self
- key_store = key.base
- # bring key to the same shape as rhs
- for i in range(key_store.ndim, rhs.ndim):
- key_store = key_store.promote(i, rhs.shape[i])
+ # if key or rhs are empty, return an empty array with correct shape
+ if key.size == 0 or rhs.size == 0:
+ if rhs.size == 0 and key.size != 0:
+ # we need to calculate shape of the 0 dim of output region
+ # even though the size of it is 0
+ # this can potentially be replaced with COUNT_NONZERO
+ s = key.nonzero()[0].size
+ else:
+ s = 0
+
+ out_shape = (s,) + tuple(
+ rhs.shape[i] for i in range(key.ndim, rhs.ndim)
+ )
+
+ out = cast(
+ DeferredArray,
+ self.runtime.create_empty_thunk(
+ out_shape,
+ rhs.dtype,
+ inputs=[rhs],
+ ),
+ )
+ out.fill(np.zeros((), dtype=out.dtype))
+ return False, rhs, out, self
+ key_store = key.base
+ # bring key to the same shape as rhs
+ for i in range(key_store.ndim, rhs.ndim):
+ key_store = key_store.promote(i, rhs.shape[i])
+
+ # has_set_value && set_value.size==1 corresponds to the case
+ # when a[bool_indices]=scalar
+ # then we can call "putmask" to modify input array
+ # and avoid calling Copy
+ has_set_value = set_value is not None and set_value.size == 1
+ if has_set_value:
+
+ mask = DeferredArray(
+ self.runtime,
+ base=key_store,
+ dtype=self.dtype,
+ )
+ rhs.putmask(mask, set_value)
+ return False, rhs, rhs, self
+ else:
out_dtype = rhs.dtype
# in the case this operation is called for the set_item, we
# return Point type field that is later used for
@@ -621,17 +733,39 @@ def _create_indexing_array(
out_tmp = out_tmp.project(rhs.ndim - dim - 1, 0)
out = out._copy_store(out_tmp)
+ return is_set, rhs, out, self
- return False, rhs, out, self
+ def _create_indexing_array(
+ self,
+ key: Any,
+ is_set: bool = False,
+ set_value: Optional[Any] = None,
+ ) -> tuple[bool, Any, Any, Any]:
+
+ is_bool_array, lhs, bool_key = self._has_single_boolean_array(
+ key, is_set
+ )
+ # the case when single boolean array is passed to the advanced
+ # indexing operation
+ if is_bool_array:
+ return lhs._advanced_indexing_with_boolean_array(
+ bool_key, is_set, set_value
+ )
+ # general advanced indexing case
+
+ store = self.base
+ rhs = self
if isinstance(key, NumPyThunk):
key = (key,)
-
assert isinstance(key, tuple)
key = self._unpack_ellipsis(key, self.ndim)
+
+ # the index where the first index_array is passed to the [] operator
+ start_index = -1
shift = 0
last_index = self.ndim
- # in case when index arrays are passed in the scaterred way,
+ # in case when index arrays are passed in the scattered way,
# we need to transpose original array so all index arrays
# are close to each other
transpose_needed = False
@@ -674,7 +808,7 @@ def _create_indexing_array(
shift = 0
for dim, k in enumerate(key):
if np.isscalar(k):
- if k < 0: # type: ignore
+ if k < 0: # type: ignore [operator]
k += store.shape[dim + shift]
store = store.project(dim + shift, k)
shift -= 1
@@ -692,8 +826,8 @@ def _create_indexing_array(
"shape of boolean index did not match "
"indexed array "
)
- # in case of the mixed indises we all nonzero
- # for the bool array
+ # in case of the mixed indices we all nonzero
+ # for the boolean array
k = k.nonzero()
shift += len(k) - 1
tuple_of_arrays += k
@@ -751,7 +885,7 @@ def _get_view(self, key: Any) -> DeferredArray:
elif isinstance(k, slice):
k, store = self._slice_store(k, store, dim + shift)
elif np.isscalar(k):
- if k < 0: # type: ignore
+ if k < 0: # type: ignore [operator]
k += store.shape[dim + shift]
store = store.project(dim + shift, k)
shift -= 1
@@ -781,10 +915,16 @@ def _broadcast(self, shape: NdShape) -> Any:
return result
- def _convert_future_to_regionfield(self) -> DeferredArray:
+ def _convert_future_to_regionfield(
+ self, change_shape: bool = False
+ ) -> DeferredArray:
+ if change_shape and self.shape == ():
+ shape: NdShape = (1,)
+ else:
+ shape = self.shape
store = self.context.create_store(
self.dtype,
- shape=self.shape,
+ shape=shape,
optimize_scalar=False,
)
thunk_copy = DeferredArray(
@@ -858,7 +998,7 @@ def get_item(self, key: Any) -> NumPyThunk:
return result
- @auto_convert([2])
+ @auto_convert("rhs")
def set_item(self, key: Any, rhs: Any) -> None:
assert self.dtype == rhs.dtype
# Check to see if this is advanced indexing or not
@@ -869,7 +1009,10 @@ def set_item(self, key: Any, rhs: Any) -> None:
lhs,
index_array,
self,
- ) = self._create_indexing_array(key, True)
+ ) = self._create_indexing_array(key, True, rhs)
+
+ if not copy_needed:
+ return
if rhs.shape != index_array.shape:
rhs_tmp = rhs._broadcast(index_array.base.shape)
@@ -896,6 +1039,8 @@ def set_item(self, key: Any, rhs: Any) -> None:
index_array = index_array._convert_future_to_regionfield()
if lhs.base.kind == Future:
lhs = lhs._convert_future_to_regionfield()
+ if lhs.base.transformed:
+ lhs = lhs._copy_store(lhs.base)
if index_array.size != 0:
copy = self.context.create_copy()
@@ -1178,7 +1323,7 @@ def swapaxes(self, axis1: int, axis2: int) -> DeferredArray:
return result
# Convert the source array to the destination array
- @auto_convert([1])
+ @auto_convert("rhs")
def convert(
self,
rhs: Any,
@@ -1214,7 +1359,7 @@ def convert(
if temporary:
lhs.set_linear()
- @auto_convert([1, 2])
+ @auto_convert("v", "lhs")
def convolve(self, v: Any, lhs: Any, mode: ConvolveMode) -> None:
input = self.base
filter = v.base
@@ -1249,7 +1394,7 @@ def convolve(self, v: Any, lhs: Any, mode: ConvolveMode) -> None:
task.execute()
- @auto_convert([1])
+ @auto_convert("rhs")
def fft(
self,
rhs: Any,
@@ -1327,7 +1472,7 @@ def fill(self, numpy_array: Any) -> None:
)
self._fill(store)
- @auto_convert([2, 4])
+ @auto_convert("rhs1_thunk", "rhs2_thunk")
def contract(
self,
lhs_modes: list[str],
@@ -1595,7 +1740,7 @@ def choose(self, rhs: Any, *args: Any) -> None:
task.execute()
# Create or extract a diagonal from a matrix
- @auto_convert([1])
+ @auto_convert("rhs")
def _diag_helper(
self,
rhs: Any,
@@ -1664,6 +1809,78 @@ def _diag_helper(
task.execute()
+ @auto_convert("indices", "values")
+ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
+
+ if indices.base.kind == Future or indices.base.transformed:
+ change_shape = indices.base.kind == Future
+ indices = indices._convert_future_to_regionfield(change_shape)
+ if values.base.kind == Future or values.base.transformed:
+ change_shape = values.base.kind == Future
+ values = values._convert_future_to_regionfield(change_shape)
+
+ if self.base.kind == Future or self.base.transformed:
+ change_shape = self.base.kind == Future
+ self_tmp = self._convert_future_to_regionfield(change_shape)
+ else:
+ self_tmp = self
+
+ assert indices.size == values.size
+
+ # first, we create indirect array with PointN type that
+ # (indices.size,) shape and is used to copy data from values
+ # to the target ND array (self)
+ N = self_tmp.ndim
+ pointN_dtype = self.runtime.get_point_type(N)
+ indirect = cast(
+ DeferredArray,
+ self.runtime.create_empty_thunk(
+ shape=indices.shape,
+ dtype=pointN_dtype,
+ inputs=[indices],
+ ),
+ )
+
+ shape = self_tmp.shape
+ task = self.context.create_task(CuNumericOpCode.WRAP)
+ task.add_output(indirect.base)
+ task.add_scalar_arg(shape, (ty.int64,))
+ task.add_scalar_arg(True, bool) # has_input
+ task.add_scalar_arg(check_bounds, bool)
+ task.add_input(indices.base)
+ task.add_alignment(indices.base, indirect.base)
+ task.throws_exception(IndexError)
+ task.execute()
+ if indirect.base.kind == Future:
+ indirect = indirect._convert_future_to_regionfield()
+
+ copy = self.context.create_copy()
+ copy.set_target_indirect_out_of_range(False)
+ copy.add_input(values.base)
+ copy.add_target_indirect(indirect.base)
+ copy.add_output(self_tmp.base)
+ copy.execute()
+
+ if self_tmp is not self:
+ self.copy(self_tmp, deep=True)
+
+ @auto_convert("mask", "values")
+ def putmask(self, mask: Any, values: Any) -> None:
+ assert self.shape == mask.shape
+
+ if values.shape != self.shape:
+ values_new = values._broadcast(self.shape)
+ else:
+ values_new = values.base
+ task = self.context.create_task(CuNumericOpCode.PUTMASK)
+ task.add_input(self.base)
+ task.add_input(mask.base)
+ task.add_input(values_new)
+ task.add_output(self.base)
+ task.add_alignment(self.base, mask.base)
+ task.add_alignment(self.base, values_new)
+ task.execute()
+
# Create an identity array with the ones offset from the diagonal by k
def eye(self, k: int) -> None:
assert self.ndim == 2 # Only 2-D arrays should be here
@@ -1712,7 +1929,7 @@ def create_scalar(value: Any, dtype: np.dtype[Any]) -> Any:
task.execute()
# Tile the src array onto the destination array
- @auto_convert([1])
+ @auto_convert("rhs")
def tile(self, rhs: Any, reps: Union[Any, Sequence[int]]) -> None:
src_array = rhs
dst_array = self
@@ -1739,7 +1956,7 @@ def transpose(
result = DeferredArray(self.runtime, result, self.dtype)
return result
- @auto_convert([1])
+ @auto_convert("rhs")
def trilu(self, rhs: Any, k: int, lower: bool) -> None:
lhs = self.base
rhs = rhs._broadcast(lhs.shape)
@@ -1780,7 +1997,7 @@ def repeat(
task.execute()
return out
- @auto_convert([1])
+ @auto_convert("rhs")
def flip(self, rhs: Any, axes: Union[None, int, tuple[int, ...]]) -> None:
input = rhs.base
output = self.base
@@ -1801,7 +2018,7 @@ def flip(self, rhs: Any, axes: Union[None, int, tuple[int, ...]]) -> None:
task.execute()
# Perform a bin count operation on the array
- @auto_convert([1], ["weights"])
+ @auto_convert("rhs", "weights")
def bincount(self, rhs: Any, weights: Optional[NumPyThunk] = None) -> None:
weight_array = weights
src_array = rhs
@@ -2872,7 +3089,7 @@ def random_integer(
self.random(RandGenCode.INTEGER, [low, high])
# Perform the unary operation and put the result in the array
- @auto_convert([2])
+ @auto_convert("src")
def unary_op(
self,
op: UnaryOpCode,
@@ -2881,27 +3098,29 @@ def unary_op(
args: Any,
multiout: Optional[Any] = None,
) -> None:
+
lhs = self.base
rhs = src._broadcast(lhs.shape)
- task = self.context.create_auto_task(CuNumericOpCode.UNARY_OP)
- task.add_output(lhs)
- task.add_input(rhs)
- task.add_scalar_arg(op.value, ty.int32)
- self.add_arguments(task, args)
+ with Annotation(self.context, {"OpCode": op.name}):
+ task = self.context.create_auto_task(CuNumericOpCode.UNARY_OP)
+ task.add_output(lhs)
+ task.add_input(rhs)
+ task.add_scalar_arg(op.value, ty.int32)
+ self.add_arguments(task, args)
- task.add_alignment(lhs, rhs)
+ task.add_alignment(lhs, rhs)
- if multiout is not None:
- for out in multiout:
- task.add_output(out.base)
- task.add_alignment(out.base, rhs)
+ if multiout is not None:
+ for out in multiout:
+ task.add_output(out.base)
+ task.add_alignment(out.base, rhs)
- task.execute()
+ task.execute()
# Perform a unary reduction operation from one set of dimensions down to
# fewer
- @auto_convert([2])
+ @auto_convert("src")
def unary_reduction(
self,
op: UnaryRedCode,
@@ -2913,7 +3132,7 @@ def unary_reduction(
args: Any,
initial: Any,
) -> None:
- lhs_array = self
+ lhs_array: Union[NumPyThunk, DeferredArray] = self
rhs_array = src
assert lhs_array.ndim <= rhs_array.ndim
@@ -2921,7 +3140,7 @@ def unary_reduction(
if argred:
argred_dtype = self.runtime.get_arg_dtype(rhs_array.dtype)
- lhs_array = self.runtime.create_empty_thunk( # type: ignore
+ lhs_array = self.runtime.create_empty_thunk(
lhs_array.shape,
dtype=argred_dtype,
inputs=[self],
@@ -2933,10 +3152,6 @@ def unary_reduction(
0 if keepdims else lhs_array.ndim
)
- task = self.context.create_auto_task(
- CuNumericOpCode.SCALAR_UNARY_RED
- )
-
if initial is not None:
assert not argred
fill_value = initial
@@ -2945,18 +3160,25 @@ def unary_reduction(
lhs_array.fill(np.array(fill_value, dtype=lhs_array.dtype))
- lhs = lhs_array.base
+ lhs = lhs_array.base # type: ignore
while lhs.ndim > 1:
lhs = lhs.project(0, 0)
- task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
- task.add_input(rhs_array.base)
- task.add_scalar_arg(op, ty.int32)
- task.add_scalar_arg(rhs_array.shape, (ty.int64,))
+ with Annotation(
+ self.context, {"OpCode": op.name, "ArgRed?": str(argred)}
+ ):
+ task = self.context.create_auto_task(
+ CuNumericOpCode.SCALAR_UNARY_RED
+ )
- self.add_arguments(task, args)
+ task.add_reduction(lhs, _UNARY_RED_TO_REDUCTION_OPS[op])
+ task.add_input(rhs_array.base)
+ task.add_scalar_arg(op, ty.int32)
+ task.add_scalar_arg(rhs_array.shape, (ty.int64,))
- task.execute()
+ self.add_arguments(task, args)
+
+ task.execute()
else:
# Before we perform region reduction, make sure to have the lhs
@@ -2972,7 +3194,7 @@ def unary_reduction(
# If output dims is not 0, then we must have axes
assert axes is not None
# Reduction to a smaller array
- result = lhs_array.base
+ result = lhs_array.base # type: ignore
if keepdims:
for axis in axes:
result = result.project(axis, 0)
@@ -2985,18 +3207,21 @@ def unary_reduction(
"Need support for reducing multiple dimensions"
)
- task = self.context.create_auto_task(CuNumericOpCode.UNARY_RED)
+ with Annotation(
+ self.context, {"OpCode": op.name, "ArgRed?": str(argred)}
+ ):
+ task = self.context.create_auto_task(CuNumericOpCode.UNARY_RED)
- task.add_input(rhs_array.base)
- task.add_reduction(result, _UNARY_RED_TO_REDUCTION_OPS[op])
- task.add_scalar_arg(axis, ty.int32)
- task.add_scalar_arg(op, ty.int32)
+ task.add_input(rhs_array.base)
+ task.add_reduction(result, _UNARY_RED_TO_REDUCTION_OPS[op])
+ task.add_scalar_arg(axis, ty.int32)
+ task.add_scalar_arg(op, ty.int32)
- self.add_arguments(task, args)
+ self.add_arguments(task, args)
- task.add_alignment(result, rhs_array.base)
+ task.add_alignment(result, rhs_array.base)
- task.execute()
+ task.execute()
if argred:
self.unary_op(
@@ -3017,7 +3242,7 @@ def isclose(
self.binary_op(BinaryOpCode.ISCLOSE, rhs1, rhs2, True, args)
# Perform the binary operation and put the result in the lhs array
- @auto_convert([2, 3])
+ @auto_convert("src1", "src2")
def binary_op(
self,
op_code: BinaryOpCode,
@@ -3030,20 +3255,21 @@ def binary_op(
rhs1 = src1._broadcast(lhs.shape)
rhs2 = src2._broadcast(lhs.shape)
- # Populate the Legate launcher
- task = self.context.create_auto_task(CuNumericOpCode.BINARY_OP)
- task.add_output(lhs)
- task.add_input(rhs1)
- task.add_input(rhs2)
- task.add_scalar_arg(op_code.value, ty.int32)
- self.add_arguments(task, args)
+ with Annotation(self.context, {"OpCode": op_code.name}):
+ # Populate the Legate launcher
+ task = self.context.create_auto_task(CuNumericOpCode.BINARY_OP)
+ task.add_output(lhs)
+ task.add_input(rhs1)
+ task.add_input(rhs2)
+ task.add_scalar_arg(op_code.value, ty.int32)
+ self.add_arguments(task, args)
- task.add_alignment(lhs, rhs1)
- task.add_alignment(lhs, rhs2)
+ task.add_alignment(lhs, rhs1)
+ task.add_alignment(lhs, rhs2)
- task.execute()
+ task.execute()
- @auto_convert([2, 3])
+ @auto_convert("src1", "src2")
def binary_reduction(
self,
op: BinaryOpCode,
@@ -3079,7 +3305,7 @@ def binary_reduction(
task.execute()
- @auto_convert([1, 2, 3])
+ @auto_convert("src1", "src2", "src3")
def where(self, src1: Any, src2: Any, src3: Any) -> None:
lhs = self.base
rhs1 = src1._broadcast(lhs.shape)
@@ -3138,15 +3364,15 @@ def compute_strides(shape: NdShape) -> tuple[int, ...]:
stride *= dim
return result
- @auto_convert([1])
+ @auto_convert("src")
def cholesky(self, src: Any, no_tril: bool = False) -> None:
cholesky(self, src, no_tril)
- @auto_convert([1, 2])
+ @auto_convert("a", "b")
def solve(self, a: Any, b: Any) -> None:
solve(self, a, b)
- @auto_convert([2])
+ @auto_convert("rhs")
def scan(
self,
op: int,
@@ -3223,7 +3449,7 @@ def unique(self) -> NumPyThunk:
return result
- @auto_convert([1, 2])
+ @auto_convert("rhs", "v")
def searchsorted(self, rhs: Any, v: Any, side: SortSide = "left") -> None:
task = self.context.create_task(CuNumericOpCode.SEARCHSORTED)
@@ -3249,7 +3475,7 @@ def searchsorted(self, rhs: Any, v: Any, side: SortSide = "left") -> None:
task.add_scalar_arg(rhs.size, ty.int64)
task.execute()
- @auto_convert([1])
+ @auto_convert("rhs")
def sort(
self,
rhs: Any,
@@ -3274,7 +3500,7 @@ def sort(
sort(self, rhs, argsort, axis, stable)
- @auto_convert([1])
+ @auto_convert("rhs")
def partition(
self,
rhs: Any,
@@ -3305,7 +3531,7 @@ def create_window(self, op_code: WindowOpCode, M: int, *args: Any) -> None:
task.add_scalar_arg(arg, ty.float64)
task.execute()
- @auto_convert([1])
+ @auto_convert("src")
def packbits(
self, src: Any, axis: Union[int, None], bitorder: BitOrder
) -> None:
@@ -3321,7 +3547,7 @@ def packbits(
task.add_constraint(p_in <= p_out * scale) # type: ignore
task.execute()
- @auto_convert([1])
+ @auto_convert("src")
def unpackbits(
self, src: Any, axis: Union[int, None], bitorder: BitOrder
) -> None:
@@ -3337,10 +3563,11 @@ def unpackbits(
task.add_constraint(p_out <= p_in * scale) # type: ignore
task.execute()
- @auto_convert([1])
+ @auto_convert("src")
def _wrap(self, src: Any, new_len: int) -> None:
if src.base.kind == Future or src.base.transformed:
- src = src._convert_future_to_regionfield()
+ change_shape = src.base.kind == Future
+ src = src._convert_future_to_regionfield(change_shape)
# first, we create indirect array with PointN type that
# (len,) shape and is used to copy data from original array
@@ -3359,6 +3586,8 @@ def _wrap(self, src: Any, new_len: int) -> None:
task = self.context.create_task(CuNumericOpCode.WRAP)
task.add_output(indirect.base)
task.add_scalar_arg(src.shape, (ty.int64,))
+ task.add_scalar_arg(False, bool) # has_input
+ task.add_scalar_arg(False, bool) # check bounds
task.execute()
copy = self.context.create_copy()
diff --git a/cunumeric/eager.py b/cunumeric/eager.py
index fdb8f7989..61e8f5d37 100644
--- a/cunumeric/eager.py
+++ b/cunumeric/eager.py
@@ -215,14 +215,17 @@ def __init__(
self.key: Optional[tuple[Any, ...]] = key
#: if this ever becomes set (to a DeferredArray), we forward all
#: operations to it
- self.deferred: Optional[DeferredArray] = None
+ self.deferred: Optional[Union[DeferredArray, NumPyThunk]] = None
self.escaped = False
@property
def storage(self) -> Union[Future, tuple[Region, FieldID]]:
if self.deferred is None:
self.to_deferred_array()
- return self.deferred.storage # type: ignore
+
+ assert self.deferred is not None
+
+ return self.deferred.storage
@property
def shape(self) -> NdShape:
@@ -265,10 +268,9 @@ def _convert_children(self) -> None:
assert self.runtime.is_deferred_array(self.deferred)
for child in self.children:
if child.deferred is None:
- # mypy can't deduce that children nodes will always have
- # their .key attribute set.
- func = getattr(self.deferred, child.key[0]) # type: ignore
- args = child.key[1:] # type: ignore
+ assert child.key is not None
+ func = getattr(self.deferred, child.key[0])
+ args = child.key[1:]
child.deferred = func(*args)
# After we've made all the deferred views for each child then
# we can traverse down. Do it this way so we can get partition
@@ -298,7 +300,7 @@ def to_deferred_array(self) -> DeferredArray:
shape=self.shape,
)
else:
- self.deferred = self.runtime.find_or_create_array_thunk( # type: ignore # noqa E501
+ self.deferred = self.runtime.find_or_create_array_thunk(
self.array,
share=self.escaped,
defer=True,
@@ -334,7 +336,7 @@ def convolve(self, v: Any, out: Any, mode: ConvolveMode) -> None:
if self.ndim == 1:
out.array = np.convolve(self.array, v.array, mode)
else:
- from scipy.signal import convolve # type: ignore
+ from scipy.signal import convolve # type: ignore [import]
out.array = convolve(self.array, v.array, mode)
@@ -502,7 +504,7 @@ def convert(
elif nan_op is ConvertCode.PROD and np.isnan(rhs.array.item()):
self.array.fill(1)
else:
- self.array.fill(rhs.array.item())
+ self.array.fill(rhs.array.astype(self.array.dtype).item())
else:
if nan_op is ConvertCode.SUM:
self.array[:] = np.where(np.isnan(rhs.array), 0, rhs.array)
@@ -620,6 +622,20 @@ def _diag_helper(
axes = tuple(range(ndims - naxes, ndims))
self.array = diagonal_reference(rhs.array, axes)
+ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
+ self.check_eager_args(indices, values)
+ if self.deferred is not None:
+ self.deferred.put(indices, values, check_bounds)
+ else:
+ np.put(self.array, indices.array, values.array)
+
+ def putmask(self, mask: Any, values: Any) -> None:
+ self.check_eager_args(mask, values)
+ if self.deferred is not None:
+ self.deferred.putmask(mask, values)
+ else:
+ np.putmask(self.array, mask.array, values.array)
+
def eye(self, k: int) -> None:
if self.deferred is not None:
self.deferred.eye(k)
@@ -1454,10 +1470,9 @@ def unary_reduction(
return
if op in _UNARY_RED_OPS:
fn = _UNARY_RED_OPS[op]
- if initial is None:
- # NumPy starts using this predefined constant, instead of None,
- # to mean no value was given by the caller
- initial = np._NoValue # type: ignore
+ # Need to be more careful here, Numpy does not use None to mean
+ # "was not passed in" in this instance
+ kws = {"initial": initial} if initial is not None else {}
fn(
rhs.array,
out=self.array,
@@ -1466,6 +1481,7 @@ def unary_reduction(
where=where
if not isinstance(where, EagerArray)
else where.array,
+ **kws,
)
elif op == UnaryRedCode.ARGMAX:
np.argmax(
diff --git a/cunumeric/linalg/linalg.py b/cunumeric/linalg/linalg.py
index 88e457194..18ecfa140 100644
--- a/cunumeric/linalg/linalg.py
+++ b/cunumeric/linalg/linalg.py
@@ -17,8 +17,12 @@
from typing import TYPE_CHECKING, Sequence, Union
import numpy as np
-from numpy.core.multiarray import normalize_axis_index # type: ignore
-from numpy.core.numeric import normalize_axis_tuple # type: ignore
+from numpy.core.multiarray import ( # type: ignore [attr-defined]
+ normalize_axis_index,
+)
+from numpy.core.numeric import ( # type: ignore [attr-defined]
+ normalize_axis_tuple,
+)
from cunumeric._ufunc.math import add, sqrt as _sqrt
from cunumeric.array import add_boilerplate, convert_to_cunumeric_ndarray
@@ -192,9 +196,9 @@ def matrix_power(a: ndarray, n: int) -> ndarray:
"""
# Process inputs
if a.ndim < 2:
- raise ValueError(f"Expected at least 2d array, but got {a.ndim}d")
+ raise LinAlgError(f"Expected at least 2d array, but got {a.ndim}d")
if a.shape[-2] != a.shape[-1]:
- raise ValueError("Last 2 dimensions of the array must be square")
+ raise LinAlgError("Last 2 dimensions of the array must be square")
if not isinstance(n, int):
raise TypeError("exponent must be an integer")
@@ -521,7 +525,7 @@ def norm(
# Zero norm
return (
(x != 0)
- .astype(np.int64)
+ .astype(x.dtype)
.sum(axis=computed_axis, keepdims=keepdims)
)
elif ord == 1:
diff --git a/cunumeric/logic.py b/cunumeric/logic.py
index 5cafffdc5..667ae1d13 100644
--- a/cunumeric/logic.py
+++ b/cunumeric/logic.py
@@ -176,7 +176,7 @@ def iscomplexobj(x: Union[ndarray, npt.NDArray[Any]]) -> bool:
Availability
--------
- Single CPU
+ Multiple GPUs, Multiple CPUs
"""
if isinstance(x, ndarray):
return x.dtype.kind == "c"
@@ -244,7 +244,7 @@ def isrealobj(x: ndarray) -> bool:
Availability
--------
- Single CPU
+ Multiple GPUs, Multiple CPUs
"""
return not iscomplexobj(x)
@@ -275,7 +275,7 @@ def isscalar(x: Union[ndarray, npt.NDArray[Any]]) -> bool:
Availability
--------
- Single CPU
+ Multiple GPUs, Multiple CPUs
"""
# Since the input can be any value, we can't just convert it to cunumeric
diff --git a/cunumeric/module.py b/cunumeric/module.py
index 7a3024e55..a2a972087 100644
--- a/cunumeric/module.py
+++ b/cunumeric/module.py
@@ -23,7 +23,9 @@
import numpy as np
import opt_einsum as oe # type: ignore [import]
-from numpy.core.multiarray import normalize_axis_index # type: ignore
+from numpy.core.multiarray import ( # type: ignore [attr-defined]
+ normalize_axis_index,
+)
from numpy.core.numeric import ( # type: ignore [attr-defined]
normalize_axis_tuple,
)
@@ -2327,17 +2329,44 @@ def repeat(a: ndarray, repeats: Any, axis: Optional[int] = None) -> ndarray:
Multiple GPUs, Multiple CPUs
"""
+ if repeats is None:
+ raise TypeError(
+ "int() argument must be a string, a bytes-like object or a number,"
+ " not 'NoneType'"
+ )
+
+ if np.ndim(repeats) > 1:
+ raise ValueError("`repeats` should be scalar or 1D array")
+
+ # axes should be integer type
+ if axis is not None and not isinstance(axis, int):
+ raise TypeError("Axis should be integer type")
+
# when array is a scalar
if np.ndim(a) == 0:
+ if axis is not None and axis != 0:
+ raise np.AxisError("axis is out of bounds for array of dimension")
if np.ndim(repeats) == 0:
+ if not isinstance(repeats, int):
+ runtime.warn(
+ "converting repeats to an integer type",
+ category=UserWarning,
+ )
+ repeats = np.int64(repeats)
return full((repeats,), cast(Union[int, float], a))
+ elif np.ndim(repeats) == 1 and len(repeats) == 1:
+ if not isinstance(repeats, int):
+ runtime.warn(
+ "converting repeats to an integer type",
+ category=UserWarning,
+ )
+ repeats = np.int64(repeats)
+ return full((repeats[0],), cast(Union[int, float], a))
else:
raise ValueError(
"`repeat` with a scalar parameter `a` is only "
"implemented for scalar values of the parameter `repeats`."
)
- if np.ndim(repeats) > 1:
- raise ValueError("`repeats` should be scalar or 1D array")
# array is an array
array = convert_to_cunumeric_ndarray(a)
@@ -2349,9 +2378,6 @@ def repeat(a: ndarray, repeats: Any, axis: Optional[int] = None) -> ndarray:
array = array.ravel()
axis = 0
- # axes should be integer type
- if not isinstance(axis, int):
- raise TypeError("Axis should be integer type")
axis_int = np.int32(axis)
if axis_int >= array.ndim:
@@ -2386,12 +2412,7 @@ def repeat(a: ndarray, repeats: Any, axis: Optional[int] = None) -> ndarray:
# repeats is an array
else:
# repeats should be integer type
- if repeats.dtype != np.int64:
- runtime.warn(
- "converting repeats to an integer type",
- category=RuntimeWarning,
- )
- repeats = repeats.astype(np.int64)
+ repeats = repeats._warn_and_convert(np.int64)
if repeats.shape[0] != array.shape[axis]:
raise ValueError("incorrect shape of repeats array")
result = array._thunk.repeat(
@@ -3192,7 +3213,7 @@ def put_along_axis(
Parameters
----------
- arr : ndarray (Ni..., M, Nk...)
+ a : ndarray (Ni..., M, Nk...)
Destination array.
indices : ndarray (Ni..., J, Nk...)
Indices to change along each 1d slice of `arr`. This must match the
@@ -3220,6 +3241,10 @@ def put_along_axis(
Multiple GPUs, Multiple CPUs
"""
+
+ if a.size == 0:
+ return
+
if not np.issubdtype(indices.dtype, np.integer):
raise TypeError("`indices` must be an integer array")
@@ -3230,6 +3255,10 @@ def put_along_axis(
if a.ndim > 1:
# TODO call a=a.flat when flat is implemented
raise ValueError("a.ndim>1 case is not supported when axis=None")
+ if (indices.size == 0) or (values.size == 0):
+ return
+ if values.shape != indices.shape:
+ values = values._wrap(indices.size)
else:
computed_axis = normalize_axis_index(axis, a.ndim)
@@ -3449,6 +3478,88 @@ def diagonal(
)
+@add_boilerplate("a", "indices", "values")
+def put(
+ a: ndarray, indices: ndarray, values: ndarray, mode: str = "raise"
+) -> None:
+ """
+ Replaces specified elements of an array with given values.
+ The indexing works as if the target array is first flattened.
+
+ Parameters
+ ----------
+ a : array_like
+ Array to put data into
+ indices : array_like
+ Target indices, interpreted as integers.
+ WARNING: In case there are repeated entries in the
+ indices array, Legate doesn't guarantee the order in
+ which values are updated.
+
+ values : array_like
+ Values to place in `a` at target indices. If values array is shorter
+ than indices, it will be repeated as necessary.
+ mode : {'raise', 'wrap', 'clip'}, optional
+ Specifies how out-of-bounds indices will behave.
+ 'raise' : raise an error.
+ 'wrap' : wrap around.
+ 'clip' : clip to the range.
+
+ See Also
+ --------
+ numpy.put
+
+ Availability
+ --------
+ Multiple GPUs, Multiple CPUs
+ """
+ a.put(indices=indices, values=values, mode=mode)
+
+
+@add_boilerplate("a", "mask", "values")
+def putmask(a: ndarray, mask: ndarray, values: ndarray) -> None:
+ """
+ putmask(a, mask, values)
+ Changes elements of an array based on conditional and input values.
+ Sets ``a.flat[n] = values[n]`` for each n where ``mask.flat[n]==True``.
+ If `values` is not the same size as `a` and `mask` then it will repeat.
+ This gives behavior different from ``a[mask] = values``.
+
+ Parameters
+ ----------
+ a : ndarray
+ Target array.
+ mask : array_like
+ Boolean mask array. It has to be the same shape as `a`.
+ values : array_like
+ Values to put into `a` where `mask` is True. If `values` is smaller
+ than `a` it will be repeated.
+
+ See Also
+ --------
+ numpy.putmask
+
+ Availability
+ ------------
+ Multiple GPUs, Multiple CPUs
+ """
+ if not a.shape == mask.shape:
+ raise ValueError("mask and data must be the same size")
+
+ mask = mask._warn_and_convert(np.dtype(bool))
+
+ if a.dtype != values.dtype:
+ values = values._warn_and_convert(a.dtype)
+
+ try:
+ np.broadcast_shapes(values.shape, a.shape)
+ except ValueError:
+ values = values._wrap(a.size)
+ values = values.reshape(a.shape)
+
+ a._thunk.putmask(mask._thunk, values._thunk)
+
+
@add_boilerplate("a", "val")
def fill_diagonal(a: ndarray, val: ndarray, wrap: bool = False) -> None:
"""
@@ -3903,9 +4014,13 @@ def tensordot(
# Trivial multi-tensor contraction strategy: contract in input order
-class NullOptimizer(oe.paths.PathOptimizer): # type: ignore
- def __call__( # type: ignore [no-untyped-def]
- self, inputs, output, size_dict, memory_limit=None
+class NullOptimizer(oe.paths.PathOptimizer): # type: ignore [misc,no-any-unimported] # noqa
+ def __call__(
+ self,
+ inputs: list[set[str]],
+ outputs: set[str],
+ size_dict: dict[str, int],
+ memory_limit: Union[int, None] = None,
) -> list[tuple[int, int]]:
return [(0, 1)] + [(0, -1)] * (len(inputs) - 2)
@@ -3956,7 +4071,8 @@ def _contract(
raise ValueError("Unknown mode labels on output")
# Handle types
- if dtype is not None:
+ makes_view = b is None and len(a_modes) == len(out_modes)
+ if dtype is not None and not makes_view:
c_dtype = dtype
elif out is not None:
c_dtype = out.dtype
@@ -5758,8 +5874,12 @@ def sort_complex(a: ndarray) -> ndarray:
# force complex result upon return
if np.issubdtype(result.dtype, np.complexfloating):
return result
- else:
+ elif (
+ np.issubdtype(result.dtype, np.integer) and result.dtype.itemsize <= 2
+ ):
return result.astype(np.complex64, copy=True)
+ else:
+ return result.astype(np.complex128, copy=True)
# partition
@@ -6132,6 +6252,8 @@ def bincount(
--------
Multiple GPUs, Multiple CPUs
"""
+ if x.ndim != 1:
+ raise ValueError("the input array must be 1-dimensional")
if weights is not None:
if weights.shape != x.shape:
raise ValueError("weights array must be same shape for bincount")
@@ -6139,11 +6261,16 @@ def bincount(
raise ValueError("weights must be convertible to float64")
# Make sure the weights are float64
weights = weights.astype(np.float64)
- if x.dtype.kind != "i" and x.dtype.kind != "u":
+ if x.dtype.kind != "i":
raise TypeError("input array for bincount must be integer type")
if minlength < 0:
raise ValueError("'minlength' must not be negative")
- minlength = _builtin_max(minlength, int(amax(x)) + 1)
+ # Note that the following are non-blocking operations,
+ # though passing their results to `int` is blocking
+ max_val, min_val = amax(x), amin(x)
+ if int(min_val) < 0:
+ raise ValueError("the input array must have no negative elements")
+ minlength = _builtin_max(minlength, int(max_val) + 1)
if x.size == 1:
# Handle the special case of 0-D array
if weights is None:
diff --git a/cunumeric/random/__init__.py b/cunumeric/random/__init__.py
index a9730d063..2f8a98460 100644
--- a/cunumeric/random/__init__.py
+++ b/cunumeric/random/__init__.py
@@ -25,7 +25,7 @@
else:
from cunumeric.random.legacy import *
-clone_module(_nprandom, globals())
+clone_module(_nprandom, globals(), include_builtin_function_type=True)
del clone_module
del _nprandom
diff --git a/cunumeric/random/bitgenerator.py b/cunumeric/random/bitgenerator.py
index 2c5dfc577..1bd0aaa03 100644
--- a/cunumeric/random/bitgenerator.py
+++ b/cunumeric/random/bitgenerator.py
@@ -15,6 +15,7 @@
from __future__ import annotations
import time
+from abc import abstractproperty
from typing import TYPE_CHECKING, Union
import numpy as np
@@ -66,7 +67,7 @@ def __init__(
self.generatorType, seed, self.flags, forceBuild
)
- @property
+ @abstractproperty
def generatorType(self) -> BitGeneratorType:
...
diff --git a/cunumeric/random/random.py b/cunumeric/random/random.py
index 7a036a86e..7f37e5651 100644
--- a/cunumeric/random/random.py
+++ b/cunumeric/random/random.py
@@ -37,6 +37,10 @@ def seed(init: Union[int, None] = None) -> None:
This function is effective only when cuRAND is NOT used in the build
and is a no-op otherwise.
+
+ Availability
+ --------
+ Multiple GPUs, Multiple CPUs
"""
if init is None:
init = 0
diff --git a/cunumeric/runtime.py b/cunumeric/runtime.py
index 49e36abb8..603a69a47 100644
--- a/cunumeric/runtime.py
+++ b/cunumeric/runtime.py
@@ -23,7 +23,7 @@
import numpy as np
from legate.core import LEGATE_MAX_DIM, Rect, get_legate_runtime, legion
from legate.core.context import Context as LegateContext
-from legate.rc import ArgSpec, Argument, parse_command_args
+from legate.util.args import ArgSpec, Argument, parse_library_command_args
from typing_extensions import TypeGuard
from .config import (
@@ -39,32 +39,19 @@
from .eager import EagerArray
from .thunk import NumPyThunk
from .types import NdShape
-from .utils import calculate_volume, find_last_user_stacklevel, get_arg_dtype
+from .utils import (
+ SUPPORTED_DTYPES,
+ calculate_volume,
+ find_last_user_stacklevel,
+ get_arg_dtype,
+)
if TYPE_CHECKING:
import numpy.typing as npt
from legate.core._legion.future import Future
from legate.core.operation import AutoTask, ManualTask
-_supported_dtypes = {
- np.bool_: ty.bool_,
- np.int8: ty.int8,
- np.int16: ty.int16,
- np.int32: ty.int32,
- int: ty.int64,
- np.int64: ty.int64,
- np.uint8: ty.uint8,
- np.uint16: ty.uint16,
- np.uint32: ty.uint32,
- np.uint: ty.uint64,
- np.uint64: ty.uint64,
- np.float16: ty.float16,
- np.float32: ty.float32,
- float: ty.float64,
- np.float64: ty.float64,
- np.complex64: ty.complex64,
- np.complex128: ty.complex128,
-}
+ from .array import ndarray
ARGS = [
Argument(
@@ -162,7 +149,7 @@ def __init__(self, legate_context: LegateContext) -> None:
self.has_curand = cunumeric_lib.shared_object.cunumeric_has_curand()
self._register_dtypes()
- self.args = parse_command_args("cunumeric", ARGS)
+ self.args = parse_library_command_args("cunumeric", ARGS)
self.args.warning = self.args.warning or self.args.test_mode
if self.num_gpus > 0 and self.args.preload_cudalibs:
@@ -170,7 +157,7 @@ def __init__(self, legate_context: LegateContext) -> None:
def _register_dtypes(self) -> None:
type_system = self.legate_context.type_system
- for numpy_type, core_type in _supported_dtypes.items():
+ for numpy_type, core_type in SUPPORTED_DTYPES.items():
type_system.make_alias(np.dtype(numpy_type), core_type)
for dtype in _CUNUMERIC_DTYPES:
@@ -366,7 +353,7 @@ def is_supported_type(self, dtype: Union[str, np.dtype[Any]]) -> bool:
def get_numpy_thunk(
self,
- obj: Any,
+ obj: Union[ndarray, npt.NDArray[Any]],
share: bool = False,
dtype: Optional[np.dtype[Any]] = None,
) -> NumPyThunk:
@@ -418,11 +405,12 @@ def compute_parent_child_mapping(
# slice object that was used to generate a child array from
# a parent array so we can build the same mapping from a
# logical region to a subregion
- parent_ptr = int(array.base.ctypes.data) # type: ignore
+ assert array.base is not None
+ parent_ptr = int(array.base.ctypes.data)
child_ptr = int(array.ctypes.data)
assert child_ptr >= parent_ptr
ptr_diff = child_ptr - parent_ptr
- parent_shape = array.base.shape # type: ignore
+ parent_shape = array.base.shape
div = (
reduce(lambda x, y: x * y, parent_shape)
if len(parent_shape) > 1
@@ -440,8 +428,8 @@ def compute_parent_child_mapping(
key: tuple[Union[slice, None], ...] = ()
child_idx = 0
child_strides = tuple(array.strides)
- parent_strides = tuple(array.base.strides) # type: ignore
- for idx in range(array.base.ndim): # type: ignore
+ parent_strides = tuple(array.base.strides)
+ for idx in range(array.base.ndim):
# Handle the adding and removing dimension cases
if parent_strides[idx] == 0:
# This was an added dimension in the parent
diff --git a/cunumeric/sort.py b/cunumeric/sort.py
index 86fa1177e..fbca9146a 100644
--- a/cunumeric/sort.py
+++ b/cunumeric/sort.py
@@ -17,7 +17,9 @@
from typing import TYPE_CHECKING, Union, cast
from legate.core import types as ty
-from numpy.core.multiarray import normalize_axis_index # type: ignore
+from numpy.core.multiarray import ( # type: ignore [attr-defined]
+ normalize_axis_index,
+)
from .config import CuNumericOpCode
diff --git a/cunumeric/thunk.py b/cunumeric/thunk.py
index bdc773aeb..e80941d4e 100644
--- a/cunumeric/thunk.py
+++ b/cunumeric/thunk.py
@@ -197,6 +197,14 @@ def _diag_helper(
) -> None:
...
+ @abstractmethod
+ def put(self, indices: Any, values: Any, check_bounds: bool) -> None:
+ ...
+
+ @abstractmethod
+ def putmask(self, mask: Any, values: Any) -> None:
+ ...
+
@abstractmethod
def eye(self, k: int) -> None:
...
diff --git a/cunumeric/utils.py b/cunumeric/utils.py
index 5bfd0b54e..25f0f19f1 100644
--- a/cunumeric/utils.py
+++ b/cunumeric/utils.py
@@ -20,25 +20,30 @@
from types import FrameType
from typing import Any, List, Sequence, Tuple, Union, cast
+import legate.core.types as ty
import numpy as np
from .types import NdShape
-_SUPPORTED_DTYPES = [
- np.float16,
- np.float32,
- np.float64,
- float,
- np.int16,
- np.int32,
- np.int64,
- int,
- np.uint16,
- np.uint32,
- np.uint64,
- np.bool_,
- bool,
-]
+SUPPORTED_DTYPES = {
+ bool: ty.bool_,
+ np.bool_: ty.bool_,
+ np.int8: ty.int8,
+ np.int16: ty.int16,
+ np.int32: ty.int32,
+ int: ty.int64, # np.int is int
+ np.int64: ty.int64,
+ np.uint8: ty.uint8,
+ np.uint16: ty.uint16,
+ np.uint32: ty.uint32,
+ np.uint64: ty.uint64, # np.uint is np.uint64
+ np.float16: ty.float16,
+ np.float32: ty.float32,
+ float: ty.float64,
+ np.float64: ty.float64,
+ np.complex64: ty.complex64,
+ np.complex128: ty.complex128,
+}
def is_advanced_indexing(key: Any) -> bool:
@@ -91,7 +96,7 @@ def find_last_user_frames(top_only: bool = True) -> str:
def is_supported_dtype(dtype: Any) -> bool:
if not isinstance(dtype, np.dtype):
raise TypeError("expected a NumPy dtype")
- return dtype.type in _SUPPORTED_DTYPES
+ return dtype.type in SUPPORTED_DTYPES
def calculate_volume(shape: NdShape) -> int:
@@ -109,7 +114,7 @@ def get_arg_dtype(dtype: np.dtype[Any]) -> np.dtype[Any]:
def get_arg_value_dtype(dtype: np.dtype[Any]) -> np.dtype[Any]:
dt = dtype.fields["arg_value"][0].type # type: ignore [index]
- return cast(Any, dt)
+ return cast(np.dtype[Any], dt)
Modes = Tuple[List[str], List[str], List[str]]
diff --git a/cunumeric_cpp.cmake b/cunumeric_cpp.cmake
index a47038a3b..7034bb600 100644
--- a/cunumeric_cpp.cmake
+++ b/cunumeric_cpp.cmake
@@ -131,6 +131,7 @@ list(APPEND cunumeric_SOURCES
src/cunumeric/index/repeat.cc
src/cunumeric/index/wrap.cc
src/cunumeric/index/zip.cc
+ src/cunumeric/index/putmask.cc
src/cunumeric/item/read.cc
src/cunumeric/item/write.cc
src/cunumeric/matrix/contract.cc
@@ -180,6 +181,7 @@ if(Legion_USE_OpenMP)
src/cunumeric/nullary/window_omp.cc
src/cunumeric/index/advanced_indexing_omp.cc
src/cunumeric/index/choose_omp.cc
+ src/cunumeric/index/putmask_omp.cc
src/cunumeric/index/repeat_omp.cc
src/cunumeric/index/wrap_omp.cc
src/cunumeric/index/zip_omp.cc
@@ -229,6 +231,7 @@ if(Legion_USE_CUDA)
src/cunumeric/index/repeat.cu
src/cunumeric/index/wrap.cu
src/cunumeric/index/zip.cu
+ src/cunumeric/index/putmask.cu
src/cunumeric/item/read.cu
src/cunumeric/item/write.cu
src/cunumeric/matrix/contract.cu
@@ -328,7 +331,7 @@ list(APPEND cunumeric_SOURCES
src/cunumeric/cunumeric.cc
)
-if(NOT CMAKE_BUILD_TYPE STREQUAL "Release")
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND cunumeric_CXX_DEFS DEBUG_CUNUMERIC)
list(APPEND cunumeric_CUDA_DEFS DEBUG_CUNUMERIC)
endif()
diff --git a/cunumeric_python.cmake b/cunumeric_python.cmake
index 3430b5828..c1ca06015 100644
--- a/cunumeric_python.cmake
+++ b/cunumeric_python.cmake
@@ -43,22 +43,14 @@ if(NOT cunumeric_FOUND)
set(SKBUILD ON)
endif()
-execute_process(
- COMMAND ${CMAKE_C_COMPILER}
- -E -DLEGATE_USE_PYTHON_CFFI
- -I "${CMAKE_CURRENT_SOURCE_DIR}/src/cunumeric"
- -P "${CMAKE_CURRENT_SOURCE_DIR}/src/cunumeric/cunumeric_c.h"
- ECHO_ERROR_VARIABLE
- OUTPUT_VARIABLE header
- COMMAND_ERROR_IS_FATAL ANY
+add_custom_target("generate_install_info_py" ALL
+ COMMAND ${CMAKE_COMMAND}
+ -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+ -P "${CMAKE_CURRENT_SOURCE_DIR}/cmake/generate_install_info_py.cmake"
+ COMMENT "Generate install_info.py"
+ VERBATIM
)
-set(libpath "")
-configure_file(
- "${CMAKE_CURRENT_SOURCE_DIR}/cunumeric/install_info.py.in"
- "${CMAKE_CURRENT_SOURCE_DIR}/cunumeric/install_info.py"
-@ONLY)
-
add_library(cunumeric_python INTERFACE)
add_library(cunumeric::cunumeric_python ALIAS cunumeric_python)
target_link_libraries(cunumeric_python INTERFACE legate::core)
diff --git a/docs/cunumeric/source/api/_ndarray.rst b/docs/cunumeric/source/api/_ndarray.rst
index 317772104..3320f0857 100644
--- a/docs/cunumeric/source/api/_ndarray.rst
+++ b/docs/cunumeric/source/api/_ndarray.rst
@@ -43,6 +43,7 @@ cunumeric.ndarray
~ndarray.nonzero
~ndarray.partition
~ndarray.prod
+ ~ndarray.put
~ndarray.ravel
~ndarray.reshape
~ndarray.searchsorted
diff --git a/docs/cunumeric/source/api/indexing.rst b/docs/cunumeric/source/api/indexing.rst
index 1023ed1d4..ab02bbcc4 100644
--- a/docs/cunumeric/source/api/indexing.rst
+++ b/docs/cunumeric/source/api/indexing.rst
@@ -43,5 +43,7 @@ Inserting data into arrays
:toctree: generated/
fill_diagonal
+ put
+ putmask
put_along_axis
place
diff --git a/docs/cunumeric/source/api/ndarray.rst b/docs/cunumeric/source/api/ndarray.rst
index 1b2c2107c..afdd1406f 100644
--- a/docs/cunumeric/source/api/ndarray.rst
+++ b/docs/cunumeric/source/api/ndarray.rst
@@ -124,7 +124,7 @@ Item selection and manipulation
:toctree: generated/
ndarray.take
- .. ndarray.put
+ ndarray.put
.. ndarray.repeat
ndarray.choose
ndarray.sort
diff --git a/docs/cunumeric/source/conf.py b/docs/cunumeric/source/conf.py
index 17fd408c1..5d3ce4881 100644
--- a/docs/cunumeric/source/conf.py
+++ b/docs/cunumeric/source/conf.py
@@ -37,8 +37,7 @@
"sphinx.ext.mathjax",
"sphinx.ext.napoleon",
"sphinx_copybutton",
- "sphinx_markdown_tables",
- "recommonmark",
+ "myst_parser",
"cunumeric._sphinxext.comparison_table",
"cunumeric._sphinxext.implemented_index",
"cunumeric._sphinxext.missing_refs",
diff --git a/docs/cunumeric/source/versions.rst b/docs/cunumeric/source/versions.rst
index c7c1e0ca6..ef6b7a83d 100644
--- a/docs/cunumeric/source/versions.rst
+++ b/docs/cunumeric/source/versions.rst
@@ -10,3 +10,5 @@ Versions
22.05
22.08
22.10
+ 23.01
+
diff --git a/examples/benchmark.py b/examples/benchmark.py
index ec107f24f..1d0944e3b 100644
--- a/examples/benchmark.py
+++ b/examples/benchmark.py
@@ -18,6 +18,122 @@
import math
from functools import reduce
+from typing_extensions import Protocol
+
+
+class Timer(Protocol):
+ def start(self):
+ ...
+
+ def stop(self):
+ """
+ Blocks execution until everything before it has completed. Returns the
+ duration since the last call to start(), in milliseconds.
+ """
+ ...
+
+
+class CuNumericTimer(Timer):
+ def __init__(self):
+ self._start_future = None
+
+ def start(self):
+ from legate.timing import time
+
+ self._start_future = time()
+
+ def stop(self):
+ from legate.timing import time
+
+ end_future = time()
+ return (end_future - self._start_future) / 1000.0
+
+
+class CuPyTimer(Timer):
+ def __init__(self):
+ self._start_event = None
+
+ def start(self):
+ from cupy import cuda
+
+ self._start_event = cuda.Event()
+ self._start_event.record()
+
+ def stop(self):
+ from cupy import cuda
+
+ end_event = cuda.Event()
+ end_event.record()
+ end_event.synchronize()
+ return cuda.get_elapsed_time(self._start_event, end_event)
+
+
+class NumPyTimer(Timer):
+ def __init__(self):
+ self._start_time = None
+
+ def start(self):
+ from time import perf_counter_ns
+
+ self._start_time = perf_counter_ns() / 1000.0
+
+ def stop(self):
+ from time import perf_counter_ns
+
+ end_time = perf_counter_ns() / 1000.0
+ return (end_time - self._start_time) / 1000.0
+
+
+# Add common arguments and parse
+def parse_args(parser):
+ parser.add_argument(
+ "-b",
+ "--benchmark",
+ type=int,
+ default=1,
+ dest="benchmark",
+ help="number of times to benchmark this application (default 1 - "
+ "normal execution)",
+ )
+ parser.add_argument(
+ "--package",
+ dest="package",
+ choices=["legate", "numpy", "cupy"],
+ type=str,
+ default="legate",
+ help="NumPy package to use",
+ )
+ parser.add_argument(
+ "--cupy-allocator",
+ dest="cupy_allocator",
+ choices=["default", "off", "managed"],
+ type=str,
+ default="default",
+ help="cupy allocator to use",
+ )
+ args, _ = parser.parse_known_args()
+ if args.package == "legate":
+ import cunumeric as np
+
+ timer = CuNumericTimer()
+ elif args.package == "cupy":
+ import cupy as np
+
+ if args.cupy_allocator == "off":
+ np.cuda.set_allocator(None)
+ print("Turning off memory pool")
+ elif args.cupy_allocator == "managed":
+ np.cuda.set_allocator(
+ np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
+ )
+ print("Using managed memory pool")
+ timer = CuPyTimer()
+ elif args.package == "numpy":
+ import numpy as np
+
+ timer = NumPyTimer()
+ return args, np, timer
+
# A helper method for benchmarking applications
def run_benchmark(f, samples, name, args):
diff --git a/examples/black_scholes.py b/examples/black_scholes.py
index aadcef456..55374ea09 100644
--- a/examples/black_scholes.py
+++ b/examples/black_scholes.py
@@ -16,12 +16,8 @@
#
import argparse
-import datetime
-import math
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
def generate_random(N, min, max, D):
@@ -75,16 +71,10 @@ def black_scholes(S, X, T, R, V):
def run_black_scholes(N, D):
print("Running black scholes on %dK options..." % N)
N *= 1000
- start = datetime.datetime.now()
+ timer.start()
S, X, T, R, V = initialize(N, D)
- call, put = black_scholes(S, X, T, R, V)
- # Check the result for NaNs to synchronize before stopping timing
- call_sum = np.sum(call)
- put_sum = np.sum(put)
- assert not math.isnan(call_sum) and not math.isnan(put_sum)
- stop = datetime.datetime.now()
- delta = stop - start
- total = delta.total_seconds() * 1000.0
+ _, _ = black_scholes(S, X, T, R, V)
+ total = timer.stop()
print("Elapsed Time: " + str(total) + " ms")
return total
@@ -107,16 +97,9 @@ def run_black_scholes(N, D):
dest="P",
help="precision of the computation in bits",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
if args.P == 16:
run_benchmark(
run_black_scholes,
diff --git a/examples/cg.py b/examples/cg.py
index 50d7d1964..a0399778e 100644
--- a/examples/cg.py
+++ b/examples/cg.py
@@ -17,15 +17,7 @@
import argparse
-from benchmark import run_benchmark
-
-try:
- from legate.timing import time
-except (ImportError, RuntimeError):
- from time import perf_counter_ns
-
- def time():
- return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark
# This is technically dead code right now, but we'll keep it around in
@@ -75,7 +67,28 @@ def generate_2D(N, corners):
return A, b
-def solve(A, b, conv_iters, max_iters, conv_threshold, verbose):
+def check(A, x, b):
+ print("Checking result...")
+ if np.allclose(A.dot(x), b):
+ print("PASS!")
+ else:
+ print("FAIL!")
+
+
+def run_cg(
+ N,
+ corners,
+ conv_iters,
+ max_iters,
+ warmup,
+ conv_threshold,
+ perform_check,
+ timing,
+ verbose,
+):
+ # A, b = generate_random(N)
+ A, b = generate_2D(N, corners)
+
print("Solving system...")
x = np.zeros(A.shape[1])
r = b - A.dot(x)
@@ -86,7 +99,11 @@ def solve(A, b, conv_iters, max_iters, conv_threshold, verbose):
max_iters = (
min(max_iters, b.shape[0]) if max_iters is not None else b.shape[0]
)
- for i in range(max_iters):
+
+ timer.start()
+ for i in range(-warmup, max_iters):
+ if i == 0:
+ timer.start()
Ap = A.dot(p)
alpha = rsold / (p.dot(Ap))
x = x + alpha * p
@@ -94,9 +111,11 @@ def solve(A, b, conv_iters, max_iters, conv_threshold, verbose):
rsnew = r.dot(r)
# We only do the convergence test every conv_iters or on the last
# iteration
- if (i % conv_iters == 0 or i == (max_iters - 1)) and np.sqrt(
- rsnew
- ) < conv_threshold:
+ if (
+ i >= 0
+ and (i % conv_iters == 0 or i == (max_iters - 1))
+ and np.sqrt(rsnew) < conv_threshold
+ ):
converged = i
break
if verbose:
@@ -104,11 +123,18 @@ def solve(A, b, conv_iters, max_iters, conv_threshold, verbose):
beta = rsnew / rsold
p = r + beta * p
rsold = rsnew
+ total = timer.stop()
+
if converged < 0:
print("Convergence FAILURE!")
else:
print("Converged in %d iterations" % (converged))
- return x
+ if perform_check:
+ check(A, x, b)
+
+ if timing:
+ print(f"Elapsed Time: {total} ms")
+ return total
def precondition(A, N, corners):
@@ -120,10 +146,22 @@ def precondition(A, N, corners):
return M
-def preconditioned_solve(
- A, M, b, conv_iters, max_iters, conv_threshold, verbose
+def run_preconditioned_cg(
+ N,
+ corners,
+ conv_iters,
+ max_iters,
+ warmup,
+ conv_threshold,
+ perform_check,
+ timing,
+ verbose,
):
print("Solving system with preconditioner...")
+ # A, b = generate_random(N)
+ A, b = generate_2D(N, corners)
+ M = precondition(A, N, corners)
+
x = np.zeros(A.shape[1])
r = b - A.dot(x)
z = M.dot(r)
@@ -134,7 +172,11 @@ def preconditioned_solve(
max_iters = (
min(max_iters, b.shape[0]) if max_iters is not None else b.shape[0]
)
- for i in range(max_iters):
+
+ timer.start()
+ for i in range(-warmup, max_iters):
+ if i == 0:
+ timer.start()
Ap = A.dot(p)
alpha = rzold / (p.dot(Ap))
x = x + alpha * p
@@ -142,9 +184,11 @@ def preconditioned_solve(
rznew = r.dot(r)
# We only do the convergence test every conv_iters or on the
# last iteration
- if (i % conv_iters == 0 or i == (max_iters - 1)) and np.sqrt(
- rznew
- ) < conv_threshold:
+ if (
+ i >= 0
+ and (i % conv_iters == 0 or i == (max_iters - 1))
+ and np.sqrt(rznew) < conv_threshold
+ ):
converged = i
break
if verbose:
@@ -154,46 +198,15 @@ def preconditioned_solve(
beta = rznew / rzold
p = z + beta * p
rzold = rznew
+ total = timer.stop()
+
if converged < 0:
print("Convergence FAILURE!")
else:
print("Converged in %d iterations" % (converged))
- return x
-
-
-def check(A, x, b):
- print("Checking result...")
- if np.allclose(A.dot(x), b):
- print("PASS!")
- else:
- print("FAIL!")
-
-
-def run_cg(
- N,
- corners,
- preconditioner,
- conv_iters,
- max_iters,
- conv_threshold,
- perform_check,
- timing,
- verbose,
-):
- # A, b = generate_random(N)
- A, b = generate_2D(N, corners)
- start = time()
- if preconditioner:
- M = precondition(A, N, corners)
- x = preconditioned_solve(
- A, M, b, conv_iters, max_iters, conv_threshold, verbose
- )
- else:
- x = solve(A, b, conv_iters, max_iters, conv_threshold, verbose)
if perform_check:
check(A, x, b)
- stop = time()
- total = (stop - start) / 1000.0
+
if timing:
print(f"Elapsed Time: {total} ms")
return total
@@ -237,6 +250,14 @@ def run_cg(
dest="max_iters",
help="bound the maximum number of iterations",
)
+ parser.add_argument(
+ "-w",
+ "--warmup",
+ type=int,
+ default=5,
+ dest="warmup",
+ help="warm-up iterations",
+ )
parser.add_argument(
"-n",
"--num",
@@ -259,15 +280,6 @@ def run_cg(
action="store_true",
help="print verbose output",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
parser.add_argument(
"--threshold",
type=float,
@@ -275,51 +287,19 @@ def run_cg(
dest="conv_threshold",
help="convergence check threshold",
)
- parser.add_argument(
- "--package",
- dest="package",
- choices=["legate", "numpy", "cupy"],
- type=str,
- default="legate",
- help="NumPy package to use (legate, numpy, or cupy)",
- )
- parser.add_argument(
- "--cupy-allocator",
- dest="cupy_allocator",
- choices=["default", "off", "managed"],
- type=str,
- default="default",
- help="cupy allocator to use (default, off, or managed)",
- )
-
- args, _ = parser.parse_known_args()
-
- if args.package == "legate":
- import cunumeric as np
- elif args.package == "cupy":
- import cupy as np
- if args.cupy_allocator == "off":
- np.cuda.set_allocator(None)
- print("Turning off memory pool")
- elif args.cupy_allocator == "managed":
- np.cuda.set_allocator(
- np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
- )
- print("Using managed memory pool")
- elif args.package == "numpy":
- import numpy as np
+ args, np, timer = parse_args(parser)
run_benchmark(
- run_cg,
+ run_preconditioned_cg if args.precondition else run_cg,
args.benchmark,
"PreCG" if args.precondition else "CG",
(
args.N,
args.corners,
- args.precondition,
args.conv_iters,
args.max_iters,
+ args.warmup,
args.conv_threshold,
args.check,
args.timing,
diff --git a/examples/einsum.py b/examples/einsum.py
index 9990c46d7..090e3385f 100644
--- a/examples/einsum.py
+++ b/examples/einsum.py
@@ -18,18 +18,10 @@
import argparse
import re
-from benchmark import run_benchmark
+from benchmark import parse_args, run_benchmark
-try:
- from legate.timing import time
-except (ImportError, RuntimeError):
- from time import perf_counter_ns
- def time():
- return perf_counter_ns() / 1000.0
-
-
-def run_einsum(expr, N, iters, dtype, cupy_compatibility):
+def run_einsum(expr, N, iters, warmup, dtype, cupy_compatibility):
# Parse contraction expression
m = re.match(r"([a-zA-Z]*),([a-zA-Z]*)->([a-zA-Z]*)", expr)
assert m is not None
@@ -90,8 +82,10 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
C = np.zeros((N,) * len(c_modes), dtype=dtype)
# Run contraction
- start = time()
- for _ in range(iters):
+ timer.start()
+ for idx in range(iters + warmup):
+ if idx == warmup:
+ timer.start()
if cupy_compatibility:
C = np.einsum(expr, A, B)
else:
@@ -108,10 +102,9 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
A, C = C, A
else:
B, C = C, B
- stop = time()
+ total = timer.stop()
# Print statistics
- total = (stop - start) / 1000.0
average = total / iters
print(f"Elapsed Time: {total:.3f} ms")
print(f"Average Iteration: {average:.3f} ms")
@@ -144,6 +137,14 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
dest="iters",
help="number of iterations to run",
)
+ parser.add_argument(
+ "-w",
+ "--warmup",
+ type=int,
+ default=5,
+ dest="warmup",
+ help="warm-up iterations",
+ )
parser.add_argument(
"-t",
"--dtype",
@@ -152,31 +153,6 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
dest="dtype",
help="dtype for array elements",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- parser.add_argument(
- "--package",
- dest="package",
- choices=["legate", "numpy", "cupy"],
- type=str,
- default="legate",
- help="NumPy package to use (legate, numpy, or cupy)",
- )
- parser.add_argument(
- "--cupy-allocator",
- dest="cupy_allocator",
- choices=["default", "off", "managed"],
- type=str,
- default="default",
- help="cupy allocator to use (default, off, or managed)",
- )
parser.add_argument(
"--cupy-compatibility",
action="store_true",
@@ -185,25 +161,9 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
else, use einsum(expr, A, B, out=C)""",
)
- args, _ = parser.parse_known_args()
-
- cupy_compatibility = args.cupy_compatibility
- if args.package == "legate":
- import cunumeric as np
- elif args.package == "cupy":
- import cupy as np
-
- if args.cupy_allocator == "off":
- np.cuda.set_allocator(None)
- print("Turning off memory pool")
- elif args.cupy_allocator == "managed":
- np.cuda.set_allocator(
- np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
- )
- print("Using managed memory pool")
- cupy_compatibility = True
- elif args.package == "numpy":
- import numpy as np
+ args, np, timer = parse_args(parser)
+
+ cupy_compatibility = args.cupy_compatibility or args.package == "cupy"
if cupy_compatibility:
print("Use C = np.einsum(expr, A, B) for cupy compatibility")
@@ -222,6 +182,7 @@ def run_einsum(expr, N, iters, dtype, cupy_compatibility):
args.expr,
args.N,
args.iters,
+ args.warmup,
dtypes[args.dtype],
cupy_compatibility,
),
diff --git a/examples/gemm.py b/examples/gemm.py
index 409d43ece..c70a666c1 100644
--- a/examples/gemm.py
+++ b/examples/gemm.py
@@ -16,12 +16,8 @@
#
import argparse
-import datetime
-import math
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
def initialize(M, N, K, ft):
@@ -39,7 +35,7 @@ def total_space(M, N, K, ft):
return (M * N + M * K + K * N) * np.dtype(ft).itemsize
-def run_gemm(N, I, ft): # noqa: E741
+def run_gemm(N, I, warmup, ft): # noqa: E741
print("Problem Size: M=" + str(N) + " N=" + str(N) + " K=" + str(N))
print("Total Iterations: " + str(I))
flops = total_flops(N, N, N)
@@ -47,25 +43,20 @@ def run_gemm(N, I, ft): # noqa: E741
space = total_space(N, N, N, ft)
print("Total Size: " + str(space / 1e6) + " MB")
A, B, C = initialize(N, N, N, ft)
- # Compute some sums and check for NaNs to force synchronization
- # before we start the timing
- assert not math.isnan(np.sum(A))
- assert not math.isnan(np.sum(B))
- assert not math.isnan(np.sum(C))
- start = datetime.datetime.now()
+
+ timer.start()
# Run for as many iterations as was requested
- for idx in range(I):
+ for idx in range(I + warmup):
+ if idx == warmup:
+ timer.start()
np.dot(A, B, out=C)
# We need to rotate the matrices to keep Legate honest
# about moving data so it can't just duplicate A and B
# on the first iteration and reuse them, this means
# that A, B, C all need to be square
A, B, C = B, C, A
- # Do another sum to synchronize for timings, B is last output
- assert not math.isnan(np.sum(B))
- stop = datetime.datetime.now()
- delta = stop - start
- total = delta.total_seconds() * 1000.0
+ total = timer.stop()
+
print("Elapsed Time: " + str(total) + " ms")
average = total / I
print("Average GEMM: " + str(average) + " ms")
@@ -83,6 +74,14 @@ def run_gemm(N, I, ft): # noqa: E741
dest="I",
help="number of iterations to run",
)
+ parser.add_argument(
+ "-w",
+ "--warmup",
+ type=int,
+ default=5,
+ dest="warmup",
+ help="warm-up iterations",
+ )
parser.add_argument(
"-n",
"--num",
@@ -100,27 +99,29 @@ def run_gemm(N, I, ft): # noqa: E741
help="number of bits of precision to use for the gemm computation "
"(16,32,64)",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
if args.P == 16:
run_benchmark(
- run_gemm, args.benchmark, "HGEMM", (args.N, args.I, np.float16)
+ run_gemm,
+ args.benchmark,
+ "HGEMM",
+ (args.N, args.I, args.warmup, np.float16),
)
elif args.P == 32:
run_benchmark(
- run_gemm, args.benchmark, "SGEMM", (args.N, args.I, np.float32)
+ run_gemm,
+ args.benchmark,
+ "SGEMM",
+ (args.N, args.I, args.warmup, np.float32),
)
elif args.P == 64:
run_benchmark(
- run_gemm, args.benchmark, "DGEMM", (args.N, args.I, np.float64)
+ run_gemm,
+ args.benchmark,
+ "DGEMM",
+ (args.N, args.I, args.warmup, np.float64),
)
else:
raise TypeError("Precision must be one of 16, 32, or 64")
diff --git a/examples/indexing_routines.py b/examples/indexing_routines.py
index 3d275e49f..a0f15e120 100644
--- a/examples/indexing_routines.py
+++ b/examples/indexing_routines.py
@@ -15,16 +15,11 @@
# limitations under the License.
#
-from __future__ import print_function
-
import argparse
import gc
import math
-from benchmark import run_benchmark
-from legate.timing import time
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
def compute_diagonal(steps, N, timing, warmup):
@@ -32,11 +27,10 @@ def compute_diagonal(steps, N, timing, warmup):
print("measuring diagonal")
for step in range(steps + warmup):
if step == warmup:
- start = time()
+ timer.start()
A2 = np.diag(A1)
A1 = np.diag(A2)
- stop = time()
- total = (stop - start) / 1000.0
+ total = timer.stop()
if timing:
space = (N * N + N) * np.dtype(int).itemsize / 1073741824
print("Total Size: " + str(space) + " GB")
@@ -57,10 +51,9 @@ def compute_choose(steps, N, timing, warmup):
C1 = np.arange(N, dtype=int) % 10
for step in range(steps + warmup):
if step == warmup:
- start = time()
+ timer.start()
C1 = np.choose(C1, A, mode="wrap")
- stop = time()
- total = (stop - start) / 1000.0
+ total = timer.stop()
if timing:
space = N * np.dtype(int).itemsize / 1073741824
print("Total Size: " + str(space) + " GB")
@@ -87,10 +80,9 @@ def compute_repeat(steps, N, timing, warmup):
print("measuring repeat")
for step in range(steps + warmup):
if step == warmup:
- start = time()
+ timer.start()
A2 = np.repeat(A2, R, axis=1)
- stop = time()
- total = (stop - start) / 1000.0
+ total = timer.stop()
if timing:
space = (N * N) * np.dtype(int).itemsize / 1073741824
print("Total Size: " + str(space) + " GB")
@@ -113,11 +105,10 @@ def compute_advanced_indexing_1d(steps, N, timing, warmup):
indx_bool = (B % 2).astype(bool)
for step in range(steps + warmup):
if step == warmup:
- start = time()
+ timer.start()
A1[indx] = 10 # 1 copy
A1[indx_bool] = 12 # 1 AI and 1 copy
- stop = time()
- total = (stop - start) / 1000.0
+ total = timer.stop()
if timing:
space = (3 * N) * np.dtype(int).itemsize / 1073741824
print("Total Size: " + str(space) + " GB")
@@ -141,12 +132,11 @@ def compute_advanced_indexing_2d(steps, N, timing, warmup):
indx2d_bool = (A2 % 2).astype(bool)
for step in range(steps + warmup):
if step == warmup:
- start = time()
+ timer.start()
A2[indx_bool, indx_bool] = 11 # one ZIP and 1 copy = N+N*N
A2[:, indx] = 12 # one ZIP and 3 copies = N+3*N*N
A2[indx2d_bool] = 13 # 1 copy and one AI task = 2* N*N
- stop = time()
- total = (stop - start) / 1000.0
+ total = timer.stop()
if timing:
space = (6 * N * N + 2 * N) * np.dtype(int).itemsize / 1073741824
print("Total Size: " + str(space) + " GB")
@@ -176,11 +166,10 @@ def compute_advanced_indexing_3d(steps, N, timing, warmup):
indx3d_bool = (A3 % 2).astype(bool)
for step in range(steps + warmup):
if step == warmup:
- start = time()
+ timer.start()
A3[indx, :, indx] = 15 # 1 ZIP and 3 copy = N+3N*N
A3[indx3d_bool] = 16 # 1 copy and 1 AI task = 2*N*N
- stop = time()
- total = (stop - start) / 1000.0
+ total = timer.stop()
if timing:
space = (5 * N * N + N) * np.dtype(int).itemsize / 1073741824
print("Total Size: " + str(space) + " GB")
@@ -264,15 +253,6 @@ def run_indexing_routines(
action="store_true",
help="print verbose output",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
parser.add_argument(
"-r",
"--routine",
@@ -281,8 +261,9 @@ def run_indexing_routines(
choices=["diagonal", "choose", "repeat", "ai1", "ai2", "ai3", "all"],
help="name of the index routine to test",
)
- args, unknown = parser.parse_known_args()
- print("Warning, unrecognized arguments: ", unknown)
+
+ args, np, timer = parse_args(parser)
+
run_benchmark(
run_indexing_routines,
args.benchmark,
diff --git a/examples/jacobi.py b/examples/jacobi.py
index 56cf8aa90..6b9e46968 100644
--- a/examples/jacobi.py
+++ b/examples/jacobi.py
@@ -18,15 +18,7 @@
import argparse
import math
-from benchmark import run_benchmark
-
-try:
- from legate.timing import time
-except (ImportError, RuntimeError):
- from time import perf_counter_ns
-
- def time():
- return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark
def generate_random(N):
@@ -40,35 +32,33 @@ def generate_random(N):
return A, b
-def solve(A, b, iters, verbose):
+def check(A, x, b):
+ print("Checking result...")
+ return np.allclose(A.dot(x), b)
+
+
+def run_jacobi(N, iters, warmup, perform_check, timing, verbose):
+ A, b = generate_random(N)
+
print("Solving system...")
x = np.zeros(A.shape[1])
d = np.diag(A)
R = A - np.diag(d)
- for i in range(iters):
- x = (b - np.dot(R, x)) / d
- return x
-
-
-def check(A, x, b):
- print("Checking result...")
- if np.allclose(A.dot(x), b):
- print("PASS!")
- else:
- print("FAIL!")
+ timer.start()
+ for i in range(iters + warmup):
+ if i == warmup:
+ timer.start()
+ x = (b - np.dot(R, x)) / d
+ total = timer.stop()
-def run_jacobi(N, iters, perform_check, timing, verbose):
- A, b = generate_random(N)
- start = time()
- x = solve(A, b, iters, verbose)
if perform_check:
- check(A, x, b)
+ assert check(A, x, b)
else:
- # Need a synchronization here for timing
- assert not math.isnan(np.sum(x))
- stop = time()
- total = (stop - start) / 1000.0
+ assert not math.isnan(
+ np.sum(x)
+ ), f"{np.count_nonzero(~np.isnan(x))} NaNs in x"
+
if timing:
print(f"Elapsed Time: {total} ms")
return total
@@ -90,6 +80,14 @@ def run_jacobi(N, iters, perform_check, timing, verbose):
dest="iters",
help="number of iterations to run",
)
+ parser.add_argument(
+ "-w",
+ "--warmup",
+ type=int,
+ default=5,
+ dest="warmup",
+ help="warm-up iterations",
+ )
parser.add_argument(
"-n",
"--num",
@@ -112,53 +110,19 @@ def run_jacobi(N, iters, perform_check, timing, verbose):
action="store_true",
help="print verbose output",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- parser.add_argument(
- "--package",
- dest="package",
- choices=["legate", "numpy", "cupy"],
- type=str,
- default="legate",
- help="NumPy package to use (legate, numpy, or cupy)",
- )
- parser.add_argument(
- "--cupy-allocator",
- dest="cupy_allocator",
- choices=["default", "off", "managed"],
- type=str,
- default="default",
- help="cupy allocator to use (default, off, or managed)",
- )
-
- args, _ = parser.parse_known_args()
-
- if args.package == "legate":
- import cunumeric as np
- elif args.package == "cupy":
- import cupy as np
- if args.cupy_allocator == "off":
- np.cuda.set_allocator(None)
- print("Turning off memory pool")
- elif args.cupy_allocator == "managed":
- np.cuda.set_allocator(
- np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
- )
- print("Using managed memory pool")
- elif args.package == "numpy":
- import numpy as np
+ args, np, timer = parse_args(parser)
run_benchmark(
run_jacobi,
args.benchmark,
"Jacobi",
- (args.N, args.iters, args.check, args.timing, args.verbose),
+ (
+ args.N,
+ args.iters,
+ args.warmup,
+ args.check,
+ args.timing,
+ args.verbose,
+ ),
)
diff --git a/examples/kmeans.py b/examples/kmeans.py
index 736b7af58..a12723d94 100644
--- a/examples/kmeans.py
+++ b/examples/kmeans.py
@@ -18,11 +18,8 @@
# Derived from https://github.com/bryancatanzaro/kmeans
import argparse
-import datetime
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
def initialize(N, D, C, T):
@@ -80,7 +77,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
print("Number of dimensions: " + str(D))
print("Number of centroids: " + str(C))
print("Max iterations: " + str(I))
- start = datetime.datetime.now()
+ timer.start()
data, centroids = initialize(N, D, C, T)
data_dots = np.square(np.linalg.norm(data, ord=2, axis=1))
@@ -128,9 +125,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
+ ": "
+ str(prior_distance_sum)
)
- stop = datetime.datetime.now()
- delta = stop - start
- total = delta.total_seconds() * 1000.0
+ total = timer.stop()
print("Elapsed Time: " + str(total) + " ms")
return total
@@ -138,7 +133,6 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
- "-c",
"--centers",
type=int,
default=10,
@@ -185,16 +179,9 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
dest="S",
help="number of iterations between sampling the log likelihood",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application"
- " (default 1 - normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
if args.P == 16:
run_benchmark(
run_kmeans,
diff --git a/examples/kmeans_slow.py b/examples/kmeans_slow.py
index 8727fa7d2..a4d4c7009 100644
--- a/examples/kmeans_slow.py
+++ b/examples/kmeans_slow.py
@@ -18,11 +18,8 @@
# Derived from https://github.com/bryancatanzaro/kmeans
import argparse
-import datetime
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
def initialize(N, D, C, T):
@@ -81,7 +78,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
print("Number of dimensions: " + str(D))
print("Number of centroids: " + str(C))
print("Max iterations: " + str(I))
- start = datetime.datetime.now()
+ timer.start()
data, centroids = initialize(N, D, C, T)
data_dots = np.square(np.linalg.norm(data, ord=2, axis=1))
@@ -129,9 +126,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
+ ": "
+ str(prior_distance_sum)
)
- stop = datetime.datetime.now()
- delta = stop - start
- total = delta.total_seconds() * 1000.0
+ total = timer.stop()
print("Elapsed Time: " + str(total) + " ms")
return total
@@ -139,7 +134,6 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
- "-c",
"--centers",
type=int,
default=10,
@@ -186,16 +180,9 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
dest="S",
help="number of iterations between sampling the log likelihood",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
if args.P == 16:
run_benchmark(
run_kmeans,
diff --git a/examples/kmeans_sort.py b/examples/kmeans_sort.py
index b848b54e0..ae84ca6da 100644
--- a/examples/kmeans_sort.py
+++ b/examples/kmeans_sort.py
@@ -18,16 +18,8 @@
# Derived from https://github.com/bryancatanzaro/kmeans
import argparse
-import datetime
-from benchmark import run_benchmark
-
-import cunumeric as np
-
-try:
- xrange
-except NameError:
- xrange = range
+from benchmark import parse_args, run_benchmark
def initialize(N, D, C, T):
@@ -68,7 +60,7 @@ def find_centroids(data, labels, C, D):
# sum across them to create the centroids
centroids = np.empty((C, D), dtype=data.dtype)
ragged_arrays = np.split(sorted_points, indexes)
- for idx in xrange(C):
+ for idx in range(C):
centroids[idx, :] = np.sum(ragged_arrays[idx], axis=0)
# To avoid introducing divide by zero errors
# If a centroid has no weight, we'll do no normalization
@@ -83,7 +75,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
print("Number of dimensions: " + str(D))
print("Number of centroids: " + str(C))
print("Max iterations: " + str(I))
- start = datetime.datetime.now()
+ timer.start()
data, centroids = initialize(N, D, C, T)
data_dots = np.square(np.linalg.norm(data, ord=2, axis=1))
@@ -130,9 +122,7 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
+ ": "
+ str(prior_distance_sum)
)
- stop = datetime.datetime.now()
- delta = stop - start
- total = delta.total_seconds() * 1000.0
+ total = timer.stop()
print("Elapsed Time: " + str(total) + " ms")
return total
@@ -187,16 +177,9 @@ def run_kmeans(C, D, T, I, N, S, benchmarking): # noqa: E741
dest="S",
help="number of iterations between sampling the log likelihood",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
if args.P == 16:
run_benchmark(
run_kmeans,
diff --git a/examples/linreg.py b/examples/linreg.py
index bce2fff58..7ec3d11ba 100644
--- a/examples/linreg.py
+++ b/examples/linreg.py
@@ -16,12 +16,8 @@
#
import argparse
-import datetime
-import math
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
def initialize(N, F, T):
@@ -32,45 +28,36 @@ def initialize(N, F, T):
return x, y
-def linear_regression(
- T, features, target, steps, learning_rate, sample, add_intercept=False
-):
- if add_intercept:
+def run_linear_regression(N, F, T, I, warmup, S, B): # noqa: E741
+ print("Running linear regression...")
+ print("Number of data points: " + str(N) + "K")
+ print("Number of features: " + str(F))
+ print("Number of iterations: " + str(I))
+
+ learning_rate = 1e-5
+ features, target = initialize(N * 1000, F, T)
+ if B:
intercept = np.ones((features.shape[0], 1), dtype=T)
features = np.hstack((intercept, features))
-
weights = np.zeros(features.shape[1], dtype=T)
- for step in range(steps):
+ timer.start()
+ for step in range(-warmup, I):
+ if step == 0:
+ timer.start()
scores = np.dot(features, weights)
error = scores - target
gradient = -(1.0 / len(features)) * error.dot(features)
weights += learning_rate * gradient
-
- if step % sample == 0:
+ if step >= 0 and step % S == 0:
print(
"Error of step "
+ str(step)
+ ": "
+ str(np.sum(np.power(error, 2)))
)
+ total = timer.stop()
- return weights
-
-
-def run_linear_regression(N, F, T, I, S, B): # noqa: E741
- print("Running linear regression...")
- print("Number of data points: " + str(N) + "K")
- print("Number of features: " + str(F))
- print("Number of iterations: " + str(I))
- start = datetime.datetime.now()
- features, target = initialize(N * 1000, F, T)
- weights = linear_regression(T, features, target, I, 1e-5, S, B)
- # Check the weights for NaNs to synchronize before stopping timing
- assert not math.isnan(np.sum(weights))
- stop = datetime.datetime.now()
- delta = stop - start
- total = delta.total_seconds() * 1000.0
print("Elapsed Time: " + str(total) + " ms")
return total
@@ -78,7 +65,7 @@ def run_linear_regression(N, F, T, I, S, B): # noqa: E741
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
- "-b",
+ "-B",
"--intercept",
dest="B",
action="store_true",
@@ -100,6 +87,14 @@ def run_linear_regression(N, F, T, I, S, B): # noqa: E741
dest="I",
help="number of iterations to run the algorithm for",
)
+ parser.add_argument(
+ "-w",
+ "--warmup",
+ type=int,
+ default=5,
+ dest="warmup",
+ help="warm-up iterations",
+ )
parser.add_argument(
"-n",
"--num",
@@ -124,35 +119,29 @@ def run_linear_regression(N, F, T, I, S, B): # noqa: E741
dest="S",
help="number of iterations between sampling the log likelihood",
)
- parser.add_argument(
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
if args.P == 16:
run_benchmark(
run_linear_regression,
args.benchmark,
"LINREG(H)",
- (args.N, args.F, np.float16, args.I, args.S, args.B),
+ (args.N, args.F, np.float16, args.I, args.warmup, args.S, args.B),
)
elif args.P == 32:
run_benchmark(
run_linear_regression,
args.benchmark,
"LINREG(S)",
- (args.N, args.F, np.float32, args.I, args.S, args.B),
+ (args.N, args.F, np.float32, args.I, args.warmup, args.S, args.B),
)
elif args.P == 64:
run_benchmark(
run_linear_regression,
args.benchmark,
"LINREG(D)",
- (args.N, args.F, np.float64, args.I, args.S, args.B),
+ (args.N, args.F, np.float64, args.I, args.warmup, args.S, args.B),
)
else:
raise TypeError("Precision must be one of 16, 32, or 64")
diff --git a/examples/logreg.py b/examples/logreg.py
index 4e1abb209..d502e35f3 100644
--- a/examples/logreg.py
+++ b/examples/logreg.py
@@ -18,15 +18,7 @@
import argparse
import math
-from benchmark import run_benchmark
-
-try:
- from legate.timing import time
-except (ImportError, RuntimeError):
- from time import perf_counter_ns
-
- def time():
- return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark
def initialize(N, F, T):
@@ -47,46 +39,41 @@ def log_likelihood(features, target, weights):
return np.sum(target * scores - np.log(1.0 + np.exp(scores)))
-def logistic_regression(
- T, features, target, steps, learning_rate, sample, add_intercept=False
-):
- if add_intercept:
+def run_logistic_regression(N, F, T, I, warmup, S, B): # noqa: E741
+ print("Running logistic regression...")
+ print("Number of data points: " + str(N) + "K")
+ print("Number of features: " + str(F))
+ print("Number of iterations: " + str(I))
+
+ learning_rate = 1e-5
+ features, target = initialize(N * 1000, F, T)
+ if B:
intercept = np.ones((features.shape[0], 1), dtype=T)
features = np.hstack((intercept, features))
-
weights = np.zeros(features.shape[1], dtype=T)
- for step in range(steps):
+ timer.start()
+ for step in range(-warmup, I):
+ if step == 0:
+ timer.start()
scores = np.dot(features, weights)
predictions = sigmoid(scores)
-
error = target - predictions
gradient = np.dot(error, features)
weights += learning_rate * gradient
-
- if step % sample == 0:
+ if step >= 0 and step % S == 0:
print(
"Log Likelihood of step "
+ str(step)
+ ": "
+ str(log_likelihood(features, target, weights))
)
+ total = timer.stop()
- return weights
-
+ assert not math.isnan(
+ np.sum(weights)
+ ), f"{np.count_nonzero(~np.isnan(weights))} NaNs in weights"
-def run_logistic_regression(N, F, T, I, S, B): # noqa: E741
- print("Running logistic regression...")
- print("Number of data points: " + str(N) + "K")
- print("Number of features: " + str(F))
- print("Number of iterations: " + str(I))
- features, target = initialize(N * 1000, F, T)
- start = time()
- weights = logistic_regression(T, features, target, I, 1e-5, S, B)
- stop = time()
- # Check the weights for NaNs
- assert not math.isnan(np.sum(weights))
- total = (stop - start) / 1000.0
print(f"Elapsed Time: {total} ms")
return total
@@ -94,7 +81,7 @@ def run_logistic_regression(N, F, T, I, S, B): # noqa: E741
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
- "-b",
+ "-B",
"--intercept",
dest="B",
action="store_true",
@@ -116,6 +103,14 @@ def run_logistic_regression(N, F, T, I, S, B): # noqa: E741
dest="I",
help="number of iterations to run the algorithm for",
)
+ parser.add_argument(
+ "-w",
+ "--warmup",
+ type=int,
+ default=5,
+ dest="warmup",
+ help="warm-up iterations",
+ )
parser.add_argument(
"-n",
"--num",
@@ -140,69 +135,29 @@ def run_logistic_regression(N, F, T, I, S, B): # noqa: E741
dest="S",
help="number of iterations between sampling the log likelihood",
)
- parser.add_argument(
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- parser.add_argument(
- "--package",
- dest="package",
- choices=["legate", "numpy", "cupy"],
- type=str,
- default="legate",
- help="NumPy package to use (legate, numpy, or cupy)",
- )
- parser.add_argument(
- "--cupy-allocator",
- dest="cupy_allocator",
- choices=["default", "off", "managed"],
- type=str,
- default="default",
- help="cupy allocator to use (default, off, or managed)",
- )
- args, _ = parser.parse_known_args()
-
- if args.package == "legate":
- import cunumeric as np
- elif args.package == "cupy":
- import cupy as np
-
- if args.cupy_allocator == "off":
- np.cuda.set_allocator(None)
- print("Turning off memory pool")
- elif args.cupy_allocator == "managed":
- np.cuda.set_allocator(
- np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
- )
- print("Using managed memory pool")
- elif args.package == "numpy":
- import numpy as np
+ args, np, timer = parse_args(parser)
if args.P == 16:
run_benchmark(
run_logistic_regression,
args.benchmark,
"LOGREG(H)",
- (args.N, args.F, np.float16, args.I, args.S, args.B),
+ (args.N, args.F, np.float16, args.I, args.warmup, args.S, args.B),
)
elif args.P == 32:
run_benchmark(
run_logistic_regression,
args.benchmark,
"LOGREG(S)",
- (args.N, args.F, np.float32, args.I, args.S, args.B),
+ (args.N, args.F, np.float32, args.I, args.warmup, args.S, args.B),
)
elif args.P == 64:
run_benchmark(
run_logistic_regression,
args.benchmark,
"LOGREG(D)",
- (args.N, args.F, np.float64, args.I, args.S, args.B),
+ (args.N, args.F, np.float64, args.I, args.warmup, args.S, args.B),
)
else:
raise TypeError("Precision must be one of 16, 32, or 64")
diff --git a/examples/lstm_backward.py b/examples/lstm_backward.py
index 554dd49e8..2de702700 100644
--- a/examples/lstm_backward.py
+++ b/examples/lstm_backward.py
@@ -16,16 +16,12 @@
#
import argparse
-import datetime
-import math
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
- start = datetime.datetime.now()
+ timer.start()
WLSTM = np.random.randn(
word_size + hidden_size, 4 * hidden_size
@@ -77,13 +73,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
else:
dh0[0] += np.sum(dHin[t, :, word_size:], 0)
- # Do a little sum to synchronize and check for NaNs
- total = np.sum(dh0)
- assert not math.isnan(total)
-
- stop = datetime.datetime.now()
- delta = stop - start
- total = delta.total_seconds() * 1000.0
+ total = timer.stop()
if timing:
print("Elapsed Time: " + str(total) + " ms")
return total
@@ -92,7 +82,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
- "-b", "--batch", type=int, default=32, dest="batch", help="batch size"
+ "-B", "--batch", type=int, default=32, dest="batch", help="batch size"
)
parser.add_argument(
"--hidden", type=int, default=10, dest="hidden", help="hidden size"
@@ -115,15 +105,9 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
action="store_true",
help="perform timing",
)
- parser.add_argument(
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
run_benchmark(
run_lstm,
args.benchmark,
diff --git a/examples/lstm_forward.py b/examples/lstm_forward.py
index dde2e7c76..097218eaf 100644
--- a/examples/lstm_forward.py
+++ b/examples/lstm_forward.py
@@ -16,16 +16,12 @@
#
import argparse
-import datetime
-import math
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
- start = datetime.datetime.now()
+ timer.start()
X = np.random.randn(sentence_length, batch_size, hidden_size)
h0 = np.random.randn(1, hidden_size)
@@ -67,13 +63,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
Ct[t] = np.tanh(C[t])
Hout[t] = IFOGf[t, :, 2 * d : 3 * d] * Ct[t]
- # Do a little sum of the outputs to synchronize and check for NaNs
- total = np.sum(Hout)
- assert not math.isnan(total)
-
- stop = datetime.datetime.now()
- delta = stop - start
- total = delta.total_seconds() * 1000.0
+ total = timer.stop()
if timing:
print("Elapsed Time: " + str(total) + " ms")
return total
@@ -82,7 +72,7 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
- "-b", "--batch", type=int, default=32, dest="batch", help="batch size"
+ "-B", "--batch", type=int, default=32, dest="batch", help="batch size"
)
parser.add_argument(
"--hidden", type=int, default=10, dest="hidden", help="hidden size"
@@ -105,15 +95,9 @@ def run_lstm(batch_size, hidden_size, sentence_length, word_size, timing):
action="store_true",
help="perform timing",
)
- parser.add_argument(
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
run_benchmark(
run_lstm,
args.benchmark,
diff --git a/examples/lstm_full.py b/examples/lstm_full.py
index 0a56400a1..7bab6c9c7 100644
--- a/examples/lstm_full.py
+++ b/examples/lstm_full.py
@@ -16,11 +16,8 @@
#
import argparse
-import datetime
-from benchmark import run_benchmark
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
class Param:
@@ -293,7 +290,7 @@ def run_lstm(
pointer = 0
- start = datetime.datetime.now()
+ timer.start()
for iteration in range(max_iters):
# Reset
@@ -328,9 +325,7 @@ def run_lstm(
pointer += T_steps
update_status(max_iters, smooth_loss)
- stop = datetime.datetime.now()
- delta = stop - start
- total = delta.total_seconds() * 1000.0
+ total = timer.stop()
if timing:
print("Elapsed Time: " + str(total) + " ms")
return total
@@ -400,16 +395,9 @@ def run_lstm(
dest="weight",
help="standard deviation of weights for initialization",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
run_benchmark(
run_lstm,
args.benchmark,
diff --git a/examples/richardson_lucy.py b/examples/richardson_lucy.py
index db8a06a75..5ffcdcad8 100644
--- a/examples/richardson_lucy.py
+++ b/examples/richardson_lucy.py
@@ -15,10 +15,7 @@
import argparse
-from benchmark import run_benchmark
-from legate.timing import time
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
float_type = "float32"
@@ -31,17 +28,16 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
im_deconv = np.full(image.shape, 0.5, dtype=float_type)
psf_mirror = np.flip(psf)
- start = time()
+ timer.start()
for idx in range(num_iter + warmup):
if idx == warmup:
- start = time()
+ timer.start()
conv = np.convolve(im_deconv, psf, mode="same")
relative_blur = image / conv
im_deconv *= np.convolve(relative_blur, psf_mirror, mode="same")
- stop = time()
- total = (stop - start) / 1000.0
+ total = timer.stop()
if timing:
print("Elapsed Time: " + str(total) + " ms")
@@ -113,16 +109,9 @@ def run_richardson_lucy(shape, filter_shape, num_iter, warmup, timing):
action="store_true",
help="perform timing",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 "
- "- normal execution)",
- )
- args = parser.parse_args()
+
+ args, np, timer = parse_args(parser)
+
run_benchmark(
run_richardson_lucy,
args.benchmark,
diff --git a/examples/scan.py b/examples/scan.py
index 07b3621fd..d4737e54b 100644
--- a/examples/scan.py
+++ b/examples/scan.py
@@ -18,8 +18,7 @@
import argparse
import numpy as np
-from benchmark import run_benchmark
-from legate.timing import time
+from benchmark import parse_args, run_benchmark
def initialize(shape, dt, axis):
@@ -75,14 +74,12 @@ def run_scan(OP, shape, dt, ax, check):
print(f"Axis: axis={ax}")
print(f"Data type: dtype={dt}32")
A, B = initialize(shape=shape, dt=dt, axis=ax)
- start = time()
+ timer.start()
# op handling
getattr(num, OP)(A, out=B, axis=ax)
- stop = time()
- delta = stop - start
- total = delta / 1000.0
+ total = timer.stop()
print(f"Elapsed Time: {total}ms")
# error checking
if check:
@@ -131,49 +128,8 @@ def run_scan(OP, shape, dt, ax, check):
action="store_true",
help="check the result of the solve",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- parser.add_argument(
- "--package",
- dest="package",
- choices=["legate", "numpy", "cupy"],
- type=str,
- default="legate",
- help="NumPy package to use (legate, numpy, or cupy)",
- )
- parser.add_argument(
- "--cupy-allocator",
- dest="cupy_allocator",
- choices=["default", "off", "managed"],
- type=str,
- default="default",
- help="cupy allocator to use (default, off, or managed)",
- )
- args, _ = parser.parse_known_args()
-
- if args.package == "legate":
- import cunumeric as num
- elif args.package == "cupy":
- import cupy as num
-
- if args.cupy_allocator == "off":
- num.cuda.set_allocator(None)
- print("Turning off memory pool")
- elif args.cupy_allocator == "managed":
- num.cuda.set_allocator(
- num.cuda.MemoryPool(num.cuda.malloc_managed).malloc
- )
- print("Using managed memory pool")
- elif args.package == "numpy":
- import numpy as num
+ args, num, timer = parse_args(parser)
run_benchmark(
run_scan,
diff --git a/examples/solve.py b/examples/solve.py
index 5d5082dd4..91f92c6dd 100644
--- a/examples/solve.py
+++ b/examples/solve.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
# Copyright 2022 NVIDIA Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,20 +17,17 @@
import argparse
-from legate.timing import time
-
-import cunumeric as np
+from benchmark import parse_args, run_benchmark
def solve(m, n, nrhs, dtype):
a = np.random.rand(m, n).astype(dtype=dtype)
b = np.random.rand(n, nrhs).astype(dtype=dtype)
- start = time()
+ timer.start()
np.linalg.solve(a, b)
- stop = time()
+ total = timer.stop()
- total = (stop - start) / 1000.0
print(f"Elapsed Time: {total} ms")
@@ -66,5 +65,11 @@ def solve(m, n, nrhs, dtype):
dest="dtype",
help="data type",
)
- args = parser.parse_args()
- solve(args.m, args.n, args.nrhs, args.dtype)
+ args, np, timer = parse_args(parser)
+
+ run_benchmark(
+ solve,
+ args.benchmark,
+ "Solve",
+ (args.m, args.n, args.nrhs, args.dtype),
+ )
diff --git a/examples/sort.py b/examples/sort.py
index fb92d3dfb..5982f91ea 100644
--- a/examples/sort.py
+++ b/examples/sort.py
@@ -18,15 +18,7 @@
import argparse
import numpy as np
-from benchmark import run_benchmark
-
-try:
- from legate.timing import time
-except (ImportError, RuntimeError):
- from time import perf_counter_ns
-
- def time():
- return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark
def check_sorted(a, a_sorted, package, axis=-1):
@@ -81,19 +73,18 @@ def run_sort(
print("UNKNOWN type " + str(newtype))
assert False
- start = time()
+ timer.start()
if argsort:
a_sorted = num.argsort(a, axis)
else:
a_sorted = num.sort(a, axis)
- stop = time()
+ total = timer.stop()
if perform_check and not argsort:
check_sorted(a, a_sorted, package, axis)
else:
# do we need to synchronize?
assert True
- total = (stop - start) * 1e-3
if timing:
print("Elapsed Time: " + str(total) + " ms")
return total
@@ -162,49 +153,8 @@ def run_sort(
action="store_true",
help="use argsort",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 - "
- "normal execution)",
- )
- parser.add_argument(
- "--package",
- dest="package",
- choices=["legate", "numpy", "cupy"],
- type=str,
- default="legate",
- help="NumPy package to use (legate, numpy, or cupy)",
- )
- parser.add_argument(
- "--cupy-allocator",
- dest="cupy_allocator",
- choices=["default", "off", "managed"],
- type=str,
- default="default",
- help="cupy allocator to use (default, off, or managed)",
- )
-
- args, _ = parser.parse_known_args()
-
- if args.package == "legate":
- import cunumeric as num
- elif args.package == "cupy":
- import cupy as num
- if args.cupy_allocator == "off":
- num.cuda.set_allocator(None)
- print("Turning off memory pool")
- elif args.cupy_allocator == "managed":
- num.cuda.set_allocator(
- num.cuda.MemoryPool(num.cuda.malloc_managed).malloc
- )
- print("Using managed memory pool")
- elif args.package == "numpy":
- import numpy as num
+ args, num, timer = parse_args(parser)
run_benchmark(
run_sort,
diff --git a/examples/stencil.py b/examples/stencil.py
index 460cb7bde..c0d33c90b 100644
--- a/examples/stencil.py
+++ b/examples/stencil.py
@@ -16,17 +16,8 @@
#
import argparse
-import math
-from benchmark import run_benchmark
-
-try:
- from legate.timing import time
-except (ImportError, RuntimeError):
- from time import perf_counter_ns
-
- def time():
- return perf_counter_ns() / 1000.0
+from benchmark import parse_args, run_benchmark
def initialize(N):
@@ -39,30 +30,25 @@ def initialize(N):
return grid
-def run(grid, I, N): # noqa: E741
+def run_stencil(N, I, warmup, timing): # noqa: E741
+ grid = initialize(N)
+
print("Running Jacobi stencil...")
center = grid[1:-1, 1:-1]
north = grid[0:-2, 1:-1]
east = grid[1:-1, 2:]
west = grid[1:-1, 0:-2]
south = grid[2:, 1:-1]
- for i in range(I):
+
+ timer.start()
+ for i in range(I + warmup):
+ if i == warmup:
+ timer.start()
average = center + north + east + west + south
work = 0.2 * average
- # delta = np.sum(np.absolute(work - center))
center[:] = work
- total = np.sum(center)
- return total / (N**2)
+ total = timer.stop()
-
-def run_stencil(N, I, timing): # noqa: E741
- grid = initialize(N)
- start = time()
- average = run(grid, I, N)
- stop = time()
- print("Average energy is %.8g" % average)
- total = (stop - start) / 1000.0
- assert not math.isnan(average)
if timing:
print(f"Elapsed Time: {total} ms")
return total
@@ -78,6 +64,14 @@ def run_stencil(N, I, timing): # noqa: E741
dest="I",
help="number of iterations to run",
)
+ parser.add_argument(
+ "-w",
+ "--warmup",
+ type=int,
+ default=5,
+ dest="warmup",
+ help="warm-up iterations",
+ )
parser.add_argument(
"-n",
"--num",
@@ -93,50 +87,12 @@ def run_stencil(N, I, timing): # noqa: E741
action="store_true",
help="perform timing",
)
- parser.add_argument(
- "-b",
- "--benchmark",
- type=int,
- default=1,
- dest="benchmark",
- help="number of times to benchmark this application (default 1 "
- "- normal execution)",
- )
- parser.add_argument(
- "--package",
- dest="package",
- choices=["legate", "numpy", "cupy"],
- type=str,
- default="legate",
- help="NumPy package to use (legate, numpy, or cupy)",
- )
- parser.add_argument(
- "--cupy-allocator",
- dest="cupy_allocator",
- choices=["default", "off", "managed"],
- type=str,
- default="default",
- help="cupy allocator to use (default, off, or managed)",
- )
-
- args, _ = parser.parse_known_args()
-
- if args.package == "legate":
- import cunumeric as np
- elif args.package == "cupy":
- import cupy as np
- if args.cupy_allocator == "off":
- np.cuda.set_allocator(None)
- print("Turning off memory pool")
- elif args.cupy_allocator == "managed":
- np.cuda.set_allocator(
- np.cuda.MemoryPool(np.cuda.malloc_managed).malloc
- )
- print("Using managed memory pool")
- elif args.package == "numpy":
- import numpy as np
+ args, np, timer = parse_args(parser)
run_benchmark(
- run_stencil, args.benchmark, "Stencil", (args.N, args.I, args.timing)
+ run_stencil,
+ args.benchmark,
+ "Stencil",
+ (args.N, args.I, args.warmup, args.timing),
)
diff --git a/examples/wgrad.py b/examples/wgrad.py
index d95c00297..f4767f2b0 100644
--- a/examples/wgrad.py
+++ b/examples/wgrad.py
@@ -16,8 +16,8 @@
#
import argparse
-import datetime
-import math
+
+from legate.timing import time
import cunumeric as np
@@ -45,17 +45,14 @@ def cross_correlate(x, y, C, K, R, S, B, H, W):
def run_wgrad(H=256, W=256, B=32, C=256, K=32, R=5, S=5, timing=False):
- if timing:
- start = datetime.datetime.now()
+ start = time()
x, y = initialize(C, K, B, H, W)
- dw = cross_correlate(x, y, C, K, R, S, B, H, W)
- # Do a little sum over dw to sync the results
- total = np.sum(dw)
- assert not math.isnan(total)
+ _ = cross_correlate(x, y, C, K, R, S, B, H, W)
+ stop = time()
+ total = (stop - start) / 1000.0
if timing:
- stop = datetime.datetime.now()
- delta = stop - start
- print("Elapsed Time: " + str(delta.total_seconds() * 1000.0) + " ms")
+ print("Elapsed Time: " + str(total) + " ms")
+ return total
if __name__ == "__main__":
@@ -104,7 +101,7 @@ def run_wgrad(H=256, W=256, B=32, C=256, K=32, R=5, S=5, timing=False):
dest="W",
help="width of images in pixels",
)
- args = parser.parse_args()
+ args = parser.parse_args(parser)
run_wgrad(
args.H, args.W, args.B, args.C, args.K, args.R, args.R, args.timing
)
diff --git a/install.py b/install.py
index 11d838b32..96dc13242 100755
--- a/install.py
+++ b/install.py
@@ -76,10 +76,13 @@ def __call__(self, parser, namespace, values, option_string):
setattr(namespace, self.dest, not option_string.startswith("--no"))
-def execute_command(args, verbose, **kwargs):
+def execute_command(args, verbose, ignore_errors=False, **kwargs):
if verbose:
print('Executing: "', " ".join(args), '" with ', kwargs)
- subprocess.check_call(args, **kwargs)
+ if ignore_errors:
+ subprocess.call(args, **kwargs)
+ else:
+ subprocess.check_call(args, **kwargs)
def scikit_build_cmake_build_dir(skbuild_dir):
@@ -139,10 +142,6 @@ def install_cunumeric(
gasnet_dir,
networks,
hdf,
- install_dir,
- legate_branch,
- legate_dir,
- legate_url,
llvm,
march,
maxdim,
@@ -187,10 +186,6 @@ def install_cunumeric(
print("gasnet_dir: ", gasnet_dir)
print("networks: ", networks)
print("hdf: ", hdf)
- print("install_dir: ", install_dir)
- print("legate_branch: ", legate_branch)
- print("legate_dir: ", legate_dir)
- print("legate_url: ", legate_url)
print("llvm: ", llvm)
print("march: ", march)
print("maxdim: ", maxdim)
@@ -226,20 +221,21 @@ def validate_path(path):
cuda_dir = validate_path(cuda_dir)
nccl_dir = validate_path(nccl_dir)
tblis_dir = validate_path(tblis_dir)
- legate_dir = validate_path(legate_dir)
thrust_dir = validate_path(thrust_dir)
curand_dir = validate_path(curand_dir)
gasnet_dir = validate_path(gasnet_dir)
cutensor_dir = validate_path(cutensor_dir)
openblas_dir = validate_path(openblas_dir)
- if legate_dir is None:
- try:
- import legate.install_info as lg_install_info
+ try:
+ import legate.install_info as lg_install_info
+ except ImportError:
+ raise RuntimeError(
+ "Cannot determine Legate install directory. Please make sure "
+ "legate.core is installed in the current Python environment."
+ )
- legate_dir = dirname(lg_install_info.libpath)
- except Exception:
- pass
+ legate_dir = dirname(lg_install_info.libpath)
if verbose:
print("cuda_dir: ", cuda_dir)
@@ -261,6 +257,29 @@ def validate_path(path):
print("Performing a clean build to accommodate build isolation.")
clean_first = True
+ cmd_env = dict(os.environ.items())
+
+ # Explicitly uninstall cunumeric if doing a clean/isolated build.
+ #
+ # A prior installation may have built and installed cunumeric C++
+ # dependencies (like BLAS or tblis).
+ #
+ # CMake will find and use them for the current build, which would normally
+ # be correct, but pip uninstalls files from any existing installation as
+ # the last step of the install process, including the libraries found by
+ # CMake during the current build.
+ #
+ # Therefore this uninstall step must occur *before* CMake attempts to find
+ # these dependencies, triggering CMake to build and install them again.
+ if clean_first or (build_isolation and not editable):
+ execute_command(
+ [sys.executable, "-m", "pip", "uninstall", "-y", "cunumeric"],
+ verbose,
+ ignore_errors=True,
+ cwd=cunumeric_dir,
+ env=cmd_env,
+ )
+
if clean_first:
shutil.rmtree(skbuild_dir, ignore_errors=True)
shutil.rmtree(join(cunumeric_dir, "dist"), ignore_errors=True)
@@ -272,7 +291,8 @@ def validate_path(path):
# Configure and build cuNumeric via setup.py
pip_install_cmd = [sys.executable, "-m", "pip", "install"]
- cmd_env = dict(os.environ.items())
+
+ install_dir = None
if unknown is not None:
try:
@@ -301,14 +321,15 @@ def validate_path(path):
pip_install_cmd += ["--no-deps", "--no-build-isolation"]
pip_install_cmd += ["--upgrade"]
+ if unknown is not None:
+ pip_install_cmd += unknown
+
pip_install_cmd += ["."]
if verbose:
pip_install_cmd += ["-vv"]
- cmake_flags = []
-
- if cmake_generator:
- cmake_flags += [f"-G{cmake_generator}"]
+ # Also use preexisting CMAKE_ARGS from conda if set
+ cmake_flags = cmd_env.get("CMAKE_ARGS", "").split(" ")
if debug or verbose:
cmake_flags += ["--log-level=%s" % ("DEBUG" if debug else "VERBOSE")]
@@ -350,18 +371,22 @@ def validate_path(path):
# A custom path to cuRAND is ignored when CUDA support is available
if cuda and curand_dir is not None:
cmake_flags += ["-Dcunumeric_cuRAND_INCLUDE_DIR=%s" % curand_dir]
- if legate_dir:
- cmake_flags += ["-Dlegate_core_ROOT=%s" % legate_dir]
- if legate_url:
- cmake_flags += ["-Dcunumeric_LEGATE_CORE_REPOSITORY=%s" % legate_url]
- if legate_branch:
- cmake_flags += ["-Dcunumeric_LEGATE_CORE_BRANCH=%s" % legate_branch]
+
+ cmake_flags += ["-Dlegate_core_ROOT=%s" % legate_dir]
cmake_flags += extra_flags
+ build_flags = [f"-j{str(thread_count)}"]
+ if verbose:
+ if cmake_generator == "Unix Makefiles":
+ build_flags += ["VERBOSE=1"]
+ else:
+ build_flags += ["--verbose"]
+
cmd_env.update(
{
- "SKBUILD_BUILD_OPTIONS": f"-j{str(thread_count)}",
- "SKBUILD_CONFIGURE_OPTIONS": "\n".join(cmake_flags),
+ "CMAKE_ARGS": " ".join(cmake_flags),
+ "CMAKE_GENERATOR": cmake_generator,
+ "SKBUILD_BUILD_OPTIONS": " ".join(build_flags),
}
)
@@ -370,14 +395,6 @@ def validate_path(path):
def driver():
parser = argparse.ArgumentParser(description="Install cuNumeric.")
- parser.add_argument(
- "--install-dir",
- dest="install_dir",
- metavar="DIR",
- required=False,
- default=None,
- help="Path to install cuNumeric software",
- )
parser.add_argument(
"--debug",
dest="debug",
@@ -434,28 +451,6 @@ def driver():
default=os.environ.get("GASNET"),
help="Path to GASNet installation directory.",
)
- parser.add_argument(
- "--with-core",
- dest="legate_dir",
- metavar="DIR",
- required=False,
- default=os.environ.get("LEGATE_DIR"),
- help="Path to Legate Core installation directory.",
- )
- parser.add_argument(
- "--legate-url",
- dest="legate_url",
- required=False,
- default="https://github.com/nv-legate/legate.core.git",
- help="Legate git URL to build cuNumeric with.",
- )
- parser.add_argument(
- "--legate-branch",
- dest="legate_branch",
- required=False,
- default="branch-22.10",
- help="Legate branch to build cuNumeric with.",
- )
parser.add_argument(
"--with-openblas",
dest="openblas_dir",
@@ -520,8 +515,11 @@ def driver():
"--cmake-generator",
dest="cmake_generator",
required=False,
- default="Ninja",
- choices=["Ninja", "Unix Makefiles"],
+ default=os.environ.get(
+ "CMAKE_GENERATOR",
+ "Unix Makefiles" if shutil.which("ninja") is None else "Ninja",
+ ),
+ choices=["Ninja", "Unix Makefiles", None],
help="The CMake makefiles generator",
)
parser.add_argument(
@@ -556,7 +554,7 @@ def driver():
"--march",
dest="march",
required=False,
- default="native",
+ default=("haswell" if platform.machine() == "x86_64" else "native"),
help="Specify the target CPU architecture.",
)
parser.add_argument(
diff --git a/pyproject.toml b/pyproject.toml
index 22727eb03..73ebc13c8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -18,7 +18,7 @@ requires = [
"ninja",
"setuptools",
"scikit-build>=0.13.1",
- "cmake>=3.22.1,!=3.23.0",
+ "cmake>=3.22.1,!=3.23.0,!=3.25.0",
]
[tool.pytest.ini_options]
@@ -75,7 +75,6 @@ warn_no_return = true
warn_return_any = false
warn_unreachable = true
-show_none_errors = true
ignore_errors = false
allow_untyped_globals = false
diff --git a/scripts/build-install.sh b/scripts/build-install.sh
index 4d9bdbfc8..af0f8429d 100755
--- a/scripts/build-install.sh
+++ b/scripts/build-install.sh
@@ -13,10 +13,10 @@ source ./scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh
rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
# Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
# Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
# Add other build options here as desired
cmake_args+="
@@ -29,7 +29,7 @@ ninja_args="-j$(nproc --ignore=2)"
# Build cunumeric + cunumeric_python and install into the current Python environment
SKBUILD_BUILD_OPTIONS="$ninja_args" \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args" \
+CMAKE_ARGS="$cmake_args" \
python -m pip install \
--root / --prefix "$CONDA_PREFIX" \
--no-deps --no-build-isolation \
diff --git a/scripts/build-no-install.sh b/scripts/build-no-install.sh
index 623ca788d..1237d1a5a 100755
--- a/scripts/build-no-install.sh
+++ b/scripts/build-no-install.sh
@@ -11,10 +11,10 @@ source ./scripts/util/compiler-flags.sh
rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
# Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
# Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
# Add other build options here as desired
cmake_args+="
@@ -27,7 +27,7 @@ ninja_args="-j$(nproc --ignore=2)"
# Build legion_core + legion_core_python and perform an "editable" install
SKBUILD_BUILD_OPTIONS="$ninja_args" \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args" \
+CMAKE_ARGS="$cmake_args" \
SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
python -m pip install \
--root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/build-separately-no-install.sh b/scripts/build-separately-no-install.sh
index b9de045b4..be31507ee 100644
--- a/scripts/build-separately-no-install.sh
+++ b/scripts/build-separately-no-install.sh
@@ -11,10 +11,10 @@ source ./scripts/util/compiler-flags.sh
rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
# Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
# Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
# Add other build options here as desired
cmake_args+="
@@ -44,7 +44,7 @@ cmake_args+="
# Build legion_core_python and perform an "editable" install
SKBUILD_BUILD_OPTIONS="$ninja_args" \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args" \
+CMAKE_ARGS="$cmake_args" \
SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
python -m pip install \
--root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/build-with-legate-no-install.sh b/scripts/build-with-legate-no-install.sh
index ad1da812a..9d83010b7 100644
--- a/scripts/build-with-legate-no-install.sh
+++ b/scripts/build-with-legate-no-install.sh
@@ -13,10 +13,10 @@ source ./scripts/util/read-legate-core-root.sh "$0"
rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
# Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
# Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
# Add other build options here as desired
cmake_args+="
@@ -28,7 +28,7 @@ ninja_args="-j$(nproc --ignore=2)"
# Build legion_core + legion_core_python and perform an "editable" install
SKBUILD_BUILD_OPTIONS="$ninja_args" \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args" \
+CMAKE_ARGS="$cmake_args" \
SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
python -m pip install \
--root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/build-with-legate-separately-no-install.sh b/scripts/build-with-legate-separately-no-install.sh
index c04e7f9ed..74cc277a0 100755
--- a/scripts/build-with-legate-separately-no-install.sh
+++ b/scripts/build-with-legate-separately-no-install.sh
@@ -13,10 +13,10 @@ source ./scripts/util/read-legate-core-root.sh "$0"
rm -rf ./{build,_skbuild,dist,cunumeric.egg-info}
# Define CMake configuration arguments
-cmake_args=
+cmake_args="${CMAKE_ARGS:-}"
# Use ninja-build if installed
-if [[ -n "$(which ninja)" ]]; then cmake_args+="-GNinja"; fi
+if [[ -n "$(which ninja)" ]]; then cmake_args+=" -GNinja"; fi
# Add other build options here as desired
cmake_args+="
@@ -45,7 +45,7 @@ cmake_args+="
# Build legion_core_python and perform an "editable" install
SKBUILD_BUILD_OPTIONS="$ninja_args" \
-SKBUILD_CONFIGURE_OPTIONS="$cmake_args" \
+CMAKE_ARGS="$cmake_args" \
SETUPTOOLS_ENABLE_FEATURES="legacy-editable" \
python -m pip install \
--root / --prefix "$CONDA_PREFIX" \
diff --git a/scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh b/scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh
index a759dd37f..4f37467bb 100755
--- a/scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh
+++ b/scripts/util/uninstall-global-legion-legate-core-and-cunumeric.sh
@@ -1,10 +1,10 @@
#! /usr/bin/env bash
-rm -rf $(find "$CONDA_PREFIX/lib" -type d -name '*cunumeric*') \
- $(find "$CONDA_PREFIX/lib" -type f -name 'libcunumeric*') \
- $(find "$CONDA_PREFIX/lib" -type f -name 'cunumeric.egg-link') \
- $(find "$CONDA_PREFIX/include" -type f -name 'tci.h') \
- $(find "$CONDA_PREFIX/include" -type d -name 'tci') \
- $(find "$CONDA_PREFIX/include" -type d -name 'tblis') \
- $(find "$CONDA_PREFIX/include" -type d -name 'cunumeric') \
+rm -rf $(find "$CONDA_PREFIX/lib" -mindepth 1 -type d -name '*cunumeric*') \
+ $(find "$CONDA_PREFIX/lib" -mindepth 1 -type f -name 'libcunumeric*') \
+ $(find "$CONDA_PREFIX/lib" -mindepth 1 -type f -name 'cunumeric.egg-link') \
+ $(find "$CONDA_PREFIX/include" -mindepth 1 -type f -name 'tci.h') \
+ $(find "$CONDA_PREFIX/include" -mindepth 1 -type d -name 'tci') \
+ $(find "$CONDA_PREFIX/include" -mindepth 1 -type d -name 'tblis') \
+ $(find "$CONDA_PREFIX/include" -mindepth 1 -type d -name 'cunumeric') \
;
diff --git a/src/Makefile b/src/Makefile
deleted file mode 100644
index 76ecd56d8..000000000
--- a/src/Makefile
+++ /dev/null
@@ -1,87 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-ifndef LEGATE_DIR
-$(error LEGATE_DIR variable is not defined, aborting build)
-endif
-ifndef OPENBLAS_PATH
-$(error OPENBLAS_PATH variable is not defined, aborting build)
-endif
-ifndef OPENBLAS_LIBNAME
-$(error OPENBLAS_PATH variable is not defined, aborting build)
-endif
-ifndef TBLIS_PATH
-$(error TBLIS_PATH variable is not defined, aborting build)
-endif
-ifeq ($(strip $(USE_CUDA)),1)
-ifndef CUTENSOR_PATH
-$(error CUTENSOR_PATH variable is not defined, aborting build)
-endif
-ifndef NCCL_PATH
-$(error NCCL_PATH variable is not defined, aborting build)
-endif
-endif # ifeq ($(strip $(USE_CUDA)),1)
-ifndef THRUST_PATH
-$(error THRUST_PATH variable is not defined, aborting build)
-endif
-
-include $(LEGATE_DIR)/share/legate/config.mk
-
-LIBNAME = libcunumeric
-
-CURAND_PATH ?=
-
-CC_FLAGS ?=
-CC_FLAGS += -I. -I$(OPENBLAS_PATH)/include -I$(TBLIS_PATH)/include -I$(THRUST_PATH)
-CC_FLAGS += -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_OMP
-
-ifdef CURAND_PATH
-BUILD_CURAND_TASKS = 1
-CC_FLAGS += -I$(CURAND_PATH)/include -DCUNUMERIC_CURAND_FOR_CPU_BUILD
-else
-ifeq ($(strip $(USE_CUDA)),1)
-BUILD_CURAND_TASKS = 1
-else
-BUILD_CURAND_TASKS = 0
-endif
-endif
-
-LD_FLAGS ?=
-LD_FLAGS += -L$(OPENBLAS_PATH)/lib -l$(OPENBLAS_LIBNAME) -Wl,-rpath,$(OPENBLAS_PATH)/lib
-LD_FLAGS += -L$(TBLIS_PATH)/lib -ltblis -Wl,-rpath,$(TBLIS_PATH)/lib
-ifeq ($(strip $(USE_CUDA)),1)
-LD_FLAGS += -lcublas -lcusolver -lcufft
-LD_FLAGS += -L$(CUTENSOR_PATH)/lib -lcutensor -Wl,-rpath,$(CUTENSOR_PATH)/lib
-LD_FLAGS += -L$(NCCL_PATH)/lib -lnccl -Wl,-rpath,$(NCCL_PATH)/lib
-endif
-NVCC_FLAGS ?=
-NVCC_FLAGS += -I. -I$(THRUST_PATH) -I$(CUTENSOR_PATH)/include -I$(NCCL_PATH)/include -Wno-deprecated-declarations
-
-ifeq ($(strip $(DEBUG)),1)
-CC_FLAGS += -DDEBUG_CUNUMERIC
-NVCC_FLAGS += -DDEBUG_CUNUMERIC
-endif
-
-CHECK_BOUNDS ?= 0
-ifeq ($(strip $(CHECK_BOUNDS)),1)
-CC_FLAGS += -DBOUNDS_CHECKS
-endif
-
-GEN_CPU_SRC =
-GEN_GPU_SRC =
-
-include cunumeric.mk
-
-include $(LEGATE_DIR)/share/legate/legate.mk
diff --git a/src/cunumeric.mk b/src/cunumeric.mk
deleted file mode 100644
index 1b7f17080..000000000
--- a/src/cunumeric.mk
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright 2021-2022 NVIDIA Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-# List all the application source files that need OpenMP separately
-# since we have to add the -fopenmp flag to CC_FLAGS for them
-GEN_CPU_SRC += cunumeric/ternary/where.cc \
- cunumeric/scan/scan_global.cc \
- cunumeric/scan/scan_local.cc \
- cunumeric/binary/binary_op.cc \
- cunumeric/binary/binary_red.cc \
- cunumeric/bits/packbits.cc \
- cunumeric/bits/unpackbits.cc \
- cunumeric/unary/scalar_unary_red.cc \
- cunumeric/unary/unary_op.cc \
- cunumeric/unary/unary_red.cc \
- cunumeric/unary/convert.cc \
- cunumeric/nullary/arange.cc \
- cunumeric/nullary/eye.cc \
- cunumeric/nullary/fill.cc \
- cunumeric/nullary/window.cc \
- cunumeric/index/advanced_indexing.cc \
- cunumeric/index/choose.cc \
- cunumeric/index/repeat.cc \
- cunumeric/index/wrap.cc \
- cunumeric/index/zip.cc \
- cunumeric/item/read.cc \
- cunumeric/item/write.cc \
- cunumeric/matrix/contract.cc \
- cunumeric/matrix/diag.cc \
- cunumeric/matrix/gemm.cc \
- cunumeric/matrix/matmul.cc \
- cunumeric/matrix/matvecmul.cc \
- cunumeric/matrix/dot.cc \
- cunumeric/matrix/potrf.cc \
- cunumeric/matrix/solve.cc \
- cunumeric/matrix/syrk.cc \
- cunumeric/matrix/tile.cc \
- cunumeric/matrix/transpose.cc \
- cunumeric/matrix/trilu.cc \
- cunumeric/matrix/trsm.cc \
- cunumeric/matrix/util.cc \
- cunumeric/random/rand.cc \
- cunumeric/search/argwhere.cc \
- cunumeric/search/nonzero.cc \
- cunumeric/set/unique.cc \
- cunumeric/set/unique_reduce.cc \
- cunumeric/stat/bincount.cc \
- cunumeric/convolution/convolve.cc \
- cunumeric/transform/flip.cc \
- cunumeric/arg.cc \
- cunumeric/mapper.cc
-
-GEN_CPU_SRC += cunumeric/cephes/chbevl.cc \
- cunumeric/cephes/i0.cc
-
-ifeq ($(strip $(USE_OPENMP)),1)
-GEN_CPU_SRC += cunumeric/ternary/where_omp.cc \
- cunumeric/scan/scan_global_omp.cc \
- cunumeric/scan/scan_local_omp.cc \
- cunumeric/binary/binary_op_omp.cc \
- cunumeric/binary/binary_red_omp.cc \
- cunumeric/bits/packbits_omp.cc \
- cunumeric/bits/unpackbits_omp.cc \
- cunumeric/unary/unary_op_omp.cc \
- cunumeric/unary/scalar_unary_red_omp.cc \
- cunumeric/unary/unary_red_omp.cc \
- cunumeric/unary/convert_omp.cc \
- cunumeric/nullary/arange_omp.cc \
- cunumeric/nullary/eye_omp.cc \
- cunumeric/nullary/fill_omp.cc \
- cunumeric/nullary/window_omp.cc \
- cunumeric/index/advanced_indexing_omp.cc\
- cunumeric/index/choose_omp.cc \
- cunumeric/index/repeat_omp.cc \
- cunumeric/index/wrap_omp.cc \
- cunumeric/index/zip_omp.cc \
- cunumeric/matrix/contract_omp.cc \
- cunumeric/matrix/diag_omp.cc \
- cunumeric/matrix/gemm_omp.cc \
- cunumeric/matrix/matmul_omp.cc \
- cunumeric/matrix/matvecmul_omp.cc \
- cunumeric/matrix/dot_omp.cc \
- cunumeric/matrix/potrf_omp.cc \
- cunumeric/matrix/solve_omp.cc \
- cunumeric/matrix/syrk_omp.cc \
- cunumeric/matrix/tile_omp.cc \
- cunumeric/matrix/transpose_omp.cc \
- cunumeric/matrix/trilu_omp.cc \
- cunumeric/matrix/trsm_omp.cc \
- cunumeric/matrix/util_omp.cc \
- cunumeric/random/rand_omp.cc \
- cunumeric/search/argwhere_omp.cc \
- cunumeric/search/nonzero_omp.cc \
- cunumeric/set/unique_omp.cc \
- cunumeric/stat/bincount_omp.cc \
- cunumeric/convolution/convolve_omp.cc \
- cunumeric/transform/flip_omp.cc
-endif
-
-GEN_GPU_SRC += cunumeric/ternary/where.cu \
- cunumeric/scan/scan_global.cu \
- cunumeric/scan/scan_local.cu \
- cunumeric/binary/binary_op.cu \
- cunumeric/binary/binary_red.cu \
- cunumeric/bits/packbits.cu \
- cunumeric/bits/unpackbits.cu \
- cunumeric/unary/scalar_unary_red.cu \
- cunumeric/unary/unary_red.cu \
- cunumeric/unary/unary_op.cu \
- cunumeric/unary/convert.cu \
- cunumeric/nullary/arange.cu \
- cunumeric/nullary/eye.cu \
- cunumeric/nullary/fill.cu \
- cunumeric/nullary/window.cu \
- cunumeric/index/advanced_indexing.cu \
- cunumeric/index/choose.cu \
- cunumeric/index/repeat.cu \
- cunumeric/index/wrap.cu \
- cunumeric/index/zip.cu \
- cunumeric/item/read.cu \
- cunumeric/item/write.cu \
- cunumeric/matrix/contract.cu \
- cunumeric/matrix/diag.cu \
- cunumeric/matrix/gemm.cu \
- cunumeric/matrix/matmul.cu \
- cunumeric/matrix/matvecmul.cu \
- cunumeric/matrix/dot.cu \
- cunumeric/matrix/potrf.cu \
- cunumeric/matrix/solve.cu \
- cunumeric/matrix/syrk.cu \
- cunumeric/matrix/tile.cu \
- cunumeric/matrix/transpose.cu \
- cunumeric/matrix/trilu.cu \
- cunumeric/matrix/trsm.cu \
- cunumeric/random/rand.cu \
- cunumeric/search/argwhere.cu \
- cunumeric/search/nonzero.cu \
- cunumeric/set/unique.cu \
- cunumeric/stat/bincount.cu \
- cunumeric/convolution/convolve.cu \
- cunumeric/fft/fft.cu \
- cunumeric/transform/flip.cu \
- cunumeric/cudalibs.cu \
- cunumeric/cunumeric.cu
-
-include cunumeric/sort/sort.mk
-
-ifeq ($(strip $(BUILD_CURAND_TASKS)),1)
-include cunumeric/random/random.mk
-endif
-
-GEN_CPU_SRC += cunumeric/cunumeric.cc # This must always be the last file!
- # It guarantees we do our registration callback
- # only after all task variants are recorded
diff --git a/src/cunumeric/binary/binary_op_util.h b/src/cunumeric/binary/binary_op_util.h
index 6d1375e13..a4c1538ec 100644
--- a/src/cunumeric/binary/binary_op_util.h
+++ b/src/cunumeric/binary/binary_op_util.h
@@ -311,7 +311,7 @@ template
struct BinaryOp {
using T = legate::legate_type_of;
static constexpr bool valid =
- not(CODE == legate::LegateTypeCode::BOOL_LT or legate::is_complex::value);
+ not(CODE == legate::LegateTypeCode::BOOL_LT or legate::is_complex::value);
BinaryOp(const std::vector& args) {}
template ::value>* = nullptr>
@@ -459,7 +459,7 @@ struct BinaryOp {
atol_ = args[1].scalar();
}
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
constexpr bool operator()(const T& a, const T& b) const
{
using std::fabs;
@@ -469,7 +469,7 @@ struct BinaryOp {
atol_ + rtol_ * static_cast(fabs(b));
}
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
constexpr bool operator()(const T& a, const T& b) const
{
return static_cast(abs(a - b)) <= atol_ + rtol_ * static_cast(abs(b));
@@ -606,13 +606,13 @@ struct BinaryOp {
static constexpr bool valid = true;
BinaryOp(const std::vector& args) {}
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
constexpr bool operator()(const _T& a, const _T& b) const
{
return static_cast(a.real()) && static_cast(b.real());
}
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
constexpr bool operator()(const _T& a, const _T& b) const
{
return static_cast(a) && static_cast(b);
@@ -626,13 +626,13 @@ struct BinaryOp {
BinaryOp(const std::vector& args) {}
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
constexpr bool operator()(const _T& a, const _T& b) const
{
return static_cast(a.real()) || static_cast(b.real());
}
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
constexpr bool operator()(const _T& a, const _T& b) const
{
return static_cast(a) || static_cast(b);
@@ -645,13 +645,13 @@ struct BinaryOp {
static constexpr bool valid = true;
BinaryOp(const std::vector& args) {}
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
constexpr bool operator()(const _T& a, const _T& b) const
{
return static_cast(a.real()) != static_cast(b.real());
}
- template ::value>* = nullptr>
+ template ::value>* = nullptr>
constexpr bool operator()(const _T& a, const _T& b) const
{
return static_cast(a) != static_cast(b);
diff --git a/src/cunumeric/cunumeric.cc b/src/cunumeric/cunumeric.cc
index bf1ef7657..e8f87bbf6 100644
--- a/src/cunumeric/cunumeric.cc
+++ b/src/cunumeric/cunumeric.cc
@@ -25,9 +25,6 @@ namespace cunumeric {
static const char* const cunumeric_library_name = "cunumeric";
-/*static*/ bool CuNumeric::has_numamem = false;
-/*static*/ MapperID CuNumeric::mapper_id = -1;
-
/*static*/ LegateTaskRegistrar& CuNumeric::get_registrar()
{
static LegateTaskRegistrar registrar;
@@ -60,7 +57,6 @@ void registration_callback(Machine machine,
#endif
// Now we can register our mapper with the runtime
- CuNumeric::mapper_id = context.get_mapper_id(0);
context.register_mapper(new CuNumericMapper(runtime, machine, context), 0);
}
@@ -74,12 +70,6 @@ void cunumeric_perform_registration(void)
// in before the runtime starts and make it global so that we know
// that this call back is invoked everywhere across all nodes
Runtime::perform_registration_callback(cunumeric::registration_callback, true /*global*/);
-
- Runtime* runtime = Runtime::get_runtime();
- Context ctx = Runtime::get_context();
- Future fut = runtime->select_tunable_value(
- ctx, CUNUMERIC_TUNABLE_HAS_NUMAMEM, cunumeric::CuNumeric::mapper_id);
- if (fut.get_result() != 0) cunumeric::CuNumeric::has_numamem = true;
}
bool cunumeric_has_curand()
diff --git a/src/cunumeric/cunumeric.h b/src/cunumeric/cunumeric.h
index 32af7e6b7..11c4cd990 100644
--- a/src/cunumeric/cunumeric.h
+++ b/src/cunumeric/cunumeric.h
@@ -37,10 +37,6 @@ struct CuNumeric {
get_registrar().record_variant(std::forward(args)...);
}
static legate::LegateTaskRegistrar& get_registrar();
-
- public:
- static bool has_numamem;
- static Legion::MapperID mapper_id;
};
template
diff --git a/src/cunumeric/cunumeric_c.h b/src/cunumeric/cunumeric_c.h
index 60d6e108d..724db0013 100644
--- a/src/cunumeric/cunumeric_c.h
+++ b/src/cunumeric/cunumeric_c.h
@@ -52,6 +52,7 @@ enum CuNumericOpCode {
CUNUMERIC_NONZERO,
CUNUMERIC_PACKBITS,
CUNUMERIC_POTRF,
+ CUNUMERIC_PUTMASK,
CUNUMERIC_RAND,
CUNUMERIC_READ,
CUNUMERIC_REPEAT,
@@ -205,7 +206,6 @@ enum CuNumericTunable {
CUNUMERIC_TUNABLE_NUM_GPUS = 1,
CUNUMERIC_TUNABLE_NUM_PROCS = 2,
CUNUMERIC_TUNABLE_MAX_EAGER_VOLUME = 3,
- CUNUMERIC_TUNABLE_HAS_NUMAMEM = 4,
};
enum CuNumericBounds {
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop.cuh b/src/cunumeric/execution_policy/indexing/parallel_loop.cuh
new file mode 100644
index 000000000..81788908f
--- /dev/null
+++ b/src/cunumeric/execution_policy/indexing/parallel_loop.cuh
@@ -0,0 +1,51 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+#include "cunumeric/execution_policy/indexing/parallel_loop.h"
+#include "cunumeric/cuda_help.h"
+
+namespace cunumeric {
+
+template
+static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+ parallel_loop_kernel(const size_t volume, KERNEL kernel, Tag tag)
+{
+ const size_t idx = global_tid_1d();
+ if (idx >= volume) return;
+ kernel(idx, tag);
+}
+
+template
+struct ParallelLoopPolicy {
+ template
+ void operator()(const RECT& rect, KERNEL&& kernel)
+ {
+ const size_t volume = rect.volume();
+ if (0 == volume) return;
+ auto stream = get_cached_stream();
+ const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+ parallel_loop_kernel<<>>(
+ volume, std::forward(kernel), Tag{});
+
+ CHECK_CUDA_STREAM(stream);
+ }
+};
+
+} // namespace cunumeric
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop.h b/src/cunumeric/execution_policy/indexing/parallel_loop.h
new file mode 100644
index 000000000..609ed04ca
--- /dev/null
+++ b/src/cunumeric/execution_policy/indexing/parallel_loop.h
@@ -0,0 +1,36 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+template
+struct ParallelLoopPolicy {};
+
+template
+struct ParallelLoopPolicy {
+ template
+ void operator()(const RECT& rect, KERNEL&& kernel)
+ {
+ const size_t volume = rect.volume();
+ for (size_t idx = 0; idx < volume; ++idx) { kernel(idx, Tag{}); }
+ }
+};
+
+} // namespace cunumeric
diff --git a/src/cunumeric/execution_policy/indexing/parallel_loop_omp.h b/src/cunumeric/execution_policy/indexing/parallel_loop_omp.h
new file mode 100644
index 000000000..a89702fe3
--- /dev/null
+++ b/src/cunumeric/execution_policy/indexing/parallel_loop_omp.h
@@ -0,0 +1,38 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+#include "cunumeric/execution_policy/indexing/parallel_loop.h"
+#include "cunumeric/omp_help.h"
+
+#include
+
+namespace cunumeric {
+
+template
+struct ParallelLoopPolicy {
+ template
+ void operator()(const RECT& rect, KERNEL&& kernel)
+ {
+ const size_t volume = rect.volume();
+#pragma omp for schedule(static)
+ for (size_t idx = 0; idx < volume; ++idx) { kernel(idx, Tag{}); }
+ }
+};
+
+} // namespace cunumeric
diff --git a/src/cunumeric/index/advanced_indexing.cu b/src/cunumeric/index/advanced_indexing.cu
index fde5590fd..a7d3f2f94 100644
--- a/src/cunumeric/index/advanced_indexing.cu
+++ b/src/cunumeric/index/advanced_indexing.cu
@@ -94,7 +94,7 @@ struct AdvancedIndexingImplBody {
const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
- size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(int64_t);
+ size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(uint64_t);
if (blocks >= MAX_REDUCTION_CTAS) {
const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
diff --git a/src/cunumeric/index/putmask.cc b/src/cunumeric/index/putmask.cc
new file mode 100644
index 000000000..595329f13
--- /dev/null
+++ b/src/cunumeric/index/putmask.cc
@@ -0,0 +1,32 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/index/putmask.h"
+#include "cunumeric/index/putmask_template.inl"
+
+namespace cunumeric {
+
+/*static*/ void PutmaskTask::cpu_variant(TaskContext& context)
+{
+ putmask_template(context);
+}
+
+namespace // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void) { PutmaskTask::register_variants(); }
+} // namespace
+
+} // namespace cunumeric
diff --git a/src/cunumeric/index/putmask.cu b/src/cunumeric/index/putmask.cu
new file mode 100644
index 000000000..abe94d82f
--- /dev/null
+++ b/src/cunumeric/index/putmask.cu
@@ -0,0 +1,28 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/execution_policy/indexing/parallel_loop.cuh"
+#include "cunumeric/index/putmask.h"
+#include "cunumeric/index/putmask_template.inl"
+
+namespace cunumeric {
+
+/*static*/ void PutmaskTask::gpu_variant(TaskContext& context)
+{
+ putmask_template(context);
+}
+
+} // namespace cunumeric
diff --git a/src/cunumeric/index/putmask.h b/src/cunumeric/index/putmask.h
new file mode 100644
index 000000000..07a418d19
--- /dev/null
+++ b/src/cunumeric/index/putmask.h
@@ -0,0 +1,43 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+
+namespace cunumeric {
+
+struct PutmaskArgs {
+ const Array& input;
+ const Array& mask;
+ const Array& values;
+};
+
+class PutmaskTask : public CuNumericTask {
+ public:
+ static const int TASK_ID = CUNUMERIC_PUTMASK;
+
+ public:
+ static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+ static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+ static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+} // namespace cunumeric
diff --git a/src/cunumeric/index/putmask_omp.cc b/src/cunumeric/index/putmask_omp.cc
new file mode 100644
index 000000000..8550b41cd
--- /dev/null
+++ b/src/cunumeric/index/putmask_omp.cc
@@ -0,0 +1,28 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/execution_policy/indexing/parallel_loop_omp.h"
+#include "cunumeric/index/putmask.h"
+#include "cunumeric/index/putmask_template.inl"
+
+namespace cunumeric {
+
+/*static*/ void PutmaskTask::omp_variant(TaskContext& context)
+{
+ putmask_template(context);
+}
+
+} // namespace cunumeric
diff --git a/src/cunumeric/index/putmask_template.inl b/src/cunumeric/index/putmask_template.inl
new file mode 100644
index 000000000..f522198b3
--- /dev/null
+++ b/src/cunumeric/index/putmask_template.inl
@@ -0,0 +1,113 @@
+/* Copyright 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+// Useful for IDEs
+#include
+#include "cunumeric/index/putmask.h"
+#include "cunumeric/pitches.h"
+#include "cunumeric/execution_policy/indexing/parallel_loop.h"
+
+namespace cunumeric {
+
+using namespace Legion;
+using namespace legate;
+
+template
+struct Putmask {
+ using T = legate_type_of;
+ using IN = AccessorRW;
+ using MASK = AccessorRO;
+ using VALUES = AccessorRO;
+
+ IN input;
+ T* inputptr;
+ MASK mask;
+ const bool* maskptr;
+ VALUES values;
+ const T* valptr;
+ Pitches pitches;
+ Rect rect;
+ bool dense;
+ size_t volume;
+
+ struct DenseTag {};
+ struct SparseTag {};
+
+ // constructor:
+ Putmask(PutmaskArgs& args) : dense(false)
+ {
+ rect = args.input.shape();
+
+ input = args.input.read_write_accessor(rect);
+ mask = args.mask.read_accessor(rect);
+ values = args.values.read_accessor(rect);
+ volume = pitches.flatten(rect);
+ if (volume == 0) return;
+#ifndef LEGION_BOUNDS_CHECKS
+ dense = input.accessor.is_dense_row_major(rect) && mask.accessor.is_dense_row_major(rect);
+ dense = dense && values.accessor.is_dense_row_major(rect);
+ if (dense) {
+ inputptr = input.ptr(rect);
+ maskptr = mask.ptr(rect);
+ valptr = values.ptr(rect);
+ }
+#endif
+ } // constructor
+
+ __CUDA_HD__ void operator()(const size_t idx, DenseTag) const noexcept
+ {
+ if (maskptr[idx]) inputptr[idx] = valptr[idx];
+ }
+
+ __CUDA_HD__ void operator()(const size_t idx, SparseTag) const noexcept
+ {
+ auto p = pitches.unflatten(idx, rect.lo);
+ if (mask[p]) input[p] = values[p];
+ }
+
+ void execute() const noexcept
+ {
+#ifndef LEGION_BOUNDS_CHECKS
+ if (dense) { return ParallelLoopPolicy()(rect, *this); }
+#endif
+ return ParallelLoopPolicy()(rect, *this);
+ }
+};
+
+using namespace Legion;
+using namespace legate;
+
+template
+struct PutmaskImpl {
+ template
+ void operator()(PutmaskArgs& args) const
+ {
+ Putmask putmask(args);
+ putmask.execute();
+ }
+};
+
+template
+static void putmask_template(TaskContext& context)
+{
+ auto& inputs = context.inputs();
+ PutmaskArgs args{context.outputs()[0], inputs[1], inputs[2]};
+ double_dispatch(args.input.dim(), args.input.code(), PutmaskImpl{}, args);
+}
+
+} // namespace cunumeric
diff --git a/src/cunumeric/index/repeat.cu b/src/cunumeric/index/repeat.cu
index 30f0c2aff..1b658874a 100644
--- a/src/cunumeric/index/repeat.cu
+++ b/src/cunumeric/index/repeat.cu
@@ -139,7 +139,7 @@ struct RepeatImplBody {
DeviceScalarReductionBuffer> sum(stream);
const size_t blocks_count = (extent + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
- const size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(int64_t);
+ const size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(uint64_t);
if (blocks_count > MAX_REDUCTION_CTAS) {
const size_t iters = (blocks_count + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
diff --git a/src/cunumeric/index/repeat_omp.cc b/src/cunumeric/index/repeat_omp.cc
index 823a1a16a..9344452d1 100644
--- a/src/cunumeric/index/repeat_omp.cc
+++ b/src/cunumeric/index/repeat_omp.cc
@@ -62,9 +62,8 @@ struct RepeatImplBody {
const int32_t axis,
const Rect& in_rect) const
{
- auto kind = CuNumeric::has_numamem ? Memory::Kind::SOCKET_MEM : Memory::Kind::SYSTEM_MEM;
int64_t axis_extent = in_rect.hi[axis] - in_rect.lo[axis] + 1;
- auto offsets = create_buffer(axis_extent, kind);
+ auto offsets = create_buffer(axis_extent);
const auto max_threads = omp_get_max_threads();
ThreadLocalStorage local_sums(max_threads);
diff --git a/src/cunumeric/index/repeat_template.inl b/src/cunumeric/index/repeat_template.inl
index c47603916..30b3249cf 100644
--- a/src/cunumeric/index/repeat_template.inl
+++ b/src/cunumeric/index/repeat_template.inl
@@ -38,9 +38,7 @@ struct RepeatImpl {
auto input_arr = args.input.read_accessor(input_rect);
if (input_rect.empty()) {
- auto extents = Point::ZEROES();
- auto buffer = create_buffer(extents);
- args.output.return_data(buffer, extents);
+ args.output.make_empty();
return;
}
diff --git a/src/cunumeric/index/wrap.cc b/src/cunumeric/index/wrap.cc
index 33dfcfe4b..9d8fef331 100644
--- a/src/cunumeric/index/wrap.cc
+++ b/src/cunumeric/index/wrap.cc
@@ -24,29 +24,32 @@ using namespace legate;
template
struct WrapImplBody {
+ template
void operator()(const AccessorWO, 1>& out,
const Pitches<0>& pitches_out,
- const Rect<1>& out_rect,
- const Pitches& pitches_in,
- const Rect& in_rect,
- const bool dense) const
+ const Rect<1>& rect_out,
+ const Pitches& pitches_base,
+ const Rect& rect_base,
+ const bool dense,
+ const bool check_bounds,
+ const IND& indices) const
{
- const int64_t start = out_rect.lo[0];
- const int64_t end = out_rect.hi[0];
- const auto in_volume = in_rect.volume();
+ const int64_t start = rect_out.lo[0];
+ const int64_t end = rect_out.hi[0];
+ const auto volume_base = rect_base.volume();
if (dense) {
- int64_t out_idx = 0;
- auto outptr = out.ptr(out_rect);
+ auto outptr = out.ptr(rect_out);
for (int64_t i = start; i <= end; i++) {
- const int64_t input_idx = i % in_volume;
- auto point = pitches_in.unflatten(input_idx, in_rect.lo);
- outptr[out_idx] = point;
- out_idx++;
+ if (check_bounds) check_idx(i, volume_base, indices);
+ const int64_t input_idx = compute_idx(i, volume_base, indices);
+ auto point = pitches_base.unflatten(input_idx, rect_base.lo);
+ outptr[i - start] = point;
}
} else {
for (int64_t i = start; i <= end; i++) {
- const int64_t input_idx = i % in_volume;
- auto point = pitches_in.unflatten(input_idx, in_rect.lo);
+ if (check_bounds) check_idx(i, volume_base, indices);
+ const int64_t input_idx = compute_idx(i, volume_base, indices);
+ auto point = pitches_base.unflatten(input_idx, rect_base.lo);
out[i] = point;
}
} // else
diff --git a/src/cunumeric/index/wrap.cu b/src/cunumeric/index/wrap.cu
index 0f118eadf..cc82418a0 100644
--- a/src/cunumeric/index/wrap.cu
+++ b/src/cunumeric/index/wrap.cu
@@ -23,62 +23,131 @@ namespace cunumeric {
using namespace Legion;
using namespace legate;
-template
+template
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+ check_kernel(Output out,
+ const AccessorRO indices,
+ const int64_t start,
+ const int64_t volume,
+ const int64_t volume_base,
+ const int64_t iters)
+{
+ bool value = false;
+ for (size_t i = 0; i < iters; i++) {
+ const auto idx = (i * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+ if (idx >= volume) break;
+ auto index_tmp = indices[idx + start];
+ int64_t index = index_tmp < 0 ? index_tmp + volume_base : index_tmp;
+ bool val = (index < 0 || index >= volume_base);
+ SumReduction::fold(value, val);
+ }
+ reduce_output(out, value);
+}
+
+template
__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
wrap_kernel(const AccessorWO, 1> out,
const int64_t start,
const int64_t volume,
const Pitches<0> pitches_out,
const Point<1> out_lo,
- const Pitches pitches_in,
- const Point in_lo,
- const size_t in_volume)
+ const Pitches pitches_base,
+ const Point base_lo,
+ const size_t volume_base,
+ const IND indices)
{
const auto idx = global_tid_1d();
if (idx >= volume) return;
- const int64_t input_idx = (idx + start) % in_volume;
+ const int64_t input_idx = compute_idx((idx + start), volume_base, indices);
auto out_p = pitches_out.unflatten(idx, out_lo);
- auto p = pitches_in.unflatten(input_idx, in_lo);
+ auto p = pitches_base.unflatten(input_idx, base_lo);
out[out_p] = p;
}
-template
+template
__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
wrap_kernel_dense(Point* out,
const int64_t start,
const int64_t volume,
- const Pitches pitches_in,
- const Point in_lo,
- const size_t in_volume)
+ const Pitches pitches_base,
+ const Point base_lo,
+ const size_t volume_base,
+ const IND indices)
{
const auto idx = global_tid_1d();
if (idx >= volume) return;
- const int64_t input_idx = (idx + start) % in_volume;
- auto p = pitches_in.unflatten(input_idx, in_lo);
+ const int64_t input_idx = compute_idx((idx + start), volume_base, indices);
+ auto p = pitches_base.unflatten(input_idx, base_lo);
out[idx] = p;
}
+// don't do anything when indices is a boolean
+void check_out_of_bounds(const bool& indices,
+ const int64_t start,
+ const int64_t volume,
+ const int64_t volume_base,
+ cudaStream_t stream)
+{
+}
+
+void check_out_of_bounds(const AccessorRO& indices,
+ const int64_t start,
+ const int64_t volume,
+ const int64_t volume_base,
+ cudaStream_t stream)
+{
+ const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+ size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(bool);
+ DeviceScalarReductionBuffer> out_of_bounds(stream);
+
+ if (blocks >= MAX_REDUCTION_CTAS) {
+ const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
+ check_kernel<<>>(
+ out_of_bounds, indices, start, volume, volume_base, iters);
+ } else {
+ check_kernel<<>>(
+ out_of_bounds, indices, start, volume, volume_base, 1);
+ }
+ CHECK_CUDA_STREAM(stream);
+
+ bool res = out_of_bounds.read(stream);
+ if (res) throw legate::TaskException("index is out of bounds in index array");
+}
+
template
struct WrapImplBody {
+ template
void operator()(const AccessorWO, 1>& out,
const Pitches<0>& pitches_out,
- const Rect<1>& out_rect,
- const Pitches& pitches_in,
- const Rect& in_rect,
- const bool dense) const
+ const Rect<1>& rect_out,
+ const Pitches& pitches_base,
+ const Rect& rect_base,
+ const bool dense,
+ const bool check_bounds,
+ const IND& indices) const
{
- auto stream = get_cached_stream();
- const int64_t start = out_rect.lo[0];
- const int64_t volume = out_rect.volume();
- const auto in_volume = in_rect.volume();
- const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+ auto stream = get_cached_stream();
+ const int64_t start = rect_out.lo[0];
+ const int64_t volume = rect_out.volume();
+ const auto volume_base = rect_base.volume();
+ const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+ if (check_bounds) check_out_of_bounds(indices, start, volume, volume_base, stream);
+
if (dense) {
- auto outptr = out.ptr(out_rect);
- wrap_kernel_dense<<>>(
- outptr, start, volume, pitches_in, in_rect.lo, in_volume);
+ auto outptr = out.ptr(rect_out);
+ wrap_kernel_dense<<>>(
+ outptr, start, volume, pitches_base, rect_base.lo, volume_base, indices);
} else {
- wrap_kernel<<>>(
- out, start, volume, pitches_out, out_rect.lo, pitches_in, in_rect.lo, in_volume);
+ wrap_kernel<<>>(out,
+ start,
+ volume,
+ pitches_out,
+ rect_out.lo,
+ pitches_base,
+ rect_base.lo,
+ volume_base,
+ indices);
}
CHECK_CUDA_STREAM(stream);
}
diff --git a/src/cunumeric/index/wrap.h b/src/cunumeric/index/wrap.h
index 91c3f2326..8c4167983 100644
--- a/src/cunumeric/index/wrap.h
+++ b/src/cunumeric/index/wrap.h
@@ -25,6 +25,9 @@ struct WrapArgs {
// copy information from original array to the
// `wrapped` one
const Legion::DomainPoint shape; // shape of the original array
+ const bool has_input;
+ const bool check_bounds;
+ const Array& in = Array();
};
class WrapTask : public CuNumericTask {
@@ -41,4 +44,42 @@ class WrapTask : public CuNumericTask {
#endif
};
+__CUDA_HD__ static int64_t compute_idx(const int64_t i, const int64_t volume, const bool&)
+{
+ return i % volume;
+}
+
+__CUDA_HD__ static int64_t compute_idx(const int64_t i,
+ const int64_t volume,
+ const legate::AccessorRO& indices)
+{
+ int64_t idx = indices[i];
+ int64_t index = idx < 0 ? idx + volume : idx;
+ return index;
+}
+
+static void check_idx(const int64_t i,
+ const int64_t volume,
+ const legate::AccessorRO& indices)
+{
+ int64_t idx = indices[i];
+ int64_t index = idx < 0 ? idx + volume : idx;
+ if (index < 0 || index >= volume)
+ throw legate::TaskException("index is out of bounds in index array");
+}
+static void check_idx(const int64_t i, const int64_t volume, const bool&)
+{
+ // don't do anything when wrapping indices
+}
+
+static bool check_idx_omp(const int64_t i,
+ const int64_t volume,
+ const legate::AccessorRO& indices)
+{
+ int64_t idx = indices[i];
+ int64_t index = idx < 0 ? idx + volume : idx;
+ return (index < 0 || index >= volume);
+}
+static bool check_idx_omp(const int64_t i, const int64_t volume, const bool&) { return false; }
+
} // namespace cunumeric
diff --git a/src/cunumeric/index/wrap_omp.cc b/src/cunumeric/index/wrap_omp.cc
index f95e9123c..9387e2e3b 100644
--- a/src/cunumeric/index/wrap_omp.cc
+++ b/src/cunumeric/index/wrap_omp.cc
@@ -24,32 +24,42 @@ using namespace legate;
template
struct WrapImplBody {
+ template
void operator()(const AccessorWO, 1>& out,
const Pitches<0>& pitches_out,
- const Rect<1>& out_rect,
- const Pitches& pitches_in,
- const Rect& in_rect,
- const bool dense) const
+ const Rect<1>& rect_out,
+ const Pitches& pitches_base,
+ const Rect& rect_base,
+ const bool dense,
+ const bool check_bounds,
+ const IND& indices) const
{
- const int64_t start = out_rect.lo[0];
- const int64_t end = out_rect.hi[0];
- const auto in_volume = in_rect.volume();
+ const int64_t start = rect_out.lo[0];
+ const int64_t end = rect_out.hi[0];
+ const auto volume_base = rect_base.volume();
+ std::atomic is_out_of_bounds = false;
if (dense) {
- auto outptr = out.ptr(out_rect);
+ auto outptr = out.ptr(rect_out);
#pragma omp parallel for schedule(static)
for (int64_t i = start; i <= end; i++) {
- const int64_t input_idx = i % in_volume;
- auto point = pitches_in.unflatten(input_idx, in_rect.lo);
+ if (check_bounds)
+ if (check_idx_omp(i, volume_base, indices)) is_out_of_bounds = true;
+ const int64_t input_idx = compute_idx(i, volume_base, indices);
+ auto point = pitches_base.unflatten(input_idx, rect_base.lo);
outptr[i - start] = point;
}
} else {
#pragma omp parallel for schedule(static)
for (int64_t i = start; i <= end; i++) {
- const int64_t input_idx = i % in_volume;
- auto point = pitches_in.unflatten(input_idx, in_rect.lo);
+ if (check_bounds)
+ if (check_idx_omp(i, volume_base, indices)) is_out_of_bounds = true;
+ const int64_t input_idx = compute_idx(i, volume_base, indices);
+ auto point = pitches_base.unflatten(input_idx, rect_base.lo);
out[i] = point;
}
} // else
+
+ if (is_out_of_bounds) throw legate::TaskException("index is out of bounds in index array");
}
};
diff --git a/src/cunumeric/index/wrap_template.inl b/src/cunumeric/index/wrap_template.inl
index 46885f24e..9a9fc3b28 100644
--- a/src/cunumeric/index/wrap_template.inl
+++ b/src/cunumeric/index/wrap_template.inl
@@ -34,15 +34,15 @@ struct WrapImpl {
void operator()(WrapArgs& args) const
{
using VAL = Point;
- auto out_rect = args.out.shape<1>(); // output array is always 1D
- auto out = args.out.write_accessor, 1>(out_rect);
+ auto rect_out = args.out.shape<1>(); // output array is always 1D
+ auto out = args.out.write_accessor, 1>(rect_out);
Pitches<0> pitches_out;
- size_t volume_out = pitches_out.flatten(out_rect);
+ size_t volume_out = pitches_out.flatten(rect_out);
if (volume_out == 0) return;
#ifndef LEGION_BOUNDS_CHECKS
- bool dense = out.accessor.is_dense_row_major(out_rect);
+ bool dense = out.accessor.is_dense_row_major(rect_out);
#else
bool dense = false;
#endif
@@ -52,24 +52,44 @@ struct WrapImpl {
point_lo[dim] = 0;
point_hi[dim] = args.shape[dim] - 1;
}
- Rect input_rect(point_lo, point_hi);
+ Rect rect_base(point_lo, point_hi);
- Pitches pitches_in;
- size_t volume_in = pitches_in.flatten(input_rect);
+ Pitches pitches_base;
+ size_t volume_base = pitches_base.flatten(rect_base);
#ifdef DEBUG_CUNUMERIC
- assert(volume_in != 0);
+ assert(volume_base != 0);
#endif
- WrapImplBody()(out, pitches_out, out_rect, pitches_in, input_rect, dense);
+ if (args.has_input) {
+ auto rect_in = args.in.shape<1>();
+ auto in = args.in.read_accessor(rect_in); // input should be always integer type
+#ifdef DEBUG_CUNUMERIC
+ assert(rect_in == rect_out);
+#endif
+ WrapImplBody()(
+ out, pitches_out, rect_out, pitches_base, rect_base, dense, args.check_bounds, in);
+
+ } else {
+ bool tmp = false;
+ WrapImplBody()(
+ out, pitches_out, rect_out, pitches_base, rect_base, dense, args.check_bounds, tmp);
+ } // else
}
};
template
static void wrap_template(TaskContext& context)
{
- auto shape = context.scalars()[0].value();
- int dim = shape.dim;
- WrapArgs args{context.outputs()[0], shape};
+ auto shape = context.scalars()[0].value();
+ int dim = shape.dim;
+ bool has_input = context.scalars()[1].value();
+ bool check_bounds = context.scalars()[2].value();
+ Array tmp_array = Array();
+ WrapArgs args{context.outputs()[0],
+ shape,
+ has_input,
+ check_bounds,
+ has_input ? context.inputs()[0] : tmp_array};
dim_dispatch(dim, WrapImpl{}, args);
}
diff --git a/src/cunumeric/index/zip.cu b/src/cunumeric/index/zip.cu
index 8bdfcd3f0..82d162126 100644
--- a/src/cunumeric/index/zip.cu
+++ b/src/cunumeric/index/zip.cu
@@ -28,15 +28,15 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
const Buffer, 1> index_arrays,
const Rect rect,
const Pitches pitches,
- size_t volume,
- DomainPoint shape,
+ const size_t volume,
+ const DomainPoint shape,
std::index_sequence)
{
const size_t idx = global_tid_1d();
if (idx >= volume) return;
auto p = pitches.unflatten(idx, rect.lo);
Legion::Point new_point;
- for (size_t i = 0; i < N; i++) { new_point[i] = compute_idx(index_arrays[i][p], shape[i]); }
+ for (size_t i = 0; i < N; i++) { new_point[i] = compute_idx_cuda(index_arrays[i][p], shape[i]); }
out[p] = new_point;
}
@@ -45,14 +45,16 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
zip_kernel_dense(Point* out,
const Buffer index_arrays,
const Rect rect,
- size_t volume,
- DomainPoint shape,
+ const size_t volume,
+ const DomainPoint shape,
std::index_sequence)
{
const size_t idx = global_tid_1d();
if (idx >= volume) return;
Legion::Point new_point;
- for (size_t i = 0; i < N; i++) { new_point[i] = compute_idx(index_arrays[i][idx], shape[i]); }
+ for (size_t i = 0; i < N; i++) {
+ new_point[i] = compute_idx_cuda(index_arrays[i][idx], shape[i]);
+ }
out[idx] = new_point;
}
@@ -62,11 +64,11 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
const Buffer, 1> index_arrays,
const Rect rect,
const Pitches pitches,
- int narrays,
- size_t volume,
- int64_t key_dim,
- int64_t start_index,
- DomainPoint shape)
+ const int64_t narrays,
+ const size_t volume,
+ const int64_t key_dim,
+ const int64_t start_index,
+ const DomainPoint shape)
{
const size_t idx = global_tid_1d();
if (idx >= volume) return;
@@ -74,7 +76,7 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
Legion::Point new_point;
for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
for (size_t i = 0; i < narrays; i++) {
- new_point[start_index + i] = compute_idx(index_arrays[i][p], shape[start_index + i]);
+ new_point[start_index + i] = compute_idx_cuda(index_arrays[i][p], shape[start_index + i]);
}
for (size_t i = (start_index + narrays); i < N; i++) {
int64_t j = key_dim + i - narrays;
@@ -83,10 +85,63 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
out[p] = new_point;
}
+template
+__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
+ check_kernel(Output out,
+ const Buffer, 1> index_arrays,
+ const int64_t volume,
+ const int64_t iters,
+ const Rect rect,
+ const Pitches pitches,
+ const int64_t narrays,
+ const int64_t start_index,
+ const DomainPoint shape)
+{
+ bool value = false;
+ for (size_t i = 0; i < iters; i++) {
+ const auto idx = (i * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
+ if (idx >= volume) break;
+ auto p = pitches.unflatten(idx, rect.lo);
+ for (size_t n = 0; n < narrays; n++) {
+ const int64_t extent = shape[start_index + n];
+ coord_t index = index_arrays[n][p] < 0 ? index_arrays[n][p] + extent : index_arrays[n][p];
+ bool val = (index < 0 || index >= extent);
+ SumReduction::fold(value, val);
+ } // for n
+ }
+ reduce_output(out, value);
+}
+
template
struct ZipImplBody {
using VAL = int64_t;
+ void check_out_of_bounds(const Buffer, 1>& index_arrays,
+ const int64_t volume,
+ const Rect& rect,
+ const Pitches& pitches,
+ const int64_t narrays,
+ const int64_t start_index,
+ const DomainPoint& shape,
+ cudaStream_t stream) const
+ {
+ const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+ size_t shmem_size = THREADS_PER_BLOCK / 32 * sizeof(bool);
+ DeviceScalarReductionBuffer> out_of_bounds(stream);
+ if (blocks >= MAX_REDUCTION_CTAS) {
+ const size_t iters = (blocks + MAX_REDUCTION_CTAS - 1) / MAX_REDUCTION_CTAS;
+ check_kernel<<>>(
+ out_of_bounds, index_arrays, volume, iters, rect, pitches, narrays, start_index, shape);
+ } else {
+ check_kernel<<>>(
+ out_of_bounds, index_arrays, volume, 1, rect, pitches, narrays, start_index, shape);
+ }
+ CHECK_CUDA_STREAM(stream);
+
+ bool res = out_of_bounds.read(stream);
+ if (res) throw legate::TaskException("index is out of bounds in index array");
+ }
+
template
void operator()(const AccessorWO, DIM>& out,
const std::vector>& index_arrays,
@@ -101,19 +156,23 @@ struct ZipImplBody {
auto stream = get_cached_stream();
const size_t volume = rect.volume();
const size_t blocks = (volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;
+
+ auto index_buf =
+ create_buffer, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
+ for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
+ check_out_of_bounds(
+ index_buf, volume, rect, pitches, index_arrays.size(), start_index, shape, stream);
+
if (index_arrays.size() == N) {
if (dense) {
- auto index_buf =
+ auto index_buf_dense =
create_buffer(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) {
- index_buf[idx] = index_arrays[idx].ptr(rect);
+ index_buf_dense[idx] = index_arrays[idx].ptr(rect);
}
zip_kernel_dense<<>>(
- out.ptr(rect), index_buf, rect, volume, shape, std::make_index_sequence());
+ out.ptr(rect), index_buf_dense, rect, volume, shape, std::make_index_sequence());
} else {
- auto index_buf =
- create_buffer, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
- for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
zip_kernel<<>>(
out, index_buf, rect, pitches, volume, shape, std::make_index_sequence());
}
@@ -121,9 +180,6 @@ struct ZipImplBody {
#ifdef DEBUG_CUNUMERIC
assert(index_arrays.size() < N);
#endif
- auto index_buf =
- create_buffer, 1>(index_arrays.size(), Memory::Kind::Z_COPY_MEM);
- for (uint32_t idx = 0; idx < index_arrays.size(); ++idx) index_buf[idx] = index_arrays[idx];
int num_arrays = index_arrays.size();
zip_kernel<<>>(
out, index_buf, rect, pitches, num_arrays, volume, key_dim, start_index, shape);
diff --git a/src/cunumeric/index/zip.h b/src/cunumeric/index/zip.h
index 61a87104c..e3c7af8a7 100644
--- a/src/cunumeric/index/zip.h
+++ b/src/cunumeric/index/zip.h
@@ -51,4 +51,17 @@ constexpr coord_t compute_idx(coord_t index, coord_t extent)
return new_index;
}
+constexpr std::pair compute_idx_omp(coord_t index, coord_t extent)
+{
+ coord_t new_index = index < 0 ? index + extent : index;
+ bool out_of_bounds = (new_index < 0 || new_index >= extent);
+ return {new_index, out_of_bounds};
+}
+
+constexpr coord_t compute_idx_cuda(coord_t index, coord_t extent)
+{
+ coord_t new_index = index < 0 ? index + extent : index;
+ return new_index;
+}
+
} // namespace cunumeric
diff --git a/src/cunumeric/index/zip_omp.cc b/src/cunumeric/index/zip_omp.cc
index 14a3c4b25..aa014547e 100644
--- a/src/cunumeric/index/zip_omp.cc
+++ b/src/cunumeric/index/zip_omp.cc
@@ -37,7 +37,8 @@ struct ZipImplBody {
const DomainPoint& shape,
std::index_sequence) const
{
- const size_t volume = rect.volume();
+ const size_t volume = rect.volume();
+ std::atomic is_out_of_bounds = false;
if (index_arrays.size() == N) {
if (dense) {
std::vector indx_ptrs = {index_arrays[Is].ptr(rect)...};
@@ -46,7 +47,9 @@ struct ZipImplBody {
for (size_t idx = 0; idx < volume; ++idx) {
Legion::Point new_point;
for (size_t i = 0; i < N; i++) {
- new_point[i] = compute_idx(indx_ptrs[i][idx], shape[i]);
+ auto pair = compute_idx_omp(indx_ptrs[i][idx], shape[i]);
+ new_point[i] = pair.first;
+ if (pair.second) is_out_of_bounds = true;
}
outptr[idx] = new_point;
}
@@ -56,7 +59,9 @@ struct ZipImplBody {
auto p = pitches.unflatten(idx, rect.lo);
Legion::Point new_point;
for (size_t i = 0; i < N; i++) {
- new_point[i] = compute_idx(index_arrays[i][p], shape[i]);
+ auto pair = compute_idx_omp(index_arrays[i][p], shape[i]);
+ new_point[i] = pair.first;
+ if (pair.second) is_out_of_bounds = true;
}
out[p] = new_point;
}
@@ -71,7 +76,9 @@ struct ZipImplBody {
Legion::Point new_point;
for (size_t i = 0; i < start_index; i++) { new_point[i] = p[i]; }
for (size_t i = 0; i < index_arrays.size(); i++) {
- new_point[start_index + i] = compute_idx(index_arrays[i][p], shape[start_index + i]);
+ auto pair = compute_idx_omp(index_arrays[i][p], shape[start_index + i]);
+ new_point[start_index + i] = pair.first;
+ if (pair.second) is_out_of_bounds = true;
}
for (size_t i = (start_index + index_arrays.size()); i < N; i++) {
int64_t j = key_dim + i - index_arrays.size();
@@ -80,6 +87,7 @@ struct ZipImplBody {
out[p] = new_point;
}
}
+ if (is_out_of_bounds) throw legate::TaskException("index is out of bounds in index array");
}
};
diff --git a/src/cunumeric/item/read.h b/src/cunumeric/item/read.h
index d3bb90774..0606d82e4 100644
--- a/src/cunumeric/item/read.h
+++ b/src/cunumeric/item/read.h
@@ -26,6 +26,9 @@ class ReadTask : public CuNumericTask {
public:
static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+ static void omp_variant(legate::TaskContext& context) { ReadTask::cpu_variant(context); }
+#endif
#ifdef LEGATE_USE_CUDA
static void gpu_variant(legate::TaskContext& context);
#endif
diff --git a/src/cunumeric/item/write.h b/src/cunumeric/item/write.h
index c3455b0e0..725918139 100644
--- a/src/cunumeric/item/write.h
+++ b/src/cunumeric/item/write.h
@@ -26,6 +26,9 @@ class WriteTask : public CuNumericTask {
public:
static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+ static void omp_variant(legate::TaskContext& context) { WriteTask::cpu_variant(context); }
+#endif
#ifdef LEGATE_USE_CUDA
static void gpu_variant(legate::TaskContext& context);
#endif
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index 855121cd2..51797acfe 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -65,15 +65,6 @@ Scalar CuNumericMapper::tunable_value(TunableID tunable_id)
}
return Scalar(eager_volume);
}
- case CUNUMERIC_TUNABLE_HAS_NUMAMEM: {
- // TODO: This assumes that either all OpenMP processors across the machine have a NUMA
- // memory or none does.
- Legion::Machine::MemoryQuery query(machine);
- query.local_address_space();
- query.only_kind(Legion::Memory::SOCKET_MEM);
- int32_t has_numamem = query.count() > 0;
- return Scalar(has_numamem);
- }
default: break;
}
LEGATE_ABORT; // unknown tunable value
@@ -119,14 +110,14 @@ std::vector CuNumericMapper::store_mappings(
// TODO: Our actual requirements are a little less strict than this; we require each array or
// vector to have a stride of 1 on at least one dimension.
std::vector mappings;
- auto& inputs = task.inputs();
- auto& outputs = task.outputs();
+ auto& inputs = task.inputs();
+ auto& reductions = task.reductions();
for (auto& input : inputs) {
mappings.push_back(StoreMapping::default_mapping(input, options.front()));
mappings.back().policy.exact = true;
}
- for (auto& output : outputs) {
- mappings.push_back(StoreMapping::default_mapping(output, options.front()));
+ for (auto& reduction : reductions) {
+ mappings.push_back(StoreMapping::default_mapping(reduction, options.front()));
mappings.back().policy.exact = true;
}
return std::move(mappings);
diff --git a/src/cunumeric/matrix/contract.cu b/src/cunumeric/matrix/contract.cu
index 722916043..7a66e9ba8 100644
--- a/src/cunumeric/matrix/contract.cu
+++ b/src/cunumeric/matrix/contract.cu
@@ -26,8 +26,7 @@ using namespace Legion;
namespace { // anonymous
template
-struct contract_helper {
-};
+struct contract_helper {};
template <>
struct contract_helper<__half> {
diff --git a/src/cunumeric/matrix/contract_omp.cc b/src/cunumeric/matrix/contract_omp.cc
index 4a1dd27b2..659db3f0a 100644
--- a/src/cunumeric/matrix/contract_omp.cc
+++ b/src/cunumeric/matrix/contract_omp.cc
@@ -112,17 +112,17 @@ struct ContractImplBody {
std::vector lhs_copy_strides(lhs_ndim);
int64_t lhs_size = calculate_volume(lhs_ndim, lhs_shape, lhs_copy_strides.data());
- float* lhs_copy_data = allocate_buffer_omp(lhs_size);
+ float* lhs_copy_data = allocate_buffer(lhs_size);
half_tensor_to_float_omp(lhs_copy_data, lhs_data, lhs_ndim, lhs_shape, lhs_strides);
std::vector rhs1_copy_strides(rhs1_ndim);
int64_t rhs1_size = calculate_volume(rhs1_ndim, rhs1_shape, rhs1_copy_strides.data());
- float* rhs1_copy_data = allocate_buffer_omp(rhs1_size);
+ float* rhs1_copy_data = allocate_buffer(rhs1_size);
half_tensor_to_float_omp(rhs1_copy_data, rhs1_data, rhs1_ndim, rhs1_shape, rhs1_strides);
std::vector rhs2_copy_strides(rhs2_ndim);
int64_t rhs2_size = calculate_volume(rhs2_ndim, rhs2_shape, rhs2_copy_strides.data());
- float* rhs2_copy_data = allocate_buffer_omp(rhs2_size);
+ float* rhs2_copy_data = allocate_buffer(rhs2_size);
half_tensor_to_float_omp(rhs2_copy_data, rhs2_data, rhs2_ndim, rhs2_shape, rhs2_strides);
ContractImplBody{}(lhs_copy_data,
diff --git a/src/cunumeric/matrix/contract_template.inl b/src/cunumeric/matrix/contract_template.inl
index 6bd375e5e..d067cafd3 100644
--- a/src/cunumeric/matrix/contract_template.inl
+++ b/src/cunumeric/matrix/contract_template.inl
@@ -33,23 +33,17 @@ template
struct ContractImplBody;
template
-struct support_contract : std::false_type {
-};
+struct support_contract : std::false_type {};
template <>
-struct support_contract