diff --git a/docs/images/binder-python.jpg b/docs/images/binder-python.jpg new file mode 100755 index 00000000..121f2c89 Binary files /dev/null and b/docs/images/binder-python.jpg differ diff --git a/docs/overview/python-package.rst b/docs/overview/python-package.rst index 004f16a5..c034e624 100644 --- a/docs/overview/python-package.rst +++ b/docs/overview/python-package.rst @@ -2,13 +2,28 @@ Python Package Overview ======== -This section provides an overview of the Python Package from a functionality perspective. If you wish to see all the classes and their respective functions you can find that in the `Python Class Reference Section `_. +This section provides an overview of the Python Package from a functionality perspective. If you wish to see all the classes and their respective functions you can find that in the `Python Class Reference Section `_. Below is a diagram that provides insights on the relationship between Vulkan Kompute objects and Vulkan resources, which primarily encompass ownership of either CPU and/or GPU memory. .. image:: ../images/kompute-architecture.jpg :width: 70% +Package Installation +^^^^^^^^^ + +Once you set up the package dependencies, you can install Kompute from ```Pypi``` using ```pip``` by running: + +.. code-block:: bash + + pip install kp + +You can also install from master branch using: + +.. code-block:: python + + pip install git+git://github.com/EthicalML/vulkan-kompute.git@master + Core Python Components ^^^^^^^^ @@ -272,28 +287,16 @@ Similar to the logistic regression implementation in the C++ examples section, b print(tensor_b_in.data()) -Package Installation -^^^^^^^^^ - -The package can be installed through the top level `setup.py` by running: - -``` -pip install kp -``` - -You can also install from master branch using: - -``` -pip install git+git://github.com/EthicalML/vulkan-kompute.git@master -``` +Log Level Configuration +^^^^^^ You can configure log level with the function `kp.log_level` as outlined below. The values are TRACE=0, DEBUG=1, INFO=2, WARN=3, ERROR=4. Kompute defaults to INFO. -``` -import kp -kp.log_level(1) -``` +.. code-block:: python + :linenos: + import kp + kp.log_level(1) diff --git a/examples/python/README.md b/examples/python/README.md new file mode 100644 index 00000000..1e2977c7 --- /dev/null +++ b/examples/python/README.md @@ -0,0 +1,17 @@ +# Kompute Python Example + +This folder contains the accompanying code for the article "High Performance Python for GPU Accelerated Machine Learning in Cross-Vendor GPUs". + +The easiest way to try this example is by using the [Google Binder Notebook](https://colab.research.google.com/drive/15uQ7qMZuOyk8JcXF-3SB2R5yNFW21I4P), which will allow you to use a GPU for free and runs without much setup. + + + + + +Alternatively if you want to test the example yourself locally, you can get setup and started through the following links: + +1. Install the [Kompute Python Package](https://kompute.cc/overview/python-package.html#package-installation) +2. Run the [Array Multiplication Code](https://github.com/EthicalML/vulkan-kompute/blob/python_extensions/python/test/test_array_multiplication.py) +3. Run the [Logistic Regression Code](https://github.com/EthicalML/vulkan-kompute/blob/python_extensions/python/test/test_logistic_regression.py) + + diff --git a/python/src/main.cpp b/python/src/main.cpp index 58009144..6f343166 100644 --- a/python/src/main.cpp +++ b/python/src/main.cpp @@ -39,12 +39,33 @@ PYBIND11_MODULE(kp, m) { return std::unique_ptr(new kp::Tensor(data, tensorTypes)); }), "Initialiser with list of data components and tensor GPU memory type.") .def("data", &kp::Tensor::data, DOC(kp, Tensor, data)) - .def("get", [](kp::Tensor &self, uint32_t index) -> float { return self.data()[index]; }, + .def("__getitem__", [](kp::Tensor &self, size_t index) -> float { return self.data()[index]; }, "When only an index is necessary") - .def("set", [](kp::Tensor &self, uint32_t index, float value) { + .def("__setitem__", [](kp::Tensor &self, size_t index, float value) { self.data()[index] = value; }) - .def("set", &kp::Tensor::setData, "Overrides the data in the local Tensor memory.") + .def("set_data", &kp::Tensor::setData, "Overrides the data in the local Tensor memory.") + .def("__iter__", [](kp::Tensor &self) { + return py::make_iterator(self.data().begin(), self.data().end()); + }, py::keep_alive<0, 1>(), // Required to keep alive iterator while exists + "Iterator to enable looping within data structure as required.") + .def("__contains__", [](kp::Tensor &self, float v) { + for (size_t i = 0; i < self.data().size(); ++i) { + if (v == self.data()[i]) { + return true; + } + } + return false; + }) + .def("__reversed__", [](kp::Tensor &self) { + size_t size = self.data().size(); + std::vector reversed(size); + for (size_t i = 0; i < size; i++) { + reversed[size - i - 1] = self.data()[i]; + } + return reversed; + }) .def("size", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.") + .def("__len__", &kp::Tensor::size, "Retrieves the size of the Tensor data as per the local Tensor memory.") .def("tensor_type", &kp::Tensor::tensorType, "Retreves the memory type of the tensor.") .def("is_init", &kp::Tensor::isInit, "Checks whether the tensor GPU memory has been initialised.") .def("map_data_from_host", &kp::Tensor::mapDataFromHostMemory, "Maps data into GPU memory from tensor local data.") diff --git a/python/test/test_array_multiplication.py b/python/test/test_array_multiplication.py new file mode 100644 index 00000000..3ef3c02c --- /dev/null +++ b/python/test/test_array_multiplication.py @@ -0,0 +1,35 @@ +import pyshader as ps +import kp + + +def test_array_multiplication(): + + # 1. Create Kompute Manager (selects device 0 by default) + mgr = kp.Manager() + + # 2. Create Kompute Tensors to hold data + tensor_in_a = kp.Tensor([2, 2, 2]) + tensor_in_b = kp.Tensor([1, 2, 3]) + tensor_out = kp.Tensor([0, 0, 0]) + + # 3. Initialise the Kompute Tensors in the GPU + mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) + + # 4. Define the multiplication shader code to run on the GPU + @ps.python2shader + def compute_shader_multiply(index=("input", "GlobalInvocationId", ps.ivec3), + data1=("buffer", 0, ps.Array(ps.f32)), + data2=("buffer", 1, ps.Array(ps.f32)), + data3=("buffer", 2, ps.Array(ps.f32))): + i = index.x + data3[i] = data1[i] * data2[i] + + # 5. Run shader code against our previously defined tensors + mgr.eval_algo_data_def( + [tensor_in_a, tensor_in_b, tensor_out], + compute_shader_multiply.to_spirv()) + + # 6. Sync tensor data from GPU back to local + mgr.eval_tensor_sync_local_def([tensor_out]) + + assert tensor_out.data() == [2.0, 4.0, 6.0] diff --git a/python/test/test_kompute.py b/python/test/test_kompute.py index 930b8d69..b9b145f2 100644 --- a/python/test/test_kompute.py +++ b/python/test/test_kompute.py @@ -1,9 +1,6 @@ import os -from pyshader import python2shader, f32, ivec3, Array -from pyshader.stdlib import exp, log - -from kp import Tensor, Manager, Sequence +import kp DIRNAME = os.path.dirname(os.path.abspath(__file__)) @@ -12,11 +9,11 @@ def test_opmult(): Test basic OpMult operation """ - tensor_in_a = Tensor([2, 2, 2]) - tensor_in_b = Tensor([1, 2, 3]) - tensor_out = Tensor([0, 0, 0]) + tensor_in_a = kp.Tensor([2, 2, 2]) + tensor_in_b = kp.Tensor([1, 2, 3]) + tensor_out = kp.Tensor([0, 0, 0]) - mgr = Manager() + mgr = kp.Manager() mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) @@ -31,11 +28,11 @@ def test_opalgobase_data(): Test basic OpAlgoBase operation """ - tensor_in_a = Tensor([2, 2, 2]) - tensor_in_b = Tensor([1, 2, 3]) - tensor_out = Tensor([0, 0, 0]) + tensor_in_a = kp.Tensor([2, 2, 2]) + tensor_in_b = kp.Tensor([1, 2, 3]) + tensor_out = kp.Tensor([0, 0, 0]) - mgr = Manager() + mgr = kp.Manager() shaderData = """ #version 450 @@ -67,11 +64,11 @@ def test_opalgobase_file(): Test basic OpAlgoBase operation """ - tensor_in_a = Tensor([2, 2, 2]) - tensor_in_b = Tensor([1, 2, 3]) - tensor_out = Tensor([0, 0, 0]) + tensor_in_a = kp.Tensor([2, 2, 2]) + tensor_in_b = kp.Tensor([1, 2, 3]) + tensor_out = kp.Tensor([0, 0, 0]) - mgr = Manager() + mgr = kp.Manager() shaderFilePath = os.path.join(DIRNAME, "../../shaders/glsl/opmult.comp") @@ -87,11 +84,11 @@ def test_sequence(): """ Test basic OpAlgoBase operation """ - mgr = Manager(0, [2]) + mgr = kp.Manager(0, [2]) - tensor_in_a = Tensor([2, 2, 2]) - tensor_in_b = Tensor([1, 2, 3]) - tensor_out = Tensor([0, 0, 0]) + tensor_in_a = kp.Tensor([2, 2, 2]) + tensor_in_b = kp.Tensor([1, 2, 3]) + tensor_out = kp.Tensor([0, 0, 0]) mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) @@ -109,118 +106,3 @@ def test_sequence(): seq.eval() assert tensor_out.data() == [2.0, 4.0, 6.0] - -def test_pyshader_pyshader(): - - @python2shader - def compute_shader_multiply(index=("input", "GlobalInvocationId", ivec3), - data1=("buffer", 0, Array(f32)), - data2=("buffer", 1, Array(f32)), - data3=("buffer", 2, Array(f32))): - i = index.x - data3[i] = data1[i] * data2[i] - - tensor_in_a = Tensor([2, 2, 2]) - tensor_in_b = Tensor([1, 2, 3]) - tensor_out = Tensor([0, 0, 0]) - - mgr = Manager() - - mgr.eval_tensor_create_def([tensor_in_a, tensor_in_b, tensor_out]) - mgr.eval_algo_data_def([tensor_in_a, tensor_in_b, tensor_out], compute_shader_multiply.to_spirv()) - mgr.eval_tensor_sync_local_def([tensor_out]) - - assert tensor_out.data() == [2.0, 4.0, 6.0] - -def test_logistic_regression_pyshader(): - @python2shader - def compute_shader( - index = ("input", "GlobalInvocationId", ivec3), - x_i = ("buffer", 0, Array(f32)), - x_j = ("buffer", 1, Array(f32)), - y = ("buffer", 2, Array(f32)), - w_in = ("buffer", 3, Array(f32)), - w_out_i = ("buffer", 4, Array(f32)), - w_out_j = ("buffer", 5, Array(f32)), - b_in = ("buffer", 6, Array(f32)), - b_out = ("buffer", 7, Array(f32)), - l_out = ("buffer", 8, Array(f32)), - M = ("buffer", 9, Array(f32))): - - i = index.x - - m = M[0] - - w_curr = vec2(w_in[0], w_in[1]) - b_curr = b_in[0] - - x_curr = vec2(x_i[i], x_j[i]) - y_curr = y[i] - - z_dot = w_curr @ x_curr - z = z_dot + b_curr - y_hat = 1.0 / (1.0 + exp(-z)) - - d_z = y_hat - y_curr - d_w = (1.0 / m) * x_curr * d_z - d_b = (1.0 / m) * d_z - - loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat))) - - w_out_i[i] = d_w.x - w_out_j[i] = d_w.y - b_out[i] = d_b - l_out[i] = loss - - - # First we create input and ouput tensors for shader - tensor_x_i = Tensor([0.0, 1.0, 1.0, 1.0, 1.0]) - tensor_x_j = Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) - - tensor_y = Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) - - tensor_w_in = Tensor([0.001, 0.001]) - tensor_w_out_i = Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - tensor_w_out_j = Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - - tensor_b_in = Tensor([0.0]) - tensor_b_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - - tensor_l_out = Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) - - tensor_m = Tensor([ 5.0 ]) - - # We store them in an array for easier interaction - params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i, - tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m] - - mgr = Manager() - - mgr.eval_tensor_create_def(params) - - # Record commands for efficient evaluation - sq = mgr.create_sequence() - sq.begin() - sq.record_tensor_sync_device([tensor_w_in, tensor_b_in]) - sq.record_algo_data(params, compute_shader.to_spirv()) - sq.record_tensor_sync_local([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]) - sq.end() - - ITERATIONS = 100 - learning_rate = 0.1 - - # Perform machine learning training and inference across all input X and Y - for i_iter in range(ITERATIONS): - sq.eval() - - # Calculate the parameters based on the respective derivatives calculated - for j_iter in range(tensor_b_out.size()): - tensor_w_in.set(0, tensor_w_in.get(0) - learning_rate * tensor_w_out_i.data()[j_iter]) - tensor_w_in.set(1, tensor_w_in.get(1) - learning_rate * tensor_w_out_j.data()[j_iter]) - tensor_b_in.set(0, tensor_b_in.get(0) - learning_rate * tensor_b_out.data()[j_iter]) - - assert tensor_w_in.data()[0] < 0.01 - assert tensor_w_in.data()[0] > 0.0 - assert tensor_w_in.data()[1] > 1.5 - assert tensor_b_in.data()[0] < 0.7 - diff --git a/python/test/test_logistic_regression.py b/python/test/test_logistic_regression.py new file mode 100644 index 00000000..f8737588 --- /dev/null +++ b/python/test/test_logistic_regression.py @@ -0,0 +1,108 @@ +import pyshader as ps +import kp + +def test_logistic_regression(): + + @ps.python2shader + def compute_shader( + index = ("input", "GlobalInvocationId", ps.ivec3), + x_i = ("buffer", 0, ps.Array(ps.f32)), + x_j = ("buffer", 1, ps.Array(ps.f32)), + y = ("buffer", 2, ps.Array(ps.f32)), + w_in = ("buffer", 3, ps.Array(ps.f32)), + w_out_i = ("buffer", 4, ps.Array(ps.f32)), + w_out_j = ("buffer", 5, ps.Array(ps.f32)), + b_in = ("buffer", 6, ps.Array(ps.f32)), + b_out = ("buffer", 7, ps.Array(ps.f32)), + l_out = ("buffer", 8, ps.Array(ps.f32)), + M = ("buffer", 9, ps.Array(ps.f32))): + + i = index.x + + m = M[0] + + w_curr = vec2(w_in[0], w_in[1]) + b_curr = b_in[0] + + x_curr = vec2(x_i[i], x_j[i]) + y_curr = y[i] + + z_dot = w_curr @ x_curr + z = z_dot + b_curr + y_hat = 1.0 / (1.0 + exp(-z)) + + d_z = y_hat - y_curr + d_w = (1.0 / m) * x_curr * d_z + d_b = (1.0 / m) * d_z + + loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat))) + + w_out_i[i] = d_w.x + w_out_j[i] = d_w.y + b_out[i] = d_b + l_out[i] = loss + + + mgr = kp.Manager(0) + + # First we create input and ouput tensors for shader + tensor_x_i = kp.Tensor([0.0, 1.0, 1.0, 1.0, 1.0]) + tensor_x_j = kp.Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) + + tensor_y = kp.Tensor([0.0, 0.0, 0.0, 1.0, 1.0]) + + tensor_w_in = kp.Tensor([0.001, 0.001]) + tensor_w_out_i = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + tensor_w_out_j = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + + tensor_b_in = kp.Tensor([0.0]) + tensor_b_out = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + + tensor_l_out = kp.Tensor([0.0, 0.0, 0.0, 0.0, 0.0]) + + tensor_m = kp.Tensor([ tensor_y.size() ]) + + # We store them in an array for easier interaction + params = [tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i, + tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m] + + mgr.eval_tensor_create_def(params) + + # Create a managed sequence + sq = mgr.create_sequence() + + # Clear previous operations and begin recording for new operations + sq.begin() + + # Record operation to sync memory from local to GPU memory + sq.record_tensor_sync_device([tensor_w_in, tensor_b_in]) + + # Record operation to execute GPU shader against all our parameters + sq.record_algo_data(params, compute_shader.to_spirv()) + + # Record operation to sync memory from GPU to local memory + sq.record_tensor_sync_local([tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]) + + # Stop recording operations + sq.end() + + ITERATIONS = 100 + learning_rate = 0.1 + + # Perform machine learning training and inference across all input X and Y + for i_iter in range(ITERATIONS): + + # Execute an iteration of the algorithm + sq.eval() + + # Calculate the parameters based on the respective derivatives calculated + for j_iter in range(tensor_b_out.size()): + tensor_w_in[0] -= learning_rate * tensor_w_out_i.data()[j_iter] + tensor_w_in[1] -= learning_rate * tensor_w_out_j.data()[j_iter] + tensor_b_in[0] -= learning_rate * tensor_b_out.data()[j_iter] + + assert tensor_w_in.data()[0] < 0.01 + assert tensor_w_in.data()[0] > 0.0 + assert tensor_w_in.data()[1] > 1.5 + assert tensor_b_in.data()[0] < 0.7 +