Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenMP #273

Open
wants to merge 38 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
337b2db
Format change.
isazi Aug 21, 2024
852f198
First very draft attempt at feature parity OpenACC/OpenMP.
isazi Aug 21, 2024
a2cee37
Added OpenMP tests.
isazi Aug 21, 2024
0067fd5
Fixed two failing tests.
isazi Aug 21, 2024
debcb64
A fix was necessary in the Compiler backend to have OpenMP working.
isazi Aug 21, 2024
bb95d7b
Fixed test.
isazi Aug 21, 2024
ba02b31
Fixing the examples.
isazi Aug 21, 2024
828d4c6
Matrix multiply example for OpenMP.
isazi Aug 21, 2024
24857cc
Adding correctness check.
isazi Aug 22, 2024
3d32c47
Refactor function.
isazi Aug 22, 2024
21e56af
Bug fixed.
isazi Aug 22, 2024
1f815e2
Adding missing parameter.
isazi Aug 22, 2024
ddf501f
Another bug fixed.
isazi Aug 22, 2024
1698a9b
Updated the OpenMP matrix multiply.
isazi Aug 22, 2024
2fffcf0
Update vector add OpenMP code.
isazi Sep 5, 2024
c0d240a
Using "restrict" is too compiler specific.
isazi Sep 5, 2024
62809fa
Reorder parameters.
isazi Sep 17, 2024
fbdadd7
Draft example of a histogram.
isazi Sep 17, 2024
d844e97
Bound the values inside the array.
isazi Sep 17, 2024
ab9d6b5
Typo.
isazi Sep 17, 2024
e7fd411
Fixing a bug in the correctness check.
isazi Sep 17, 2024
939f0c3
Use the cleaning observer.
isazi Sep 17, 2024
06c6074
Fixing what is (probably) a long standing bug, observers were ignored…
isazi Sep 17, 2024
9dc47a4
Fixing allocation bugs in the Compiler Backend.
isazi Sep 18, 2024
c7bc0c8
Move this trailing comment on the previous empty line.
isazi Sep 19, 2024
ee7432a
Fixing, for good, a bug that prevented cleaning up the output memory …
isazi Sep 19, 2024
e2c6a09
The example should now work.
isazi Sep 19, 2024
fb9033a
Fix the test to use the new method.
isazi Sep 19, 2024
f9808e1
Some refactoring necessary for everything to work. The Compiler backe…
isazi Sep 19, 2024
2333916
Remove old tests.
isazi Sep 19, 2024
aa3cadb
Added test for the compiler memory refresh.
isazi Sep 19, 2024
4587539
Update the test.
isazi Sep 19, 2024
4c77414
Although semantically there is no dtoh copy in the compiler backend, …
isazi Sep 19, 2024
da2fe05
Test fixed.
isazi Sep 19, 2024
47898d9
Added tunable parameters to the example.
isazi Sep 19, 2024
760462c
OpenMP version of the histogram example.
isazi Sep 19, 2024
b207e06
Merge branch 'master' into directives
isazi Nov 28, 2024
69ec5ac
Merge branch 'master' into directives
isazi Dec 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 3 additions & 4 deletions examples/c/vector_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
}
"""

size = 72*1024*1024
size = 72 * 1024 * 1024

a = numpy.random.randn(size).astype(numpy.float32)
b = numpy.random.randn(size).astype(numpy.float32)
Expand All @@ -39,7 +39,6 @@
tune_params["nthreads"] = [1, 2, 3, 4, 8, 12, 16, 24, 32]
tune_params["vecsize"] = [1, 2, 4, 8, 16]

answer = [a+b, None, None, None]
answer = [a + b, None, None, None]

tune_kernel("vector_add", kernel_string, size, args, tune_params,
answer=answer, compiler_options=['-O3'])
tune_kernel("vector_add", kernel_string, size, args, tune_params, answer=answer, compiler_options=["-fopenmp", "-O3"])
72 changes: 72 additions & 0 deletions examples/directives/histogram_c_openacc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#!/usr/bin/env python
"""This is a simple example for tuning C++ OpenACC code with the kernel tuner"""
import numpy as np

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives


# Naive Python histogram implementation
def histogram(vector, hist):
for i in range(0, len(vector)):
hist[vector[i]] += 1
return hist


code = """
#include <stdlib.h>

#define HIST_SIZE 256
#define VECTOR_SIZE 1000000

#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
#if enable_reduction == 1
#pragma acc parallel num_gangs(ngangs) vector_length(nthreads) reduction(+:hist[:HIST_SIZE])
#else
#pragma acc parallel num_gangs(ngangs) vector_length(nthreads)
#endif
#pragma acc loop independent
for ( int i = 0; i < VECTOR_SIZE; i++ ) {
#if enable_atomic == 1
#pragma acc atomic update
#endif
hist[vector[i]] += 1;
}
#pragma tuner stop
"""

# Extract tunable directive
app = Code(OpenACC(), Cxx())
kernel_string, kernel_args = process_directives(app, code)

tune_params = dict()
tune_params["ngangs"] = [2**i for i in range(1, 11)]
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
tune_params["enable_reduction"] = [0, 1]
tune_params["enable_atomic"] = [0, 1]
constraints = ["enable_reduction != enable_atomic"]
metrics = dict()
metrics["GB/s"] = (
lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
/ (x["time"] / 10**3)
/ 10**9
)

kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
answer = [None, reference_hist]

tune_kernel(
"histogram",
kernel_string["histogram"],
0,
kernel_args["histogram"],
tune_params,
restrictions=constraints,
metrics=metrics,
answer=answer,
compiler="nvc++",
compiler_options=["-fast", "-acc=gpu"],
)
71 changes: 71 additions & 0 deletions examples/directives/histogram_c_openmp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env python
"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""
import numpy as np

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives


# Naive Python histogram implementation
def histogram(vector, hist):
for i in range(0, len(vector)):
hist[vector[i]] += 1
return hist


code = """
#include <stdlib.h>

#define HIST_SIZE 256
#define VECTOR_SIZE 1000000

#pragma tuner start histogram vector(int*:VECTOR_SIZE) hist(int*:HIST_SIZE)
#if enable_reduction == 1
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads) reduction(+:hist[:HIST_SIZE])
#else
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
#endif
for ( int i = 0; i < VECTOR_SIZE; i++ ) {
#if enable_atomic == 1
#pragma omp atomic update
#endif
hist[vector[i]] += 1;
}
#pragma tuner stop
"""

# Extract tunable directive
app = Code(OpenMP(), Cxx())
kernel_string, kernel_args = process_directives(app, code)

tune_params = dict()
tune_params["nteams"] = [2**i for i in range(1, 11)]
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
tune_params["enable_reduction"] = [0, 1]
tune_params["enable_atomic"] = [0, 1]
constraints = ["enable_reduction != enable_atomic"]
metrics = dict()
metrics["GB/s"] = (
lambda x: ((2 * 4 * len(kernel_args["histogram"][0])) + (4 * len(kernel_args["histogram"][0])))
/ (x["time"] / 10**3)
/ 10**9
)

kernel_args["histogram"][0] = np.random.randint(0, 256, len(kernel_args["histogram"][0]), dtype=np.int32)
kernel_args["histogram"][1] = np.zeros(len(kernel_args["histogram"][1])).astype(np.int32)
reference_hist = np.zeros_like(kernel_args["histogram"][1]).astype(np.int32)
reference_hist = histogram(kernel_args["histogram"][0], reference_hist)
answer = [None, reference_hist]

tune_kernel(
"histogram",
kernel_string["histogram"],
0,
kernel_args["histogram"],
tune_params,
restrictions=constraints,
metrics=metrics,
answer=answer,
compiler="nvc++",
compiler_options=["-fast", "-mp=gpu"],
)
18 changes: 10 additions & 8 deletions examples/directives/matrix_multiply_c_openacc.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,8 @@
#!/usr/bin/env python
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import (
Code,
OpenACC,
Cxx,
process_directives
)
from kernel_tuner import tune_kernel, run_kernel
from kernel_tuner.utils.directives import Code, OpenACC, Cxx, process_directives

N = 4096

Expand Down Expand Up @@ -45,13 +40,20 @@
metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9

# compute reference solution from CPU
results = run_kernel(
"mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
)
answer = [None, None, results[2]]

tune_kernel(
"mm",
kernel_string["mm"],
0,
kernel_args["mm"],
tune_params,
metrics=metrics,
compiler_options=["-fast", "-acc=gpu"],
answer=answer,
compiler="nvc++",
compiler_options=["-fast", "-acc=gpu"],
)
59 changes: 59 additions & 0 deletions examples/directives/matrix_multiply_c_openmp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#!/usr/bin/env python
"""This is an example tuning a naive matrix multiplication using the simplified directives interface"""

from kernel_tuner import tune_kernel, run_kernel
from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives

N = 4096

code = """
#define N 4096

void matrix_multiply(float *A, float *B, float *C) {
#pragma tuner start mm A(float*:NN) B(float*:NN) C(float*:NN)
float temp_sum = 0.0f;
#pragma omp target
#pragma omp teams distribute collapse(2)
for ( int i = 0; i < N; i++) {
for ( int j = 0; j < N; j++ ) {
temp_sum = 0.0f;
#pragma omp parallel for num_threads(nthreads) reduction(+:temp_sum)
for ( int k = 0; k < N; k++ ) {
temp_sum += A[(i * N) + k] * B[(k * N) + j];
}
C[(i * N) + j] = temp_sum;
}
}
#pragma tuner stop
}
"""

# Extract tunable directive
app = Code(OpenMP(), Cxx())
dims = {"NN": N**2}
kernel_string, kernel_args = process_directives(app, code, user_dimensions=dims)

tune_params = dict()
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
metrics = dict()
metrics["time_s"] = lambda x: x["time"] / 10**3
metrics["GB/s"] = lambda x: ((N**3 * 2 * 4) + (N**2 * 4)) / x["time_s"] / 10**9
metrics["GFLOP/s"] = lambda x: (N**3 * 3) / x["time_s"] / 10**9

# compute reference solution from CPU
results = run_kernel(
"mm", kernel_string["mm"], 0, kernel_args["mm"], {"nthreads": 1}, compiler="nvc++", compiler_options=["-fast"]
)
answer = [None, None, results[2]]

tune_kernel(
"mm",
kernel_string["mm"],
0,
kernel_args["mm"],
tune_params,
metrics=metrics,
answer=answer,
compiler="nvc++",
compiler_options=["-fast", "-mp=gpu"],
)
2 changes: 1 addition & 1 deletion examples/directives/vector_add_c_openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,6 @@
tune_params,
metrics=metrics,
answer=answer,
compiler_options=["-fast", "-acc=gpu"],
compiler="nvc++",
compiler_options=["-fast", "-acc=gpu"],
)
57 changes: 57 additions & 0 deletions examples/directives/vector_add_c_openmp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#!/usr/bin/env python
"""This is a simple example for tuning C++ OpenMP code with the kernel tuner"""

from kernel_tuner import tune_kernel
from kernel_tuner.utils.directives import Code, OpenMP, Cxx, process_directives

code = """
#include <stdlib.h>

#define VECTOR_SIZE 1000000

int main(void) {
int size = VECTOR_SIZE;
float * a = (float *) malloc(VECTOR_SIZE * sizeof(float));
float * b = (float *) malloc(VECTOR_SIZE * sizeof(float));
float * c = (float *) malloc(VECTOR_SIZE * sizeof(float));

#pragma tuner start vector_add a(float*:VECTOR_SIZE) b(float*:VECTOR_SIZE) c(float*:VECTOR_SIZE) size(int:VECTOR_SIZE)
#pragma omp target teams distribute parallel for num_teams(nteams) num_threads(nthreads)
for ( int i = 0; i < size; i++ ) {
c[i] = a[i] + b[i];
}
#pragma tuner stop

free(a);
free(b);
free(c);
}
"""

# Extract tunable directive
app = Code(OpenMP(), Cxx())
kernel_string, kernel_args = process_directives(app, code)

tune_params = dict()
tune_params["nteams"] = [2**i for i in range(1,11)]
tune_params["nthreads"] = [32 * i for i in range(1, 33)]
metrics = dict()
metrics["GB/s"] = (
lambda x: ((2 * 4 * len(kernel_args["vector_add"][0])) + (4 * len(kernel_args["vector_add"][0])))
/ (x["time"] / 10**3)
/ 10**9
)

answer = [None, None, kernel_args["vector_add"][0] + kernel_args["vector_add"][1], None]

tune_kernel(
"vector_add",
kernel_string["vector_add"],
0,
kernel_args["vector_add"],
tune_params,
metrics=metrics,
answer=answer,
compiler="nvc++",
compiler_options=["-fast", "-mp=gpu"],
)
2 changes: 1 addition & 1 deletion examples/directives/vector_add_fortran_openacc.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,6 @@
tune_params,
metrics=metrics,
answer=answer,
compiler_options=["-fast", "-acc=gpu"],
compiler="nvfortran",
compiler_options=["-fast", "-acc=gpu"],
)
42 changes: 0 additions & 42 deletions examples/fortran/test_fortran_vector_add.py

This file was deleted.

4 changes: 2 additions & 2 deletions examples/fortran/vector_add.F90
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ subroutine vector_add(C, A, B, n)
real (c_float), intent(in), dimension(N) :: A, B
integer (c_int), intent(in) :: n

!$OMP parallel do
!$omp parallel do
do i = 1, N
C(i) = A(i) + B(i)
end do
!$OMP end parallel do
!$omp end parallel do

end subroutine vector_add

Expand Down
Loading
Loading