Skip to content

Commit

Permalink
Add Benchmarking CI (JuliaGPU#420)
Browse files Browse the repository at this point in the history
  • Loading branch information
christiangnrd authored Sep 26, 2024
1 parent 5b983a5 commit 8652754
Show file tree
Hide file tree
Showing 12 changed files with 796 additions and 4 deletions.
62 changes: 58 additions & 4 deletions .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,12 @@ steps:
queue: "juliaecosystem"
os: "macos"
arch: "aarch64"
if: build.message !~ /\[skip tests\]/
if: |
build.message =~ /\[only tests\]/ ||
build.message =~ /\[only julia\]/ ||
build.message !~ /\[only/ &&
build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip julia\]/
timeout_in_minutes: 60
matrix:
setup:
Expand All @@ -46,7 +51,12 @@ steps:
queue: "juliaecosystem"
os: "macos"
arch: "aarch64"
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
if: |
build.message =~ /\[only tests\]/ ||
build.message =~ /\[only special\]/ ||
build.message !~ /\[only/ && !build.pull_request.draft &&
build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip special\]/
timeout_in_minutes: 60
matrix:
setup:
Expand Down Expand Up @@ -75,7 +85,12 @@ steps:
queue: "juliaecosystem"
os: "macos"
arch: "aarch64"
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
if: |
build.message =~ /\[only tests\]/ ||
build.message =~ /\[only special\]/ ||
build.message !~ /\[only/ && !build.pull_request.draft &&
build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip special\]/
timeout_in_minutes: 60
- label: "Opaque pointers"
plugins:
Expand All @@ -95,5 +110,44 @@ steps:
queue: "juliaecosystem"
os: "macos"
arch: "aarch64"
if: build.message !~ /\[skip tests\]/ && !build.pull_request.draft
if: |
build.message =~ /\[only tests\]/ ||
build.message =~ /\[only special\]/ ||
build.message !~ /\[only/ && !build.pull_request.draft &&
build.message !~ /\[skip tests\]/ &&
build.message !~ /\[skip special\]/
timeout_in_minutes: 60

# we want to benchmark every commit on the master branch, even if it failed CI
- wait: ~
# continue_on_failure: true

- group: ":racehorse: Benchmarks"
steps:
- label: "Benchmarks"
plugins:
- JuliaCI/julia#v1:
version: "1.10"
command: |
julia --project=perf -e '
using Pkg
println("--- :julia: Instantiating project")
Pkg.develop([PackageSpec(path=pwd())])
Pkg.instantiate()
push!(LOAD_PATH, @__DIR__)
println("+++ :julia: Benchmarking")
include("perf/runbenchmarks.jl")'
artifact_paths:
- "benchmarkresults.json"
agents:
queue: "juliaecosystem"
os: "macos"
arch: "aarch64"
macos_version: "15.0"
if: |
build.message =~ /\[only benchmarks\]/ ||
build.message !~ /\[only/ && !build.pull_request.draft &&
build.message !~ /\[skip benchmarks\]/
timeout_in_minutes: 30
63 changes: 63 additions & 0 deletions .github/workflows/Benchmark.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: Benchmarks
permissions:
contents: write # contents permission to update benchmark contents in gh-pages branch
statuses: read
deployments: write # deployments permission to deploy GitHub pages website
pull-requests: write

on:
pull_request:
branches:
- main
paths:
- "src/**/*"
- "ext/**/*"
- "perf/**/*"
- ".buildkite/**/*"
- "Project.toml"
- ".github/workflows/Benchmark.yml"
push:
branches:
- main
paths:
- "src/**/*"
- "ext/**/*"
- "benchmarks/**/*"
- ".buildkite/**/*"
- "Project.toml"
- ".github/workflows/Benchmark.yml"

jobs:
benchmark:
if: ${{ contains(github.event.head_commit.message, '[only benchmarks]') || !contains(github.event.head_commit.message, '[only') && !contains(github.event.head_commit.message, '[skip benchmarks]') && github.event.pull_request.draft == false }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Download Buildkite Artifacts
id: download
uses: EnricoMi/download-buildkite-artifact-action@v1
with:
buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
ignore_build_states: blocked,canceled,skipped,not_run,failed
ignore_job_states: timed_out,failed
output_path: artifacts

- name: Locate Benchmarks Artifact
id: locate
if: ${{ steps.download.outputs.download-state == 'success' }}
run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT

- name: Upload Benchmark Results
if: ${{ steps.locate.outputs.path != '' }}
uses: benchmark-action/github-action-benchmark@v1
with:
name: Metal Benchmarks
tool: "julia"
output-file-path: ${{ steps.locate.outputs.path }}
benchmark-data-dir-path: ""
github-token: ${{ secrets.GITHUB_TOKEN }}
comment-always: true
summary-always: true
alert-threshold: "150%"
fail-on-alert: false
auto-push: ${{ github.event_name != 'pull_request' }}
2 changes: 2 additions & 0 deletions perf/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
results.json
reference.json
7 changes: 7 additions & 0 deletions perf/Project.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[deps]
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
110 changes: 110 additions & 0 deletions perf/array.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
group = addgroup!(SUITE, "array")

const m = 512
const n = 1000

# generate some arrays
cpu_mat = rand(rng, Float32, m, n)
gpu_mat = MtlArray{Float32}(undef, size(cpu_mat))
gpu_vec = reshape(gpu_mat, length(gpu_mat))
gpu_arr_3d = reshape(gpu_mat, (m, 40, 25))
gpu_arr_4d = reshape(gpu_mat, (m, 10, 10, 10))
gpu_mat_ints = MtlArray(rand(rng, Int, m, n))
gpu_vec_ints = reshape(gpu_mat_ints, length(gpu_mat_ints))
gpu_mat_bools = MtlArray(rand(rng, Bool, m, n))
gpu_vec_bools = reshape(gpu_mat_bools, length(gpu_mat_bools))

group["construct"] = @benchmarkable MtlArray{Int}(undef, 1)

group["copy"] = @async_benchmarkable copy($gpu_mat)

gpu_mat2 = copy(gpu_mat)
let group = addgroup!(group, "copyto!")
group["cpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat, $cpu_mat)
group["gpu_to_cpu"] = @async_benchmarkable copyto!($cpu_mat, $gpu_mat)
group["gpu_to_gpu"] = @async_benchmarkable copyto!($gpu_mat2, $gpu_mat)
end

let group = addgroup!(group, "iteration")
group["scalar"] = @benchmarkable Metal.@allowscalar [$gpu_vec[i] for i in 1:10]

group["logical"] = @benchmarkable $gpu_vec[$gpu_vec_bools]

let group = addgroup!(group, "findall")
group["bool"] = @benchmarkable findall($gpu_vec_bools)
group["int"] = @benchmarkable findall(isodd, $gpu_vec_ints)
end

let group = addgroup!(group, "findfirst")
group["bool"] = @benchmarkable findfirst($gpu_vec_bools)
group["int"] = @benchmarkable findfirst(isodd, $gpu_vec_ints)
end

let group = addgroup!(group, "findmin") # findmax
group["1d"] = @async_benchmarkable findmin($gpu_vec)
group["2d"] = @async_benchmarkable findmin($gpu_mat; dims=1)
end
end

# let group = addgroup!(group, "reverse")
# group["1d"] = @async_benchmarkable reverse($gpu_vec)
# group["2d"] = @async_benchmarkable reverse($gpu_mat; dims=1)
# group["1d_inplace"] = @async_benchmarkable reverse!($gpu_vec)
# group["2d_inplace"] = @async_benchmarkable reverse!($gpu_mat; dims=1)
# end

group["broadcast"] = @async_benchmarkable $gpu_mat .= 0f0

# no need to test inplace version, which performs the same operation (but with an alloc)
let group = addgroup!(group, "accumulate")
group["1d"] = @async_benchmarkable accumulate(+, $gpu_vec)
group["2d"] = @async_benchmarkable accumulate(+, $gpu_mat; dims=1)
end

let group = addgroup!(group, "reductions")
let group = addgroup!(group, "reduce")
group["1d"] = @async_benchmarkable reduce(+, $gpu_vec)
group["2d"] = @async_benchmarkable reduce(+, $gpu_mat; dims=1)
end

let group = addgroup!(group, "mapreduce")
group["1d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_vec)
group["2d"] = @async_benchmarkable mapreduce(x->x+1, +, $gpu_mat; dims=1)
end

# used by sum, prod, minimum, maximum, all, any, count
end

let group = addgroup!(group, "random")
let group = addgroup!(group, "rand")
group["Float32"] = @async_benchmarkable Metal.rand(Float32, m*n)
group["Int64"] = @async_benchmarkable Metal.rand(Int64, m*n)
end

let group = addgroup!(group, "rand!")
group["Float32"] = @async_benchmarkable Metal.rand!($gpu_vec)
group["Int64"] = @async_benchmarkable Metal.rand!($gpu_vec_ints)
end

let group = addgroup!(group, "randn")
group["Float32"] = @async_benchmarkable Metal.randn(Float32, m*n)
# group["Int64"] = @async_benchmarkable Metal.randn(Int64, m*n)
end

let group = addgroup!(group, "randn!")
group["Float32"] = @async_benchmarkable Metal.randn!($gpu_vec)
# group["Int64"] = @async_benchmarkable Metal.randn!($gpu_vec_ints)
end
end

# let group = addgroup!(group, "sorting")
# group["1d"] = @async_benchmarkable sort($gpu_vec)
# group["2d"] = @async_benchmarkable sort($gpu_mat; dims=1)
# group["by"] = @async_benchmarkable sort($gpu_vec; by=sin)
# end

let group = addgroup!(group, "permutedims")
group["2d"] = @async_benchmarkable permutedims($gpu_mat, (2,1))
group["3d"] = @async_benchmarkable permutedims($gpu_arr_3d, (3,1,2))
group["4d"] = @async_benchmarkable permutedims($gpu_arr_4d, (2,1,4,3))
end
78 changes: 78 additions & 0 deletions perf/byval.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
module ByVal

using Metal, BenchmarkTools, Random

const threads = 256

# simple add matrixes kernel
function kernel_add_mat(n, x1, x2, y)
i = thread_position_in_grid_1d()
if i <= n
@inbounds y[i] = x1[i] + x2[i]
end
return
end

@inline get_inputs3(indx_y, a, b, c) = (a, b, c)
@inline get_inputs3(indx_y, a1, a2, b1, b2, c1, c2) = indx_y == 1 ? (a1, b1, c1) : (a2, b2, c2)
@inline get_inputs3(indx_y, a1, a2, a3, b1, b2, b3, c1, c2, c3) = indx_y == 1 ? (a1, b1, c1) : indx_y == 2 ? (a2, b2, c2) : (a3, b3, c3)

# add arrays of matrixes kernel
function kernel_add_mat_z_slices(n, vararg...)
x1, x2, y = get_inputs3(threadgroup_position_in_grid_2d().y, vararg...)
i = thread_position_in_grid_1d()
if i <= n
@inbounds y[i] = x1[i] + x2[i]
end
return
end

function add_z_slices!(y, x1, x2)
m1, n1 = size(x1[1]) #get size of first slice
groups = (m1 * n1 + threads - 1) ÷ threads
# get length(x1) more groups than needed to process 1 slice
@metal groups = groups, length(x1) threads = threads kernel_add_mat_z_slices(m1 * n1, x1..., x2..., y...)
end

function add!(y, x1, x2)
m1, n1 = size(x1)
groups = (m1 * n1 + threads - 1) ÷ threads
@metal groups = (groups, 1) threads = threads kernel_add_mat(m1 * n1, x1, x2, y)
end

function main()
results = BenchmarkGroup()

num_z_slices = 3
Random.seed!(1)

#m, n = 7, 5 # tiny to measure overhead
#m, n = 521, 111
#m, n = 1521, 1111
#m, n = 3001, 1511 # prime numbers to test memory access correctness
m, n = 3072, 1536 # 256 multiplier
#m, n = 6007, 3001 # prime numbers to test memory access correctness

x1 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
x2 = [mtl(randn(Float32, (m, n)) .+ Float32(0.5)) for i = 1:num_z_slices]
y1 = [similar(x1[1]) for i = 1:num_z_slices]

# reference down to bones add on GPU
results["reference"] = @benchmark Metal.@sync add!($y1[1], $x1[1], $x2[1])

# adding arrays in an array
for slices = 1:num_z_slices
results["slices=$slices"] = @benchmark Metal.@sync add_z_slices!($y1[1:$slices], $x1[1:$slices], $x2[1:$slices])
end

# BenchmarkTools captures inputs, JuliaCI/BenchmarkTools.jl#127, so forcibly free them
Metal.unsafe_free!.(x1)
Metal.unsafe_free!.(x2)
Metal.unsafe_free!.(y1)

return results
end

end

ByVal.main()
35 changes: 35 additions & 0 deletions perf/kernel.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# using GPUArrays

group = addgroup!(SUITE, "kernel")

group["launch"] = @benchmarkable @metal identity(nothing)

# group["occupancy"] = @benchmarkable begin
# kernel = @metal launch=false identity(nothing)
# GPUArrays.launch_heuristic(Metal.mtlArrayBackend(), kernel.f; elements=1, elements_per_thread=1)
# return
# end

src = Metal.rand(Float32, 512, 1000)
dest = similar(src)
function indexing_kernel(dest, src)
i = thread_position_in_grid_1d()
@inbounds dest[i] = src[i]
return
end
group["indexing"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $indexing_kernel($dest, $src)

function checked_indexing_kernel(dest, src)
i = thread_position_in_grid_1d()
dest[i] = src[i]
return
end
group["indexing_checked"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $checked_indexing_kernel($dest, $src)

## DELETE
# function rand_kernel(dest::AbstractArray{T}) where {T}
# i = thread_position_in_grid_1d()
# dest[i] = Metal.rand(T)
# return
# end
# group["rand"] = @async_benchmarkable @metal threads=size(src,1) groups=size(src,2) $rand_kernel($dest)
Loading

0 comments on commit 8652754

Please sign in to comment.