-
-
Notifications
You must be signed in to change notification settings - Fork 36
/
compute_timestamps.py
165 lines (142 loc) · 4.66 KB
/
compute_timestamps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
"""
A simple example to profile a compute pass using ComputePassTimestampWrites.
"""
import wgpu
"""
Define the number of elements, global and local sizes.
Change these and see how it affects performance.
"""
n = 512 * 512
local_size = [32, 1, 1]
global_size = [n // local_size[0], 1, 1]
shader_source = f"""
@group(0) @binding(0)
var<storage,read> data1: array<i32>;
@group(0) @binding(1)
var<storage,read> data2: array<i32>;
@group(0) @binding(2)
var<storage,read_write> data3: array<i32>;
@compute
@workgroup_size({','.join(map(str, local_size))})
fn main(@builtin(global_invocation_id) index: vec3<u32>) {{
let i: u32 = index.x;
data3[i] = data1[i] + data2[i];
}}
"""
# Define two arrays
data1 = memoryview(bytearray(n * 4)).cast("i")
data2 = memoryview(bytearray(n * 4)).cast("i")
# Initialize the arrays
for i in range(n):
data1[i] = i
for i in range(n):
data2[i] = i * 2
adapter = wgpu.gpu.request_adapter_sync(power_preference="high-performance")
# Request a device with the timestamp_query feature, so we can profile our computation
device = adapter.request_device_sync(
required_features=[wgpu.FeatureName.timestamp_query]
)
cshader = device.create_shader_module(code=shader_source)
# Create buffer objects, input buffer is mapped.
buffer1 = device.create_buffer_with_data(data=data1, usage=wgpu.BufferUsage.STORAGE)
buffer2 = device.create_buffer_with_data(data=data2, usage=wgpu.BufferUsage.STORAGE)
buffer3 = device.create_buffer(
size=data1.nbytes, usage=wgpu.BufferUsage.STORAGE | wgpu.BufferUsage.COPY_SRC
)
# Setup layout and bindings
binding_layouts = [
{
"binding": 0,
"visibility": wgpu.ShaderStage.COMPUTE,
"buffer": {
"type": wgpu.BufferBindingType.read_only_storage,
},
},
{
"binding": 1,
"visibility": wgpu.ShaderStage.COMPUTE,
"buffer": {
"type": wgpu.BufferBindingType.read_only_storage,
},
},
{
"binding": 2,
"visibility": wgpu.ShaderStage.COMPUTE,
"buffer": {
"type": wgpu.BufferBindingType.storage,
},
},
]
bindings = [
{
"binding": 0,
"resource": {"buffer": buffer1, "offset": 0, "size": buffer1.size},
},
{
"binding": 1,
"resource": {"buffer": buffer2, "offset": 0, "size": buffer2.size},
},
{
"binding": 2,
"resource": {"buffer": buffer3, "offset": 0, "size": buffer3.size},
},
]
# Put everything together
bind_group_layout = device.create_bind_group_layout(entries=binding_layouts)
pipeline_layout = device.create_pipeline_layout(bind_group_layouts=[bind_group_layout])
bind_group = device.create_bind_group(layout=bind_group_layout, entries=bindings)
# Create and run the pipeline
compute_pipeline = device.create_compute_pipeline(
layout=pipeline_layout,
compute={"module": cshader, "entry_point": "main"},
)
"""
Create a QuerySet to store the 'beginning_of_pass' and 'end_of_pass' timestamps.
Set the 'count' parameter to 2, as this set will contain 2 timestamps.
"""
query_set = device.create_query_set(type=wgpu.QueryType.timestamp, count=2)
command_encoder = device.create_command_encoder()
# Pass our QuerySet and the indices into it, where the timestamps will be written.
compute_pass = command_encoder.begin_compute_pass(
timestamp_writes={
"query_set": query_set,
"beginning_of_pass_write_index": 0,
"end_of_pass_write_index": 1,
}
)
"""
Create the buffer to store our query results.
Each timestamp is 8 bytes. We mark the buffer usage to be QUERY_RESOLVE,
as we will use this buffer in a resolve_query_set call later.
"""
query_buf = device.create_buffer(
size=8 * query_set.count,
usage=wgpu.BufferUsage.QUERY_RESOLVE | wgpu.BufferUsage.COPY_SRC,
)
compute_pass.set_pipeline(compute_pipeline)
compute_pass.set_bind_group(0, bind_group)
compute_pass.dispatch_workgroups(*global_size) # x y z
compute_pass.end()
# Resolve our queries, and store the results in the destination buffer we created above.
command_encoder.resolve_query_set(
query_set=query_set,
first_query=0,
query_count=2,
destination=query_buf,
destination_offset=0,
)
device.queue.submit([command_encoder.finish()])
"""
Read the query buffer to get the timestamps.
Index 0: beginning timestamp
Index 1: end timestamp
"""
timestamps = device.queue.read_buffer(query_buf).cast("Q").tolist()
print(f"Adding two {n} sized arrays took {(timestamps[1]-timestamps[0])/1000} us")
# Read result
out = device.queue.read_buffer(buffer3).cast("i")
result = out.tolist()
# Calculate the result on the CPU for comparison
result_cpu = [a + b for a, b in zip(data1, data2)]
# Ensure results are the same
assert result == result_cpu