Skip to content

Commit

Permalink
fix: add compat entries
Browse files Browse the repository at this point in the history
  • Loading branch information
avik-pal committed Jan 21, 2025
1 parent 91e8a0f commit 1053879
Showing 1 changed file with 10 additions and 1 deletion.
11 changes: 10 additions & 1 deletion examples/RealNVP/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,13 @@ Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
Lux = "1"
CairoMakie = "0.13.1"
ConcreteStructs = "0.2.3"
Enzyme = "0.13.28"
Lux = "1.5"
MLUtils = "0.4.5"
Optimisers = "0.4.4"
Printf = "1.10"
Random = "1.10"
Reactant = "0.2.20"
Statistics = "1.10"

1 comment on commit 1053879

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lux Benchmarks

Benchmark suite Current: 1053879 Previous: 46a012d Ratio
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3875 ns 3791 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 4292 ns 4500 ns 0.95
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4958 ns 4875 ns 1.02
layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3708 ns 3666 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10750 ns 10167 ns 1.06
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10416 ns 10458 ns 1.00
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10833 ns 10750 ns 1.01
layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10500 ns 10625 ns 0.99
bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s) 1250 ns 1062.5 ns 1.18
bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s) 1042 ns 1167 ns 0.89
bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s) 1417 ns 1500 ns 0.94
bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s) 1208 ns 1125 ns 1.07
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 4125 ns 4083 ns 1.01
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 3792 ns 4042 ns 0.94
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 4208 ns 4208 ns 1
bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 4166 ns 3958 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 57458 ns 57542 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 46709 ns 46416 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38291.5 ns 47125 ns 0.81
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 82166 ns 80875 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2036084 ns 2035395.5 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2088000 ns 2078396 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2101833.5 ns 2078708 ns 1.01
batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1996395.5 ns 1998584 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 171187 ns 144250 ns 1.19
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 141166 ns 144166.5 ns 0.98
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 145416.5 ns 145125 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 143604 ns 153104.5 ns 0.94
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1123959 ns 1120291.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1117541.5 ns 1113167 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1153479.5 ns 832708.5 ns 1.39
layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1120542 ns 1117084 ns 1.00
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3250 ns 3375 ns 0.96
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3542 ns 3542 ns 1
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4083 ns 4166 ns 0.98
layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3042 ns 3125 ns 0.97
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9145.5 ns 9042 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8833 ns 8750 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10333 ns 10208 ns 1.01
layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 9292 ns 8833 ns 1.05
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 15250 ns 17041 ns 0.89
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17354.5 ns 15834 ns 1.10
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 16208 ns 16604.5 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 15187.5 ns 16791 ns 0.90
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 216750 ns 213750 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 211208 ns 214875 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 212166.5 ns 215667 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 227042 ns 226125 ns 1.00
bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s) 667 ns 542 ns 1.23
bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s) 583 ns 708 ns 0.82
bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s) 770.5 ns 709 ns 1.09
bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s) 500 ns 541 ns 0.92
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 1459 ns 1375 ns 1.06
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 1417 ns 1375 ns 1.03
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 1417 ns 1500 ns 0.94
bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 1458 ns 1458 ns 1
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7166 ns 7000 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5875 ns 5750 ns 1.02
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5250 ns 6042 ns 0.87
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10041 ns 9750 ns 1.03
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 221000 ns 222021 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 227229.5 ns 228542 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 228708 ns 229292 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 213792 ns 213937.5 ns 1.00
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s) 3834 ns 3875 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s) 3875 ns 3917 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s) 3917 ns 3959 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s) 3875 ns 3917 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16750 ns 16917 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16708 ns 16792 ns 0.99
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16542 ns 17250 ns 0.96
dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 17042 ns 16750 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 580104.5 ns 568792 ns 1.02
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 575958 ns 578645.5 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 579375 ns 578083 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 580708 ns 575625 ns 1.01
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1416791 ns 1422625 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1424167 ns 1420000 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1423042 ns 1422375 ns 1.00
dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 1425000 ns 1426708 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/2 thread(s) 1079063 ns 1077687.5 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/4 thread(s) 963917 ns 960917 ns 1.00
lenet(28, 28, 1, 64)/forward/CPU/8 thread(s) 1334458 ns 1353229.5 ns 0.99
lenet(28, 28, 1, 64)/forward/CPU/1 thread(s) 1297667 ns 1315312 ns 0.99
lenet(28, 28, 1, 64)/zygote/CPU/2 thread(s) 5943395.5 ns 5961958 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/4 thread(s) 4600125 ns 4633250 ns 0.99
lenet(28, 28, 1, 64)/zygote/CPU/8 thread(s) 4951395.5 ns 4975188 ns 1.00
lenet(28, 28, 1, 64)/zygote/CPU/1 thread(s) 5560500 ns 5557125 ns 1.00
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s) 500 ns 583 ns 0.86
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s) 500 ns 542 ns 0.92
dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s) 542 ns 542 ns 1
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2166 ns 2208 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2042 ns 2250 ns 0.91
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2125 ns 2167 ns 0.98
dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 2208 ns 2125 ns 1.04
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 3687.5 ns 4125 ns 0.89
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 3791 ns 4375 ns 0.87
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 4792 ns 5167 ns 0.93
layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 3667 ns 4250 ns 0.86
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10875 ns 11875 ns 0.92
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 11084 ns 11000 ns 1.01
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 11500 ns 11917 ns 0.97
layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 11250 ns 11500 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6125 ns 7000 ns 0.88
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6834 ns 6958 ns 0.98
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7542 ns 8250 ns 0.91
groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6250 ns 6125 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 17625 ns 18708.5 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 17542 ns 18625 ns 0.94
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 18834 ns 18375 ns 1.02
groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 17416 ns 16708 ns 1.04
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 542 ns 625 ns 0.87
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 666 ns 708 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 625 ns 667 ns 0.94
batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 625 ns 584 ns 1.07
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 8500 ns 8834 ns 0.96
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8750 ns 8875 ns 0.99
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9125 ns 9334 ns 0.98
batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9208 ns 8354.5 ns 1.10
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s) 64375 ns 64459 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s) 64542 ns 64750 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s) 64667 ns 64916 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s) 64500 ns 64625 ns 1.00
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 277667 ns 279250 ns 0.99
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 287083 ns 282167 ns 1.02
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 291375 ns 284125 ns 1.03
dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 284145.5 ns 278708 ns 1.02
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/2 thread(s) 3306333 ns 3278417 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/4 thread(s) 3031917 ns 3081000 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/8 thread(s) 2796833 ns 3021792 ns 0.93
mlp7layer_bn(gelu)(32 x 256)/forward/CPU/1 thread(s) 3935125 ns 4040979.5 ns 0.97
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/2 thread(s) 7260770.5 ns 7620208 ns 0.95
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/4 thread(s) 7411416 ns 7449187.5 ns 0.99
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/8 thread(s) 7367271 ns 7493708.5 ns 0.98
mlp7layer_bn(gelu)(32 x 256)/zygote/CPU/1 thread(s) 8191583.5 ns 8208791 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/2 thread(s) 17581104 ns 18366417 ns 0.96
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/4 thread(s) 17521584 ns 17522312.5 ns 1.00
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/8 thread(s) 17682146 ns 17580834 ns 1.01
mlp7layer_bn(gelu)(32 x 256)/enzyme/CPU/1 thread(s) 14123875 ns 14093354.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23725208 ns 23631333 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 34375583 ns 33504604 ns 1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 40913375 ns 37034667 ns 1.10
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34801458 ns 34967583.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 189578375 ns 189693000 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 164456312.5 ns 165014875 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 155623541 ns 152416688 ns 1.02
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 434187396 ns 434850958 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 289496083 ns 289105312.5 ns 1.00
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 262462166 ns 250867083 ns 1.05
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 305828042 ns 296775875 ns 1.03
Conv((3, 3), 32 => 32, relu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 474493916.5 ns 473537562.5 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23604 ns 22083 ns 1.07
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 24250 ns 22459 ns 1.08
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23979 ns 25375 ns 0.94
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 21291 ns 24083 ns 0.88
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 104687.5 ns 103083 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 104875 ns 103250 ns 1.02
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 104125 ns 104542 ns 1.00
layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 103292 ns 103041 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6749.5 ns 5917 ns 1.14
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5416 ns 5958 ns 0.91
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7000 ns 6708 ns 1.04
layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5333 ns 5791.5 ns 0.92
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14833 ns 14792 ns 1.00
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 14709 ns 15000 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 16166 ns 16542 ns 0.98
layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14770.5 ns 14875 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3018000 ns 3002625 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2066604.5 ns 2079375 ns 0.99
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2280541.5 ns 2272333 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4577917 ns 4882708 ns 0.94
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 23533375 ns 23536000 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18022709 ns 18038562.5 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 17334750 ns 16972167 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 34837750 ns 34545146 ns 1.01
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 33300333 ns 33221458 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 27629000 ns 27561792 ns 1.00
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 27822584 ns 27327000 ns 1.02
Conv((3, 3), 4 => 4, identity)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41187708 ns 42034750 ns 0.98
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 74520.5 ns 71417 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 74875 ns 71854.5 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 82167 ns 75708 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 74583 ns 74708 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 308437.5 ns 205250.5 ns 1.50
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 225749.5 ns 206750 ns 1.09
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 320208.5 ns 208958 ns 1.53
layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 218542 ns 217416 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 11583 ns 11875 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 11583 ns 11416 ns 1.01
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13208 ns 12958 ns 1.02
layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 11458 ns 11708 ns 0.98
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 28167 ns 25667 ns 1.10
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 28375 ns 26541.5 ns 1.07
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 29709 ns 27729.5 ns 1.07
layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 28917 ns 26667 ns 1.08
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 12000 ns 12812.5 ns 0.94
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 12292 ns 12209 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 13958 ns 14208 ns 0.98
groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 12333 ns 12291.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 25666 ns 25625 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 25959 ns 25916.5 ns 1.00
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 26500 ns 26250 ns 1.01
groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 26459 ns 26604 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 180521 ns 178792 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 179354.5 ns 180750 ns 0.99
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 183458 ns 181917 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 180375 ns 179166 ns 1.01
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 590375 ns 593333 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 594250 ns 582708 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 594916 ns 583667 ns 1.02
groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 583541 ns 584542 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 6084 ns 6167 ns 0.99
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5854.5 ns 5875 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 7104.5 ns 6875 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 5917 ns 5708.5 ns 1.04
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 14208 ns 13791 ns 1.03
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13500 ns 13917 ns 0.97
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 15625 ns 15667 ns 1.00
layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 13834 ns 14458 ns 0.96
batchedmm(512, Bsize=4)/forward/CPU/2 thread(s) 1217312.5 ns 1225312.5 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/4 thread(s) 1268500 ns 1241959 ns 1.02
batchedmm(512, Bsize=4)/forward/CPU/8 thread(s) 1281209 ns 1289958.5 ns 0.99
batchedmm(512, Bsize=4)/forward/CPU/1 thread(s) 998541.5 ns 1011625 ns 0.99
batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s) 4105042 ns 4103042 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s) 4410083.5 ns 4403333 ns 1.00
batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s) 4905208.5 ns 4523854.5 ns 1.08
batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s) 3703875 ns 3709771 ns 1.00
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1792 ns 1875 ns 0.96
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1791 ns 1916 ns 0.93
dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 4833 ns 4958 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 4833 ns 5000 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 4833 ns 4958 ns 0.97
dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 4875 ns 4875 ns 1
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5375 ns 5833 ns 0.92
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 5958 ns 5917 ns 1.01
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7166.5 ns 6667 ns 1.07
groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5333.5 ns 5209 ns 1.02
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 10500 ns 11125 ns 0.94
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11042 ns 11500 ns 0.96
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11125 ns 11458 ns 0.97
groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 11542 ns 10500 ns 1.10
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s) 292 ns 375 ns 0.78
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s) 333 ns 375 ns 0.89
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s) 291 ns 333 ns 0.87
dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s) 333 ns 375 ns 0.89
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s) 2750 ns 2792 ns 0.98
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s) 2708 ns 2833 ns 0.96
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s) 2750 ns 3083 ns 0.89
dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s) 3083 ns 2750 ns 1.12
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 10875 ns 11459 ns 0.95
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 11125 ns 11625 ns 0.96
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 12958.5 ns 12875 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 11229.5 ns 10958 ns 1.02
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24604.5 ns 25020.5 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24834 ns 25292 ns 0.98
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 25333 ns 25125 ns 1.01
groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 25333 ns 24875 ns 1.02
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s) 4166 ns 4250 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s) 4167 ns 4250 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s) 4208 ns 4250 ns 0.99
dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s) 4208 ns 4208 ns 1
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s) 16375 ns 16333 ns 1.00
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s) 16500 ns 16375 ns 1.01
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s) 16167 ns 16520.5 ns 0.98
dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s) 16291 ns 16208 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5834 ns 5833 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5834 ns 5833 ns 1.00
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5792 ns 6042 ns 0.96
batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5875 ns 5833 ns 1.01
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 20792 ns 21000 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 21000 ns 21000 ns 1
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 21166 ns 21417 ns 0.99
batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 21167 ns 20709 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/2 thread(s) 423895.5 ns 422124.5 ns 1.00
batchedmm(16, Bsize=512)/forward/CPU/4 thread(s) 380479 ns 387791 ns 0.98
batchedmm(16, Bsize=512)/forward/CPU/8 thread(s) 485125 ns 477333 ns 1.02
batchedmm(16, Bsize=512)/forward/CPU/1 thread(s) 106958 ns 103125 ns 1.04
batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s) 937833 ns 921333 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s) 963250 ns 974250 ns 0.99
batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s) 1216083 ns 1186458 ns 1.02
batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s) 428542 ns 457479.5 ns 0.94
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 80291.5 ns 80542 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 79458 ns 80709 ns 0.98
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 87042 ns 84896 ns 1.03
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 80375 ns 79833 ns 1.01
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1917916.5 ns 1919250 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1918437.5 ns 1876583 ns 1.02
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1950812.5 ns 1946041 ns 1.00
groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1915188 ns 1921396 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s) 291 ns 292 ns 1.00
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s) 333 ns 333 ns 1
dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 292 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 1792 ns 1917 ns 0.93
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 1834 ns 1917 ns 0.96
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 1875 ns 1875 ns 1
dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 1875 ns 1792 ns 1.05
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 6000 ns 6417 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 6167 ns 6666 ns 0.93
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7834 ns 7771 ns 1.01
groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 6125 ns 6145.5 ns 1.00
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 9041 ns 9604.5 ns 0.94
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 9125 ns 9459 ns 0.96
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 9333 ns 9500 ns 0.98
groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 9625 ns 9041 ns 1.06
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 120446062.5 ns 120459792 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174298416.5 ns 173682208 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 155622396 ns 147804000 ns 1.05
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 104910437 ns 105720875 ns 0.99
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 613470583 ns 610206729.5 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 555889999.5 ns 555562500 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 467916666 ns 452099291.5 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 629979541 ns 626409896 ns 1.01
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 717129562 ns 657253583 ns 1.09
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 665448791 ns 665008062.5 ns 1.00
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 597201792 ns 581676208.5 ns 1.03
Conv((3, 3), 64 => 64, relu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 855951979.5 ns 857648458 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58542 ns 57875 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 48208 ns 47791 ns 1.01
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 39083 ns 47500 ns 0.82
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 80167 ns 83395.5 ns 0.96
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1918312.5 ns 1915500 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1976771 ns 1932792 ns 1.02
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1793729 ns 1995084 ns 0.90
batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1888625 ns 1890500 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 268666.5 ns 267854.5 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 268458 ns 267708 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 269271 ns 269750 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 265875 ns 268166 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 676000 ns 594417 ns 1.14
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 587417 ns 681291 ns 0.86
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 601499.5 ns 604895.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 700333 ns 689917 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 2212542 ns 2176375 ns 1.02
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 2211416 ns 2222812.5 ns 0.99
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 2103833 ns 2205042 ns 0.95
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 2216500 ns 2093562.5 ns 1.06
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5504541 ns 5514416 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5488625 ns 5508500 ns 1.00
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5582375 ns 5535958 ns 1.01
layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5490917 ns 5491750 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 647417 ns 638167 ns 1.01
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 641916.5 ns 647708 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 650125 ns 659416 ns 0.99
dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 642917 ns 643750 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 1821291 ns 1822167 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 1717958 ns 1723042 ns 1.00
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 1666375 ns 1727833 ns 0.96
dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 2103666.5 ns 2106333 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58292 ns 58458 ns 1.00
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47209 ns 46917 ns 1.01
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 37250 ns 47292 ns 0.79
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 80791 ns 84125 ns 0.96
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2017916.5 ns 2030041 ns 0.99
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2086583 ns 2004250 ns 1.04
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1901083 ns 2122125 ns 0.90
batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1990750 ns 1985979.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 13371875 ns 13357770.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 12426458 ns 12440000 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 12666062 ns 12492250 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 15204979 ns 15108458 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 47257417 ns 47178791.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 41744209 ns 41760334 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 41179062.5 ns 40950875 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 58639833 ns 58205437.5 ns 1.01
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 73940917 ns 97014458.5 ns 0.76
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 90904041 ns 91152834 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 91001000 ns 90701604.5 ns 1.00
Conv((3, 3), 4 => 4, gelu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 98448625 ns 98541521.5 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 58833 ns 58959 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 47958 ns 47375 ns 1.01
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 38542 ns 47750 ns 0.81
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 84292 ns 79958 ns 1.05
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1904750 ns 1918645.5 ns 0.99
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1969542 ns 1971000 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1800875 ns 1997667 ns 0.90
batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1895917 ns 1889750 ns 1.00
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 292 ns 416 ns 0.70
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 416 ns 375 ns 1.11
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 375 ns 375 ns 1
batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 375 ns 333 ns 1.13
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6145.5 ns 6292 ns 0.98
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6458 ns 6542 ns 0.99
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6375 ns 6834 ns 0.93
batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6625 ns 6125 ns 1.08
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s) 250 ns 292 ns 0.86
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s) 292 ns 292 ns 1
dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s) 292 ns 333 ns 0.88
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s) 2666 ns 2833 ns 0.94
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s) 2875 ns 2917 ns 0.99
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s) 2833 ns 2917 ns 0.97
dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s) 2875 ns 2708 ns 1.06
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 284556437.5 ns 289426812.5 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 340224270.5 ns 339624334 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 320916166 ns 315284104.5 ns 1.02
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 270718833 ns 274668667 ns 0.99
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 998965333.5 ns 1014634416 ns 0.98
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 956359521 ns 953687125 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 868085334 ns 857733312.5 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 1210263479.5 ns 1265357333 ns 0.96
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 1439494000 ns 1675373667 ns 0.86
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 1675455020.5 ns 1668941291 ns 1.00
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 1623450375 ns 1606744000 ns 1.01
Conv((3, 3), 64 => 64, gelu)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 1781275542 ns 1787636084 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1402500 ns 1409499.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1406416 ns 1413833 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1410125 ns 1419895.5 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1406875 ns 1458541.5 ns 0.96
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5015125 ns 5016749.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5021375 ns 4651917 ns 1.08
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5065333 ns 5058791 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5030104.5 ns 5012792 ns 1.00
vgg16(32, 32, 3, 32)/forward/CPU/2 thread(s) 178918125 ns 171852250 ns 1.04
vgg16(32, 32, 3, 32)/forward/CPU/4 thread(s) 137633791 ns 129831062.5 ns 1.06
vgg16(32, 32, 3, 32)/forward/CPU/8 thread(s) 137284041 ns 115995771 ns 1.18
vgg16(32, 32, 3, 32)/forward/CPU/1 thread(s) 169122750 ns 168839667 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/2 thread(s) 824093375 ns 629070333 ns 1.31
vgg16(32, 32, 3, 32)/zygote/CPU/4 thread(s) 493391208 ns 493488792 ns 1.00
vgg16(32, 32, 3, 32)/zygote/CPU/8 thread(s) 544904625 ns 456364583 ns 1.19
vgg16(32, 32, 3, 32)/zygote/CPU/1 thread(s) 646424584 ns 675660292 ns 0.96
batchedmm(512, Bsize=32)/forward/CPU/2 thread(s) 8944417 ns 8950646 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/4 thread(s) 8930333 ns 8924625 ns 1.00
batchedmm(512, Bsize=32)/forward/CPU/8 thread(s) 8002583 ns 7865125 ns 1.02
batchedmm(512, Bsize=32)/forward/CPU/1 thread(s) 9740458 ns 9701750 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s) 37148750 ns 36024125 ns 1.03
batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s) 36964208 ns 37000208.5 ns 1.00
batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s) 34465958 ns 33425875 ns 1.03
batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s) 38308250 ns 37661542 ns 1.02
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s) 47458 ns 47562.5 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s) 47334 ns 47416 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s) 47542 ns 47666 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s) 47584 ns 47375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s) 50542 ns 50542 ns 1
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s) 50542 ns 50375 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s) 50625 ns 50584 ns 1.00
bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s) 50500 ns 50583 ns 1.00
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 6292 ns 6958.5 ns 0.90
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6625 ns 6500 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 8479 ns 8042 ns 1.05
groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 6792 ns 6542 ns 1.04
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 9584 ns 10042 ns 0.95
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 10625 ns 10437.5 ns 1.02
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 10375 ns 10500 ns 0.99
groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 10458 ns 10375 ns 1.01
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5250 ns 5666 ns 0.93
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5917 ns 5958 ns 0.99
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7917 ns 7417 ns 1.07
groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5750 ns 5458 ns 1.05
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 18291.5 ns 13125 ns 1.39
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 15958 ns 13250 ns 1.20
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 16500 ns 13375 ns 1.23
groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 16583 ns 13208 ns 1.26
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 1083 ns 1083 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1084 ns 1.00
batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 1084 ns 1084 ns 1
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 8104.5 ns 8000 ns 1.01
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 8084 ns 8292 ns 0.97
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 8125 ns 8500 ns 0.96
batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 8458 ns 8125 ns 1.04
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 23125 ns 23354.5 ns 0.99
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 23167 ns 23250 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 23167 ns 23542 ns 0.98
bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 23541 ns 23125 ns 1.02
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 52500 ns 52667 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 52417 ns 52584 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 52645.5 ns 52750 ns 1.00
bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 52458 ns 52417 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1405062.5 ns 1398084 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1402583.5 ns 1402791 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1406875 ns 1401792 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1403729.5 ns 1402875 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5007708 ns 5010813 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5013292 ns 5016584 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 5046271 ns 5062708 ns 1.00
groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 5005125 ns 5013500 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/2 thread(s) 3074708 ns 3040417 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/4 thread(s) 2091499.5 ns 2105083 ns 0.99
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/8 thread(s) 2290083.5 ns 2280208 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/forward/CPU/1 thread(s) 4915708.5 ns 4865521 ns 1.01
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/2 thread(s) 24422083 ns 24414604.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/4 thread(s) 18926750 ns 18876208.5 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/8 thread(s) 18059792 ns 17652979 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/zygote/CPU/1 thread(s) 35835500.5 ns 35825688 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/2 thread(s) 34039292 ns 34006188 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/4 thread(s) 28325625 ns 28283750 ns 1.00
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/8 thread(s) 28468583 ns 27926083.5 ns 1.02
Conv((3, 3), 4 => 4, relu)(64 x 64 x 4 x 128)/enzyme/CPU/1 thread(s) 41461250 ns 41742416.5 ns 0.99
batchedmm(512, Bsize=512)/forward/CPU/2 thread(s) 144570938 ns 144750166 ns 1.00
batchedmm(512, Bsize=512)/forward/CPU/4 thread(s) 147768250 ns 146949375 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/8 thread(s) 127812375 ns 126208208.5 ns 1.01
batchedmm(512, Bsize=512)/forward/CPU/1 thread(s) 173201708 ns 173205292 ns 1.00
batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s) 952803959 ns 1847080125 ns 0.52
batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s) 1880403417 ns 809911709 ns 2.32
batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s) 721103250 ns 755677291 ns 0.95
batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s) 665759084 ns 667449084 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 77270.5 ns 76791 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 72541 ns 76042 ns 0.95
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 76166 ns 76417 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 72646 ns 72541 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 291833 ns 277229 ns 1.05
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 193625 ns 193583 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 275146 ns 205417 ns 1.34
layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 289604.5 ns 303083.5 ns 0.96
batchedmm(512, Bsize=128)/forward/CPU/2 thread(s) 35435979 ns 35472875 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/4 thread(s) 36430959 ns 36379896 ns 1.00
batchedmm(512, Bsize=128)/forward/CPU/8 thread(s) 32728396 ns 32315333.5 ns 1.01
batchedmm(512, Bsize=128)/forward/CPU/1 thread(s) 40524416 ns 40618416.5 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s) 148443209 ns 146765250 ns 1.01
batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s) 153839875 ns 153200125 ns 1.00
batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s) 142207500 ns 137307792 ns 1.04
batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s) 286559208 ns 285301125 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/2 thread(s) 121670542 ns 120518062.5 ns 1.01
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/4 thread(s) 174360666.5 ns 174031666 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/8 thread(s) 155087062.5 ns 148283312.5 ns 1.05
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/forward/CPU/1 thread(s) 106968083 ns 106552271 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/2 thread(s) 468237229 ns 469918416 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/4 thread(s) 467305229 ns 466837917 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/8 thread(s) 457270500 ns 437920916.5 ns 1.04
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/zygote/CPU/1 thread(s) 742197000 ns 739774042 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/2 thread(s) 775778042 ns 711087896 ns 1.09
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/4 thread(s) 639059458 ns 640897313 ns 1.00
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/8 thread(s) 642570667 ns 630411896 ns 1.02
Conv((3, 3), 64 => 64, identity)(64 x 64 x 64 x 128)/enzyme/CPU/1 thread(s) 849532312.5 ns 849787625 ns 1.00
mlp7layer_bn(relu)(32 x 256)/forward/CPU/2 thread(s) 1345916 ns 1302125 ns 1.03
mlp7layer_bn(relu)(32 x 256)/forward/CPU/4 thread(s) 984292 ns 905958 ns 1.09
mlp7layer_bn(relu)(32 x 256)/forward/CPU/8 thread(s) 764770.5 ns 938334 ns 0.82
mlp7layer_bn(relu)(32 x 256)/forward/CPU/1 thread(s) 2095229.5 ns 1987437 ns 1.05
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/2 thread(s) 2954875 ns 2951687.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/4 thread(s) 2619000 ns 2611020.5 ns 1.00
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/8 thread(s) 2499292 ns 2639896 ns 0.95
mlp7layer_bn(relu)(32 x 256)/zygote/CPU/1 thread(s) 3688708.5 ns 3702396 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/2 thread(s) 5790208 ns 5801417 ns 1.00
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/4 thread(s) 5791792 ns 5727666.5 ns 1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/8 thread(s) 5888041 ns 5818916 ns 1.01
mlp7layer_bn(relu)(32 x 256)/enzyme/CPU/1 thread(s) 2887459 ns 2913834 ns 0.99
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7208 ns 7417 ns 0.97
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5833 ns 6166 ns 0.95
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5250 ns 6209 ns 0.85
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 10083 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223354 ns 212792 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 232209 ns 220834 ns 1.05
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 220729.5 ns 221166 ns 1.00
batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 219292 ns 215459 ns 1.02
vgg16(32, 32, 3, 64)/forward/CPU/2 thread(s) 303148916.5 ns 300445333 ns 1.01
vgg16(32, 32, 3, 64)/forward/CPU/4 thread(s) 220759541.5 ns 214002042 ns 1.03
vgg16(32, 32, 3, 64)/forward/CPU/8 thread(s) 221905479 ns 196386541 ns 1.13
vgg16(32, 32, 3, 64)/forward/CPU/1 thread(s) 309164583 ns 307720792 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/2 thread(s) 1233285583 ns 1232629833 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/4 thread(s) 899326000 ns 899311645.5 ns 1.00
vgg16(32, 32, 3, 64)/zygote/CPU/8 thread(s) 858911520.5 ns 825300584 ns 1.04
vgg16(32, 32, 3, 64)/zygote/CPU/1 thread(s) 1144926250 ns 1150330250 ns 1.00
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 4959 ns 5458 ns 0.91
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5209 ns 5416 ns 0.96
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 6875 ns 6750.5 ns 1.02
groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5125 ns 5084 ns 1.01
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 10333 ns 7667 ns 1.35
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 10209 ns 7333 ns 1.39
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 10375 ns 7500 ns 1.38
groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 10583 ns 7250 ns 1.46
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 500 ns 583 ns 0.86
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 625 ns 1
batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 625 ns 542 ns 1.15
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 9125 ns 9542 ns 0.96
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9208 ns 9833 ns 0.94
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 9209 ns 9667 ns 0.95
batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 9417 ns 9041 ns 1.04
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s) 352041 ns 352562.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s) 352167 ns 351833 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s) 352833 ns 353416.5 ns 1.00
bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s) 352250 ns 366166 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s) 810042 ns 826208 ns 0.98
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s) 832334 ns 775333.5 ns 1.07
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s) 777896 ns 808520.5 ns 0.96
bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s) 833959 ns 828833 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/2 thread(s) 339375 ns 340917 ns 1.00
batchedmm(16, Bsize=32)/forward/CPU/4 thread(s) 345208.5 ns 342729.5 ns 1.01
batchedmm(16, Bsize=32)/forward/CPU/8 thread(s) 443583 ns 453708 ns 0.98
batchedmm(16, Bsize=32)/forward/CPU/1 thread(s) 10500 ns 10687.5 ns 0.98
batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s) 720437.5 ns 709875 ns 1.01
batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s) 730000 ns 728042 ns 1.00
batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s) 1036000 ns 1005792 ns 1.03
batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s) 26584 ns 26667 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/2 thread(s) 378750 ns 380187.5 ns 1.00
batchedmm(16, Bsize=128)/forward/CPU/4 thread(s) 347042 ns 355542 ns 0.98
batchedmm(16, Bsize=128)/forward/CPU/8 thread(s) 446167 ns 442146 ns 1.01
batchedmm(16, Bsize=128)/forward/CPU/1 thread(s) 30208 ns 30959 ns 0.98
batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s) 736541 ns 726667 ns 1.01
batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s) 781270.5 ns 778791.5 ns 1.00
batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s) 1066792 ns 1034042 ns 1.03
batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s) 104812.5 ns 105042 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s) 3375 ns 3583 ns 0.94
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s) 3458 ns 3542 ns 0.98
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s) 3709 ns 3708 ns 1.00
bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s) 3625 ns 3542 ns 1.02
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s) 4167 ns 4583 ns 0.91
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s) 4208 ns 4333 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s) 4250 ns 4375 ns 0.97
bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s) 4291 ns 4167 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 3625 ns 3833 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 3375 ns 3542 ns 0.95
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 4437.5 ns 4292 ns 1.03
layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 3708 ns 3500 ns 1.06
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8375 ns 8334 ns 1.00
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8208 ns 8334 ns 0.98
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8583 ns 8708 ns 0.99
layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8542 ns 8625 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 205167 ns 203709 ns 1.01
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209208 ns 209833 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 208833 ns 213750 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199083 ns 200750 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 606958 ns 611979.5 ns 0.99
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 671708 ns 623084 ns 1.08
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 624000 ns 633542 ns 0.98
batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 633208 ns 630833 ns 1.00
batchedmm(128, Bsize=128)/forward/CPU/2 thread(s) 996958.5 ns 991250 ns 1.01
batchedmm(128, Bsize=128)/forward/CPU/4 thread(s) 1038063 ns 1017458.5 ns 1.02
batchedmm(128, Bsize=128)/forward/CPU/8 thread(s) 970916.5 ns 954833 ns 1.02
batchedmm(128, Bsize=128)/forward/CPU/1 thread(s) 870270.5 ns 864916.5 ns 1.01
batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s) 4514312 ns 4517208 ns 1.00
batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s) 4740687.5 ns 4768041 ns 0.99
batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s) 4626625 ns 4459667 ns 1.04
batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s) 4278333 ns 4281312 ns 1.00
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3083 ns 3625 ns 0.85
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 3209 ns 3291 ns 0.98
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4417 ns 4250 ns 1.04
layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 3458 ns 3166 ns 1.09
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7250 ns 7500 ns 0.97
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7167 ns 7458 ns 0.96
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7333 ns 7687.5 ns 0.95
layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7541 ns 7084 ns 1.06
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1650062.5 ns 1644333 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1162479.5 ns 1183209 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1343562.5 ns 1370292 ns 0.98
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2474584 ns 2475167 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12306500 ns 12346958.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9576334 ns 9593646 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9347167 ns 9292209 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18004520.5 ns 17963583.5 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17357042 ns 17361375 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14404458 ns 14393542 ns 1.00
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14505083.5 ns 14339750 ns 1.01
Conv((3, 3), 2 => 2, identity)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21117625 ns 21095083 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 88584 ns 88167 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 89416.5 ns 88875 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 91000 ns 91875 ns 0.99
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 116312.5 ns 134020.5 ns 0.87
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2027750 ns 2027813 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2156354 ns 2027000.5 ns 1.06
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1755083 ns 2054000 ns 0.85
groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2022583 ns 2028125 ns 1.00
batchedmm(2, Bsize=4)/forward/CPU/2 thread(s) 3416 ns 2792 ns 1.22
batchedmm(2, Bsize=4)/forward/CPU/4 thread(s) 2792 ns 2583 ns 1.08
batchedmm(2, Bsize=4)/forward/CPU/8 thread(s) 2021 ns 3458 ns 0.58
batchedmm(2, Bsize=4)/forward/CPU/1 thread(s) 3459 ns 1917 ns 1.80
batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s) 2750 ns 2709 ns 1.02
batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s) 3042 ns 2792 ns 1.09
batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s) 3083 ns 2792 ns 1.10
batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s) 3084 ns 2833.5 ns 1.09
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7209 ns 7375 ns 0.98
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 6041 ns 6041 ns 1
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5333 ns 6167 ns 0.86
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10083 ns 10125 ns 1.00
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 214125 ns 242958 ns 0.88
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229084 ns 220917 ns 1.04
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 223791.5 ns 220417 ns 1.02
batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 221708 ns 240375 ns 0.92
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3708 ns 3709 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3792 ns 3791 ns 1.00
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3791 ns 3750 ns 1.01
dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3708 ns 3708 ns 1
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 14584 ns 14584 ns 1
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 14458 ns 14542 ns 0.99
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 14292 ns 14584 ns 0.98
dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 14583 ns 14417 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 96000 ns 92125 ns 1.04
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 91334 ns 92458 ns 0.99
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 94166.5 ns 98562.5 ns 0.96
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 137583 ns 118229 ns 1.16
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1927479 ns 1913333 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1933333 ns 1909771 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1671542 ns 1956333 ns 0.85
groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1929000 ns 1924333 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/2 thread(s) 880583 ns 879000 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/4 thread(s) 820750 ns 818395.5 ns 1.00
lenet(28, 28, 1, 32)/forward/CPU/8 thread(s) 1161125 ns 1219520.5 ns 0.95
lenet(28, 28, 1, 32)/forward/CPU/1 thread(s) 964042 ns 966459 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/2 thread(s) 2817062.5 ns 2822917 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/4 thread(s) 2505978.5 ns 2496917 ns 1.00
lenet(28, 28, 1, 32)/zygote/CPU/8 thread(s) 3333708 ns 3359000 ns 0.99
lenet(28, 28, 1, 32)/zygote/CPU/1 thread(s) 3424937.5 ns 3411333 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 17166 ns 17000 ns 1.01
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 15292 ns 15458.5 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 16937.5 ns 19041 ns 0.89
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 16792 ns 16875 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 227729.5 ns 258834 ns 0.88
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 260125 ns 215125 ns 1.21
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 216458 ns 215792 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 259708 ns 227875 ns 1.14
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 221208.5 ns 219062.5 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 221937 ns 221375 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 221042 ns 222875 ns 0.99
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 221958.5 ns 220791 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 495666 ns 497625 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 561062.5 ns 535916 ns 1.05
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 501250 ns 499208 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 572917 ns 511125 ns 1.12
batchedmm(16, Bsize=4)/forward/CPU/2 thread(s) 4167 ns 3833.5 ns 1.09
batchedmm(16, Bsize=4)/forward/CPU/4 thread(s) 3625 ns 4250 ns 0.85
batchedmm(16, Bsize=4)/forward/CPU/8 thread(s) 5417 ns 5166.5 ns 1.05
batchedmm(16, Bsize=4)/forward/CPU/1 thread(s) 3750 ns 3792 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s) 7500 ns 7542 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s) 7458 ns 7167 ns 1.04
batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s) 7458 ns 7542 ns 0.99
batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s) 7917 ns 7667 ns 1.03
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18625 ns 18667 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17500 ns 16708 ns 1.05
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19375 ns 20584 ns 0.94
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18292 ns 18084 ns 1.01
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 223917 ns 224209 ns 1.00
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 229208.5 ns 212687 ns 1.08
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 218333 ns 213167 ns 1.02
groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 228667 ns 222979.5 ns 1.03
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 4166 ns 4250 ns 0.98
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 4166 ns 4333.5 ns 0.96
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 5375 ns 5125 ns 1.05
layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 4416 ns 3875 ns 1.14
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 10042 ns 10542 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 9750 ns 10791 ns 0.90
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 10417 ns 10959 ns 0.95
layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 10334 ns 10333 ns 1.00
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 3375 ns 3375 ns 1
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 2833 ns 3333 ns 0.85
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 4375 ns 4042 ns 1.08
layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 2792 ns 2958 ns 0.94
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7083 ns 7500 ns 0.94
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7333 ns 7750 ns 0.95
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7417 ns 7625 ns 0.97
layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7375 ns 7208 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 23307041.5 ns 23498333.5 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 33839458 ns 34789375 ns 0.97
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 40745646 ns 37689958 ns 1.08
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 34862708 ns 34909542 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 184254354 ns 184647292 ns 1.00
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 169428437.5 ns 163834583 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 150235166.5 ns 146363541.5 ns 1.03
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 273092750 ns 274565083 ns 0.99
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 284314042 ns 278243563 ns 1.02
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 259222834 ns 245760791.5 ns 1.05
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 233454625 ns 231789354 ns 1.01
Conv((3, 3), 32 => 32, identity)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 323194834 ns 324000854.5 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 183354.5 ns 182625 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 182083 ns 184458 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 185375 ns 186250 ns 1.00
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 183166.5 ns 181875 ns 1.01
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 598042 ns 628291.5 ns 0.95
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 638604 ns 608229.5 ns 1.05
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 590042 ns 598250 ns 0.99
groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 639625 ns 637791 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/2 thread(s) 3814396 ns 3874375 ns 0.98
batchedmm(128, Bsize=512)/forward/CPU/4 thread(s) 3917959 ns 3917042 ns 1.00
batchedmm(128, Bsize=512)/forward/CPU/8 thread(s) 3558667 ns 3534687.5 ns 1.01
batchedmm(128, Bsize=512)/forward/CPU/1 thread(s) 4558792 ns 4554291 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s) 17242875 ns 17461354.5 ns 0.99
batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s) 17847895.5 ns 17833459 ns 1.00
batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s) 16851208 ns 16559937.5 ns 1.02
batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s) 19971167 ns 19938750 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 500 ns 625 ns 0.80
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 625 ns 500 ns 1.25
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 542 ns 666 ns 0.81
batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 667 ns 583 ns 1.14
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 9333 ns 9292 ns 1.00
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 8917 ns 9458 ns 0.94
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 9792 ns 9375 ns 1.04
batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 9750 ns 9187.5 ns 1.06
vgg16(32, 32, 3, 128)/forward/CPU/2 thread(s) 652733938 ns 651812167 ns 1.00
vgg16(32, 32, 3, 128)/forward/CPU/4 thread(s) 393383500 ns 390086667 ns 1.01
vgg16(32, 32, 3, 128)/forward/CPU/8 thread(s) 395122417 ns 327502625 ns 1.21
vgg16(32, 32, 3, 128)/forward/CPU/1 thread(s) 624702084 ns 747314333 ns 0.84
vgg16(32, 32, 3, 128)/zygote/CPU/2 thread(s) 1882307625 ns 1879705041.5 ns 1.00
vgg16(32, 32, 3, 128)/zygote/CPU/4 thread(s) 1638716333.5 ns 1650371917 ns 0.99
vgg16(32, 32, 3, 128)/zygote/CPU/8 thread(s) 1551357292 ns 1514378771 ns 1.02
vgg16(32, 32, 3, 128)/zygote/CPU/1 thread(s) 2292499417 ns 2204966313 ns 1.04
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 1649417 ns 1651458 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 1198625 ns 1196083 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 1369208 ns 1387103.5 ns 0.99
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 2494208 ns 2353958 ns 1.06
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 12699979.5 ns 12704667 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 9947354 ns 9935187.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 9680125.5 ns 9671333.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 18361875 ns 18432334 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 17714687.5 ns 17670625 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 14723938 ns 14743791.5 ns 1.00
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 14690791 ns 14593292 ns 1.01
Conv((3, 3), 2 => 2, relu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 21421188 ns 21437146 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 26250 ns 26250 ns 1
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 26209 ns 26292 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 26209 ns 26333 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 26250 ns 26292 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 67292 ns 67166 ns 1.00
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 67625 ns 67208 ns 1.01
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 67000 ns 67917 ns 0.99
dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 67167 ns 66958 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 204208 ns 202875 ns 1.01
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 209583 ns 210375 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 209542 ns 209916 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 199166 ns 198750 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 602458 ns 645354 ns 0.93
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 626542 ns 637500.5 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 624687.5 ns 634542 ns 0.98
batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 632958 ns 634250 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 656125 ns 672209 ns 0.98
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 646104 ns 637917 ns 1.01
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 546958 ns 665042 ns 0.82
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 679042 ns 664917 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2259375 ns 2224563 ns 1.02
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2247416.5 ns 2248771 ns 1.00
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2013146 ns 2241125 ns 0.90
layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2262166.5 ns 2237000 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 18354.5 ns 17417 ns 1.05
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 17375 ns 17333 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 19625 ns 19500 ns 1.01
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 18542 ns 16875 ns 1.10
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 259959 ns 260770.5 ns 1.00
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 263500 ns 219458.5 ns 1.20
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 221375 ns 229000 ns 0.97
groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 261334 ns 263334 ns 0.99
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 584 ns 625 ns 0.93
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 625 ns 666 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 625 ns 667 ns 0.94
batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 708 ns 584 ns 1.21
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 10125 ns 10000 ns 1.01
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 9709 ns 9750 ns 1.00
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 10458 ns 10125 ns 1.03
batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 10250 ns 9750 ns 1.05
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s) 5500 ns 5375 ns 1.02
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s) 5375 ns 5625 ns 0.96
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s) 7041.5 ns 6604.5 ns 1.07
groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s) 5167 ns 5000 ns 1.03
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 7875 ns 7875 ns 1
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 7750 ns 7292 ns 1.06
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 7542 ns 7687.5 ns 0.98
groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 7791 ns 7334 ns 1.06
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 2041 ns 2041 ns 1
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1958 ns 2250 ns 0.87
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 2209 ns 2458 ns 0.90
bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 2167 ns 2084 ns 1.04
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 6333 ns 6542 ns 0.97
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 6542 ns 6458 ns 1.01
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 6416 ns 6708 ns 0.96
bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 6666 ns 6541 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s) 749417 ns 747125 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s) 746625 ns 749958.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s) 749166.5 ns 747167 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s) 772625 ns 771333.5 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s) 792667 ns 791000 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s) 792625 ns 780041.5 ns 1.02
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s) 775750 ns 775416 ns 1.00
bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s) 808562.5 ns 794812.5 ns 1.02
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 7334 ns 6959 ns 1.05
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 5959 ns 6000 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 5333 ns 6125 ns 0.87
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 10125 ns 10167 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 220166 ns 259750 ns 0.85
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 239292 ns 238854 ns 1.00
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 229167 ns 231104 ns 0.99
batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 254959 ns 250208 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s) 9792 ns 10125 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s) 10000 ns 10312.5 ns 0.97
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s) 11166 ns 10875 ns 1.03
layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s) 9750 ns 10167 ns 0.96
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 24541 ns 24167 ns 1.02
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 24291 ns 24583 ns 0.99
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 24917 ns 25333 ns 0.98
layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 24625 ns 24584 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/2 thread(s) 105924583 ns 106104729.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/4 thread(s) 116546459 ns 117502187.5 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/8 thread(s) 124211854 ns 120758625 ns 1.03
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/forward/CPU/1 thread(s) 117471395.5 ns 117423500 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/2 thread(s) 393647209 ns 392280708 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/4 thread(s) 356631062.5 ns 358697709 ns 0.99
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/8 thread(s) 357758708 ns 357440917 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/zygote/CPU/1 thread(s) 619205000 ns 540821208.5 ns 1.14
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/2 thread(s) 612150166 ns 781416292 ns 0.78
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/4 thread(s) 766180166.5 ns 760831458 ns 1.01
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/8 thread(s) 749713459 ns 750885583.5 ns 1.00
Conv((3, 3), 32 => 32, gelu)(64 x 64 x 32 x 128)/enzyme/CPU/1 thread(s) 785793916 ns 784554021 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s) 7000 ns 7583 ns 0.92
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s) 6875 ns 6875 ns 1
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s) 8625 ns 8208 ns 1.05
groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6542 ns 7917 ns 0.83
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 13500 ns 14542 ns 0.93
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 13625 ns 13667 ns 1.00
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 14375 ns 14125 ns 1.02
groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 14584 ns 14375 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s) 5917 ns 5750 ns 1.03
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s) 5770.5 ns 6125 ns 0.94
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s) 7875 ns 7500 ns 1.05
groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s) 5583 ns 5500 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s) 13000 ns 12875 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s) 12625 ns 12417 ns 1.02
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s) 12834 ns 12687.5 ns 1.01
groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s) 12895.5 ns 13042 ns 0.99
batchedmm(2, Bsize=128)/forward/CPU/2 thread(s) 5895.5 ns 5250 ns 1.12
batchedmm(2, Bsize=128)/forward/CPU/4 thread(s) 5292 ns 5709 ns 0.93
batchedmm(2, Bsize=128)/forward/CPU/8 thread(s) 5916 ns 6542 ns 0.90
batchedmm(2, Bsize=128)/forward/CPU/1 thread(s) 5417 ns 5375 ns 1.01
batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s) 15667 ns 15750 ns 0.99
batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s) 15895.5 ns 15375 ns 1.03
batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s) 15916 ns 15584 ns 1.02
batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s) 16041 ns 15916 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 292 ns 375 ns 0.78
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 417 ns 334 ns 1.25
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6292 ns 6583 ns 0.96
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6667 ns 6625 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6667 ns 6625 ns 1.01
batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6666 ns 6375 ns 1.05
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s) 5916 ns 5958 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s) 5875 ns 6041 ns 0.97
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s) 5917 ns 5959 ns 0.99
batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s) 6041 ns 5875 ns 1.03
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s) 21667 ns 21520.5 ns 1.01
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s) 21208 ns 21209 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s) 21750 ns 21667 ns 1.00
batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s) 21875 ns 21334 ns 1.03
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 144583 ns 144062.5 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 162416 ns 143042 ns 1.14
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 146625 ns 146334 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 187542 ns 188146 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 1319875 ns 1317583 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 1320770.5 ns 1321709 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 957604 ns 1365791.5 ns 0.70
layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 1324833 ns 1318666 ns 1.00
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s) 23125 ns 24708 ns 0.94
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s) 22437.5 ns 24375 ns 0.92
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s) 23854.5 ns 24375 ns 0.98
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s) 24396 ns 22374.5 ns 1.09
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s) 129875 ns 134750 ns 0.96
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s) 138125 ns 181250 ns 0.76
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s) 118937.5 ns 130000 ns 0.91
layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s) 176083 ns 130958 ns 1.34
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 375 ns 375 ns 1
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 375 ns 417 ns 0.90
batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 375 ns 333 ns 1.13
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 6833.5 ns 6625 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 6708 ns 6500 ns 1.03
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 6667 ns 6708 ns 0.99
batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 6917 ns 6792 ns 1.02
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 4333.5 ns 4625 ns 0.94
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 4292 ns 4541.5 ns 0.95
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 5292 ns 5333 ns 0.99
layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 4042 ns 4583 ns 0.88
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 11542 ns 9875 ns 1.17
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 11958 ns 9916.5 ns 1.21
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 11708 ns 10417 ns 1.12
layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 12625 ns 10375 ns 1.22
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s) 1584 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s) 1583 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s) 1583 ns 1625 ns 0.97
dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s) 1667 ns 1667 ns 1
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s) 5667 ns 5750 ns 0.99
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s) 5625 ns 5750 ns 0.98
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s) 5791 ns 6083 ns 0.95
dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s) 5791 ns 5709 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/2 thread(s) 6893499.5 ns 6814041 ns 1.01
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/4 thread(s) 6374750 ns 6367459 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/8 thread(s) 6500541.5 ns 6578812.5 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/forward/CPU/1 thread(s) 7628458 ns 7695958 ns 0.99
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/2 thread(s) 24057854 ns 24052709 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/4 thread(s) 21255853.5 ns 21310875 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/8 thread(s) 21045937.5 ns 21123834 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/zygote/CPU/1 thread(s) 29752958 ns 29855166.5 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/2 thread(s) 37194104 ns 48838979.5 ns 0.76
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/4 thread(s) 45565937.5 ns 45549667 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/8 thread(s) 45856833 ns 45706771 ns 1.00
Conv((3, 3), 2 => 2, gelu)(64 x 64 x 2 x 128)/enzyme/CPU/1 thread(s) 49410209 ns 49408500 ns 1.00
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s) 5729.5 ns 5875 ns 0.98
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s) 6041 ns 5709 ns 1.06
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s) 7542 ns 6708 ns 1.12
groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s) 5583 ns 5541 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 7812.5 ns 8875 ns 0.88
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8333 ns 8167 ns 1.02
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8667 ns 8542 ns 1.01
groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8750 ns 8208 ns 1.07
lenet(28, 28, 1, 128)/forward/CPU/2 thread(s) 1558521 ns 1556417 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/4 thread(s) 1261333 ns 1270792 ns 0.99
lenet(28, 28, 1, 128)/forward/CPU/8 thread(s) 1624791.5 ns 1624187.5 ns 1.00
lenet(28, 28, 1, 128)/forward/CPU/1 thread(s) 2151979 ns 2180520.5 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/2 thread(s) 7911312.5 ns 7888792 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/4 thread(s) 6595562.5 ns 6591250 ns 1.00
lenet(28, 28, 1, 128)/zygote/CPU/8 thread(s) 7113500.5 ns 7197854 ns 0.99
lenet(28, 28, 1, 128)/zygote/CPU/1 thread(s) 10486458 ns 10478229.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/2 thread(s) 370375.5 ns 366500 ns 1.01
batchedmm(128, Bsize=4)/forward/CPU/4 thread(s) 370334 ns 371020.5 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/8 thread(s) 457042 ns 457708 ns 1.00
batchedmm(128, Bsize=4)/forward/CPU/1 thread(s) 24083.5 ns 33208.5 ns 0.73
batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s) 740416 ns 723916.5 ns 1.02
batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s) 810542 ns 801750 ns 1.01
batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s) 1091458.5 ns 1064875 ns 1.02
batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s) 119250 ns 115334 ns 1.03
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s) 397375 ns 397291 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s) 288000 ns 287834 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s) 211583 ns 288166 ns 0.73
dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s) 750270.5 ns 750833 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s) 673041 ns 661875 ns 1.02
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s) 532334 ns 532416 ns 1.00
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s) 474084 ns 535458 ns 0.89
dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s) 973792 ns 973250 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 662833.5 ns 670958 ns 0.99
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 641958 ns 644229 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 544334 ns 680667 ns 0.80
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 670813 ns 648125 ns 1.04
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2467229 ns 2459333 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2462313 ns 2456084 ns 1.00
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 2482583.5 ns 2464542 ns 1.01
layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2448459 ns 2456083 ns 1.00
batchedmm(2, Bsize=32)/forward/CPU/2 thread(s) 3583.5 ns 3708 ns 0.97
batchedmm(2, Bsize=32)/forward/CPU/4 thread(s) 2687.5 ns 3334 ns 0.81
batchedmm(2, Bsize=32)/forward/CPU/8 thread(s) 2959 ns 4334 ns 0.68
batchedmm(2, Bsize=32)/forward/CPU/1 thread(s) 3833 ns 2667 ns 1.44
batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s) 5542 ns 5500 ns 1.01
batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s) 5792 ns 5458 ns 1.06
batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s) 5833 ns 5625 ns 1.04
batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s) 5833.5 ns 5542 ns 1.05
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1460979.5 ns 1458167 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1498958 ns 1500500 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1492334 ns 1499333 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1436709 ns 1437750 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5110375 ns 5130750 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5286896 ns 5285584 ns 1.00
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4965208 ns 5315979 ns 0.93
batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4987187.5 ns 4998959 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s) 3709 ns 3708 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s) 3750 ns 3709 ns 1.01
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s) 3709 ns 3750 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s) 3709 ns 3750 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s) 15250 ns 15375 ns 0.99
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s) 15375 ns 15417 ns 1.00
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s) 15208 ns 15500 ns 0.98
dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s) 15542 ns 15167 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s) 71167 ns 70667 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s) 71208 ns 71208 ns 1
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s) 71125 ns 71959 ns 0.99
dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s) 70145.5 ns 71333 ns 0.98
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 318209 ns 318500 ns 1.00
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 321166 ns 318000 ns 1.01
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 331000 ns 323666 ns 1.02
dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 318208 ns 317125 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s) 1000 ns 1084 ns 0.92
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s) 1084 ns 1125 ns 0.96
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s) 1083 ns 1084 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s) 1125 ns 1000 ns 1.13
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s) 8208 ns 8458 ns 0.97
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s) 8333 ns 8334 ns 1.00
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s) 8542 ns 8292 ns 1.03
batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s) 8458 ns 8375 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/2 thread(s) 513416.5 ns 506709 ns 1.01
batchedmm(128, Bsize=32)/forward/CPU/4 thread(s) 491000 ns 492375 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/8 thread(s) 564167 ns 562708 ns 1.00
batchedmm(128, Bsize=32)/forward/CPU/1 thread(s) 219125 ns 222187.5 ns 0.99
batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s) 1389604.5 ns 1387250 ns 1.00
batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s) 1470916.5 ns 1449208 ns 1.01
batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s) 1739750 ns 1788375 ns 0.97
batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s) 867042 ns 865812.5 ns 1.00
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s) 333 ns 375 ns 0.89
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s) 375 ns 292 ns 1.28
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s) 292 ns 416 ns 0.70
batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s) 417 ns 333 ns 1.25
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s) 6792 ns 6667 ns 1.02
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s) 6667 ns 6458 ns 1.03
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s) 6667 ns 6625 ns 1.01
batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s) 6583 ns 6458 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1744875 ns 1722042 ns 1.01
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1720437.5 ns 1723208.5 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1725229 ns 1721083 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1774833.5 ns 1723750 ns 1.03
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 4362875 ns 4362042 ns 1.00
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 4366833.5 ns 4261187.5 ns 1.02
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4017625 ns 4415583.5 ns 0.91
layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4360042 ns 4366958.5 ns 1.00
bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s) 6709 ns 6750 ns 0.99
bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s) 6541 ns 6959 ns 0.94
bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s) 7125 ns 6959 ns 1.02
bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s) 6896 ns 6708.5 ns 1.03
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 32667 ns 51417 ns 0.64
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 51125 ns 32917 ns 1.55
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 33125 ns 33333 ns 0.99
bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 52271 ns 51208.5 ns 1.02
batchedmm(2, Bsize=512)/forward/CPU/2 thread(s) 18166.5 ns 17542 ns 1.04
batchedmm(2, Bsize=512)/forward/CPU/4 thread(s) 17500 ns 17875 ns 0.98
batchedmm(2, Bsize=512)/forward/CPU/8 thread(s) 18875 ns 18916 ns 1.00
batchedmm(2, Bsize=512)/forward/CPU/1 thread(s) 17666.5 ns 17750 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s) 53667 ns 53458 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s) 53584 ns 53334 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s) 53417 ns 53250 ns 1.00
batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s) 54000 ns 53500 ns 1.01
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s) 75334 ns 75292 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s) 75375 ns 75375 ns 1
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s) 75209 ns 75792 ns 0.99
dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s) 74916 ns 75208 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s) 324959 ns 324375 ns 1.00
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s) 340167 ns 327625 ns 1.04
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s) 336875 ns 329583 ns 1.02
dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s) 324833 ns 324208 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 1486958 ns 1484375 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 1526792 ns 1527958 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 1521459 ns 1527583 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 1463834 ns 1462209 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 5117062 ns 5124708 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 5294604 ns 5280333 ns 1.00
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 4960833 ns 5332500 ns 0.93
batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 4987709 ns 4985875 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s) 28167 ns 28250 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s) 28167 ns 28291 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s) 28292 ns 28333 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s) 28292 ns 28291 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s) 66333 ns 66459 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s) 66833 ns 66458 ns 1.01
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s) 66500 ns 66833 ns 1.00
dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s) 66459 ns 66416 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/2 thread(s) 1395354 ns 1501229 ns 0.93
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/4 thread(s) 1059146 ns 1127563 ns 0.94
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/8 thread(s) 814208 ns 1119291.5 ns 0.73
mlp7layer_bn(tanh)(32 x 256)/forward/CPU/1 thread(s) 2269396 ns 2246375 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/2 thread(s) 3090979 ns 3082875 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/4 thread(s) 2740854.5 ns 2738375 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/8 thread(s) 2544104.5 ns 2760354 ns 0.92
mlp7layer_bn(tanh)(32 x 256)/zygote/CPU/1 thread(s) 3812666 ns 3780667 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/2 thread(s) 7882104 ns 7895333 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/4 thread(s) 7902666.5 ns 7893459 ns 1.00
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/8 thread(s) 8008791.5 ns 7944812.5 ns 1.01
mlp7layer_bn(tanh)(32 x 256)/enzyme/CPU/1 thread(s) 4806271 ns 4834521 ns 0.99
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s) 81167 ns 80959 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s) 83208.5 ns 80333 ns 1.04
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s) 81979.5 ns 82166 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s) 80417 ns 134375.5 ns 0.60
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s) 2017166.5 ns 2014625 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s) 2013729 ns 2006229 ns 1.00
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s) 1774125 ns 2047021 ns 0.87
groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s) 2014354.5 ns 2022958 ns 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.