From f2ba92ac4b523688eae2daf876487bafd673805f Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Wed, 7 Dec 2022 01:46:36 +0200
Subject: [PATCH 01/46] update some shared files for attention body

---
 src/neural/network_legacy.cc             |  10 +
 src/neural/network_legacy.h              |  13 +
 src/neural/shared/attention_policy_map.h | 384 +++++++++++++++++++++--
 3 files changed, 383 insertions(+), 24 deletions(-)
diff --git a/src/neural/network_legacy.cc b/src/neural/network_legacy.cc
index f4819d4952..0872ae0ddc 100644
--- a/src/neural/network_legacy.cc
+++ b/src/neural/network_legacy.cc
@@ -30,6 +30,8 @@ static constexpr float kEpsilon = 1e-5f;
 
 LegacyWeights::LegacyWeights(const pblczero::Weights& weights)
     : input(weights.input()),
+      ip_emb_w(LayerAdapter(weights.ip_emb_w()).as_vector()),
+      ip_emb_b(LayerAdapter(weights.ip_emb_b()).as_vector()),
       policy1(weights.policy1()),
       policy(weights.policy()),
       ip_pol_w(LayerAdapter(weights.ip_pol_w()).as_vector()),
@@ -40,11 +42,15 @@ LegacyWeights::LegacyWeights(const pblczero::Weights& weights)
       ip3_pol_b(LayerAdapter(weights.ip3_pol_b()).as_vector()),
       ip4_pol_w(LayerAdapter(weights.ip4_pol_w()).as_vector()),
       value(weights.value()),
+      ip_val_w(LayerAdapter(weights.ip_val_w()).as_vector()),
+      ip_val_b(LayerAdapter(weights.ip_val_b()).as_vector()),
       ip1_val_w(LayerAdapter(weights.ip1_val_w()).as_vector()),
       ip1_val_b(LayerAdapter(weights.ip1_val_b()).as_vector()),
       ip2_val_w(LayerAdapter(weights.ip2_val_w()).as_vector()),
       ip2_val_b(LayerAdapter(weights.ip2_val_b()).as_vector()),
       moves_left(weights.moves_left()),
+      ip_mov_w(LayerAdapter(weights.ip_mov_w()).as_vector()),
+      ip_mov_b(LayerAdapter(weights.ip_mov_b()).as_vector()),
       ip1_mov_w(LayerAdapter(weights.ip1_mov_w()).as_vector()),
       ip1_mov_b(LayerAdapter(weights.ip1_mov_b()).as_vector()),
       ip2_mov_w(LayerAdapter(weights.ip2_mov_w()).as_vector()),
@@ -52,6 +58,10 @@ LegacyWeights::LegacyWeights(const pblczero::Weights& weights)
   for (const auto& res : weights.residual()) {
     residual.emplace_back(res);
   }
+  encoder_head_count = weights.headcount();
+  for (const auto& enc : weights.encoder()) {
+    encoder.emplace_back(enc);
+  }
   pol_encoder_head_count = weights.pol_headcount();
   for (const auto& enc : weights.pol_encoder()) {
     pol_encoder.emplace_back(enc);
diff --git a/src/neural/network_legacy.h b/src/neural/network_legacy.h
index 3ba6028d5e..19284af172 100644
--- a/src/neural/network_legacy.h
+++ b/src/neural/network_legacy.h
@@ -88,6 +88,15 @@ struct LegacyWeights {
   // Input convnet.
   ConvBlock input;
 
+  // Embedding layer
+  Vec ip_emb_w;
+  Vec ip_emb_b;
+
+  // Encoder stack.
+  std::vector<EncoderLayer> encoder;
+  int encoder_head_count;
+
+
   // Residual tower.
   std::vector<Residual> residual;
 
@@ -109,6 +118,8 @@ struct LegacyWeights {
 
   // Value head
   ConvBlock value;
+  Vec ip_val_w;
+  Vec ip_val_b;
   Vec ip1_val_w;
   Vec ip1_val_b;
   Vec ip2_val_w;
@@ -116,6 +127,8 @@ struct LegacyWeights {
 
   // Moves left head
   ConvBlock moves_left;
+  Vec ip_mov_w;
+  Vec ip_mov_b;
   Vec ip1_mov_w;
   Vec ip1_mov_b;
   Vec ip2_mov_w;
diff --git a/src/neural/shared/attention_policy_map.h b/src/neural/shared/attention_policy_map.h
index 5ab0966654..df39df7dc9 100644
--- a/src/neural/shared/attention_policy_map.h
+++ b/src/neural/shared/attention_policy_map.h
@@ -380,30 +380,366 @@ const short kAttnPolicyMap[] = {
     1848, 1849, 1850, 1851, -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,
     -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1,   1852, 1853,
     1854, 1855, 1856, 1857};
-
-}  // namespace lczero
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
 
 
+#if 0
+// used only by Arcturai's T02 network
+constexpr int kNumPosEncodingChannels = 6;
+__device__ constexpr float kPosEncoding[64][kNumPosEncodingChannels] = {
+    {0., 0., 0., 0., 0., 0.}, {0., 0., 0., 0., 0., 1.},
+    {0., 0., 0., 0., 1., 0.}, {0., 0., 0., 0., 1., 1.},
+    {0., 0., 0., 1., 0., 0.}, {0., 0., 0., 1., 0., 1.},
+    {0., 0., 0., 1., 1., 0.}, {0., 0., 0., 1., 1., 1.},
+    {0., 0., 1., 0., 0., 0.}, {0., 0., 1., 0., 0., 1.},
+    {0., 0., 1., 0., 1., 0.}, {0., 0., 1., 0., 1., 1.},
+    {0., 0., 1., 1., 0., 0.}, {0., 0., 1., 1., 0., 1.},
+    {0., 0., 1., 1., 1., 0.}, {0., 0., 1., 1., 1., 1.},
+    {0., 1., 0., 0., 0., 0.}, {0., 1., 0., 0., 0., 1.},
+    {0., 1., 0., 0., 1., 0.}, {0., 1., 0., 0., 1., 1.},
+    {0., 1., 0., 1., 0., 0.}, {0., 1., 0., 1., 0., 1.},
+    {0., 1., 0., 1., 1., 0.}, {0., 1., 0., 1., 1., 1.},
+    {0., 1., 1., 0., 0., 0.}, {0., 1., 1., 0., 0., 1.},
+    {0., 1., 1., 0., 1., 0.}, {0., 1., 1., 0., 1., 1.},
+    {0., 1., 1., 1., 0., 0.}, {0., 1., 1., 1., 0., 1.},
+    {0., 1., 1., 1., 1., 0.}, {0., 1., 1., 1., 1., 1.},
+    {1., 0., 0., 0., 0., 0.}, {1., 0., 0., 0., 0., 1.},
+    {1., 0., 0., 0., 1., 0.}, {1., 0., 0., 0., 1., 1.},
+    {1., 0., 0., 1., 0., 0.}, {1., 0., 0., 1., 0., 1.},
+    {1., 0., 0., 1., 1., 0.}, {1., 0., 0., 1., 1., 1.},
+    {1., 0., 1., 0., 0., 0.}, {1., 0., 1., 0., 0., 1.},
+    {1., 0., 1., 0., 1., 0.}, {1., 0., 1., 0., 1., 1.},
+    {1., 0., 1., 1., 0., 0.}, {1., 0., 1., 1., 0., 1.},
+    {1., 0., 1., 1., 1., 0.}, {1., 0., 1., 1., 1., 1.},
+    {1., 1., 0., 0., 0., 0.}, {1., 1., 0., 0., 0., 1.},
+    {1., 1., 0., 0., 1., 0.}, {1., 1., 0., 0., 1., 1.},
+    {1., 1., 0., 1., 0., 0.}, {1., 1., 0., 1., 0., 1.},
+    {1., 1., 0., 1., 1., 0.}, {1., 1., 0., 1., 1., 1.},
+    {1., 1., 1., 0., 0., 0.}, {1., 1., 1., 0., 0., 1.},
+    {1., 1., 1., 0., 1., 0.}, {1., 1., 1., 0., 1., 1.},
+    {1., 1., 1., 1., 0., 0.}, {1., 1., 1., 1., 0., 1.},
+    {1., 1., 1., 1., 1., 0.}, {1., 1., 1., 1., 1., 1.}};
+#endif
+
+constexpr int kNumPosEncodingChannels = 64;
+#if defined(__CUDA_ARCH__)
+__device__ 
+#endif
+const float kPosEncoding[64][kNumPosEncodingChannels] = {
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0,
+    0.0,  1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0,
+    0.0,  0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0,
+    0.0,  0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0,
+    0.0,  0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+    -1.0};
 
+}  // namespace lczero

From f091f5e33996fd2837edeb7fbb793951f092d19a Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Sun, 4 Dec 2022 11:05:21 +0200
Subject: [PATCH 02/46] update default net (#1804)

---
 appveyor.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index e2784bc6c6..ab5225341e 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -43,10 +43,10 @@ install:
 - cmd: IF %NAME%==cpu-openblas set GTEST=true
 - cmd: IF %NAME%==onednn set ONEDNN=true
 - cmd: IF %NAME%==onnx-dml set ONNX_DML=true
-- cmd: set NET=744204
-- cmd: set NET_HASH=0f2f738e314bf618384045d4320a55333375d273d093adb805a4268ee53b519c
-- cmd: IF NOT %BLAS%==true IF NOT %ANDROID%==true set NET=753723
-- cmd: IF NOT %BLAS%==true IF NOT %ANDROID%==true set NET_HASH=3e3444370b9fe413244fdc79671a490e19b93d3cca1669710ffeac890493d198
+- cmd: set NET=753723
+- cmd: set NET_HASH=3e3444370b9fe413244fdc79671a490e19b93d3cca1669710ffeac890493d198
+- cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET=791556
+- cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET_HASH=f404e156ceb2882470fd8c032b8754af0fa0b71168328912eaef14671a256e34
 - cmd: call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64
 - cmd: set DNNL_NAME=dnnl_win_1.5.0_cpu_vcomp
 - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/oneapi-src/oneDNN/releases/download/v1.5/dnnl_win_1.5.0_cpu_vcomp.zip

From ad045295bc06f4c682b18372497a98b875ae7b6c Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Sun, 4 Dec 2022 11:05:51 +0200
Subject: [PATCH 03/46] option to select onnx opset (#1803)

---
 src/neural/onnx/builder.cc      | 38 +++++++++++++++++++++++++++------
 src/neural/onnx/builder.h       |  3 ++-
 src/neural/onnx/converter.cc    |  2 +-
 src/neural/onnx/converter.h     |  1 +
 src/neural/onnx/network_onnx.cc |  1 +
 5 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/src/neural/onnx/builder.cc b/src/neural/onnx/builder.cc
index c046f69fb1..be97dfd7ac 100644
--- a/src/neural/onnx/builder.cc
+++ b/src/neural/onnx/builder.cc
@@ -29,6 +29,7 @@
 
 #include <initializer_list>
 
+#include "neural/onnx/adapters.h"
 #include "neural/onnx/onnx.pb.h"
 #include "utils/exception.h"
 #include "utils/random.h"
@@ -36,12 +37,15 @@
 
 namespace lczero {
 
-OnnxBuilder::OnnxBuilder() {
+OnnxBuilder::OnnxBuilder(int opset) : opset_(opset) {
+  if (opset < 7 || opset > 17) {
+    throw Exception("Only ONNX opsets between 7 and 17 are supported.");
+  }
   model_.set_ir_version(4);
   model_.set_domain("org.lczero.models.*");
   model_.set_producer_name("Lc0");
   model_.set_producer_version(GetVersionStr());
-  model_.add_opset_import()->set_version(9);
+  model_.add_opset_import()->set_version(opset);
   model_.mutable_graph()->set_name("org.lczero/converted/" +
                                    Random::Get().GetString(16));
 }
@@ -166,7 +170,12 @@ std::string OnnxBuilder::Squeeze(const std::string& name,
                                  const std::string& input) {
   auto* node = model_.mutable_graph()->add_node();
   auto out = PopulateStdNodeFields(node, name, input, "Squeeze");
-  AddIntsAttribute(node, "axes", {2, 3});
+  if (opset_ < 13) {
+    AddIntsAttribute(node, "axes", {2, 3});
+  } else {
+    node->add_input(
+        AddInitializer(name + "/axes", Int64OnnxConst({2, 3}, {2})));
+  }
   return out;
 }
 
@@ -287,7 +296,14 @@ std::vector<std::string> OnnxBuilder::Split(const std::string& name,
   node->add_input(input);
   AddIntAttribute(node, "axis", axis);
   if (split.size() > 0) {
-    AddIntsAttribute(node, "split", split);
+    if (opset_ < 13) {
+      AddIntsAttribute(node, "split", split);
+    } else {
+      node->add_input(AddInitializer(
+          name + "/split",
+          Int64OnnxConst(std::vector<int64_t>(begin(split), end(split)),
+                         {static_cast<int>(split.size())})));
+    }
     std::vector<std::string> out;
     for (size_t i = 1; i <= split.size(); i++) {
       out.push_back(name + "/out" + std::to_string(i));
@@ -306,8 +322,17 @@ std::string OnnxBuilder::Slice(const std::string& name,
                                std::initializer_list<int> ends) {
   auto* node = model_.mutable_graph()->add_node();
   auto out = PopulateStdNodeFields(node, name, input, "Slice");
-  AddIntsAttribute(node, "starts", starts);
-  AddIntsAttribute(node, "ends", ends);
+  if (opset_ < 10) {
+    AddIntsAttribute(node, "starts", starts);
+    AddIntsAttribute(node, "ends", ends);
+  } else {
+    node->add_input(AddInitializer(
+        name + "/starts", Int32OnnxConst(std::vector<int>(starts),
+                                         {static_cast<int>(starts.size())})));
+    node->add_input(AddInitializer(
+        name + "/ends", Int32OnnxConst(std::vector<int>(ends),
+                                       {static_cast<int>(ends.size())})));
+  }
   return out;
 }
 
@@ -331,6 +356,7 @@ std::string OnnxBuilder::Sigmoid(const std::string& name,
   return PopulateStdNodeFields(node, name, input, "Sigmoid");
 }
 
+// This is only defined in opset 17 but onnxruntime supports it from 1.
 std::string OnnxBuilder::LayerNormalization(const std::string& name,
                                             const std::string& input,
                                             const OnnxConst& scale,
diff --git a/src/neural/onnx/builder.h b/src/neural/onnx/builder.h
index f2f03363f9..3b88cb27ec 100644
--- a/src/neural/onnx/builder.h
+++ b/src/neural/onnx/builder.h
@@ -45,7 +45,7 @@ class OnnxConst {
 // Builds Onnx::ModelProto.
 class OnnxBuilder {
  public:
-  OnnxBuilder();
+  OnnxBuilder(int opset);
   void AddInput(const std::string& name, std::initializer_list<int> dims,
                 pblczero::TensorProto::DataType datatype);
   void AddOutput(const std::string& name, std::initializer_list<int> dims,
@@ -112,6 +112,7 @@ class OnnxBuilder {
   std::string OutputAsString() const { return model_.OutputAsString(); }
 
  private:
+  const int opset_;
   pblczero::ModelProto model_;
 };
 
diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc
index 6e8254221f..1bf4b88b70 100644
--- a/src/neural/onnx/converter.cc
+++ b/src/neural/onnx/converter.cc
@@ -559,7 +559,7 @@ void Converter::MakeMovesLeftHead(pblczero::OnnxModel* onnx,
 
 void Converter::GenerateOnnx(pblczero::OnnxModel* onnx) {
   LegacyWeights weights(src_.weights());
-  OnnxBuilder builder;
+  OnnxBuilder builder(options_.opset);
 
   AddStdInitializers(&builder);
 
diff --git a/src/neural/onnx/converter.h b/src/neural/onnx/converter.h
index e22ab9dac9..8980ffb858 100644
--- a/src/neural/onnx/converter.h
+++ b/src/neural/onnx/converter.h
@@ -42,6 +42,7 @@ struct WeightsToOnnxConverterOptions {
   std::string output_value = "/output/value";
   std::string output_mlh = "/output/mlh";
   int batch_size = -1;
+  int opset = 17;
 };
 
 // Converts "classical" weights file to weights file with embedded ONNX model.
diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc
index 747ef31ec8..7d7950175c 100644
--- a/src/neural/onnx/network_onnx.cc
+++ b/src/neural/onnx/network_onnx.cc
@@ -409,6 +409,7 @@ std::unique_ptr<Network> MakeOnnxNetwork(const std::optional<WeightsFile>& w,
                       " is not supported by the ONNX backend.");
     }
     WeightsToOnnxConverterOptions converter_options;
+    converter_options.opset = opts.GetOrDefault<int>("opset", 17);
     converter_options.data_type_ =
         fp16 ? WeightsToOnnxConverterOptions::DataType::kFloat16
              : WeightsToOnnxConverterOptions::DataType::kFloat32;

From 2782b94aaf9caa4b5c7a2dbaa54d7acf63f34113 Mon Sep 17 00:00:00 2001
From: trre123 <94855351+trre123@users.noreply.github.com>
Date: Sun, 4 Dec 2022 04:06:51 -0500
Subject: [PATCH 04/46] Add 'simple' time manager (#1764)

---
 meson.build                  |   1 +
 src/mcts/stoppers/factory.cc |   8 ++-
 src/mcts/stoppers/simple.cc  | 127 +++++++++++++++++++++++++++++++++++
 src/mcts/stoppers/simple.h   |  37 ++++++++++
 4 files changed, 171 insertions(+), 2 deletions(-)
 create mode 100644 src/mcts/stoppers/simple.cc
 create mode 100644 src/mcts/stoppers/simple.h

diff --git a/meson.build b/meson.build
index 38d1fcb147..49b60c1cd6 100644
--- a/meson.build
+++ b/meson.build
@@ -170,6 +170,7 @@ files += [
   'src/mcts/stoppers/common.cc',
   'src/mcts/stoppers/factory.cc',
   'src/mcts/stoppers/legacy.cc',
+  'src/mcts/stoppers/simple.cc',
   'src/mcts/stoppers/smooth.cc',
   'src/mcts/stoppers/stoppers.cc',
   'src/mcts/stoppers/timemgr.cc',
diff --git a/src/mcts/stoppers/factory.cc b/src/mcts/stoppers/factory.cc
index 6a5c116fcd..e08e269117 100644
--- a/src/mcts/stoppers/factory.cc
+++ b/src/mcts/stoppers/factory.cc
@@ -1,6 +1,6 @@
 /*
   This file is part of Leela Chess Zero.
-  Copyright (C) 2019 The LCZero Authors
+  Copyright (C) 2022 The LCZero Authors
 
   Leela Chess is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
@@ -32,6 +32,7 @@
 #include "factory.h"
 #include "mcts/stoppers/alphazero.h"
 #include "mcts/stoppers/legacy.h"
+#include "mcts/stoppers/simple.h"
 #include "mcts/stoppers/smooth.h"
 #include "mcts/stoppers/stoppers.h"
 #include "utils/exception.h"
@@ -47,7 +48,7 @@ const OptionId kMoveOverheadId{
 const OptionId kTimeManagerId{
     "time-manager", "TimeManager",
     "Name and config of a time manager. "
-    "Possible names are 'legacy' (default), 'smooth' and 'alphazero'."
+    "Possible names are 'legacy' (default), 'smooth', 'alphazero', and simple."
     "See https://lc0.org/timemgr for configuration details."};
 }  // namespace
 
@@ -82,6 +83,9 @@ std::unique_ptr<TimeManager> MakeTimeManager(const OptionsDict& options) {
   } else if (managers[0] == "smooth") {
     time_manager =
         MakeSmoothTimeManager(move_overhead, tm_options.GetSubdict("smooth"));
+  } else if (managers[0] == "simple") {
+    time_manager =
+        MakeSimpleTimeManager(move_overhead, tm_options.GetSubdict("simple"));
   }
 
   if (!time_manager) {
diff --git a/src/mcts/stoppers/simple.cc b/src/mcts/stoppers/simple.cc
new file mode 100644
index 0000000000..fd93669375
--- /dev/null
+++ b/src/mcts/stoppers/simple.cc
@@ -0,0 +1,127 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2022 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "mcts/stoppers/stoppers.h"
+
+namespace lczero {
+
+namespace {
+
+class SimpleTimeManager : public TimeManager {
+ public:
+  SimpleTimeManager(int64_t move_overhead, const OptionsDict& params)
+      : move_overhead_(move_overhead),
+        basepct_(params.GetOrDefault<float>("base-pct", 1.83f)),
+        plypct_(params.GetOrDefault<float>("ply-pct", 0.0454f)),
+        timefactor_(params.GetOrDefault<float>("time-factor", 33.4f)),
+        opening_bonus_(params.GetOrDefault<float>("opening-bonus", 82.5f)) {
+    if (basepct_ <= 0.0f || basepct_ > 100.0f) {
+      throw Exception("base-pct value to be in range [0.0, 100.0]");
+    }
+    if (plypct_ < 0.0f || plypct_ > 1.0f) {
+      throw Exception("ply-pct value to be in range [0.0, 1.0]");
+    }
+    if (timefactor_ < 0.0f || timefactor_ > 100.0f) {
+      throw Exception("time-factor value to be in range [0.0, 100.0]");
+    }
+    if (opening_bonus_ < 0.0f || opening_bonus_ > 1000.0f) {
+      throw Exception("opening-bonus value to be in range [0.0, 1000.0]");
+    }
+  }
+  std::unique_ptr<SearchStopper> GetStopper(const GoParams& params,
+                                            const NodeTree& tree) override;
+
+ private:
+  const int64_t move_overhead_;
+  const float basepct_;
+  const float plypct_;
+  const float timefactor_;
+  const float opening_bonus_;
+  float prev_move_time = 0.0f;
+  float prev_total_moves_time = 0.0f;
+  bool bonus_applied = false;
+};
+
+std::unique_ptr<SearchStopper> SimpleTimeManager::GetStopper(
+    const GoParams& params, const NodeTree& tree) {
+  const Position& position = tree.HeadPosition();
+  const bool is_black = position.IsBlackToMove();
+  const std::optional<int64_t>& time = (is_black ? params.btime : params.wtime);
+
+  // If no time limit is given, don't stop on this condition.
+  if (params.infinite || params.ponder || !time) return nullptr;
+
+  const std::optional<int64_t>& inc = is_black ? params.binc : params.winc;
+  const int increment = inc ? std::max(int64_t(0), *inc) : 0;
+
+  const float total_moves_time =
+      static_cast<float>(*time) - static_cast<float>(move_overhead_);
+
+  // increase percentage as ply count increases
+  float pct = (basepct_ + position.GetGamePly() * plypct_) * 0.01f;
+
+  // increase percentage as ratio of increment time to total time gets smaller
+  pct += pct * (static_cast<float>(increment) /
+                static_cast<float>(total_moves_time) * timefactor_);
+
+  float this_move_time = total_moves_time * pct;
+
+  // immediately spend time saved from smart pruning during previous move
+  if (prev_move_time > 0.0f) {
+    const float time_saved =
+        prev_move_time - (prev_total_moves_time -
+                          (total_moves_time - static_cast<float>(increment)));
+
+    this_move_time += time_saved;
+  }
+
+  // apply any opening bonus and note the next move will also benefit
+  // from an increased time_saved as a result
+  if (!bonus_applied) {
+    this_move_time += this_move_time * opening_bonus_ * 0.01f;
+    bonus_applied = true;
+  }
+
+  this_move_time = std::min(this_move_time, total_moves_time);
+
+  prev_move_time = this_move_time;
+  prev_total_moves_time = total_moves_time;
+
+  LOGFILE << "Budgeted time for the move: " << this_move_time << "ms"
+          << "Remaining time " << *time << "ms(-" << move_overhead_
+          << "ms overhead)";
+
+  return std::make_unique<TimeLimitStopper>(this_move_time);
+}
+
+}  // namespace
+
+std::unique_ptr<TimeManager> MakeSimpleTimeManager(
+    int64_t move_overhead, const OptionsDict& params) {
+  return std::make_unique<SimpleTimeManager>(move_overhead, params);
+}
+}  // namespace lczero
diff --git a/src/mcts/stoppers/simple.h b/src/mcts/stoppers/simple.h
new file mode 100644
index 0000000000..b7b6f3a349
--- /dev/null
+++ b/src/mcts/stoppers/simple.h
@@ -0,0 +1,37 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2020 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#pragma once
+
+#include "utils/optionsdict.h"
+
+namespace lczero {
+
+std::unique_ptr<TimeManager> MakeSimpleTimeManager(
+    int64_t move_overhead, const OptionsDict& params);
+
+}  // namespace lczero

From b218726db5c1594b8f922fe0aabd129e86043804 Mon Sep 17 00:00:00 2001
From: zz4032 <alg4032@arcor.de>
Date: Sun, 4 Dec 2022 10:07:54 +0100
Subject: [PATCH 05/46] Backendbench assistant with TC-dependent output.
 (#1546)

---
 src/benchmark/backendbench.cc | 66 ++++++++++++++++++++++++-----------
 1 file changed, 45 insertions(+), 21 deletions(-)

diff --git a/src/benchmark/backendbench.cc b/src/benchmark/backendbench.cc
index d2ff622ec4..6792f9b778 100644
--- a/src/benchmark/backendbench.cc
+++ b/src/benchmark/backendbench.cc
@@ -50,23 +50,28 @@ const OptionId kFenId{"fen", "", "Benchmark initial position FEN."};
 
 const OptionId kClippyId{"clippy", "", "Enable helpful assistant."};
 
-const OptionId kClippyThresholdId{"clippy-threshold", "",
-                                  "Ratio of nps improvement necessary for each "
-                                  "doubling of batchsize to be considered "
-                                  "best."};
-
-void Clippy(std::string msg) {
+void Clippy(std::string title,
+            std::string msg3,  std::string best3, std::string msg2,
+            std::string best2, std::string msg,   std::string best) {
   std::cout << "  __" << std::endl;
   std::cout << " /  \\" << std::endl;
-  std::cout << " |  |" << std::endl;
-  std::cout << " +  +    " << std::string(msg.length() + 2, '_') << std::endl;
-  std::cout << "(@)(@) _|" << std::string(msg.length() + 2, ' ') << '|'
+  std::cout << " |  |    " << std::string(title.length()+2, '_') << std::endl;
+  std::cout << " +  +   | " << std::string(title.length()+1, ' ')
+            << "|" << std::endl;
+  std::cout << "(@)(@) _| "
+            << title << " |"
             << std::endl;
-  std::cout << " |  |  \\  " << msg << " |" << std::endl;
-  std::cout << " || |/  |" << std::string(msg.length() + 2, '_') << '|'
+  std::cout << " |  |  \\  " << std::string(6, ' ') << msg3
+            << std::string(4 - best3.length(), ' ') << best3
+            << std::string(title.length()-33, ' ') << "|" << std::endl;
+  std::cout << " || |/  | " << std::string(6, ' ') << msg2
+            << std::string(4 - best2.length(), ' ') << best2
+            << std::string(title.length()-33, ' ') << "|" << std::endl;
+  std::cout << " || ||  | " << std::string(6, ' ') << msg
+            << std::string(4 - best.length(), ' ') << best
+            << std::string(title.length()-33, ' ') << "|" << std::endl;
+  std::cout << " |\\_/|  |" << std::string(title.length()+2, '_') << "|"
             << std::endl;
-  std::cout << " || ||" << std::endl;
-  std::cout << " |\\_/|" << std::endl;
   std::cout << " \\___/" << std::endl;
 }
 }  // namespace
@@ -82,7 +87,6 @@ void BackendBenchmark::Run() {
   options.Add<IntOption>(kBatchStepId, 1, 256) = 1;
   options.Add<StringOption>(kFenId) = ChessBoard::kStartposFen;
   options.Add<BoolOption>(kClippyId) = false;
-  options.Add<FloatOption>(kClippyThresholdId, 0.0f, 1.0f) = 0.15f;
 
   if (!options.ProcessAllFlags()) return;
 
@@ -103,8 +107,8 @@ void BackendBenchmark::Run() {
 
     const int batches = option_dict.Get<int>(kBatchesId);
 
-    int best = 1;
-    float best_nps = 0.0f;
+    int best = 1; int best2 = 1; int best3 = 1;
+    float best_nps = 0.0f; float best_nps2 = 0.0f; float best_nps3 = 0.0f;
     std::optional<std::chrono::time_point<std::chrono::steady_clock>> pending;
 
     for (int i = option_dict.Get<int>(kStartBatchSizeId);
@@ -133,12 +137,27 @@ void BackendBenchmark::Run() {
                 << " nps." << std::endl;
 
       if (option_dict.Get<bool>(kClippyId)) {
-        const float threshold = option_dict.Get<float>(kClippyThresholdId);
+        float nps_ingame  = std::pow((nps + best_nps)  / 2, 1.085);
+        float nps_ingame2 = std::pow((nps + best_nps2) / 2, 1.085);
+        float nps_ingame3 = std::pow((nps + best_nps3) / 2, 1.085);
+        float threshold  = 0.16947 * exp(-4.1695e-6 * nps_ingame  * 180) + 0.02;
+        float threshold2 = 0.16947 * exp(-4.1695e-6 * nps_ingame2 *  15) + 0.02;
+        float threshold3 = 0.16947 * exp(-4.1695e-6 * nps_ingame3 *   1) + 0.02;
 
         if (nps > best_nps &&
             threshold * (i - best) * best_nps < (nps - best_nps) * best) {
           best_nps = nps;
           best = i;
+          if (threshold2 * (i - best2) * best_nps2 <
+              (nps - best_nps2) * best2) {
+            best_nps2 = nps;
+            best2 = i;
+            if (threshold3 * (i - best3) * best_nps3 <
+                (nps - best_nps3) * best3) {
+              best_nps3 = nps;
+              best3 = i;
+            }
+          }
           if (!pending) {
             pending = std::chrono::steady_clock::now();
           }
@@ -147,16 +166,21 @@ void BackendBenchmark::Run() {
           time = std::chrono::steady_clock::now() - *pending;
           if (time.count() > 10) {
             Clippy(
-                std::to_string(best) +
-                " looks like the best minibatch-size for this net (so far).");
+                "Recommended minibatch-size for this net (so far):",
+                "1s/move   (Bullet):     ", std::to_string(best3),
+                "15s/move  (Rapid):      ", std::to_string(best2),
+                "3min/move (Tournament): ", std::to_string(best));
             pending.reset();
           }
         }
       }
     }
     if (option_dict.Get<bool>(kClippyId)) {
-      Clippy(std::to_string(best) +
-             " looks like the best minibatch-size for this net.");
+        Clippy(
+            "Recommended minibatch-size for this net:",
+            "1s/move   (Bullet):     ", std::to_string(best3),
+            "15s/move  (Rapid):      ", std::to_string(best2),
+            "3min/move (Tournament): ", std::to_string(best));
     }
   } catch (Exception& ex) {
     std::cerr << ex.what() << std::endl;

From 402da93ce0966f221eb68a648514f8a2d24da862 Mon Sep 17 00:00:00 2001
From: Alexander Lyashuk <crem@google.com>
Date: Sun, 4 Dec 2022 10:26:47 +0100
Subject: [PATCH 06/46] In smooth TM, use piggybank when it seems like the
 bestmove can be overtaken. (#1762)

fixes https://github.com/LeelaChessZero/lc0/issues/1582
---
 src/mcts/stoppers/smooth.cc   | 155 ++++++++++++++++++++++++++++++----
 src/mcts/stoppers/stoppers.cc |   5 +-
 2 files changed, 143 insertions(+), 17 deletions(-)

diff --git a/src/mcts/stoppers/smooth.cc b/src/mcts/stoppers/smooth.cc
index e0cc38e651..2a1a196247 100644
--- a/src/mcts/stoppers/smooth.cc
+++ b/src/mcts/stoppers/smooth.cc
@@ -29,6 +29,7 @@
 
 #include <functional>
 #include <iomanip>
+#include <optional>
 
 #include "mcts/stoppers/legacy.h"
 #include "mcts/stoppers/stoppers.h"
@@ -83,6 +84,19 @@ class Params {
   // Max number of avg move times in piggybank.
   float max_piggybank_moves() const { return max_piggybank_moves_; }
 
+  int64_t trend_nps_update_period_ms() const {
+    return trend_nps_update_period_ms_;
+  }
+
+  // Expected ration of the best move nps in future, to the current nps.
+  float bestmove_optimism() const { return bestmove_optimism_; }
+
+  // Expected ration of the non-best move nps in future, to the current nps.
+  float overtaker_optimism() const { return overtaker_optimism_; }
+
+  // Force a use of piggybank during the first few milliseconds of the move.
+  float force_piggybank_ms() const { return force_piggybank_ms_; }
+
   // Move overhead.
   int64_t move_overhead_ms() const { return move_overhead_ms_; }
   // Returns a function function that estimates remaining moves.
@@ -104,6 +118,10 @@ class Params {
   const float per_move_piggybank_fraction_;
   const float max_piggybank_use_;
   const float max_piggybank_moves_;
+  const float trend_nps_update_period_ms_;
+  const float bestmove_optimism_;
+  const float overtaker_optimism_;
+  const float force_piggybank_ms_;
   const MovesLeftEstimator moves_left_estimator_;
 };
 
@@ -144,6 +162,12 @@ Params::Params(const OptionsDict& params, int64_t move_overhead)
           params.GetOrDefault<float>("max-piggybank-use", 0.94f)),
       max_piggybank_moves_(
           params.GetOrDefault<float>("max-piggybank-moves", 36.5f)),
+      trend_nps_update_period_ms_(
+          params.GetOrDefault<int>("trend-nps-update-period-ms", 3000)),
+      bestmove_optimism_(params.GetOrDefault<float>("bestmove-optimism", 0.2f)),
+      overtaker_optimism_(
+          params.GetOrDefault<float>("overtaker-optimism", 4.0f)),
+      force_piggybank_ms_(params.GetOrDefault<int>("force-piggybank-ms", 1000)),
       moves_left_estimator_(CreateMovesLeftEstimator(params)) {}
 
 // Returns the updated value of @from, towards @to by the number of halves
@@ -198,9 +222,36 @@ float LinearDecay(float cur_value, float target_value,
 
 class SmoothTimeManager;
 
+class VisitsTrendWatcher {
+ public:
+  VisitsTrendWatcher(float nps_update_period, float bestmove_optimism,
+                     float overtaker_optimism)
+      : nps_update_period_(nps_update_period),
+        bestmove_optimism_(bestmove_optimism),
+        overtaker_optimism_(overtaker_optimism) {}
+
+  void Update(uint64_t timestamp, const std::vector<uint32_t>& visits);
+  bool IsBestmoveBeingOvertaken(uint64_t by_which_time) const;
+
+ private:
+  const float nps_update_period_;
+  const float bestmove_optimism_;
+  const float overtaker_optimism_;
+
+  mutable Mutex mutex_;
+  uint64_t prev_timestamp_ GUARDED_BY(mutex_) = 0;
+  std::vector<uint32_t> prev_visits_ GUARDED_BY(mutex_);
+  uint64_t cur_timestamp_ GUARDED_BY(mutex_) = 0;
+  std::vector<uint32_t> cur_visits_ GUARDED_BY(mutex_);
+  uint64_t last_timestamp_ GUARDED_BY(mutex_) = 0;
+  std::vector<uint32_t> last_visits_ GUARDED_BY(mutex_);
+};
+
 class SmoothStopper : public SearchStopper {
  public:
   SmoothStopper(int64_t deadline_ms, int64_t allowed_piggybank_use_ms,
+                float nps_update_period, float bestmove_optimism,
+                float overtaker_optimism, int64_t forces_piggybank_ms,
                 SmoothTimeManager* manager);
 
  private:
@@ -209,7 +260,9 @@ class SmoothStopper : public SearchStopper {
 
   const int64_t deadline_ms_;
   const int64_t allowed_piggybank_use_ms_;
+  const int64_t forced_piggybank_use_ms_;
 
+  VisitsTrendWatcher visits_trend_watcher_;
   SmoothTimeManager* const manager_;
   std::atomic_flag used_piggybank_;
 };
@@ -251,19 +304,19 @@ class SmoothTimeManager : public TimeManager {
             : total_move_time / move_allocated_time_ms_;
     // Recompute expected move time for logging.
     const float expected_move_time = move_allocated_time_ms_ * timeuse_;
-    // If piggybank was used, cannot update timeuse_.
+
     int64_t piggybank_time_used = 0;
     if (used_piggybank) {
       piggybank_time_used = std::max(int64_t(), total_move_time - time_budget);
       piggybank_time_ -= piggybank_time_used;
-    } else {
-      timeuse_ =
-          ExponentialDecay(timeuse_, this_move_time_use,
-                           params_.smartpruning_timeuse_halfupdate_moves(),
-                           this_move_time_fraction);
-      if (timeuse_ < params_.min_smartpruning_timeuse()) {
-        timeuse_ = params_.min_smartpruning_timeuse();
-      }
+    }
+    // If piggybank was used, time use is 100%.
+    timeuse_ =
+        ExponentialDecay(timeuse_, used_piggybank ? 1.0f : this_move_time_use,
+                         params_.smartpruning_timeuse_halfupdate_moves(),
+                         this_move_time_fraction);
+    if (timeuse_ < params_.min_smartpruning_timeuse()) {
+      timeuse_ = params_.min_smartpruning_timeuse();
     }
     // Remember final number of nodes for tree reuse estimation.
     last_move_final_nodes_ = total_nodes;
@@ -404,8 +457,10 @@ class SmoothTimeManager : public TimeManager {
             << ", moves=" << remaining_moves << ", time=" << total_remaining_ms
             << "ms, nps=" << nps_;
 
-    return std::make_unique<SmoothStopper>(move_allocated_time_ms_,
-                                           allowed_piggybank_time_ms, this);
+    return std::make_unique<SmoothStopper>(
+        move_allocated_time_ms_, allowed_piggybank_time_ms,
+        params_.trend_nps_update_period_ms(), params_.bestmove_optimism(),
+        params_.overtaker_optimism(), params_.force_piggybank_ms(), this);
   }
 
   void UpdateTreeReuseFactor(int64_t new_move_nodes) REQUIRES(mutex_) {
@@ -460,11 +515,65 @@ class SmoothTimeManager : public TimeManager {
   bool is_first_move_ GUARDED_BY(mutex_) = true;
 };
 
+void VisitsTrendWatcher::Update(uint64_t timestamp,
+                                const std::vector<uint32_t>& visits) {
+  Mutex::Lock lock(mutex_);
+  if (timestamp <= last_timestamp_) return;
+  if (prev_visits_.empty()) {
+    prev_visits_ = visits;
+    cur_visits_ = visits;
+    prev_timestamp_ = timestamp;
+    cur_timestamp_ = timestamp;
+  }
+  last_timestamp_ = timestamp;
+  last_visits_ = visits;
+  if (cur_timestamp_ + nps_update_period_ >= timestamp) {
+    prev_timestamp_ = cur_timestamp_;
+    prev_visits_ = std::move(cur_visits_);
+    cur_visits_ = last_visits_;
+    cur_timestamp_ = last_timestamp_;
+  }
+}
+
+bool VisitsTrendWatcher::IsBestmoveBeingOvertaken(
+    uint64_t by_which_time) const {
+  Mutex::Lock lock(mutex_);
+  // If we don't have any stats yet, we cannot stop the search.
+  if (prev_timestamp_ >= last_timestamp_) return false;
+  if (by_which_time <= last_timestamp_) return false;
+  std::vector<float> npms;
+  npms.reserve(last_visits_.size());
+  for (size_t i = 0; i < last_visits_.size(); ++i) {
+    npms.push_back(static_cast<float>(last_visits_[i] - prev_visits_[i]) /
+                   (last_timestamp_ - prev_timestamp_));
+  }
+  const size_t bestmove_idx =
+      std::max_element(last_visits_.begin(), last_visits_.end()) -
+      last_visits_.begin();
+  const auto planned_bestmove_visits =
+      last_visits_[bestmove_idx] + bestmove_optimism_ * npms[bestmove_idx] *
+                                       (by_which_time - last_timestamp_);
+  for (size_t i = 0; i < last_visits_.size(); ++i) {
+    if (i == bestmove_idx) continue;
+    const auto planned_visits =
+        last_visits_[i] +
+        overtaker_optimism_ * npms[i] * (by_which_time - last_timestamp_);
+    if (planned_visits > planned_bestmove_visits) return true;
+  }
+  return false;
+}
+
 SmoothStopper::SmoothStopper(int64_t deadline_ms,
                              int64_t allowed_piggybank_use_ms,
+                             float nps_update_period, float bestmove_optimism,
+                             float overtaker_optimism,
+                             int64_t forced_piggybank_use_ms,
                              SmoothTimeManager* manager)
     : deadline_ms_(deadline_ms),
       allowed_piggybank_use_ms_(allowed_piggybank_use_ms),
+      forced_piggybank_use_ms_(forced_piggybank_use_ms),
+      visits_trend_watcher_(nps_update_period, bestmove_optimism,
+                            overtaker_optimism),
       manager_(manager) {
   used_piggybank_.clear();
 }
@@ -478,10 +587,16 @@ bool SmoothStopper::ShouldStop(const IterationStats& stats,
     return true;
   }
 
+  visits_trend_watcher_.Update(stats.time_since_movestart, stats.edge_n);
+  const auto deadline_with_piggybank = deadline_ms_ + allowed_piggybank_use_ms_;
+  const bool force_use_piggybank =
+      stats.time_since_first_batch <= forced_piggybank_use_ms_;
   const bool use_piggybank =
-      (stats.time_usage_hint_ == IterationStats::TimeUsageHint::kNeedMoreTime);
+      (stats.time_usage_hint_ == IterationStats::TimeUsageHint::kNeedMoreTime ||
+       force_use_piggybank ||
+       visits_trend_watcher_.IsBestmoveBeingOvertaken(deadline_with_piggybank));
   const int64_t time_limit =
-      use_piggybank ? (deadline_ms_ + allowed_piggybank_use_ms_) : deadline_ms_;
+      use_piggybank ? deadline_with_piggybank : deadline_ms_;
   hints->UpdateEstimatedNps(nps);
   hints->UpdateEstimatedRemainingTimeMs(time_limit -
                                         stats.time_since_movestart);
@@ -489,10 +604,20 @@ bool SmoothStopper::ShouldStop(const IterationStats& stats,
     // It's not entirely correct as due to extended remaining time smart pruning
     // will trigger later and we spend more time than if use_piggyback was
     // false, even before reaching the deadline.
-    used_piggybank_.test_and_set();
+    if (!used_piggybank_.test_and_set()) {
+      LOGFILE << "Entering piggybank, reason: "
+              << (stats.time_usage_hint_ ==
+                          IterationStats::TimeUsageHint::kNeedMoreTime
+                      ? "requested by search."
+                  : force_use_piggybank
+                      ? "forced used in the beginning of the move."
+                      : "bestmove can be overtaken.");
+    }
   }
   if (stats.time_since_movestart >= time_limit) {
-    LOGFILE << "Stopping search: Ran out of time.";
+    LOGFILE << "Stopping search: Ran out of time. elapsed=" << std::fixed
+            << stats.time_since_movestart << " limit=" << time_limit
+            << " piggy=" << use_piggybank;
     return true;
   }
   return false;
diff --git a/src/mcts/stoppers/stoppers.cc b/src/mcts/stoppers/stoppers.cc
index 8d6f7ae426..a66e4f8063 100644
--- a/src/mcts/stoppers/stoppers.cc
+++ b/src/mcts/stoppers/stoppers.cc
@@ -242,8 +242,9 @@ bool SmartPruningStopper::ShouldStop(const IterationStats& stats,
   }
 
   if (remaining_playouts < (largest_n - second_largest_n)) {
-    LOGFILE << remaining_playouts << " playouts remaining. Best move has "
-            << largest_n << " visits, second best -- " << second_largest_n
+    LOGFILE << std::fixed << remaining_playouts
+            << " playouts remaining. Best move has " << largest_n
+            << " visits, second best -- " << second_largest_n
             << ". Difference is " << (largest_n - second_largest_n)
             << ", so stopping the search after "
             << stats.batches_since_movestart << " batches.";

From 8774cae6726b8e03f00be136af939b2eed3cfca6 Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Sun, 4 Dec 2022 13:31:38 +0200
Subject: [PATCH 07/46] only use thread binding with blas backend (#1797)

---
 src/main.cc                     | 3 ---
 src/mcts/node.cc                | 4 ----
 src/mcts/search.cc              | 1 -
 src/mcts/search.h               | 4 +---
 src/neural/blas/network_blas.cc | 5 +++++
 src/neural/network.h            | 1 +
 src/neural/network_demux.cc     | 7 ++-----
 src/neural/network_mux.cc       | 7 ++-----
 8 files changed, 11 insertions(+), 21 deletions(-)

diff --git a/src/main.cc b/src/main.cc
index 3f370b898d..4c8880d4e6 100644
--- a/src/main.cc
+++ b/src/main.cc
@@ -36,7 +36,6 @@
 #include "utils/commandline.h"
 #include "utils/esc_codes.h"
 #include "utils/logging.h"
-#include "utils/numa.h"
 #include "version.h"
 
 int main(int argc, const char** argv) {
@@ -49,8 +48,6 @@ int main(int argc, const char** argv) {
        << " built " << __DATE__;
 
   try {
-    Numa::Init();
-    Numa::BindThread(0);
     InitializeMagicBitboards();
 
     CommandLine::Init(argc, argv);
diff --git a/src/mcts/node.cc b/src/mcts/node.cc
index 9b71cff091..9bda7f144a 100644
--- a/src/mcts/node.cc
+++ b/src/mcts/node.cc
@@ -39,7 +39,6 @@
 #include "neural/network.h"
 #include "utils/exception.h"
 #include "utils/hashcat.h"
-#include "utils/numa.h"
 
 namespace lczero {
 
@@ -99,9 +98,6 @@ class NodeGarbageCollector {
   }
 
   void Worker() {
-    // Keep garbage collection on same core as where search workers are most
-    // likely to be to make any lock conention on gc mutex cheaper.
-    Numa::BindThread(0);
     while (!stop_.load()) {
       std::this_thread::sleep_for(std::chrono::milliseconds(kGCIntervalMs));
       GarbageCollect();
diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index 29808e80ad..74077d912b 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -886,7 +886,6 @@ void Search::PopulateCommonIterationStats(IterationStats* stats) {
 }
 
 void Search::WatchdogThread() {
-  Numa::BindThread(0);
   LOGFILE << "Start a watchdog thread.";
   StoppersHints hints;
   IterationStats stats;
diff --git a/src/mcts/search.h b/src/mcts/search.h
index 9a1be4c125..1323320bca 100644
--- a/src/mcts/search.h
+++ b/src/mcts/search.h
@@ -44,7 +44,6 @@
 #include "syzygy/syzygy.h"
 #include "utils/logging.h"
 #include "utils/mutex.h"
-#include "utils/numa.h"
 
 namespace lczero {
 
@@ -216,11 +215,10 @@ class SearchWorker {
         params_(params),
         moves_left_support_(search_->network_->GetCapabilities().moves_left !=
                             pblczero::NetworkFormat::MOVES_LEFT_NONE) {
-    Numa::BindThread(id);
+    search_->network_->InitThread(id);
     for (int i = 0; i < params.GetTaskWorkersPerSearchWorker(); i++) {
       task_workspaces_.emplace_back();
       task_threads_.emplace_back([this, i]() {
-        Numa::BindThread(i);
         this->RunTasks(i);
       });
     }
diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 60827dc3f4..1bf2a61196 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -35,6 +35,7 @@
 #include "neural/shared/attention_policy_map.h"
 #include "neural/shared/policy_map.h"
 #include "neural/shared/winograd_filter.h"
+#include "utils/numa.h"
 
 #ifdef USE_DNNL
 #include <omp.h>
@@ -135,6 +136,8 @@ class BlasNetwork : public Network {
     return capabilities_;
   }
 
+  void InitThread(int id) override { Numa::BindThread(id); }
+
  private:
   // A cap on the max batch size since it consumes a lot of memory
   static constexpr auto kHardMaxBatchSize = 2048;
@@ -648,6 +651,8 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
     : capabilities_{file.format().network_format().input(),
                     file.format().network_format().moves_left()},
       weights_(file.weights()) {
+    Numa::Init();
+
   max_batch_size_ =
       static_cast<size_t>(options.GetOrDefault<int>("batch_size", 256));
 
diff --git a/src/neural/network.h b/src/neural/network.h
index 2d0a7c720e..fe6337711b 100644
--- a/src/neural/network.h
+++ b/src/neural/network.h
@@ -107,6 +107,7 @@ class Network {
  public:
   virtual const NetworkCapabilities& GetCapabilities() const = 0;
   virtual std::unique_ptr<NetworkComputation> NewComputation() = 0;
+  virtual void InitThread(int /*id*/) {}
   virtual ~Network() = default;
 };
 
diff --git a/src/neural/network_demux.cc b/src/neural/network_demux.cc
index 3845057ed4..8935b696c6 100644
--- a/src/neural/network_demux.cc
+++ b/src/neural/network_demux.cc
@@ -31,7 +31,6 @@
 
 #include "neural/factory.h"
 #include "utils/exception.h"
-#include "utils/numa.h"
 
 namespace lczero {
 namespace {
@@ -135,7 +134,7 @@ class DemuxingNetwork : public Network {
     }
 
     for (int i = 0; i < nn_threads; ++i) {
-      threads_.emplace_back([this, i]() { Worker(i); });
+      threads_.emplace_back([this, i]() { Worker(); });
     }
   }
 
@@ -163,9 +162,7 @@ class DemuxingNetwork : public Network {
     }
   }
 
-  void Worker(int id) {
-    // Add one to the id in order to leave space for an active search thread.
-    Numa::BindThread(id + 1);
+  void Worker() {
     // While Abort() is not called (and it can only be called from destructor).
     while (!abort_) {
       {
diff --git a/src/neural/network_mux.cc b/src/neural/network_mux.cc
index 440a382cda..e8d6ec71d3 100644
--- a/src/neural/network_mux.cc
+++ b/src/neural/network_mux.cc
@@ -31,7 +31,6 @@
 
 #include "neural/factory.h"
 #include "utils/exception.h"
-#include "utils/numa.h"
 
 namespace lczero {
 namespace {
@@ -126,7 +125,7 @@ class MuxingNetwork : public Network {
 
     for (int i = 0; i < nn_threads; ++i) {
       threads_.emplace_back(
-          [this, net, max_batch, i]() { Worker(net, max_batch, i); });
+          [this, net, max_batch]() { Worker(net, max_batch); });
     }
   }
 
@@ -154,9 +153,7 @@ class MuxingNetwork : public Network {
     }
   }
 
-  void Worker(Network* network, const int max_batch, int id) {
-    // Add one to the id in order to leave space for an active search thread.
-    Numa::BindThread(id + 1);
+  void Worker(Network* network, const int max_batch) {
     // While Abort() is not called (and it can only be called from destructor).
     while (!abort_) {
       std::vector<MuxingComputation*> children;

From a3646eaf16f7df94ef0125438a9cabe6875b8de0 Mon Sep 17 00:00:00 2001
From: Naphthalin <40385638+Naphthalin@users.noreply.github.com>
Date: Sun, 4 Dec 2022 12:32:20 +0100
Subject: [PATCH 08/46] Fix apparent "mate blindness" in endgame due to
 instamove logic (#1742)

---
 src/mcts/search.cc            | 7 +++++++
 src/mcts/stoppers/stoppers.cc | 3 ++-
 src/mcts/stoppers/timemgr.h   | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index 74077d912b..6fe8d1c7e6 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -844,6 +844,7 @@ void Search::PopulateCommonIterationStats(IterationStats* stats) {
   stats->average_depth = cum_depth_ / (total_playouts_ ? total_playouts_ : 1);
   stats->edge_n.clear();
   stats->win_found = false;
+  stats->may_resign = true;
   stats->num_losing_edges = 0;
   stats->time_usage_hint_ = IterationStats::TimeUsageHint::kNormal;
 
@@ -870,6 +871,12 @@ void Search::PopulateCommonIterationStats(IterationStats* stats) {
       if (n > 0 && edge.IsTerminal() && edge.GetWL(0.0f) < 0.0f) {
         stats->num_losing_edges += 1;
       }
+      // If game is resignable, no need for moving quicker. This allows
+      // proving mate when losing anyway for better score output.
+      // Hardcoded resign threshold, because there is no available parameter.
+      if (n > 0 && q > -0.98f) {
+        stats->may_resign = false;
+      }
       if (max_n < n) {
         max_n = n;
         max_n_has_max_q_plus_m = false;
diff --git a/src/mcts/stoppers/stoppers.cc b/src/mcts/stoppers/stoppers.cc
index a66e4f8063..9f99a4269a 100644
--- a/src/mcts/stoppers/stoppers.cc
+++ b/src/mcts/stoppers/stoppers.cc
@@ -195,7 +195,8 @@ bool SmartPruningStopper::ShouldStop(const IterationStats& stats,
     LOGFILE << "Only one possible move. Moving immediately.";
     return true;
   }
-  if (stats.edge_n.size() <= static_cast<size_t>(stats.num_losing_edges + 1)) {
+  if (stats.edge_n.size() <= static_cast<size_t>(stats.num_losing_edges) +
+                                 (stats.may_resign ? 0 : 1)) {
     LOGFILE << "At most one non losing move, stopping search.";
     return true;
   }
diff --git a/src/mcts/stoppers/timemgr.h b/src/mcts/stoppers/timemgr.h
index b545f194c9..64d0c2c05d 100644
--- a/src/mcts/stoppers/timemgr.h
+++ b/src/mcts/stoppers/timemgr.h
@@ -52,6 +52,7 @@ struct IterationStats {
   // TODO: remove this in favor of time_usage_hint_=kImmediateMove when
   // smooth time manager is the default.
   bool win_found = false;
+  bool may_resign = false;
   int num_losing_edges = 0;
 
   enum class TimeUsageHint { kNormal, kNeedMoreTime, kImmediateMove };

From 2feda7cb6b8094f75ddec9c9a7386cc19474cef0 Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Sun, 4 Dec 2022 13:32:48 +0200
Subject: [PATCH 09/46] initial dfrc support (#1684)

---
 src/chess/board.cc               | 96 +++++++++++++++++---------------
 src/chess/board.h                | 59 +++++++++++++-------
 src/chess/pgn.h                  |  4 +-
 src/neural/decoder.cc            | 15 +++--
 src/neural/encoder.cc            | 22 ++++++--
 src/trainingdata/trainingdata.cc | 20 ++++---
 6 files changed, 131 insertions(+), 85 deletions(-)

diff --git a/src/chess/board.cc b/src/chess/board.cc
index 0ec7de089d..0ace8142ef 100644
--- a/src/chess/board.cc
+++ b/src/chess/board.cc
@@ -463,21 +463,19 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const {
       // For castlings we don't check destination king square for checks, it
       // will be done in legal move check phase.
       if (castlings_.we_can_000()) {
-        const uint8_t qrook = castlings_.queenside_rook();
+        const uint8_t qrook = castlings_.our_queenside_rook();
         if (walk_free(std::min(static_cast<uint8_t>(C1), qrook),
                       std::max(static_cast<uint8_t>(D1), king), qrook, king) &&
             !range_attacked(king, C1)) {
-          result.emplace_back(source,
-                              BoardSquare(RANK_1, castlings_.queenside_rook()));
+          result.emplace_back(source, BoardSquare(RANK_1, qrook));
         }
       }
       if (castlings_.we_can_00()) {
-        const uint8_t krook = castlings_.kingside_rook();
+        const uint8_t krook = castlings_.our_kingside_rook();
         if (walk_free(std::min(static_cast<uint8_t>(F1), king),
                       std::max(static_cast<uint8_t>(G1), krook), krook, king) &&
             !range_attacked(king, G1)) {
-          result.emplace_back(source,
-                              BoardSquare(RANK_1, castlings_.kingside_rook()));
+          result.emplace_back(source, BoardSquare(RANK_1, krook));
         }
       }
       continue;
@@ -625,10 +623,10 @@ bool ChessBoard::ApplyMove(Move move) {
   rooks_.reset(to);
   bishops_.reset(to);
   pawns_.reset(to);
-  if (to.as_int() == 56 + castlings_.kingside_rook()) {
+  if (to.as_int() == A8 + castlings_.their_kingside_rook()) {
     castlings_.reset_they_can_00();
   }
-  if (to.as_int() == 56 + castlings_.queenside_rook()) {
+  if (to.as_int() == A8 + castlings_.their_queenside_rook()) {
     castlings_.reset_they_can_000();
   }
 
@@ -672,8 +670,12 @@ bool ChessBoard::ApplyMove(Move move) {
 
   // Reset castling rights.
   if (from_row == RANK_1 && rooks_.get(from)) {
-    if (from_col == castlings_.queenside_rook()) castlings_.reset_we_can_000();
-    if (from_col == castlings_.kingside_rook()) castlings_.reset_we_can_00();
+    if (from_col == castlings_.our_queenside_rook()) {
+      castlings_.reset_we_can_000();
+    }
+    if (from_col == castlings_.our_kingside_rook()) {
+      castlings_.reset_we_can_00();
+    }
   }
 
   // Ordinary move.
@@ -1043,62 +1045,66 @@ void ChessBoard::SetFromFen(std::string fen, int* rule50_ply, int* moves) {
   }
 
   if (castlings != "-") {
-    uint8_t left_rook = FILE_A;
-    uint8_t right_rook = FILE_H;
+    uint8_t our_left_rook = FILE_A;
+    uint8_t our_right_rook = FILE_H;
+    uint8_t their_left_rook = FILE_A;
+    uint8_t their_right_rook = FILE_H;
     for (char c : castlings) {
       const bool is_black = std::islower(c);
       const int king_col = (is_black ? their_king_ : our_king_).col();
-      if (!is_black) c = std::tolower(c);
       const auto rooks =
           (is_black ? their_pieces_ : our_pieces_) & ChessBoard::rooks();
-      if (c == 'k') {
-        // Finding rightmost rook.
-        for (right_rook = FILE_H; right_rook > king_col; --right_rook) {
-          if (rooks.get(is_black ? RANK_8 : RANK_1, right_rook)) break;
+      auto find_rook = [rooks, king_col, fen](bool forward, uint8_t rank) {
+        uint8_t rook;
+        for (rook = forward ? FILE_A : FILE_H; rook != king_col;
+             rook += 2 * forward - 1) {
+          if (rooks.get(rank, rook)) break;
         }
-        if (right_rook == king_col) {
-          throw Exception("Bad fen string (no kingside rook): " + fen);
+        if (rook == king_col) {
+          throw Exception("Bad fen string (missing rook): " + fen);
         }
-        if (is_black) {
-          castlings_.set_they_can_00();
+        return rook;
+      };
+      if (c == 'K') {
+        // Finding rightmost rook.
+        our_right_rook = find_rook(false, RANK_1);
+        castlings_.set_we_can_00();
+      } else if (c == 'Q') {
+        // Finding leftmost rook.
+        our_left_rook = find_rook(true, RANK_1);
+        castlings_.set_we_can_000();
+      } else if (c >= 'A' && c <= 'H') {
+        int rook_col = c - 'A';
+        if (rook_col < king_col) {
+          our_left_rook = rook_col;
+          castlings_.set_we_can_000();
         } else {
+          our_right_rook = rook_col;
           castlings_.set_we_can_00();
         }
+      } else if (c == 'k') {
+        // Finding rightmost rook.
+        their_right_rook = find_rook(false, RANK_8);
+        castlings_.set_they_can_00();
       } else if (c == 'q') {
         // Finding leftmost rook.
-        for (left_rook = FILE_A; left_rook < king_col; ++left_rook) {
-          if (rooks.get(is_black ? RANK_8 : RANK_1, left_rook)) break;
-        }
-        if (left_rook == king_col) {
-          throw Exception("Bad fen string (no queenside rook): " + fen);
-        }
-        if (is_black) {
-          castlings_.set_they_can_000();
-        } else {
-          castlings_.set_we_can_000();
-        }
+        their_left_rook = find_rook(true, RANK_8);
+        castlings_.set_they_can_000();
       } else if (c >= 'a' && c <= 'h') {
         int rook_col = c - 'a';
         if (rook_col < king_col) {
-          left_rook = rook_col;
-          if (is_black) {
-            castlings_.set_they_can_000();
-          } else {
-            castlings_.set_we_can_000();
-          }
+          their_left_rook = rook_col;
+          castlings_.set_they_can_000();
         } else {
-          right_rook = rook_col;
-          if (is_black) {
-            castlings_.set_they_can_00();
-          } else {
-            castlings_.set_we_can_00();
-          }
+          their_right_rook = rook_col;
+          castlings_.set_they_can_00();
         }
       } else {
         throw Exception("Bad fen string (unexpected casting symbol): " + fen);
       }
     }
-    castlings_.SetRookPositions(left_rook, right_rook);
+    castlings_.SetRookPositions(our_left_rook, our_right_rook, their_left_rook,
+                                their_right_rook);
   }
 
   if (en_passant != "-") {
diff --git a/src/chess/board.h b/src/chess/board.h
index c9480b9631..bc0b82fbc5 100644
--- a/src/chess/board.h
+++ b/src/chess/board.h
@@ -116,7 +116,12 @@ class ChessBoard {
 
   class Castlings {
    public:
-    Castlings() : queenside_rook_(0), kingside_rook_(7) {}
+    Castlings()
+        : our_queenside_rook_(FILE_A),
+          their_queenside_rook_(FILE_A),
+          our_kingside_rook_(FILE_H),
+          their_kingside_rook_(FILE_H),
+          data_(0) {}
 
     void set_we_can_00() { data_ |= 1; }
     void set_we_can_000() { data_ |= 2; }
@@ -134,7 +139,11 @@ class ChessBoard {
     bool they_can_000() const { return data_ & 8; }
     bool no_legal_castle() const { return data_ == 0; }
 
-    void Mirror() { data_ = ((data_ & 0b11) << 2) + ((data_ & 0b1100) >> 2); }
+    void Mirror() {
+      std::swap(our_queenside_rook_, their_queenside_rook_);
+      std::swap(our_kingside_rook_, their_kingside_rook_);
+      data_ = ((data_ & 0b11) << 2) + ((data_ & 0b1100) >> 2);
+    }
 
     // Note: this is not a strict xfen compatible output. Without access to the
     // board its not possible to know whether there is ambiguity so all cases
@@ -142,16 +151,17 @@ class ChessBoard {
     std::string as_string() const {
       if (data_ == 0) return "-";
       std::string result;
-      if (queenside_rook() == FILE_A && kingside_rook() == FILE_H) {
+      if (our_queenside_rook() == FILE_A && our_kingside_rook() == FILE_H &&
+          their_queenside_rook() == FILE_A && their_kingside_rook() == FILE_H) {
         if (we_can_00()) result += 'K';
         if (we_can_000()) result += 'Q';
         if (they_can_00()) result += 'k';
         if (they_can_000()) result += 'q';
       } else {
-        if (we_can_00()) result += 'A' + kingside_rook();
-        if (we_can_000()) result += 'A' + queenside_rook();
-        if (they_can_00()) result += 'a' + kingside_rook();
-        if (they_can_000()) result += 'a' + queenside_rook();
+        if (we_can_00()) result += 'A' + our_kingside_rook();
+        if (we_can_000()) result += 'A' + our_queenside_rook();
+        if (they_can_00()) result += 'a' + their_kingside_rook();
+        if (they_can_000()) result += 'a' + their_queenside_rook();
       }
       return result;
     }
@@ -164,8 +174,10 @@ class ChessBoard {
       if (they_can_00()) result += 'k';
       if (they_can_000()) result += 'q';
       result += '[';
-      result += 'a' + queenside_rook();
-      result += 'a' + kingside_rook();
+      result += 'A' + our_queenside_rook();
+      result += 'A' + our_kingside_rook();
+      result += 'a' + their_queenside_rook();
+      result += 'a' + their_kingside_rook();
       result += ']';
       return result;
     }
@@ -173,29 +185,38 @@ class ChessBoard {
     uint8_t as_int() const { return data_; }
 
     bool operator==(const Castlings& other) const {
-      assert(queenside_rook_ == other.queenside_rook_ &&
-             kingside_rook_ == other.kingside_rook_);
+      assert(our_queenside_rook_ == other.our_queenside_rook_ &&
+             our_kingside_rook_ == other.our_kingside_rook_ &&
+             their_queenside_rook_ == other.their_queenside_rook_ &&
+             their_kingside_rook_ == other.their_kingside_rook_);
       return data_ == other.data_;
     }
 
-    uint8_t queenside_rook() const { return queenside_rook_; }
-    uint8_t kingside_rook() const { return kingside_rook_; }
-    void SetRookPositions(std::uint8_t left, std::uint8_t right) {
-      queenside_rook_ = left;
-      kingside_rook_ = right;
+    uint8_t our_queenside_rook() const { return our_queenside_rook_; }
+    uint8_t our_kingside_rook() const { return our_kingside_rook_; }
+    uint8_t their_queenside_rook() const { return their_queenside_rook_; }
+    uint8_t their_kingside_rook() const { return their_kingside_rook_; }
+    void SetRookPositions(uint8_t our_left, uint8_t our_right,
+                          uint8_t their_left, uint8_t their_right) {
+      our_queenside_rook_ = our_left;
+      our_kingside_rook_ = our_right;
+      their_queenside_rook_ = their_left;
+      their_kingside_rook_ = their_right;
     }
 
    private:
     // Position of "left" (queenside) rook in starting game position.
-    std::uint8_t queenside_rook_ : 3;
+    uint8_t our_queenside_rook_;
+    uint8_t their_queenside_rook_;
     // Position of "right" (kingside) rook in starting position.
-    std::uint8_t kingside_rook_ : 3;
+    uint8_t our_kingside_rook_;
+    uint8_t their_kingside_rook_;
 
     // - Bit 0 -- "our" side's kingside castle.
     // - Bit 1 -- "our" side's queenside castle.
     // - Bit 2 -- opponent's side's kingside castle.
     // - Bit 3 -- opponent's side's queenside castle.
-    std::uint8_t data_ = 0;
+    uint8_t data_;
   };
 
   std::string DebugString() const;
diff --git a/src/chess/pgn.h b/src/chess/pgn.h
index 0f38d406c4..512d4c673b 100644
--- a/src/chess/pgn.h
+++ b/src/chess/pgn.h
@@ -213,10 +213,10 @@ class PgnReader {
       BoardSquare king_sq(GetLowestBit(king_board.as_int()));
       if (san.size() > 4 && san[3] == '-' && san[4] == 'O') {
         m = Move(BoardSquare(0, king_sq.col()),
-                 BoardSquare(0, board.castlings().queenside_rook()));
+                 BoardSquare(0, board.castlings().our_queenside_rook()));
       } else {
         m = Move(BoardSquare(0, king_sq.col()),
-                 BoardSquare(0, board.castlings().kingside_rook()));
+                 BoardSquare(0, board.castlings().our_kingside_rook()));
       }
       return m;
     }
diff --git a/src/neural/decoder.cc b/src/neural/decoder.cc
index 13007caca6..1798523730 100644
--- a/src/neural/decoder.cc
+++ b/src/neural/decoder.cc
@@ -95,29 +95,34 @@ void PopulateBoard(pblczero::NetworkFormat::InputFormat input_format,
     case pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION_V2:
     case pblczero::NetworkFormat::
         INPUT_112_WITH_CANONICALIZATION_V2_ARMAGEDDON: {
-      auto queenside = 0;
-      auto kingside = 7;
+      int our_queenside = ChessBoard::FILE_A;
+      int their_queenside = ChessBoard::FILE_A;
+      int our_kingside = ChessBoard::FILE_H;
+      int their_kingside = ChessBoard::FILE_H;
       if (planes[kAuxPlaneBase + 0].mask != 0) {
         auto mask = planes[kAuxPlaneBase + 0].mask;
-        queenside = GetLowestBit((mask >> 56) | mask);
         if ((mask & 0xFFLL) != 0) {
+          our_queenside = GetLowestBit(mask & 0xFFLL);
           castlings.set_we_can_000();
         }
         if (mask >> 56 != 0) {
+          their_queenside = GetLowestBit(mask >> 56);
           castlings.set_they_can_000();
         }
       }
       if (planes[kAuxPlaneBase + 1].mask != 0) {
         auto mask = planes[kAuxPlaneBase + 1].mask;
-        kingside = GetLowestBit((mask >> 56) | mask);
         if ((mask & 0xFFLL) != 0) {
+          our_kingside = GetLowestBit(mask & 0xFFLL);
           castlings.set_we_can_00();
         }
         if (mask >> 56 != 0) {
+          their_kingside = GetLowestBit(mask >> 56);
           castlings.set_they_can_00();
         }
       }
-      castlings.SetRookPositions(queenside, kingside);
+      castlings.SetRookPositions(our_queenside, our_kingside, their_queenside,
+                                 their_kingside);
       break;
     }
 
diff --git a/src/neural/encoder.cc b/src/neural/encoder.cc
index ef8ef6524a..63851fad48 100644
--- a/src/neural/encoder.cc
+++ b/src/neural/encoder.cc
@@ -182,13 +182,23 @@ InputPlanes EncodePositionForNN(
         // h-side (kingside) castling right.
         const auto& cast = board.castlings();
         result[kAuxPlaneBase + 0].mask =
-            ((cast.we_can_000() ? BoardSquare(ChessBoard::A1).as_board() : 0) |
-             (cast.they_can_000() ? BoardSquare(ChessBoard::A8).as_board() : 0))
-            << cast.queenside_rook();
+            (cast.we_can_000()
+                 ? BoardSquare(ChessBoard::A1 + cast.our_queenside_rook())
+                       .as_board()
+                 : 0) |
+            (cast.they_can_000()
+                 ? BoardSquare(ChessBoard::A8 + cast.their_queenside_rook())
+                       .as_board()
+                 : 0);
         result[kAuxPlaneBase + 1].mask =
-            ((cast.we_can_00() ? BoardSquare(ChessBoard::A1).as_board() : 0) |
-             (cast.they_can_00() ? BoardSquare(ChessBoard::A8).as_board() : 0))
-            << cast.kingside_rook();
+            (cast.we_can_00()
+                 ? BoardSquare(ChessBoard::A1 + cast.our_kingside_rook())
+                       .as_board()
+                 : 0) |
+            (cast.they_can_00()
+                 ? BoardSquare(ChessBoard::A8 + cast.their_kingside_rook())
+                       .as_board()
+                 : 0);
         break;
       }
       default:
diff --git a/src/trainingdata/trainingdata.cc b/src/trainingdata/trainingdata.cc
index cb4b0c82bd..1285dc7b49 100644
--- a/src/trainingdata/trainingdata.cc
+++ b/src/trainingdata/trainingdata.cc
@@ -191,18 +191,22 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history,
   const auto& castlings = position.GetBoard().castlings();
   // Populate castlings.
   // For non-frc trained nets, just send 1 like we used to.
-  uint8_t queen_side = 1;
-  uint8_t king_side = 1;
+  uint8_t our_queen_side = 1;
+  uint8_t our_king_side = 1;
+  uint8_t their_queen_side = 1;
+  uint8_t their_king_side = 1;
   // If frc trained, send the bit mask representing rook position.
   if (Is960CastlingFormat(input_format_)) {
-    queen_side <<= castlings.queenside_rook();
-    king_side <<= castlings.kingside_rook();
+    our_queen_side <<= castlings.our_queenside_rook();
+    our_king_side <<= castlings.our_kingside_rook();
+    their_queen_side <<= castlings.their_queenside_rook();
+    their_king_side <<= castlings.their_kingside_rook();
   }
 
-  result.castling_us_ooo = castlings.we_can_000() ? queen_side : 0;
-  result.castling_us_oo = castlings.we_can_00() ? king_side : 0;
-  result.castling_them_ooo = castlings.they_can_000() ? queen_side : 0;
-  result.castling_them_oo = castlings.they_can_00() ? king_side : 0;
+  result.castling_us_ooo = castlings.we_can_000() ? our_queen_side : 0;
+  result.castling_us_oo = castlings.we_can_00() ? our_king_side : 0;
+  result.castling_them_ooo = castlings.they_can_000() ? their_queen_side : 0;
+  result.castling_them_oo = castlings.they_can_00() ? their_king_side : 0;
 
   // Other params.
   if (IsCanonicalFormat(input_format_)) {

From 2efcd4f19034ed23d22f7e367818e18bc8388e41 Mon Sep 17 00:00:00 2001
From: almaudoh <almaudoh@yahoo.com>
Date: Wed, 7 Dec 2022 17:58:41 +0100
Subject: [PATCH 10/46] Fix error in mha scaling in encoder layers. (#1808)

---
 src/neural/blas/network_blas.cc             |  2 +-
 src/neural/metal/mps/MetalNetworkBuilder.mm |  2 --
 src/neural/metal/mps/NetworkGraph.h         |  2 --
 src/neural/metal/mps/NetworkGraph.mm        | 17 +++++++++--------
 src/neural/onnx/adapters.cc                 |  1 +
 src/neural/onnx/converter.cc                |  4 ++--
 6 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 1bf2a61196..3edc1f15b0 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -352,7 +352,7 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
           const int d_model = layer.mha.q_b.size();
           const int heads = weights_.pol_encoder_head_count;
           const int depth = d_model / heads;
-          const float scaling = 1.0f / sqrtf(d_model);
+          const float scaling = 1.0f / sqrtf(depth);
 
           // MHA is done per batch since there's a fourth dimension introduced.
           for (auto batch = size_t{0}; batch < batch_size; batch++) {
diff --git a/src/neural/metal/mps/MetalNetworkBuilder.mm b/src/neural/metal/mps/MetalNetworkBuilder.mm
index 9940920307..2a25accdfd 100644
--- a/src/neural/metal/mps/MetalNetworkBuilder.mm
+++ b/src/neural/metal/mps/MetalNetworkBuilder.mm
@@ -148,8 +148,6 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
                                                             withKeys:mhaK
                                                           withValues:mhaV
                                                                heads:weights.pol_encoder_head_count
-                                                              dModel:dModel
-                                                               scale:1.0f / sqrt(dModel)
                                                                label:[NSString stringWithFormat:@"policy/encoder_%zu/mha", i]];
 
             // MHA final dense layer.
diff --git a/src/neural/metal/mps/NetworkGraph.h b/src/neural/metal/mps/NetworkGraph.h
index 353a367bbc..8df9a1058c 100644
--- a/src/neural/metal/mps/NetworkGraph.h
+++ b/src/neural/metal/mps/NetworkGraph.h
@@ -120,8 +120,6 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat
                                               withKeys:(MPSGraphTensor * __nonnull)keys
                                             withValues:(MPSGraphTensor * __nonnull)values
                                                  heads:(NSUInteger)heads
-                                                dModel:(NSUInteger)dModel
-                                                 scale:(float)scale
                                                  label:(NSString * __nonnull)label;
 
 -(nonnull MPSGraphTensor *) scaledQKMatmulWithQueries:(MPSGraphTensor * __nonnull)queries
diff --git a/src/neural/metal/mps/NetworkGraph.mm b/src/neural/metal/mps/NetworkGraph.mm
index c5e46e7e23..2a437ffe23 100644
--- a/src/neural/metal/mps/NetworkGraph.mm
+++ b/src/neural/metal/mps/NetworkGraph.mm
@@ -611,12 +611,12 @@ -(nonnull MPSGraphTensor *) scaledMHAMatmulWithQueries:(MPSGraphTensor * __nonnu
                                                          withKeys:(MPSGraphTensor * __nonnull)keys
                                                        withValues:(MPSGraphTensor * __nonnull)values
                                                             heads:(NSUInteger)heads
-                                                           dModel:(NSUInteger)dModel
-                                                            scale:(float)scale
                                                             label:(NSString * __nonnull)label
 {
     // Split heads.
-    const NSUInteger depth = dModel / heads;
+    const NSUInteger dmodel = [[queries.shape lastObject] intValue];
+    const NSUInteger depth = dmodel / heads;
+
     queries = [_graph reshapeTensor:queries withShape:@[@(-1), @64, @(heads), @(depth)] name:[NSString stringWithFormat:@"%@/reshape_q", label]];
     queries = [_graph transposeTensor:queries dimension:1 withDimension:2 name:[NSString stringWithFormat:@"%@/transpose_q", label]];
 
@@ -631,10 +631,11 @@ -(nonnull MPSGraphTensor *) scaledMHAMatmulWithQueries:(MPSGraphTensor * __nonnu
     MPSGraphTensor * attn = [_graph matrixMultiplicationWithPrimaryTensor:queries
                                                           secondaryTensor:keys
                                                                      name:[NSString stringWithFormat:@"%@/matmul_qk", label]];
-    attn = [_graph multiplicationWithPrimaryTensor:attn
-                                   secondaryTensor:[_graph constantWithScalar:scale
-                                                                            shape:@[@1] dataType:attn.dataType]
-                                              name:[NSString stringWithFormat:@"%@/scale", label]];
+    attn = [_graph divisionWithPrimaryTensor:attn
+                             secondaryTensor:[_graph constantWithScalar:sqrt(depth)
+                                                                  shape:@[@1]
+                                                               dataType:attn.dataType]
+                                        name:[NSString stringWithFormat:@"%@/scale", label]];
 
     attn = [self applyActivationWithTensor:attn activation:@"softmax" label:label];
 
@@ -645,7 +646,7 @@ -(nonnull MPSGraphTensor *) scaledMHAMatmulWithQueries:(MPSGraphTensor * __nonnu
 
     attn = [_graph transposeTensor:attn dimension:1 withDimension:2 name:[NSString stringWithFormat:@"%@/transpose_a", label]];
 
-    return [_graph reshapeTensor:attn withShape:@[@(-1), @(dModel)] name:[NSString stringWithFormat:@"%@/reshape_a", label]];
+    return [_graph reshapeTensor:attn withShape:@[@(-1), @(dmodel)] name:[NSString stringWithFormat:@"%@/reshape_a", label]];
 }
 
 -(nonnull MPSGraphTensor *) scaledQKMatmulWithQueries:(MPSGraphTensor * __nonnull)queries
diff --git a/src/neural/onnx/adapters.cc b/src/neural/onnx/adapters.cc
index e0f5485046..96f59ea616 100644
--- a/src/neural/onnx/adapters.cc
+++ b/src/neural/onnx/adapters.cc
@@ -26,6 +26,7 @@ Program grant you additional permission to convey the resulting work.
 */
 
 #include "neural/onnx/adapters.h"
+#include <algorithm>
 
 #include "utils/fp16_utils.h"
 #include "utils/transpose.h"
diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc
index 1bf4b88b70..aaf200120d 100644
--- a/src/neural/onnx/converter.cc
+++ b/src/neural/onnx/converter.cc
@@ -275,10 +275,10 @@ std::string Converter::MakeEncoderLayer(
   std::unique_ptr<OnnxConst> scale;
   if (GetDataType() == pblczero::TensorProto::FLOAT16) {
     scale = std::make_unique<Float16OnnxConst>(
-        Float16OnnxConst({FP32toFP16(1.0f / sqrtf(d_model))}, {1}));
+        Float16OnnxConst({FP32toFP16(1.0f / sqrtf(depth))}, {1}));
   } else {
     scale = std::make_unique<FloatOnnxConst>(
-        FloatOnnxConst({1.0f / sqrtf(d_model)}, {1}));
+        FloatOnnxConst({1.0f / sqrtf(depth)}, {1}));
   }
   flow = builder->Mul(name + "/mha/QK/scale", flow, *scale);
   flow = builder->Softmax(name + "/mha/QK/softmax", flow, 3);

From eeeab14f929daa81d908c97d57b7c494b086d994 Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Thu, 8 Dec 2022 15:43:56 +0200
Subject: [PATCH 11/46] add probability for early termination of selfplay
 openings (#1681)

---
 src/selfplay/game.cc       | 35 +++++++++++++++++++++++++++++++++--
 src/selfplay/game.h        |  4 ++++
 src/selfplay/tournament.cc | 31 +++++++++++++++----------------
 3 files changed, 52 insertions(+), 18 deletions(-)

diff --git a/src/selfplay/game.cc b/src/selfplay/game.cc
index d56e1552e0..c06a0e3d85 100644
--- a/src/selfplay/game.cc
+++ b/src/selfplay/game.cc
@@ -31,6 +31,7 @@
 
 #include "mcts/stoppers/common.h"
 #include "mcts/stoppers/factory.h"
+#include "utils/random.h"
 
 namespace lczero {
 
@@ -60,6 +61,10 @@ const OptionId kSyzygyTablebaseId{
     "List of Syzygy tablebase directories, list entries separated by system "
     "separator (\";\" for Windows, \":\" for Linux).",
     's'};
+const OptionId kOpeningStopProbId{
+    "opening-stop-prob", "OpeningStopProb",
+    "From each opening move, start a self-play game with probability max(p, "
+    "1/n), where p is the value given and n the opening moves remaining."};
 }  // namespace
 
 void SelfPlayGame::PopulateUciParams(OptionsParser* options) {
@@ -71,6 +76,7 @@ void SelfPlayGame::PopulateUciParams(OptionsParser* options) {
   options->Add<BoolOption>(kUciChess960) = false;
   PopulateTimeManagementOptions(RunType::kSelfplay, options);
   options->Add<StringOption>(kSyzygyTablebaseId);
+  options->Add<FloatOption>(kOpeningStopProbId, 0.0f, 1.0f) = 0.0f;
 }
 
 SelfPlayGame::SelfPlayGame(PlayerOptions white, PlayerOptions black,
@@ -91,10 +97,34 @@ SelfPlayGame::SelfPlayGame(PlayerOptions white, PlayerOptions black,
     tree_[1] = std::make_shared<NodeTree>();
     tree_[1]->ResetToPosition(orig_fen_, {});
   }
+  int ply = 0;
+  auto white_prob = white.uci_options->Get<float>(kOpeningStopProbId);
+  auto black_prob = black.uci_options->Get<float>(kOpeningStopProbId);
+  if (white_prob != black_prob && white_prob != 0 && black_prob != 0) {
+    throw Exception("Stop probabilities must be both equal or zero!");
+  }
+
   for (Move m : opening.moves) {
+    // For early exit from the opening, we support two cases: a) where both
+    // sides have the same exit probability and b) where one side's exit
+    // probability is zero. In the following formula, `positions` is the number
+    // of possible exit points remaining, used for adjusting the exit
+    // probability (to avoid favoring the last position).
+    auto exit_prob_now = tree_[0]->IsBlackToMove() ? black_prob : white_prob;
+    auto exit_prob_next = tree_[0]->IsBlackToMove() ? white_prob : black_prob;
+    int positions = opening.moves.size() - ply + 1;
+    if (exit_prob_now > 0.0f &&
+        Random::Get().GetFloat(1.0f) <
+            std::max(exit_prob_now,
+                     exit_prob_now / (exit_prob_now * ((positions + 1) / 2) +
+                                      exit_prob_next * (positions / 2)))) {
+      break;
+    }
     tree_[0]->MakeMove(m);
     if (tree_[0] != tree_[1]) tree_[1]->MakeMove(m);
+    ply++;
   }
+  start_ply_ = ply;
 }
 
 void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
@@ -102,8 +132,9 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
   bool blacks_move = tree_[0]->IsBlackToMove();
 
   // If we are training, verify that input formats are consistent.
-  if (training && options_[0].network->GetCapabilities().input_format !=
-      options_[1].network->GetCapabilities().input_format) {
+  if (training &&
+      options_[0].network->GetCapabilities().input_format !=
+          options_[1].network->GetCapabilities().input_format) {
     throw Exception("Can't mix networks with different input format!");
   }
   // Take syzygy tablebases from player1 options.
diff --git a/src/selfplay/game.h b/src/selfplay/game.h
index 7ad7c2b2ae..918c328ce1 100644
--- a/src/selfplay/game.h
+++ b/src/selfplay/game.h
@@ -85,6 +85,9 @@ class SelfPlayGame {
   // not.
   void Abort();
 
+  // Number of ply used from the given opening.
+  int GetStartPly() const { return start_ply_; }
+
   // Writes training data to a file.
   void WriteTrainingData(TrainingDataWriter* writer) const;
 
@@ -103,6 +106,7 @@ class SelfPlayGame {
   // tree_[0] == tree_[1].
   std::shared_ptr<NodeTree> tree_[2];
   std::string orig_fen_;
+  int start_ply_;
 
   // Search that is currently in progress. Stored in members so that Abort()
   // can stop it.
diff --git a/src/selfplay/tournament.cc b/src/selfplay/tournament.cc
index 084e62126f..cd9b7f79c8 100644
--- a/src/selfplay/tournament.cc
+++ b/src/selfplay/tournament.cc
@@ -80,10 +80,10 @@ const OptionId kOpeningsMirroredId{
 const OptionId kOpeningsModeId{"openings-mode", "OpeningsMode",
                                "A choice of sequential, shuffled, or random."};
 const OptionId kSyzygyTablebaseId{
-	"syzygy-paths", "SyzygyPath",
-	"List of Syzygy tablebase directories, list entries separated by system "
-	"separator (\";\" for Windows, \":\" for Linux).",
-	's' };
+    "syzygy-paths", "SyzygyPath",
+    "List of Syzygy tablebase directories, list entries separated by system "
+    "separator (\";\" for Windows, \":\" for Linux).",
+    's'};
 
 }  // namespace
 
@@ -218,17 +218,15 @@ SelfPlayTournament::SelfPlayTournament(
   }
 
   // Take syzygy tablebases from options.
-  std::string tb_paths =
-	  options.Get<std::string>(kSyzygyTablebaseId);
+  std::string tb_paths = options.Get<std::string>(kSyzygyTablebaseId);
   if (!tb_paths.empty()) {
-	  syzygy_tb_ = std::make_unique<SyzygyTablebase>();
-	  CERR << "Loading Syzygy tablebases from " << tb_paths;
-	  if (!syzygy_tb_->init(tb_paths)) {
-		  CERR << "Failed to load Syzygy tablebases!";
-		  syzygy_tb_ = nullptr;
-	  }
+    syzygy_tb_ = std::make_unique<SyzygyTablebase>();
+    CERR << "Loading Syzygy tablebases from " << tb_paths;
+    if (!syzygy_tb_->init(tb_paths)) {
+      CERR << "Failed to load Syzygy tablebases!";
+      syzygy_tb_ = nullptr;
+    }
   }
-
 }
 
 void SelfPlayTournament::PlayOneGame(int game_number) {
@@ -352,7 +350,7 @@ void SelfPlayTournament::PlayOneGame(int game_number) {
   auto player2_threads = player_options_[1][color_idx[1]].Get<int>(kThreadsId);
   game.Play(player1_threads, player2_threads, kTraining, syzygy_tb_.get(),
             enable_resign);
-  
+
   // If game was aborted, it's still undecided.
   if (game.GetGameResult() != GameResult::UNDECIDED) {
     // Game callback.
@@ -362,12 +360,13 @@ void SelfPlayTournament::PlayOneGame(int game_number) {
     game_info.game_id = game_number;
     game_info.initial_fen = opening.start_fen;
     game_info.moves = game.GetMoves();
-    game_info.play_start_ply = opening.moves.size();
+    game_info.play_start_ply = game.GetStartPly();
     if (!enable_resign) {
       game_info.min_false_positive_threshold =
           game.GetWorstEvalForWinnerOrDraw();
     }
-    if (kTraining) {
+    if (kTraining &&
+        game_info.play_start_ply < static_cast<int>(game_info.moves.size())) {
       TrainingDataWriter writer(game_number);
       game.WriteTrainingData(&writer);
       writer.Finalize();

From bb238006f9db90226082a375184919cda3aa361c Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Mon, 12 Dec 2022 13:57:46 +0200
Subject: [PATCH 12/46] update some shared files for smolgen

---
 appveyor.yml                   |  6 +++---
 src/neural/network_legacy.cc   | 22 ++++++++++++++++++++--
 src/neural/network_legacy.h    | 25 ++++++++++++++++++++++++-
 src/neural/shared/activation.h |  2 +-
 4 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index ab5225341e..fb64dc3d18 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -51,9 +51,9 @@ install:
 - cmd: set DNNL_NAME=dnnl_win_1.5.0_cpu_vcomp
 - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/oneapi-src/oneDNN/releases/download/v1.5/dnnl_win_1.5.0_cpu_vcomp.zip
 - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_1.5.0_cpu_vcomp.zip -oC:\cache
-- cmd: IF %NAME%==onednn set DNNL_NAME=dnnl_win_2.6.0_cpu_vcomp_gpu_vcomp
-- cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/borg323/oneDNN/releases/download/v2.6/dnnl_win_2.6.0_cpu_vcomp_gpu_vcomp.zip
-- cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_2.6.0_cpu_vcomp_gpu_vcomp.zip -oC:\cache
+- cmd: IF %NAME%==onednn set DNNL_NAME=dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp
+- cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/borg323/oneDNN/releases/download/v2.7.2/dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip
+- cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip -oC:\cache
 - cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 appveyor DownloadFile https://github.com/borg323/onnxruntime/releases/download/v1.13.1/onnxruntime-win-x64-dml-1.13.1.zip
 - cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 7z x onnxruntime-win-x64-dml-1.13.1.zip -oC:\cache
 - cmd: IF %NAME%==onnx-dml set ONNX_NAME=onnxruntime-win-x64-dml-1.13.1
diff --git a/src/neural/network_legacy.cc b/src/neural/network_legacy.cc
index 0872ae0ddc..387590de6b 100644
--- a/src/neural/network_legacy.cc
+++ b/src/neural/network_legacy.cc
@@ -32,6 +32,8 @@ LegacyWeights::LegacyWeights(const pblczero::Weights& weights)
     : input(weights.input()),
       ip_emb_w(LayerAdapter(weights.ip_emb_w()).as_vector()),
       ip_emb_b(LayerAdapter(weights.ip_emb_b()).as_vector()),
+      ip_mult_gate(LayerAdapter(weights.ip_mult_gate()).as_vector()),
+      ip_add_gate(LayerAdapter(weights.ip_add_gate()).as_vector()),
       policy1(weights.policy1()),
       policy(weights.policy()),
       ip_pol_w(LayerAdapter(weights.ip_pol_w()).as_vector()),
@@ -54,7 +56,9 @@ LegacyWeights::LegacyWeights(const pblczero::Weights& weights)
       ip1_mov_w(LayerAdapter(weights.ip1_mov_w()).as_vector()),
       ip1_mov_b(LayerAdapter(weights.ip1_mov_b()).as_vector()),
       ip2_mov_w(LayerAdapter(weights.ip2_mov_w()).as_vector()),
-      ip2_mov_b(LayerAdapter(weights.ip2_mov_b()).as_vector()) {
+      ip2_mov_b(LayerAdapter(weights.ip2_mov_b()).as_vector()),
+      smolgen_w(LayerAdapter(weights.smolgen_w()).as_vector()),
+      has_smolgen(weights.has_smolgen_w()) {
   for (const auto& res : weights.residual()) {
     residual.emplace_back(res);
   }
@@ -145,7 +149,9 @@ LegacyWeights::MHA::MHA(const pblczero::Weights::MHA& mha)
       v_w(LayerAdapter(mha.v_w()).as_vector()),
       v_b(LayerAdapter(mha.v_b()).as_vector()),
       dense_w(LayerAdapter(mha.dense_w()).as_vector()),
-      dense_b(LayerAdapter(mha.dense_b()).as_vector()) {}
+      dense_b(LayerAdapter(mha.dense_b()).as_vector()),
+      smolgen(Smolgen(mha.smolgen())),
+      has_smolgen(mha.has_smolgen()) {}
 
 LegacyWeights::FFN::FFN(const pblczero::Weights::FFN& ffn)
     : dense1_w(LayerAdapter(ffn.dense1_w()).as_vector()),
@@ -162,4 +168,16 @@ LegacyWeights::EncoderLayer::EncoderLayer(
       ln2_gammas(LayerAdapter(encoder.ln2_gammas()).as_vector()),
       ln2_betas(LayerAdapter(encoder.ln2_betas()).as_vector()) {}
 
+LegacyWeights::Smolgen::Smolgen(
+    const pblczero::Weights::Smolgen& smolgen)
+    : compress(LayerAdapter(smolgen.compress()).as_vector()),
+      dense1_w(LayerAdapter(smolgen.dense1_w()).as_vector()),
+      dense1_b(LayerAdapter(smolgen.dense1_b()).as_vector()),
+      ln1_gammas(LayerAdapter(smolgen.ln1_gammas()).as_vector()),
+      ln1_betas(LayerAdapter(smolgen.ln1_betas()).as_vector()),
+      dense2_w(LayerAdapter(smolgen.dense2_w()).as_vector()),
+      dense2_b(LayerAdapter(smolgen.dense2_b()).as_vector()),
+      ln2_gammas(LayerAdapter(smolgen.ln2_gammas()).as_vector()),
+      ln2_betas(LayerAdapter(smolgen.ln2_betas()).as_vector()) {}
+
 }  // namespace lczero
diff --git a/src/neural/network_legacy.h b/src/neural/network_legacy.h
index 19284af172..5715c40fbb 100644
--- a/src/neural/network_legacy.h
+++ b/src/neural/network_legacy.h
@@ -55,6 +55,19 @@ struct LegacyWeights {
     bool has_se;
   };
 
+  struct Smolgen {
+    explicit Smolgen(const pblczero::Weights::Smolgen& smolgen);
+    Vec compress;
+    Vec dense1_w;
+    Vec dense1_b;
+    Vec ln1_gammas;
+    Vec ln1_betas;
+    Vec dense2_w;
+    Vec dense2_b;
+    Vec ln2_gammas;
+    Vec ln2_betas;
+  };
+
   struct MHA {
     explicit MHA(const pblczero::Weights::MHA& mha);
     Vec q_w;
@@ -65,6 +78,8 @@ struct LegacyWeights {
     Vec v_b;
     Vec dense_w;
     Vec dense_b;
+    Smolgen smolgen;
+    bool has_smolgen;
   };
 
   struct FFN {
@@ -92,11 +107,14 @@ struct LegacyWeights {
   Vec ip_emb_w;
   Vec ip_emb_b;
 
+  // Input gating
+  Vec ip_mult_gate;
+  Vec ip_add_gate;
+
   // Encoder stack.
   std::vector<EncoderLayer> encoder;
   int encoder_head_count;
 
-
   // Residual tower.
   std::vector<Residual> residual;
 
@@ -133,6 +151,11 @@ struct LegacyWeights {
   Vec ip1_mov_b;
   Vec ip2_mov_w;
   Vec ip2_mov_b;
+
+  // Smolgen global weights
+  Vec smolgen_w;
+  Vec smolgen_b;
+  bool has_smolgen;
 };
 
 }  // namespace lczero
diff --git a/src/neural/shared/activation.h b/src/neural/shared/activation.h
index 5937126b32..8a55df486b 100644
--- a/src/neural/shared/activation.h
+++ b/src/neural/shared/activation.h
@@ -22,7 +22,7 @@
 #include <vector>
 
 namespace lczero {
-enum ActivationFunction { NONE, RELU, TANH, SIGMOID, SELU, MISH };
+enum ActivationFunction { NONE, RELU, TANH, SIGMOID, SELU, MISH, SWISH };
 
 // Softmax activation
 void SoftmaxActivation(const size_t size, const float* input, float* output);

From c80a4b72a2271be3cc54344169267b2d60ebbd5a Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Fri, 16 Dec 2022 22:57:01 +0200
Subject: [PATCH 13/46] add swish activation

---
 src/neural/shared/activation.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc
index ecf154f2ed..9f6b7489c4 100644
--- a/src/neural/shared/activation.cc
+++ b/src/neural/shared/activation.cc
@@ -78,6 +78,8 @@ float Activate(const float val, const ActivationFunction activation) {
       return 1.0f / (1.0f + expf(-val));
     case SELU:
       return selu(val);
+    case SWISH:
+      return val / (1.0f + expf(-val));
     case NONE:
       // Nothing to do.
       break;

From 45965dda5665a50d709d54afece6285b3e92f7ed Mon Sep 17 00:00:00 2001
From: Ankan Banerjee <ankan.ban@gmail.com>
Date: Sat, 17 Dec 2022 09:13:08 +0530
Subject: [PATCH 14/46] Persistent L2 cache opt for cuda backend (#1815)

* add persistent L2 cache opt

- goal is to fit activations in residual block in L2 cache.
 - around 6.7% improvement in T80 networks.

* add checks for cuda version

- prevent compile errors when compiled with old CUDA toolkit.

* fix typo with cudart_version
---
 src/neural/cuda/layers.cc       |  6 +++
 src/neural/cuda/network_cuda.cc | 67 +++++++++++++++++++++++++++++----
 2 files changed, 65 insertions(+), 8 deletions(-)

diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index 643974b375..47feb4eb9a 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -1204,6 +1204,12 @@ void ResidualBlock<DataType>::Eval(int N, DataType* output,
   DataType* transformed_output =
       transformed_input + scratch_size / (2 * sizeof(DataType));
 
+  // caller wants us to sub-allocate all memory we need from "output" tensor
+  if (!scratch) {
+    transformed_input = output; // this is true in normal cases too!
+    transformed_output = transformed_input + (N * C * 8 * 8 * 36 / 16);
+  }
+
   if (first_block_) {
     InputTransform<DataType, true>(N, c_input_, transformed_input, input,
                                    stream);
diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/cuda/network_cuda.cc
index b374d27561..5052b78021 100644
--- a/src/neural/cuda/network_cuda.cc
+++ b/src/neural/cuda/network_cuda.cc
@@ -178,6 +178,10 @@ class CudaNetwork : public Network {
     cudaGetDeviceProperties(&deviceProp, gpu_id_);
     showDeviceInfo(deviceProp);
 
+    l2_cache_size_ = deviceProp.l2CacheSize;
+
+    allow_cache_opt_ = options.GetOrDefault<bool>("cache_opt", true);
+
     // Select GPU to run on (for *the current* thread).
     ReportCUDAErrors(cudaSetDevice(gpu_id_));
 
@@ -225,6 +229,7 @@ class CudaNetwork : public Network {
     const int kNumInputPlanes = kInputPlanes;
     const int kNumFilters = (int)weights.input.biases.size();
     numBlocks_ = (int)weights.residual.size();
+    numFilters_ = kNumFilters;
 
     // Warn if the memory required for storing transformed weights is
     // going to exceed 40% of total video memory, force custom_winograd off
@@ -533,19 +538,52 @@ class CudaNetwork : public Network {
     float* opVal = io->op_value_mem_gpu_;
     float* opMov = io->op_moves_left_mem_gpu_;
 
+
+    // Figure out if the memory requirment for running the res block would fit
+    // in the L2 cache.
+    bool enableCacheOpt = false;
+    DataType* skip_connection =
+        use_res_block_winograd_fuse_opt_ ? tensor_mem[1] : tensor_mem[2];
+
+#if CUDART_VERSION >= 11000
+    const int pre_transform_tensor_size =
+        batchSize * numFilters_ * 8 * 8 * sizeof(DataType);
+    const int transformed_tensor_size = pre_transform_tensor_size * 36 / 16;
+    const int res_block_mem =
+        transformed_tensor_size * 2 + pre_transform_tensor_size;
+
+    cudaStreamAttrValue stream_attribute = {};
+    stream_attribute.accessPolicyWindow.base_ptr = tensor_mem[2];
+    stream_attribute.accessPolicyWindow.num_bytes = res_block_mem;
+    stream_attribute.accessPolicyWindow.hitRatio = 1.0f;
+    stream_attribute.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting;
+    stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming;
+
+    if (allow_cache_opt_ && use_res_block_winograd_fuse_opt_ &&
+        (res_block_mem <= scratch_size_) && (res_block_mem <= l2_cache_size_)) {
+      // we can use a single alloc to hold all the required tensors, and enable
+      // persistent L2 caching on it
+      ReportCUDAErrors(cudaStreamSetAttribute(
+          stream, cudaStreamAttributeAccessPolicyWindow, &stream_attribute));
+
+      enableCacheOpt = true;
+      skip_connection =
+          tensor_mem[2] + 2 * transformed_tensor_size / sizeof(DataType);
+    }
+#endif
+
     int l = 0;
     // Input.
-    network_[l++]->Eval(
-        batchSize,
-        use_res_block_winograd_fuse_opt_ ? tensor_mem[1] : tensor_mem[2],
-        tensor_mem[0], nullptr, scratch_mem, scratch_size_, nullptr, cublas,
-        stream);  // input conv
+    network_[l++]->Eval(batchSize, skip_connection, tensor_mem[0], nullptr,
+                        scratch_mem, scratch_size_, nullptr, cublas,
+                        stream);  // input conv
 
     // Residual block.
     for (int block = 0; block < numBlocks_; block++) {
       if (use_res_block_winograd_fuse_opt_) {
-        network_[l++]->Eval(batchSize, tensor_mem[2], tensor_mem[1], nullptr,
-                            scratch_mem, scratch_size_, nullptr, cublas,
+        network_[l++]->Eval(batchSize, tensor_mem[2], skip_connection, nullptr,
+                            enableCacheOpt ? nullptr : scratch_mem,
+                            scratch_size_, nullptr, cublas,
                             stream);  // block
       } else {
         network_[l++]->Eval(batchSize, tensor_mem[0], tensor_mem[2], nullptr,
@@ -558,6 +596,16 @@ class CudaNetwork : public Network {
       }
     }
 
+#if CUDART_VERSION >= 11000
+    if (enableCacheOpt) {
+      // reset the cache settings
+      stream_attribute.accessPolicyWindow.num_bytes = 0;
+      cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow,
+                             &stream_attribute);
+      cudaCtxResetPersistingL2Cache();
+    }
+#endif
+
     // Policy head.
     if (attn_policy_) {
       network_[l++]->Eval(
@@ -761,18 +809,21 @@ class CudaNetwork : public Network {
  private:
   const NetworkCapabilities capabilities_;
   int gpu_id_;
+  int l2_cache_size_;
   int max_batch_size_;
   bool wdl_;
   bool moves_left_;
   bool use_res_block_winograd_fuse_opt_;  // fuse operations inside the residual
                                           // tower
   bool multi_stream_;                     // run multiple parallel network evals
+  bool allow_cache_opt_;                  // try to fit residual block activations in L2 cache
 
   // Currently only one NN Eval can happen a time (we can fix this if needed
   // by allocating more memory).
   mutable std::mutex lock_;
 
   int numBlocks_;
+  int numFilters_;
   bool has_se_;
   bool conv_policy_;
   bool attn_policy_;
@@ -840,7 +891,7 @@ class CudaNetwork : public Network {
     CERR << "GPU clock frequency: " << deviceProp.clockRate / 1e3f << " MHz";
     CERR << "GPU compute capability: " << deviceProp.major << "."
          << deviceProp.minor;
-
+    CERR << "L2 cache capacity: " << deviceProp.l2CacheSize;
     if (std::is_same<float, DataType>::value && deviceProp.major >= 7) {
       CERR << "WARNING: you will probably get better performance from the "
               "cuda-fp16 backend.";

From a127abd1f348bafdd5b94c4a16a1d5380d834a6b Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Sun, 18 Dec 2022 02:12:13 +0200
Subject: [PATCH 15/46] fix undefined behavior (#1817)

---
 src/neural/cuda/layers.cc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index 47feb4eb9a..0415b24a92 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -1200,14 +1200,16 @@ void ResidualBlock<DataType>::Eval(int N, DataType* output,
 
   // Split the scratch space into two parts - use first part for holding
   // transformed input and second part for transformed output.
-  DataType* transformed_input = (DataType*)scratch;
-  DataType* transformed_output =
-      transformed_input + scratch_size / (2 * sizeof(DataType));
-
-  // caller wants us to sub-allocate all memory we need from "output" tensor
+  DataType* transformed_input;
+  DataType* transformed_output;
   if (!scratch) {
-    transformed_input = output; // this is true in normal cases too!
+    // Caller wants us to sub-allocate all memory we need from "output" tensor.
+    transformed_input = output;  // This is true in normal cases too!
     transformed_output = transformed_input + (N * C * 8 * 8 * 36 / 16);
+  } else {
+    transformed_input = (DataType*)scratch;
+    transformed_output =
+        transformed_input + scratch_size / (2 * sizeof(DataType));
   }
 
   if (first_block_) {

From f4a918a19be24e8c8119209f22379e8e8147b495 Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Sun, 18 Dec 2022 02:12:46 +0200
Subject: [PATCH 16/46] set MADV_RANDOM for syzygy mmap (#1818)

---
 src/syzygy/syzygy.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/syzygy/syzygy.cc b/src/syzygy/syzygy.cc
index 89352e52c9..554186f7d9 100644
--- a/src/syzygy/syzygy.cc
+++ b/src/syzygy/syzygy.cc
@@ -1047,6 +1047,11 @@ class SyzygyTablebaseImpl {
     }
     *mapping = statbuf.st_size;
     base_address = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_SHARED, fd, 0);
+#if defined(MADV_RANDOM)
+    // For context: <https://github.com/official-stockfish/Stockfish/pull/1829>
+    // and <https://github.com/official-stockfish/Stockfish/pull/3094>.
+    madvise(base_address, statbuf.st_size, MADV_RANDOM);
+#endif
     ::close(fd);
     if (base_address == MAP_FAILED) {
       throw Exception("Could not mmap() " + fname);

From 6a774591ce34cc84da842e74dad65c4297678be2 Mon Sep 17 00:00:00 2001
From: almaudoh <almaudoh@yahoo.com>
Date: Sun, 25 Dec 2022 05:45:28 +0100
Subject: [PATCH 17/46] Fix softmax cuda (#1822)

* Fix softmax function in cuda.

* Add fix for negative zero.

* Code style fixes.

Co-authored-by: Alma <almaudoh@gmail.com>
---
 src/neural/cuda/common_kernels.cu   | 29 ++++++++++++++++++++++-------
 src/neural/cuda/winograd_helper.inc | 18 ++++++++++++++++++
 2 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu
index a1ecb36d97..684d8d1e8a 100644
--- a/src/neural/cuda/common_kernels.cu
+++ b/src/neural/cuda/common_kernels.cu
@@ -682,8 +682,12 @@ __global__ void softmax_opt_64_kernel(T* output, const T* input, int N) {
     copyAs<uint2>(&x[0], &input[index * 2]);
   }
 
-  ex[0] = exp(x[0]);
-  ex[1] = exp(x[1]);
+  float threadMax = max(x[0], x[1]);
+  float maxval = warpMax(threadMax);
+  maxval = __shfl_sync(0xFFFFFFFF, maxval, 0);
+
+  ex[0] = exp(x[0] - maxval);
+  ex[1] = exp(x[1] - maxval);
 
   float threadSum = ex[0] + ex[1];
   float Sum = warpReduce(threadSum);
@@ -716,14 +720,25 @@ __global__ void softmax_kernel(T* output, const T* input) {
   int C = blockDim.x;
   int index = n * C + c;
 
-  __shared__ float sum;
-  if (c == 0) sum = 0;
-  __syncthreads();
-
   // softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
 
   float x = (float)input[index];
-  float ex = exp(x);
+
+  __shared__ float sum, maxval;
+  if (c == 0) {
+    sum = 0;
+    maxval = x;
+  }
+
+  __syncthreads();
+
+  // Get max across warp first, and then update across C dimension
+  float warpmax = warpMax(x);
+  if ((c & 0x1F) == 0) atomicMaxFloat(&maxval, warpmax);
+
+  __syncthreads();
+
+  float ex = exp(x - maxval);
 
   // compute warp wide sums first
   float val = warpReduce(ex);
diff --git a/src/neural/cuda/winograd_helper.inc b/src/neural/cuda/winograd_helper.inc
index 456649ba87..3f362f3074 100644
--- a/src/neural/cuda/winograd_helper.inc
+++ b/src/neural/cuda/winograd_helper.inc
@@ -419,6 +419,24 @@ __device__ __forceinline__ float warpReduce(float x) {
   return x;
 }
 
+// fast max reduction for the warp
+__device__ __forceinline__ float warpMax(float x) {
+#pragma unroll
+  for (int mask = 16; mask > 0; mask >>= 1)
+    x = max(x, __shfl_xor_sync(0xFFFFFFFF, x, mask));
+
+  return x;
+}
+
+// atomic max implementation for floats
+__device__ __forceinline__ float atomicMaxFloat(float* addr, float val) {
+    float max;
+    max = !signbit(val) ? __int_as_float(atomicMax((int*)addr, __float_as_int(val))) :
+         __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(val)));
+
+    return max;
+}
+
 // Helper fuction to do vector loads/stores
 template <typename T>
 __device__ __forceinline__ void copyAs(void* dst, const void* src) {

From 662648ac742c846a0a5cc65321d4c9c29c7f501f Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Mon, 26 Dec 2022 13:10:27 +0200
Subject: [PATCH 18/46] set cache_opt default to false (#1823)

---
 src/neural/cuda/network_cuda.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/cuda/network_cuda.cc
index 5052b78021..dce3b28265 100644
--- a/src/neural/cuda/network_cuda.cc
+++ b/src/neural/cuda/network_cuda.cc
@@ -180,7 +180,7 @@ class CudaNetwork : public Network {
 
     l2_cache_size_ = deviceProp.l2CacheSize;
 
-    allow_cache_opt_ = options.GetOrDefault<bool>("cache_opt", true);
+    allow_cache_opt_ = options.GetOrDefault<bool>("cache_opt", false);
 
     // Select GPU to run on (for *the current* thread).
     ReportCUDAErrors(cudaSetDevice(gpu_id_));

From a5de019e08ef49516b1c48223795f00367bf5589 Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Wed, 28 Dec 2022 00:04:40 +0200
Subject: [PATCH 19/46] fix onnx final mlh activation (#1825)

---
 src/neural/onnx/converter.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc
index aaf200120d..6004e4d56f 100644
--- a/src/neural/onnx/converter.cc
+++ b/src/neural/onnx/converter.cc
@@ -552,7 +552,8 @@ void Converter::MakeMovesLeftHead(pblczero::OnnxModel* onnx,
       *GetWeghtsConverter(weights.ip2_mov_w, {mlh_fc1_outputs, 1}, {1, 0}));
   flow = builder->Add("/mlh/dense2/add", flow,
                       *GetWeghtsConverter(weights.ip2_mov_b, {1}));
-  auto output = builder->Relu(options_.output_mlh, flow);
+  flow = MakeActivation(builder, flow, "/mlh/dense2", default_activation_);
+  auto output = builder->Identity(options_.output_mlh, flow);
   builder->AddOutput(output, {options_.batch_size, 1}, GetDataType());
   onnx->set_output_mlh(output);
 }

From c41ba16945c2520b8aad44b11ec9575bcaf86cb3 Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Wed, 28 Dec 2022 15:15:35 +0200
Subject: [PATCH 20/46] inline fp16 conversions (#1824)

---
 appveyor.yml                    |   4 +-
 meson.build                     |   1 -
 src/neural/onnx/network_onnx.cc |   9 ++-
 src/utils/fp16_utils.cc         | 107 --------------------------------
 src/utils/fp16_utils.h          |  83 ++++++++++++++++++++++++-
 5 files changed, 88 insertions(+), 116 deletions(-)
 delete mode 100644 src/utils/fp16_utils.cc

diff --git a/appveyor.yml b/appveyor.yml
index fb64dc3d18..187c8cef95 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -116,10 +116,12 @@ before_build:
 - cmd: IF %DX%==true SET BUILD_BLAS=true
 - cmd: SET EMBED=false
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %ANDROID%==true SET EMBED=true
+- cmd: SET POPCNT=true
+- cmd: IF %NAME%==cpu-openblas SET POPCNT=false
 - cmd: SET EXTRA=
 - cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md
 - cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include
-- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
+- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=%POPCNT% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA%
 - cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64
 - cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false
 build_script:
diff --git a/meson.build b/meson.build
index 49b60c1cd6..54f9210a94 100644
--- a/meson.build
+++ b/meson.build
@@ -209,7 +209,6 @@ files += [
   'src/utils/random.cc',
   'src/utils/string.cc',
   'src/utils/weights_adapter.cc',
-  'src/utils/fp16_utils.cc',
   'src/version.cc',
 ]
 includes += include_directories('src')
diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc
index 7d7950175c..6908ea47df 100644
--- a/src/neural/onnx/network_onnx.cc
+++ b/src/neural/onnx/network_onnx.cc
@@ -196,12 +196,11 @@ Ort::Value OnnxComputation<DataType>::PrepareInputs(int start, int batch_size) {
   int end = std::min(start + batch_size, static_cast<int>(raw_input_.size()));
   for (int i = start; i < end; i++) {
     for (const auto& plane : raw_input_[i]) {
+      DataType value = std::is_same<Ort::Float16_t, DataType>::value
+                           ? FP32toFP16(plane.value)
+                           : plane.value;
       for (auto bit : IterateBits(plane.mask)) {
-        if (std::is_same<Ort::Float16_t, DataType>::value) {
-          *(iter + bit) = FP32toFP16(plane.value);
-        } else {
-          *(iter + bit) = plane.value;
-        }
+        *(iter + bit) = value;
       }
       iter += 64;
     }
diff --git a/src/utils/fp16_utils.cc b/src/utils/fp16_utils.cc
deleted file mode 100644
index 826389b1e9..0000000000
--- a/src/utils/fp16_utils.cc
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
-  This file is part of Leela Chess Zero.
-  Copyright (C) 2020 The LCZero Authors
-
-  Leela Chess is free software: you can redistribute it and/or modify
-  it under the terms of the GNU General Public License as published by
-  the Free Software Foundation, either version 3 of the License, or
-  (at your option) any later version.
-
-  Leela Chess is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-  GNU General Public License for more details.
-
-  You should have received a copy of the GNU General Public License
-  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
-
-  Additional permission under GNU GPL version 3 section 7
-
-  If you modify this Program, or any covered work, by linking or
-  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
-  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
-  modified version of those libraries), containing parts covered by the
-  terms of the respective license agreement, the licensors of this
-  Program grant you additional permission to convey the resulting work.
-*/
-
-#include <cstdint>
-#include <cstring>
-
-// Define NO_F16C to avoid the F16C intrinsics. Also disabled with NO_POPCNT
-// since it catches most processors without F16C instructions.
-
-#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
-    defined(__x86_64__)
-#include <immintrin.h>
-#else
-#define NO_F16C
-#endif
-
-namespace lczero {
-
-uint16_t FP32toFP16(float f32) {
-#if defined(NO_POPCNT) || defined(NO_F16C) || \
-    (defined(__GNUC__) && !defined(__F16C__))
-  unsigned int x;
-  unsigned int sign = 0;
-  memcpy(&x, &f32, sizeof(float));
-  if (x & 0x80000000) sign = 0x8000;
-  x &= 0x7fffffff;
-  if (x >= 0x477ff000) {
-    if ((x & 0x7f800000) == 0x7f800000 && (x & 0x7fffff)) {
-      x = ((x >> 13) - 0x38000) | 0x200;
-    } else {
-      x = 0x7c00;
-    }
-  } else if (x <= 0x33000000)
-    x = 0;
-  else if (x <= 0x387fefff) {
-    int shift = 126 - ((x >> 23) & 0xff);
-    x = (x & 0x7fffff) | 0x800000;
-    if (x & (0x17fffff >> (24 - shift))) x += 0x800000 >> (24 - shift);
-    x >>= shift;
-  } else {
-    // Adjust exponent and round to nearest even.
-    if (x & 0x2fff) {
-      x -= 0x37fff000;
-    } else {
-      x -= 0x38000000;
-    }
-    x >>= 13;
-  }
-  return x | sign;
-#else
-  __m128 A = _mm_set_ss(f32);
-  __m128i H = _mm_cvtps_ph(A, 0);
-  return _mm_extract_epi16(H, 0);
-#endif
-}
-
-float FP16toFP32(uint16_t f16) {
-#if defined(NO_POPCNT) || defined(NO_F16C) || \
-    (defined(__GNUC__) && !defined(__F16C__))
-  unsigned int x;
-  float f;
-  x = f16 & 0x7fff;
-  if ((x & 0x7c00) == 0) {
-    f = 5.9604645e-8f * x;
-    memcpy(&x, &f, sizeof(float));
-  } else if (x >= 0x7c00) {
-    if (x & 0x1ff) x |= 0x200;
-    x = (x + 0x38000) << 13;
-  } else {
-    x = (x + 0x1c000) << 13;
-  }
-  if (f16 & 0x8000) x |= 0x80000000;
-  memcpy(&f, &x, sizeof(float));
-  return f;
-#else
-  __m128i H = _mm_setzero_si128();
-  H = _mm_insert_epi16(H, f16, 0);
-  __m128 A = _mm_cvtph_ps(H);
-  return _mm_cvtss_f32(A);
-#endif
-}
-
-}  // namespace lczero
diff --git a/src/utils/fp16_utils.h b/src/utils/fp16_utils.h
index fadf83d031..2680536599 100644
--- a/src/utils/fp16_utils.h
+++ b/src/utils/fp16_utils.h
@@ -25,9 +25,88 @@
   Program grant you additional permission to convey the resulting work.
 */
 #pragma once
+
+#include <cstdint>
+#include <cstring>
+
+// Define NO_F16C to avoid the F16C intrinsics. Also disabled with NO_POPCNT
+// since it catches most processors without F16C instructions.
+#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \
+    defined(__x86_64__)
+#include <immintrin.h>
+#else
+#define NO_F16C
+#endif
+
 namespace lczero {
 
-uint16_t FP32toFP16(float f32);
-float FP16toFP32(uint16_t f16);
+#if defined(NO_POPCNT) || defined(NO_F16C) || \
+    (defined(__GNUC__) && !defined(__F16C__))
+
+inline uint16_t FP32toFP16(float f32) {
+  unsigned int x;
+  unsigned int sign = 0;
+  memcpy(&x, &f32, sizeof(float));
+  if (x & 0x80000000) sign = 0x8000;
+  x &= 0x7fffffff;
+  if (x >= 0x477ff000) {
+    if ((x & 0x7f800000) == 0x7f800000 && (x & 0x7fffff)) {
+      x = ((x >> 13) - 0x38000) | 0x200;
+    } else {
+      x = 0x7c00;
+    }
+  } else if (x <= 0x33000000)
+    x = 0;
+  else if (x <= 0x387fefff) {
+    int shift = 126 - ((x >> 23) & 0xff);
+    x = (x & 0x7fffff) | 0x800000;
+    if (x & (0x17fffff >> (24 - shift))) x += 0x800000 >> (24 - shift);
+    x >>= shift;
+  } else {
+    // Adjust exponent and round to nearest even.
+    if (x & 0x2fff) {
+      x -= 0x37fff000;
+    } else {
+      x -= 0x38000000;
+    }
+    x >>= 13;
+  }
+  return x | sign;
+}
+
+inline float FP16toFP32(uint16_t f16) {
+  unsigned int x;
+  float f;
+  x = f16 & 0x7fff;
+  if ((x & 0x7c00) == 0) {
+    f = 5.9604645e-8f * x;
+    memcpy(&x, &f, sizeof(float));
+  } else if (x >= 0x7c00) {
+    if (x & 0x1ff) x |= 0x200;
+    x = (x + 0x38000) << 13;
+  } else {
+    x = (x + 0x1c000) << 13;
+  }
+  if (f16 & 0x8000) x |= 0x80000000;
+  memcpy(&f, &x, sizeof(float));
+  return f;
+}
+
+#else
+
+inline uint16_t FP32toFP16(float f32) {
+  __m128 A = _mm_set_ss(f32);
+  __m128i H = _mm_cvtps_ph(A, 0);
+  return _mm_extract_epi16(H, 0);
+}
+
+inline float FP16toFP32(uint16_t f16) {
+  __m128i H = _mm_setzero_si128();
+  H = _mm_insert_epi16(H, f16, 0);
+  __m128 A = _mm_cvtph_ps(H);
+  return _mm_cvtss_f32(A);
+}
+
+#endif
 
 }  // namespace lczero

From 9ff390be73fc54a18e804014b96993461ce584ae Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Wed, 28 Dec 2022 15:15:55 +0200
Subject: [PATCH 21/46] onnx squeeze cleanup (#1826)

---
 src/neural/onnx/builder.cc   | 11 +++++++----
 src/neural/onnx/builder.h    |  3 ++-
 src/neural/onnx/converter.cc |  2 +-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/neural/onnx/builder.cc b/src/neural/onnx/builder.cc
index be97dfd7ac..5cafe518a0 100644
--- a/src/neural/onnx/builder.cc
+++ b/src/neural/onnx/builder.cc
@@ -167,14 +167,17 @@ std::string OnnxBuilder::GlobalAveragePool(const std::string& name,
 }
 
 std::string OnnxBuilder::Squeeze(const std::string& name,
-                                 const std::string& input) {
+                                 const std::string& input,
+                                 std::initializer_list<int> axes) {
   auto* node = model_.mutable_graph()->add_node();
   auto out = PopulateStdNodeFields(node, name, input, "Squeeze");
   if (opset_ < 13) {
-    AddIntsAttribute(node, "axes", {2, 3});
+    AddIntsAttribute(node, "axes", axes);
   } else {
-    node->add_input(
-        AddInitializer(name + "/axes", Int64OnnxConst({2, 3}, {2})));
+    node->add_input(AddInitializer(
+        name + "/axes",
+        Int64OnnxConst(std::vector<int64_t>(begin(axes), end(axes)),
+                       {static_cast<int>(axes.size())})));
   }
   return out;
 }
diff --git a/src/neural/onnx/builder.h b/src/neural/onnx/builder.h
index 3b88cb27ec..96fa4e82ee 100644
--- a/src/neural/onnx/builder.h
+++ b/src/neural/onnx/builder.h
@@ -68,7 +68,8 @@ class OnnxBuilder {
                   const OnnxConst&);
   std::string GlobalAveragePool(const std::string& name,
                                 const std::string& input);
-  std::string Squeeze(const std::string& name, const std::string& input);
+  std::string Squeeze(const std::string& name, const std::string& input,
+                      std::initializer_list<int> axes);
   std::string MatMul(const std::string& name, const std::string& input1,
                      const OnnxConst& input2);
   std::string MatMul(const std::string& name, const std::string& input1,
diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc
index 6004e4d56f..8e1c43eb18 100644
--- a/src/neural/onnx/converter.cc
+++ b/src/neural/onnx/converter.cc
@@ -184,7 +184,7 @@ std::string Converter::MakeSqueezeAndExcite(
   const int se_filters = se_unit.b1.size();
 
   auto flow = builder->GlobalAveragePool(name + "/pooled", input);
-  flow = builder->Squeeze(name + "/squeeze", flow);
+  flow = builder->Squeeze(name + "/squeeze", flow, {2, 3});
   flow = builder->MatMul(
       name + "/matmul1", flow,
       *GetWeghtsConverter(se_unit.w1, {NumFilters(), se_filters}, {1, 0}));

From ec887c5766ea3bc5292b66f56e2b03cac968e1c8 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Fri, 30 Dec 2022 01:00:02 +0200
Subject: [PATCH 22/46] update submodule

---
 libs/lczero-common | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libs/lczero-common b/libs/lczero-common
index 4dfa4ce833..2165d35bf6 160000
--- a/libs/lczero-common
+++ b/libs/lczero-common
@@ -1 +1 @@
-Subproject commit 4dfa4ce8339357819f7de01517e6297d4c768cdf
+Subproject commit 2165d35bf63e95549eb4feff06a755ec88af5264

From 825d0d093f00e999027ce1284ad6044e5780b0a1 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sun, 15 Jan 2023 01:54:42 +0200
Subject: [PATCH 23/46] update for new net.proto

---
 libs/lczero-common              |  2 +-
 src/neural/shared/activation.cc |  4 ++++
 src/neural/shared/activation.h  | 18 +++++++++++++++---
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/libs/lczero-common b/libs/lczero-common
index 2165d35bf6..1e1138673c 160000
--- a/libs/lczero-common
+++ b/libs/lczero-common
@@ -1 +1 @@
-Subproject commit 2165d35bf63e95549eb4feff06a755ec88af5264
+Subproject commit 1e1138673cb171bec090e1da54f3f2f57854620a
diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc
index 9f6b7489c4..5ba2e17a40 100644
--- a/src/neural/shared/activation.cc
+++ b/src/neural/shared/activation.cc
@@ -21,6 +21,8 @@
 #include <algorithm>
 #include <cmath>
 
+#include "utils/exception.h"
+
 #ifdef USE_ISPC
 #include "activation_ispc.h"
 #endif
@@ -83,6 +85,8 @@ float Activate(const float val, const ActivationFunction activation) {
     case NONE:
       // Nothing to do.
       break;
+    default:
+      throw Exception("unsupported activation function");
   }
   return val;
 }
diff --git a/src/neural/shared/activation.h b/src/neural/shared/activation.h
index 8a55df486b..1ef3e0de48 100644
--- a/src/neural/shared/activation.h
+++ b/src/neural/shared/activation.h
@@ -21,17 +21,29 @@
 #include <cstddef>
 #include <vector>
 
+#include "proto/net.pb.h"
+
 namespace lczero {
-enum ActivationFunction { NONE, RELU, TANH, SIGMOID, SELU, MISH, SWISH };
+enum ActivationFunction {
+  NONE = pblczero::NetworkFormat::ActivationFunction::NONE,
+  RELU = pblczero::NetworkFormat::ActivationFunction::RELU,
+  TANH = pblczero::NetworkFormat::ActivationFunction::TANH,
+  SIGMOID = pblczero::NetworkFormat::ActivationFunction::SIGMOID,
+  SELU = pblczero::NetworkFormat::ActivationFunction::SELU,
+  MISH = pblczero::NetworkFormat::ActivationFunction::MISH,
+  SWISH = pblczero::NetworkFormat::ActivationFunction::SWISH,
+  RELU_2 = pblczero::NetworkFormat::ActivationFunction::RELU_2,
+  SOFTMAX = pblczero::NetworkFormat::ActivationFunction::SOFTMAX
+};
 
 // Softmax activation
 void SoftmaxActivation(const size_t size, const float* input, float* output);
 
-void BiasResidual(const size_t batch_size, const size_t channels, float * data,
+void BiasResidual(const size_t batch_size, const size_t channels, float* data,
                   const float* biases, const float* eltwise,
                   const ActivationFunction activation = RELU);
 
-void BiasActivate(const size_t batch_size, const size_t channels, float * data,
+void BiasActivate(const size_t batch_size, const size_t channels, float* data,
                   const float* biases,
                   const ActivationFunction activation = RELU);
 

From 155c54a02338fb96a13312427b064dd9349d5bb5 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Wed, 8 Feb 2023 01:46:50 +0200
Subject: [PATCH 24/46] ispc alternatives for attention policy parts

---
 meson.build                       |  1 +
 src/neural/blas/encoder.h         | 18 +++++++--
 src/neural/blas/layer_norm.ispc   | 63 +++++++++++++++++++++++++++++++
 src/neural/shared/activation.cc   |  9 +++++
 src/neural/shared/activation.ispc | 20 +++++++++-
 5 files changed, 106 insertions(+), 5 deletions(-)
 create mode 100644 src/neural/blas/layer_norm.ispc

diff --git a/meson.build b/meson.build
index 54f9210a94..56d391c432 100644
--- a/meson.build
+++ b/meson.build
@@ -376,6 +376,7 @@ if get_option('build_backends')
 
     if get_option('ispc') and ispc.found()
       files += iscp_gen.process('src/neural/blas/winograd_transform.ispc')
+      files += iscp_gen.process('src/neural/blas/layer_norm.ispc')
       files += iscp_gen.process('src/neural/shared/activation.ispc')
       add_project_arguments('-DUSE_ISPC', language : 'cpp')
     endif
diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h
index 99d2752752..5314525507 100644
--- a/src/neural/blas/encoder.h
+++ b/src/neural/blas/encoder.h
@@ -1,6 +1,6 @@
 /*
  This file is part of Leela Chess Zero.
- Copyright (C) 2018-2019 The LCZero Authors
+ Copyright (C) 2022-2023 The LCZero Authors
 
  Leela Chess is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -25,6 +25,10 @@
 #include "neural/shared/activation.h"
 #include "utils/exception.h"
 
+#ifdef USE_ISPC
+#include "layer_norm_ispc.h"
+#endif
+
 namespace lczero {
 
 namespace {
@@ -44,6 +48,7 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size,
                                    const float* skip, const float* gammas,
                                    const float* betas, float epsilon) {
   for (size_t i = 0; i < batch_size; i++) {
+#ifndef USE_ISPC
     // Mean taken in dimension C.
     float mean = 0;
     for (size_t c = 0; c < channels; ++c) {
@@ -61,11 +66,16 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size,
     var /= channels;
 
     // Norm.
+    float den = 1.0f / std::sqrt(var + epsilon);
     for (size_t c = 0; c < channels; ++c) {
-      data[i * channels + c] = betas[c] + gammas[c] *
-                                              (data[i * channels + c] - mean) /
-                                              std::sqrt(var + epsilon);
+      data[i * channels + c] =
+          betas[c] + gammas[c] * (data[i * channels + c] - mean) * den;
     }
+#else
+    ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels,
+                                        skip + i * channels, gammas, betas,
+                                        epsilon);
+#endif
   }
 }
 
diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc
new file mode 100644
index 0000000000..7995c9b0e2
--- /dev/null
+++ b/src/neural/blas/layer_norm.ispc
@@ -0,0 +1,63 @@
+/*
+ This file is part of Leela Chess Zero.
+ Copyright (C) 2023 The LCZero Authors
+
+ Leela Chess is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Leela Chess is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
+                                          uniform float data[],
+                                          const uniform float skip[],
+                                          const uniform float gammas[],
+                                          const uniform float betas[],
+                                          uniform const float epsilon) {
+#if 0
+    // Faster but potentially less stable version for future testing.
+    // One pass mean and variance taken in dimension C. Uses shifted variance calculation.
+    float imean = 0;
+    float ivar = 0;
+    float k = data[0] + skip[0];
+    foreach (c = 0 ... channels) {
+      float t = data[c] + skip[c];
+      data[c] = t;
+      t -= k;
+      imean += t;
+      ivar += t * t;
+    }
+    float mean = reduce_add(imean) / channels;
+    float var = (reduce_add(ivar) - channels * mean * mean) / channels;
+    mean += k;
+#else
+  // Mean taken in dimension C.
+  float imean = 0;
+  foreach (c = 0 ... channels) {
+    data[c] += skip[c];
+    imean += data[c];
+  }
+  float mean = reduce_add(imean) / channels;
+
+  // Variance.
+  float ivar = 0;
+  foreach (c = 0 ... channels) {
+    float diff = data[c] - mean;
+    ivar += diff * diff;
+  }
+  float var = reduce_add(ivar) / channels;
+#endif
+
+  float den = rsqrt(var + epsilon);
+  foreach (c = 0 ... channels) {
+    data[c] = betas[c] + gammas[c] * (data[c] - mean) * den;
+  }
+}
diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc
index 5ba2e17a40..1a2dd34ead 100644
--- a/src/neural/shared/activation.cc
+++ b/src/neural/shared/activation.cc
@@ -110,6 +110,15 @@ void Activate(const size_t len, const float* data, const float* bias,
     }
 #else
     ispc::ActivateMish(len, 1.0f, data, bias, 0.0f, output);
+#endif
+  } else if (activation == SELU) {
+#ifndef USE_ISPC
+    for (size_t b = 0; b < len; b++) {
+      float val = data[b] + bias[b];
+      output[b] = selu(val);
+    }
+#else
+    ispc::ActivateSelu(len, 1.0f, data, bias, 0.0f, output);
 #endif
   } else {
     for (size_t b = 0; b < len; b++) {
diff --git a/src/neural/shared/activation.ispc b/src/neural/shared/activation.ispc
index 27cc36d116..ef209e512d 100644
--- a/src/neural/shared/activation.ispc
+++ b/src/neural/shared/activation.ispc
@@ -1,6 +1,6 @@
 /*
  This file is part of Leela Chess Zero.
- Copyright (C) 2022 The LCZero Authors
+ Copyright (C) 2022-2023 The LCZero Authors
 
  Leela Chess is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
@@ -35,3 +35,21 @@ export void ActivateMish(uniform const size_t len, uniform float gamma,
     output[b] = mish(val);
   }
 }
+
+static inline float selu(float val) {
+  float alpha = 1.67326324f, scale = 1.05070098f;
+  if (val > 0) {
+    return scale * val;
+  } else {
+    return scale * alpha * (exp(val) - 1.0f);
+  }
+}
+
+export void ActivateSelu(uniform const size_t len, uniform float gamma,
+                         const uniform float data[], const uniform float bias[],
+                         uniform float beta, uniform float output[]) {
+  foreach (b = 0 ... len) {
+    float val = gamma * data[b] + bias[b] + beta;
+    output[b] = selu(val);
+  }
+}

From cab4c0415e280133ca6fe88de821f8ce807a3c3f Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Thu, 9 Feb 2023 22:47:34 +0200
Subject: [PATCH 25/46] remove mha transpose

---
 src/neural/blas/encoder.h       |  55 ------------------
 src/neural/blas/network_blas.cc | 100 +++++++++++++++++++-------------
 2 files changed, 59 insertions(+), 96 deletions(-)

diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h
index 5314525507..567db1be40 100644
--- a/src/neural/blas/encoder.h
+++ b/src/neural/blas/encoder.h
@@ -18,12 +18,9 @@
 
 #pragma once
 
-#include <Eigen/Core>
 #include <cmath>
-#include <cstddef>
 
 #include "neural/shared/activation.h"
-#include "utils/exception.h"
 
 #ifdef USE_ISPC
 #include "layer_norm_ispc.h"
@@ -31,18 +28,6 @@
 
 namespace lczero {
 
-namespace {
-
-template <typename T>
-using EigenMatrixMap =
-    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
-
-template <typename T>
-using ConstEigenMatrixMap =
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
-
-}  // namespace
-
 void LayerNorm2DWithSkipConnection(const size_t batch_size,
                                    const size_t channels, float* data,
                                    const float* skip, const float* gammas,
@@ -79,44 +64,4 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size,
   }
 }
 
-template <bool use_eigen>
-void AttentionMatmul2D(const bool transpose_a, const bool transpose_b,
-                       const size_t batch_size, const size_t M, const size_t N,
-                       const size_t K, const float scaling, const float* input1,
-                       const float* input2, float* output) {
-  for (auto batch = size_t{0}; batch < batch_size; batch++) {
-    const float* A = &input1[batch * M * K];
-    const float* B = &input2[batch * N * K];
-    float* C = &output[batch * M * N];
-    if (use_eigen) {
-      auto C_mat = EigenMatrixMap<float>(C, N, M);
-
-      if (transpose_a && transpose_b) {
-        C_mat.noalias() = scaling *
-                          ConstEigenMatrixMap<float>(B, K, N).transpose() *
-                          ConstEigenMatrixMap<float>(A, M, K).transpose();
-      } else if (transpose_a) {
-        C_mat.noalias() = scaling * ConstEigenMatrixMap<float>(B, N, K) *
-                          ConstEigenMatrixMap<float>(A, M, K).transpose();
-      } else if (transpose_b) {
-        C_mat.noalias() = scaling *
-                          ConstEigenMatrixMap<float>(B, K, N).transpose() *
-                          ConstEigenMatrixMap<float>(A, K, M);
-      } else {
-        C_mat.noalias() = scaling * ConstEigenMatrixMap<float>(B, N, K) *
-                          ConstEigenMatrixMap<float>(A, K, M);
-      }
-    } else {
-#ifdef USE_BLAS
-      cblas_sgemm(CblasRowMajor, transpose_a ? CblasTrans : CblasNoTrans,
-                  transpose_b ? CblasTrans : CblasNoTrans, M, N, K, scaling, A,
-                  transpose_a ? M : K, B, transpose_b ? K : N, 0.0f, C, N);
-#else
-      // Should never get here.
-      throw Exception("Blas backend internal error");
-#endif
-    }
-  }
-}
-
 }  // namespace lczero
diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 3edc1f15b0..72117410fa 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -177,6 +177,14 @@ using EigenMatrixMap =
 template <typename T>
 using ConstEigenMatrixMap =
     Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenStridedMatrixMap =
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, 0,
+               Eigen::OuterStride<>>;
+template <typename T>
+using ConstEigenStridedMatrixMap =
+    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, 0,
+               Eigen::OuterStride<>>;
 
 template <bool use_eigen>
 void BlasComputation<use_eigen>::ComputeBlocking() {
@@ -327,9 +335,8 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
       if (weights_.pol_encoder.size() > 0) {
         std::vector<float> head_buffer4(largest_batch_size * max_channel_size *
                                         kSquares);
-        std::vector<float> temp_buffer1(policy_d_model * kSquares);
-        std::vector<float> temp_buffer2(policy_d_model * kSquares);
-        std::vector<float> temp_buffer3(policy_d_model * kSquares);
+        std::vector<float> temp_buffer(weights_.pol_encoder_head_count *
+                                       kSquares * kSquares);
 
         for (auto layer : weights_.pol_encoder) {
           // Q
@@ -358,51 +365,62 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
           for (auto batch = size_t{0}; batch < batch_size; batch++) {
             auto batchStart = batch * kSquares * d_model;
 
-            // Reshape and transpose for each head.
-            const float* Q = temp_buffer1.data();
-            const float* K = temp_buffer2.data();
-            const float* V = temp_buffer3.data();
-
-            for (int head = 0; head < heads; head++) {
-              for (int j = 0; j < kSquares; j++) {
-                auto channelStart = batchStart + j * d_model + head * depth;
-                auto transposeStart = head * kSquares * depth + j * depth;
-                std::copy(head_buffer2.begin() + channelStart,
-                          head_buffer2.begin() + channelStart + depth,
-                          temp_buffer1.begin() + transposeStart);
-                std::copy(head_buffer3.begin() + channelStart,
-                          head_buffer3.begin() + channelStart + depth,
-                          temp_buffer2.begin() + transposeStart);
-                std::copy(head_buffer4.begin() + channelStart,
-                          head_buffer4.begin() + channelStart + depth,
-                          temp_buffer3.begin() + transposeStart);
-              }
-            }
+            const float* Q = &head_buffer2[batchStart];
+            const float* K = &head_buffer3[batchStart];
+            const float* V = &head_buffer4[batchStart];
 
             // matmul(Q, K) for all heads per batch.
-            float* QK = &head_buffer2[batchStart];
-            AttentionMatmul2D<use_eigen>(false, true, heads, kSquares, kSquares,
-                                         depth, scaling, Q, K, QK);
-
+            float* QK = temp_buffer.data();
+            for (auto h = 0; h < heads; h++) {
+              const float* A = &Q[h * depth];
+              const float* B = &K[h * depth];
+              float* C = &QK[h * kSquares * kSquares];
+              if (use_eigen) {
+                auto C_mat = EigenMatrixMap<float>(C, kSquares, kSquares);
+                C_mat.noalias() =
+                    scaling *
+                    ConstEigenStridedMatrixMap<float>(
+                        B, depth, kSquares, Eigen::OuterStride<>(heads * depth))
+                        .transpose() *
+                    ConstEigenStridedMatrixMap<float>(
+                        A, depth, kSquares,
+                        Eigen::OuterStride<>(heads * depth));
+              } else {
+#ifdef USE_BLAS
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, kSquares,
+                            kSquares, depth, scaling, A, heads * depth, B,
+                            heads * depth, 0.0f, C, kSquares);
+#else
+                // Should never get here.
+                throw Exception("Blas backend internal error");
+#endif
+              }
+            }
             // Apply Softmax.
             for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) {
               SoftmaxActivation(kSquares, QK + h, QK + h);
             }
 
             // matmul(softmax(QK), V) for all heads per batch.
-            float* attn = &head_buffer3[batchStart];
-            AttentionMatmul2D<use_eigen>(false, false, heads, kSquares, depth,
-                                         kSquares, 1.0, QK, V, attn);
-
-            // Transpose back into N x 64 x H x D.
-            for (int j = 0; j < kSquares; j++) {
-              for (int head = 0; head < heads; head++) {
-                auto transposeStart =
-                    batchStart + head * kSquares * depth + j * depth;
-                std::copy(head_buffer3.begin() + transposeStart,
-                          head_buffer3.begin() + transposeStart + depth,
-                          head_buffer2.begin() + batchStart + j * d_model +
-                              head * depth);
+            float* attn = &head_buffer2[batchStart];
+            for (auto h = 0; h < heads; h++) {
+              const float* A = &QK[h * kSquares * kSquares];
+              const float* B = &V[h * depth];
+              float* C = &attn[h * depth];
+              if (use_eigen) {
+                auto C_mat = EigenStridedMatrixMap<float>(
+                    C, depth, kSquares, Eigen::OuterStride<>(heads * depth));
+                C_mat.noalias() =
+                    ConstEigenStridedMatrixMap<float>(
+                        B, depth, kSquares,
+                        Eigen::OuterStride<>(heads * depth)) *
+                    ConstEigenMatrixMap<float>(A, kSquares, kSquares);
+              } else {
+#ifdef USE_BLAS
+                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, kSquares,
+                            depth, kSquares, 1.0f, A, kSquares, B,
+                            heads * depth, 0.0f, C, heads * depth);
+#endif
               }
             }
           }
@@ -651,7 +669,7 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
     : capabilities_{file.format().network_format().input(),
                     file.format().network_format().moves_left()},
       weights_(file.weights()) {
-    Numa::Init();
+  Numa::Init();
 
   max_batch_size_ =
       static_cast<size_t>(options.GetOrDefault<int>("batch_size", 256));

From ce1460846d9ff155cfdfb40c5cfb889adbc79a75 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Fri, 10 Feb 2023 01:19:28 +0200
Subject: [PATCH 26/46] ispc softmax

---
 src/neural/blas/network_blas.cc   |  8 ++++++++
 src/neural/shared/activation.ispc | 22 ++++++++++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 72117410fa..9a92682b3b 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -41,6 +41,10 @@
 #include <omp.h>
 #endif
 
+#ifdef USE_ISPC
+#include "activation_ispc.h"
+#endif
+
 namespace lczero {
 namespace {
 
@@ -398,7 +402,11 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
             }
             // Apply Softmax.
             for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) {
+#ifdef USE_ISPC
               SoftmaxActivation(kSquares, QK + h, QK + h);
+#else
+              ispc::SoftmaxActivation(kSquares, QK + h, QK + h);
+#endif
             }
 
             // matmul(softmax(QK), V) for all heads per batch.
diff --git a/src/neural/shared/activation.ispc b/src/neural/shared/activation.ispc
index ef209e512d..6190198515 100644
--- a/src/neural/shared/activation.ispc
+++ b/src/neural/shared/activation.ispc
@@ -53,3 +53,25 @@ export void ActivateSelu(uniform const size_t len, uniform float gamma,
     output[b] = selu(val);
   }
 }
+
+export void SoftmaxActivation(uniform const size_t size,
+                              const uniform float input[],
+                              uniform float output[]) {
+  float vmax = -3.4e38f;
+  foreach (c = 0 ... size) {
+    if (input[c] > vmax) vmax = input[c];
+  }
+  uniform float alpha = reduce_max(vmax);
+
+  float t = 0.0f;
+  foreach (c = 0 ... size) {
+    float val = exp(input[c] - alpha);
+    output[c] = val;
+    t += val;
+  }
+  uniform float denom = 1.0f / reduce_add(t);
+
+  foreach (c = 0 ... size) {
+    output[c] *= denom;
+  }
+}

From bd8aafed4a9f882e08e0484f6491e7301aa68996 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Fri, 10 Feb 2023 23:40:53 +0200
Subject: [PATCH 27/46] important typo fix

---
 src/neural/blas/network_blas.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 9a92682b3b..a5e6976ee4 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -402,7 +402,7 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
             }
             // Apply Softmax.
             for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) {
-#ifdef USE_ISPC
+#ifndef USE_ISPC
               SoftmaxActivation(kSquares, QK + h, QK + h);
 #else
               ispc::SoftmaxActivation(kSquares, QK + h, QK + h);

From 8d67b1f7109534f431192916e8898ead02498890 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sat, 11 Feb 2023 02:24:34 +0200
Subject: [PATCH 28/46] refactor blas encoder

---
 src/neural/blas/network_blas.cc | 277 +++++++++++++++++---------------
 1 file changed, 150 insertions(+), 127 deletions(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index a5e6976ee4..9a3e51ce7b 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -102,6 +102,12 @@ class BlasComputation : public NetworkComputation {
 
  private:
   void EncodePlanes(const InputPlanes& sample, float* buffer);
+  void MakeEncoderLayer(std::vector<float>& head_buffer,
+                        std::vector<float>& head_buffer2,
+                        std::vector<float>& head_buffer3, size_t batch_size,
+                        const LegacyWeights::EncoderLayer& layer,
+                        int embedding_size, int heads,
+                        ActivationFunction activation, float alpha = 1.0f);
 
   static constexpr auto kWidth = 8;
   static constexpr auto kHeight = 8;
@@ -190,6 +196,146 @@ using ConstEigenStridedMatrixMap =
     Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>, 0,
                Eigen::OuterStride<>>;
 
+template <bool use_eigen>
+void BlasComputation<use_eigen>::MakeEncoderLayer(
+    std::vector<float>& head_buffer, std::vector<float>& head_buffer2,
+    std::vector<float>& head_buffer3, size_t batch_size,
+    const LegacyWeights::EncoderLayer& layer, int embedding_size, int heads,
+    ActivationFunction activation, float alpha) {
+  const int d_model = layer.mha.q_b.size();
+  static std::vector<float> head_buffer4;
+  head_buffer4.clear();
+  head_buffer4.resize(batch_size * d_model * kSquares);
+  static std::vector<float> temp_buffer;
+  temp_buffer.clear();
+  temp_buffer.resize(heads * kSquares * kSquares);
+  // Q
+  FullyConnectedLayer<use_eigen>::Forward1D(
+      batch_size * kSquares, embedding_size, d_model, head_buffer.data(),
+      layer.mha.q_w.data(), layer.mha.q_b.data(), NONE, head_buffer2.data());
+  // K
+  FullyConnectedLayer<use_eigen>::Forward1D(
+      batch_size * kSquares, embedding_size, d_model, head_buffer.data(),
+      layer.mha.k_w.data(), layer.mha.k_b.data(), NONE, head_buffer3.data());
+  // V
+  FullyConnectedLayer<use_eigen>::Forward1D(
+      batch_size * kSquares, embedding_size, d_model, head_buffer.data(),
+      layer.mha.v_w.data(), layer.mha.v_b.data(), NONE, head_buffer4.data());
+
+  // MHA (Q, K, V)
+  const int depth = d_model / heads;
+  const float scaling = 1.0f / sqrtf(depth);
+
+  // MHA is done per batch since there's a fourth dimension introduced.
+  for (auto batch = size_t{0}; batch < batch_size; batch++) {
+    auto batchStart = batch * kSquares * d_model;
+
+    const float* Q = &head_buffer2[batchStart];
+    const float* K = &head_buffer3[batchStart];
+    const float* V = &head_buffer4[batchStart];
+
+    // matmul(Q, K) for all heads per batch.
+    float* QK = temp_buffer.data();
+    for (auto h = 0; h < heads; h++) {
+      const float* A = &Q[h * depth];
+      const float* B = &K[h * depth];
+      float* C = &QK[h * kSquares * kSquares];
+      if (use_eigen) {
+        auto C_mat = EigenMatrixMap<float>(C, kSquares, kSquares);
+        C_mat.noalias() =
+            scaling *
+            ConstEigenStridedMatrixMap<float>(
+                B, depth, kSquares, Eigen::OuterStride<>(heads * depth))
+                .transpose() *
+            ConstEigenStridedMatrixMap<float>(
+                A, depth, kSquares, Eigen::OuterStride<>(heads * depth));
+      } else {
+#ifdef USE_BLAS
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, kSquares, kSquares,
+                    depth, scaling, A, heads * depth, B, heads * depth, 0.0f, C,
+                    kSquares);
+#else
+        // Should never get here.
+        throw Exception("Blas backend internal error");
+#endif
+      }
+    }
+    // Apply Softmax.
+    for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) {
+#ifndef USE_ISPC
+      SoftmaxActivation(kSquares, QK + h, QK + h);
+#else
+      ispc::SoftmaxActivation(kSquares, QK + h, QK + h);
+#endif
+    }
+
+    // matmul(softmax(QK), V) for all heads per batch.
+    float* attn = &head_buffer2[batchStart];
+    for (auto h = 0; h < heads; h++) {
+      const float* A = &QK[h * kSquares * kSquares];
+      const float* B = &V[h * depth];
+      float* C = &attn[h * depth];
+      if (use_eigen) {
+        auto C_mat = EigenStridedMatrixMap<float>(
+            C, depth, kSquares, Eigen::OuterStride<>(heads * depth));
+        C_mat.noalias() =
+            ConstEigenStridedMatrixMap<float>(
+                B, depth, kSquares, Eigen::OuterStride<>(heads * depth)) *
+            ConstEigenMatrixMap<float>(A, kSquares, kSquares);
+      } else {
+#ifdef USE_BLAS
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, kSquares, depth,
+                    kSquares, 1.0f, A, kSquares, B, heads * depth, 0.0f, C,
+                    heads * depth);
+#endif
+      }
+    }
+  }
+
+  // Fully connected final MHA layer.
+  FullyConnectedLayer<use_eigen>::Forward1D(
+      batch_size * kSquares, d_model, embedding_size, head_buffer2.data(),
+      layer.mha.dense_w.data(), layer.mha.dense_b.data(), NONE,
+      head_buffer3.data());
+
+  if (alpha != 1.0f) {
+    for (size_t i = 0; i < batch_size * kSquares * embedding_size; i++) {
+      head_buffer[i] *= alpha;
+    }
+  }
+
+  // Layer Norm + skip connection.
+  LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size,
+                                head_buffer.data(), head_buffer3.data(),
+                                layer.ln1_gammas.data(), layer.ln1_betas.data(),
+                                1e-6);
+
+  // FFN.
+  const size_t dff_size = layer.ffn.dense1_b.size();
+  FullyConnectedLayer<use_eigen>::Forward1D(
+      batch_size * kSquares, embedding_size, dff_size, head_buffer.data(),
+      layer.ffn.dense1_w.data(), layer.ffn.dense1_b.data(), activation,
+      head_buffer2.data());
+
+  FullyConnectedLayer<use_eigen>::Forward1D(
+      batch_size * kSquares, dff_size, layer.ffn.dense2_b.size(),
+      head_buffer2.data(), layer.ffn.dense2_w.data(), layer.ffn.dense2_b.data(),
+      NONE, head_buffer3.data());
+
+  if (alpha != 1.0f) {
+    for (size_t i = 0; i < batch_size * kSquares * layer.ffn.dense2_b.size();
+         i++) {
+      head_buffer[i] *= alpha;
+    }
+  }
+
+  // Layer Norm + skip connection.
+  LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size,
+                                head_buffer.data(), head_buffer3.data(),
+                                layer.ln2_gammas.data(), layer.ln2_betas.data(),
+                                1e-6);
+}
+
 template <bool use_eigen>
 void BlasComputation<use_eigen>::ComputeBlocking() {
   // Retrieve network key dimensions from the weights structure.
@@ -336,133 +482,10 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
       std::vector<float> head_buffer3(largest_batch_size * max_channel_size *
                                       kSquares);
 
-      if (weights_.pol_encoder.size() > 0) {
-        std::vector<float> head_buffer4(largest_batch_size * max_channel_size *
-                                        kSquares);
-        std::vector<float> temp_buffer(weights_.pol_encoder_head_count *
-                                       kSquares * kSquares);
-
-        for (auto layer : weights_.pol_encoder) {
-          // Q
-          FullyConnectedLayer<use_eigen>::Forward1D(
-              batch_size * kSquares, embedding_size, layer.mha.q_b.size(),
-              head_buffer.data(), layer.mha.q_w.data(), layer.mha.q_b.data(),
-              NONE, head_buffer2.data());
-          // K
-          FullyConnectedLayer<use_eigen>::Forward1D(
-              batch_size * kSquares, embedding_size, layer.mha.k_b.size(),
-              head_buffer.data(), layer.mha.k_w.data(), layer.mha.k_b.data(),
-              NONE, head_buffer3.data());
-          // V
-          FullyConnectedLayer<use_eigen>::Forward1D(
-              batch_size * kSquares, embedding_size, layer.mha.v_b.size(),
-              head_buffer.data(), layer.mha.v_w.data(), layer.mha.v_b.data(),
-              NONE, head_buffer4.data());
-
-          // MHA (Q, K, V)
-          const int d_model = layer.mha.q_b.size();
-          const int heads = weights_.pol_encoder_head_count;
-          const int depth = d_model / heads;
-          const float scaling = 1.0f / sqrtf(depth);
-
-          // MHA is done per batch since there's a fourth dimension introduced.
-          for (auto batch = size_t{0}; batch < batch_size; batch++) {
-            auto batchStart = batch * kSquares * d_model;
-
-            const float* Q = &head_buffer2[batchStart];
-            const float* K = &head_buffer3[batchStart];
-            const float* V = &head_buffer4[batchStart];
-
-            // matmul(Q, K) for all heads per batch.
-            float* QK = temp_buffer.data();
-            for (auto h = 0; h < heads; h++) {
-              const float* A = &Q[h * depth];
-              const float* B = &K[h * depth];
-              float* C = &QK[h * kSquares * kSquares];
-              if (use_eigen) {
-                auto C_mat = EigenMatrixMap<float>(C, kSquares, kSquares);
-                C_mat.noalias() =
-                    scaling *
-                    ConstEigenStridedMatrixMap<float>(
-                        B, depth, kSquares, Eigen::OuterStride<>(heads * depth))
-                        .transpose() *
-                    ConstEigenStridedMatrixMap<float>(
-                        A, depth, kSquares,
-                        Eigen::OuterStride<>(heads * depth));
-              } else {
-#ifdef USE_BLAS
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, kSquares,
-                            kSquares, depth, scaling, A, heads * depth, B,
-                            heads * depth, 0.0f, C, kSquares);
-#else
-                // Should never get here.
-                throw Exception("Blas backend internal error");
-#endif
-              }
-            }
-            // Apply Softmax.
-            for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) {
-#ifndef USE_ISPC
-              SoftmaxActivation(kSquares, QK + h, QK + h);
-#else
-              ispc::SoftmaxActivation(kSquares, QK + h, QK + h);
-#endif
-            }
-
-            // matmul(softmax(QK), V) for all heads per batch.
-            float* attn = &head_buffer2[batchStart];
-            for (auto h = 0; h < heads; h++) {
-              const float* A = &QK[h * kSquares * kSquares];
-              const float* B = &V[h * depth];
-              float* C = &attn[h * depth];
-              if (use_eigen) {
-                auto C_mat = EigenStridedMatrixMap<float>(
-                    C, depth, kSquares, Eigen::OuterStride<>(heads * depth));
-                C_mat.noalias() =
-                    ConstEigenStridedMatrixMap<float>(
-                        B, depth, kSquares,
-                        Eigen::OuterStride<>(heads * depth)) *
-                    ConstEigenMatrixMap<float>(A, kSquares, kSquares);
-              } else {
-#ifdef USE_BLAS
-                cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, kSquares,
-                            depth, kSquares, 1.0f, A, kSquares, B,
-                            heads * depth, 0.0f, C, heads * depth);
-#endif
-              }
-            }
-          }
-
-          // Fully connected final MHA layer.
-          FullyConnectedLayer<use_eigen>::Forward1D(
-              batch_size * kSquares, d_model, embedding_size,
-              head_buffer2.data(), layer.mha.dense_w.data(),
-              layer.mha.dense_b.data(), NONE, head_buffer3.data());
-
-          // Layer Norm + skip connection.
-          LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size,
-                                        head_buffer.data(), head_buffer3.data(),
-                                        layer.ln1_gammas.data(),
-                                        layer.ln1_betas.data(), 1e-6);
-
-          // FFN.
-          const size_t dff_size = layer.ffn.dense1_b.size();
-          FullyConnectedLayer<use_eigen>::Forward1D(
-              batch_size * kSquares, embedding_size, dff_size,
-              head_buffer.data(), layer.ffn.dense1_w.data(),
-              layer.ffn.dense1_b.data(), SELU, head_buffer2.data());
-
-          FullyConnectedLayer<use_eigen>::Forward1D(
-              batch_size * kSquares, dff_size, layer.ffn.dense2_b.size(),
-              head_buffer2.data(), layer.ffn.dense2_w.data(),
-              layer.ffn.dense2_b.data(), NONE, head_buffer3.data());
-
-          // Layer Norm + skip connection.
-          LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size,
-                                        head_buffer.data(), head_buffer3.data(),
-                                        layer.ln2_gammas.data(),
-                                        layer.ln2_betas.data(), 1e-6);
-        }
+      for (auto layer : weights_.pol_encoder) {
+        MakeEncoderLayer(head_buffer, head_buffer2, head_buffer3, batch_size,
+                         layer, embedding_size, weights_.pol_encoder_head_count,
+                         SELU);
       }
 
       // Q

From a96c4c048c6dbf7bb03a1f2f6212d028e011f9cb Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Sat, 11 Feb 2023 02:32:03 +0200
Subject: [PATCH 29/46] blas performance fix

---
 src/neural/blas/network_blas.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 9a3e51ce7b..48ea687ce7 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -482,7 +482,7 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
       std::vector<float> head_buffer3(largest_batch_size * max_channel_size *
                                       kSquares);
 
-      for (auto layer : weights_.pol_encoder) {
+      for (auto& layer : weights_.pol_encoder) {
         MakeEncoderLayer(head_buffer, head_buffer2, head_buffer3, batch_size,
                          layer, embedding_size, weights_.pol_encoder_head_count,
                          SELU);

From ffd6bbaab1569236472c1abdd2b06ef7b9f35f69 Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Mon, 20 Feb 2023 01:49:30 +0100
Subject: [PATCH 30/46] Attention body support to blas.

---
 src/neural/blas/encoder.h                |  13 +-
 src/neural/blas/fully_connected_layer.cc |   4 +-
 src/neural/blas/layer_norm.ispc          |  36 ++-
 src/neural/blas/network_blas.cc          | 331 +++++++++++++++++------
 src/neural/shared/activation.cc          |   2 +
 5 files changed, 294 insertions(+), 92 deletions(-)

diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h
index 567db1be40..b9940a4b14 100644
--- a/src/neural/blas/encoder.h
+++ b/src/neural/blas/encoder.h
@@ -36,9 +36,16 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size,
 #ifndef USE_ISPC
     // Mean taken in dimension C.
     float mean = 0;
-    for (size_t c = 0; c < channels; ++c) {
-      data[i * channels + c] += skip[i * channels + c];
-      mean += data[i * channels + c];
+    if (skip != nullptr) {
+      for (size_t c = 0; c < channels; ++c) {
+        data[i * channels + c] += skip[i * channels + c];
+        mean += data[i * channels + c];
+      }
+    }
+    else {
+      for (size_t c = 0; c < channels; ++c) {
+        mean += data[i * channels + c];
+      }
     }
     mean /= channels;
 
diff --git a/src/neural/blas/fully_connected_layer.cc b/src/neural/blas/fully_connected_layer.cc
index 2465779ed1..b4879d3580 100644
--- a/src/neural/blas/fully_connected_layer.cc
+++ b/src/neural/blas/fully_connected_layer.cc
@@ -103,7 +103,9 @@ void FullyConnectedLayer<false>::Forward1D(
                 outputs,            // C
                 (int)output_size);  // ldc, leading rank of C
   }
-  ApplyBias(batch_size, output_size, biases, activation, outputs);
+  if (biases != nullptr) {
+    ApplyBias(batch_size, output_size, biases, activation, outputs);
+  }
 }
 
 template <>
diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc
index 7995c9b0e2..9d7255e49c 100644
--- a/src/neural/blas/layer_norm.ispc
+++ b/src/neural/blas/layer_norm.ispc
@@ -27,13 +27,23 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
     // One pass mean and variance taken in dimension C. Uses shifted variance calculation.
     float imean = 0;
     float ivar = 0;
-    float k = data[0] + skip[0];
-    foreach (c = 0 ... channels) {
-      float t = data[c] + skip[c];
-      data[c] = t;
-      t -= k;
-      imean += t;
-      ivar += t * t;
+    if (skip != nullptr) {
+      float k = data[0] + skip[0];
+      foreach (c = 0 ... channels) {
+        float t = data[c] + skip[c];
+        data[c] = t;
+        t -= k;
+        imean += t;
+        ivar += t * t;
+      }
+    } else {
+      float k = data[0];
+      foreach (c = 0 ... channels) {
+        float t = data[c];
+        t -= k;
+        imean += t;
+        ivar += t * t;
+      }
     }
     float mean = reduce_add(imean) / channels;
     float var = (reduce_add(ivar) - channels * mean * mean) / channels;
@@ -41,9 +51,15 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
 #else
   // Mean taken in dimension C.
   float imean = 0;
-  foreach (c = 0 ... channels) {
-    data[c] += skip[c];
-    imean += data[c];
+  if (skip != nullptr) {
+    foreach (c = 0 ... channels) {
+      data[c] += skip[c];
+      imean += data[c];
+    }
+  } else {
+    foreach (c = 0 ... channels) {
+      imean += data[c];
+    }
   }
   float mean = reduce_add(imean) / channels;
 
diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 48ea687ce7..2cf050bc62 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -54,7 +54,10 @@ class BlasComputation : public NetworkComputation {
   BlasComputation(const LegacyWeights& weights, const size_t max_batch_size,
                   const bool wdl, const bool moves_left, const bool conv_policy,
                   const ActivationFunction default_activation,
-                  const bool attn_policy);
+                  const ActivationFunction smolgen_activation,
+                  const ActivationFunction ffn_activation,
+                  const bool attn_policy,
+                  const bool attn_body);
 
   virtual ~BlasComputation() {}
 
@@ -107,7 +110,9 @@ class BlasComputation : public NetworkComputation {
                         std::vector<float>& head_buffer3, size_t batch_size,
                         const LegacyWeights::EncoderLayer& layer,
                         int embedding_size, int heads,
-                        ActivationFunction activation, float alpha = 1.0f);
+                        ActivationFunction smolgen_activation = SWISH,
+                        ActivationFunction ffn_activation = RELU_2,
+                        float alpha = 1.0f);
 
   static constexpr auto kWidth = 8;
   static constexpr auto kHeight = 8;
@@ -127,7 +132,10 @@ class BlasComputation : public NetworkComputation {
   bool moves_left_;
   bool conv_policy_;
   ActivationFunction default_activation_;
+  ActivationFunction smolgen_activation_;
+  ActivationFunction ffn_activation_;
   bool attn_policy_;
+  bool attn_body_;
 };
 
 template <bool use_eigen>
@@ -139,7 +147,7 @@ class BlasNetwork : public Network {
   std::unique_ptr<NetworkComputation> NewComputation() override {
     return std::make_unique<BlasComputation<use_eigen>>(
         weights_, max_batch_size_, wdl_, moves_left_, conv_policy_,
-        default_activation_, attn_policy_);
+        default_activation_, smolgen_activation_, ffn_activation_, attn_policy_, attn_body_);
   }
 
   const NetworkCapabilities& GetCapabilities() const override {
@@ -159,14 +167,21 @@ class BlasNetwork : public Network {
   bool moves_left_;
   bool conv_policy_;
   ActivationFunction default_activation_;
+  ActivationFunction smolgen_activation_;
+  ActivationFunction ffn_activation_;
   bool attn_policy_;
+  bool attn_body_;
 };
 
 template <bool use_eigen>
 BlasComputation<use_eigen>::BlasComputation(
     const LegacyWeights& weights, const size_t max_batch_size, const bool wdl,
     const bool moves_left, const bool conv_policy,
-    const ActivationFunction default_activation, const bool attn_policy)
+    const ActivationFunction default_activation,
+    const ActivationFunction smolgen_activation,
+    const ActivationFunction ffn_activation, 
+    const bool attn_policy,
+    const bool attn_body)
     : weights_(weights),
       max_batch_size_(max_batch_size),
       policies_(0),
@@ -175,7 +190,10 @@ BlasComputation<use_eigen>::BlasComputation(
       moves_left_(moves_left),
       conv_policy_(conv_policy),
       default_activation_(default_activation),
-      attn_policy_(attn_policy) {
+      smolgen_activation_(smolgen_activation),
+      ffn_activation_(ffn_activation),
+      attn_policy_(attn_policy),
+      attn_body_(attn_body) {
 #ifdef USE_DNNL
   omp_set_num_threads(1);
 #endif
@@ -201,11 +219,15 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
     std::vector<float>& head_buffer, std::vector<float>& head_buffer2,
     std::vector<float>& head_buffer3, size_t batch_size,
     const LegacyWeights::EncoderLayer& layer, int embedding_size, int heads,
-    ActivationFunction activation, float alpha) {
+    ActivationFunction smolgen_activation, ActivationFunction ffn_activation,
+    float alpha) {
   const int d_model = layer.mha.q_b.size();
+  const int dff_size = layer.ffn.dense1_b.size();
+
   static std::vector<float> head_buffer4;
   head_buffer4.clear();
-  head_buffer4.resize(batch_size * d_model * kSquares);
+  head_buffer4.resize(batch_size * std::max(d_model, dff_size) * kSquares);
+
   static std::vector<float> temp_buffer;
   temp_buffer.clear();
   temp_buffer.resize(heads * kSquares * kSquares);
@@ -260,6 +282,57 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
 #endif
       }
     }
+
+    // Smolgen.
+    if (layer.mha.has_smolgen) {
+      float* input = &head_buffer[batchStart];
+      float* temp1 = &head_buffer2[batchStart];
+      float* temp2 = &head_buffer3[batchStart];
+
+      // Compress.
+      const auto hidden_channels = layer.mha.smolgen.compress.size() / d_model;
+      FullyConnectedLayer<use_eigen>::Forward1D(
+          kSquares, d_model, hidden_channels,
+          input, layer.mha.smolgen.compress.data(),
+          (const float*)nullptr, NONE, temp1);
+
+      // Dense 1.
+      const auto hidden_sz = layer.mha.smolgen.dense1_b.size();
+      FullyConnectedLayer<use_eigen>::Forward1D(
+          1, kSquares * hidden_channels, hidden_sz,
+          temp1, layer.mha.smolgen.dense1_w.data(),
+          layer.mha.smolgen.dense1_b.data(), smolgen_activation, temp2);
+      // Layer Norm + skip connection.
+      LayerNorm2DWithSkipConnection(batch_size, hidden_sz,
+                                    temp2, (const float*)nullptr,
+                                    layer.mha.smolgen.ln1_gammas.data(),
+                                    layer.mha.smolgen.ln1_betas.data(),
+                                    1e-6);
+
+      // Dense 2.
+      const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size();
+      FullyConnectedLayer<use_eigen>::Forward1D(
+          1, hidden_sz, gen_sz_outputs, temp2,
+          layer.mha.smolgen.dense2_w.data(),
+          layer.mha.smolgen.dense2_b.data(), smolgen_activation, temp1);
+      // Layer Norm + skip connection.
+      LayerNorm2DWithSkipConnection(batch_size, gen_sz_outputs,
+                                    temp1, (const float*)nullptr,
+                                    layer.mha.smolgen.ln2_gammas.data(),
+                                    layer.mha.smolgen.ln2_betas.data(),
+                                    1e-6);
+
+      // Global smolgen weights (use bias to add already calculated attention weights).
+      FullyConnectedLayer<use_eigen>::Forward1D(
+          heads, gen_sz_outputs / heads, 64 * 64, temp1,
+          weights_.smolgen_w.data(), (const float*)nullptr, NONE, temp2);
+
+      // Add smolgen weights to QK.
+      for (auto i = 0; i < heads * kSquares * kSquares; i++) {
+        QK[i] += temp2[i];
+      }
+    }
+
     // Apply Softmax.
     for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) {
 #ifndef USE_ISPC
@@ -311,15 +384,14 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
                                 1e-6);
 
   // FFN.
-  const size_t dff_size = layer.ffn.dense1_b.size();
   FullyConnectedLayer<use_eigen>::Forward1D(
       batch_size * kSquares, embedding_size, dff_size, head_buffer.data(),
-      layer.ffn.dense1_w.data(), layer.ffn.dense1_b.data(), activation,
-      head_buffer2.data());
+      layer.ffn.dense1_w.data(), layer.ffn.dense1_b.data(), ffn_activation,
+      head_buffer4.data());
 
   FullyConnectedLayer<use_eigen>::Forward1D(
       batch_size * kSquares, dff_size, layer.ffn.dense2_b.size(),
-      head_buffer2.data(), layer.ffn.dense2_w.data(), layer.ffn.dense2_b.data(),
+      head_buffer4.data(), layer.ffn.dense2_w.data(), layer.ffn.dense2_b.data(),
       NONE, head_buffer3.data());
 
   if (alpha != 1.0f) {
@@ -341,17 +413,21 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
   // Retrieve network key dimensions from the weights structure.
   const auto num_value_channels = weights_.ip1_val_b.size();
   const auto num_moves_channels = weights_.ip1_mov_b.size();
-  const auto num_value_input_planes = weights_.value.biases.size();
+  const auto num_value_input_planes = attn_body_ ? weights_.ip_val_b.size()
+                                     : weights_.value.biases.size();
   const auto num_policy_input_planes = weights_.policy.biases.size();
-  const auto num_moves_input_planes = weights_.moves_left.biases.size();
+  const auto num_moves_input_planes = attn_body_ ? weights_.ip_mov_b.size()
+                                     : weights_.moves_left.biases.size();
   const auto num_output_policy = static_cast<size_t>(kPolicyOutputs);
-  const auto output_channels = weights_.input.biases.size();
+  const auto output_channels = attn_body_ ? weights_.ip_emb_b.size()
+                               : weights_.input.biases.size();
+  const auto num_res_blocks = weights_.residual.size();
 
   // max_channels is the maximum number of input channels of any
   // convolution.
   // Residual blocks are identical, but the first convolution might be bigger
   // when the network has very few filters
-  const auto input_channels = static_cast<size_t>(kInputPlanes);
+  const auto input_channels = static_cast<size_t>(kInputPlanes + (attn_body_ ? kNumPosEncodingChannels : 0));
   const auto max_channels = std::max(output_channels, input_channels);
 
   // The policy head may increase convolution max output size.
@@ -361,8 +437,8 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
           : output_channels;
 
   // Determine the largest batch for allocations.
-  const auto plane_count = planes_.size();
-  const auto largest_batch_size = std::min(max_batch_size_, plane_count);
+  const auto total_batches = planes_.size();
+  const auto largest_batch_size = std::min(max_batch_size_, total_batches);
 
   /* Typically
    input_channels = 112
@@ -380,10 +456,8 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
   std::vector<float> output_fc(largest_batch_size * max_fc_channels);
 
   std::vector<float> res_buffer1(largest_batch_size * max_channels * kSquares);
-  std::vector<float> res_buffer2(largest_batch_size * output_channels *
-                                 kSquares);
-  std::vector<float> res_buffer3(largest_batch_size * output_channels *
-                                 kSquares);
+  std::vector<float> res_buffer2(largest_batch_size * max_channels * kSquares);
+  std::vector<float> res_buffer3(largest_batch_size * max_channels * kSquares);
 
   WinogradConvolution3<use_eigen> convolve3(largest_batch_size, max_channels,
                                             max_output_channels);
@@ -403,73 +477,136 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
   float* conv_out = res_buffer2.data();
   float* res = res_buffer3.data();
 
-  for (size_t i = 0; i < plane_count; i += largest_batch_size) {
-    const auto batch_size = std::min(plane_count - i, largest_batch_size);
+  for (size_t i = 0; i < total_batches; i += largest_batch_size) {
+    const auto batch_size = std::min(total_batches - i, largest_batch_size);
     for (size_t j = 0; j < batch_size; j++) {
       EncodePlanes(planes_[i + j], &conv_in[j * kSquares * kInputPlanes]);
     }
 
-    // Input convolution
+    if (num_res_blocks > 0) {
+      // Input convolution
 
-    convolve3.Forward(batch_size, kInputPlanes, output_channels, conv_in,
-                      weights_.input.weights.data(), conv_out);
+      convolve3.Forward(batch_size, kInputPlanes, output_channels, conv_in,
+                        weights_.input.weights.data(), conv_out);
 
-    BiasActivate(batch_size, output_channels, conv_out,
-                 weights_.input.biases.data(), default_activation_);
+      BiasActivate(batch_size, output_channels, conv_out,
+                  weights_.input.biases.data(), default_activation_);
 
-    // Residual tower
+      // Residual tower
 
-    for (auto& residual : weights_.residual) {
-      const auto& conv1 = residual.conv1;
-      const auto& conv2 = residual.conv2;
-      const auto& se = residual.se;
+      for (auto& residual : weights_.residual) {
+        const auto& conv1 = residual.conv1;
+        const auto& conv2 = residual.conv2;
+        const auto& se = residual.se;
 
-      std::swap(conv_out, conv_in);
+        std::swap(conv_out, conv_in);
 
-      convolve3.Forward(batch_size, output_channels, output_channels, conv_in,
-                        conv1.weights.data(), conv_out);
+        convolve3.Forward(batch_size, output_channels, output_channels, conv_in,
+                          conv1.weights.data(), conv_out);
 
-      BiasActivate(batch_size, output_channels, &conv_out[0],
-                   conv1.biases.data(), default_activation_);
+        BiasActivate(batch_size, output_channels, &conv_out[0],
+                    conv1.biases.data(), default_activation_);
 
-      std::swap(conv_in, res);
-      std::swap(conv_out, conv_in);
+        std::swap(conv_in, res);
+        std::swap(conv_out, conv_in);
 
-      convolve3.Forward(batch_size, output_channels, output_channels, conv_in,
-                        conv2.weights.data(), conv_out);
+        convolve3.Forward(batch_size, output_channels, output_channels, conv_in,
+                          conv2.weights.data(), conv_out);
 
-      if (residual.has_se) {
-        // No relu if followed by SE-unit and residual/bias is added later
-        std::swap(conv_out, conv_in);
+        if (residual.has_se) {
+          // No relu if followed by SE-unit and residual/bias is added later
+          std::swap(conv_out, conv_in);
 
-        auto se_fc_outputs = se.b1.size();
-        ApplySEUnit<use_eigen>(batch_size, output_channels, se_fc_outputs,
-                               conv_in, conv2.biases.data(), res, se.w1.data(),
-                               se.b1.data(), se.w2.data(), se.b2.data(),
-                               conv_out, default_activation_);
-      } else {
-        BiasResidual(batch_size, output_channels, &conv_out[0],
-                     conv2.biases.data(), res, default_activation_);
+          auto se_fc_outputs = se.b1.size();
+          ApplySEUnit<use_eigen>(batch_size, output_channels, se_fc_outputs,
+                                conv_in, conv2.biases.data(), res, se.w1.data(),
+                                se.b1.data(), se.w2.data(), se.b2.data(),
+                                conv_out, default_activation_);
+        } else {
+          BiasResidual(batch_size, output_channels, &conv_out[0],
+                      conv2.biases.data(), res, default_activation_);
+        }
       }
     }
 
+    if (attn_body_) {
+      const auto embedding_size = weights_.ip_emb_b.size();
+      assert(embedding_size > 0);
+      const auto input_size = num_res_blocks == 0 ? input_channels : weights_.input.biases.size();
+
+      if (num_res_blocks == 0) {
+        // No residual means pure transformer, so process input position encoding.
+        // Preprocess for attention body.
+        for (auto batch = size_t{0}; batch < batch_size; batch++) {
+          for (auto i = 0; i < kSquares; i++) {
+            // NCHW to NHWC conversion.
+            for (size_t j = 0; j < kInputPlanes; j++) {
+              res[batch * kSquares * input_size + i * input_size + j] =
+                  conv_in[batch * kSquares * kInputPlanes + j * kSquares + i];
+            }
+            // Position encoding.
+            for (size_t j = kInputPlanes; j < input_size; j++) {
+              res[batch * kSquares * input_size + i * input_size + j] =
+                  kPosEncoding[i][j - kInputPlanes];
+            }
+          }
+        }
+      }
+
+      // Input embedding.
+      FullyConnectedLayer<use_eigen>::Forward1D(
+          batch_size * kSquares, input_size, embedding_size,
+          res_buffer3.data(), weights_.ip_emb_w.data(),
+          weights_.ip_emb_b.data(), default_activation_, res_buffer1.data());
+
+      // Input gating
+      if (weights_.ip_mult_gate.size() > 0) {
+        int idx;
+        for (auto batch = size_t{0}; batch < batch_size; batch++) {
+          for (auto i = 0; i < kSquares; i++) {
+            for (auto j = 0; j < embedding_size; j++) {
+              idx = batch * kSquares * embedding_size + i * embedding_size + j;
+              res_buffer1[idx] = res_buffer1[idx] * weights_.ip_mult_gate[j * kSquares + i]
+                                 + weights_.ip_add_gate[j * kSquares + i];
+            }
+          }
+        };
+      }
+
+      // Attention body encoders.
+
+      float alpha = (float) pow(2.0 * weights_.encoder.size(), 0.25);
+      for (auto& layer : weights_.encoder) {
+        MakeEncoderLayer(res_buffer1, res_buffer2, res_buffer3, batch_size,
+                         layer, embedding_size, weights_.encoder_head_count,
+                         smolgen_activation_, ffn_activation_, alpha);
+      }
+
+      // for (auto i=0; i<embedding_size * batch_size * 64; i++) CERR << i << ";" << res_buffer1[i];
+      res = res_buffer1.data();
+      conv_in = res_buffer2.data();
+      conv_out = res_buffer3.data();
+    }
+
     // Need to preserve conv_out which is used for value and moves left heads.
     if (attn_policy_) {
-      // NCHW to NHWC conversion.
-      for (auto batch = size_t{0}; batch < batch_size; batch++) {
-        for (auto i = 0; i < kSquares; i++) {
-          for (size_t j = 0; j < output_channels; j++) {
-            res[batch * kSquares * output_channels + i * output_channels + j] =
-                conv_out[batch * kSquares * output_channels + j * kSquares + i];
+      if (!attn_body_) {
+        // NCHW to NHWC conversion.
+        for (auto batch = size_t{0}; batch < batch_size; batch++) {
+          for (auto i = 0; i < kSquares; i++) {
+            for (size_t j = 0; j < output_channels; j++) {
+              res[batch * kSquares * output_channels + i * output_channels + j] =
+                  conv_out[batch * kSquares * output_channels + j * kSquares + i];
+            }
           }
         }
       }
-      const size_t embedding_size = weights_.ip_pol_b.size();
-      // Embedding.
+      const size_t policy_embedding_size = weights_.ip_pol_b.size();
+      // Policy Embedding.
       FullyConnectedLayer<use_eigen>::Forward1D(
-          batch_size * kSquares, output_channels, embedding_size, res,
+          batch_size * kSquares, output_channels, policy_embedding_size, res,
           weights_.ip_pol_w.data(), weights_.ip_pol_b.data(),
-          SELU,  // SELU activation for attention head.
+          attn_body_ ? default_activation_ : SELU,  // SELU activation hardcoded for apmish nets.
           head_buffer.data());
 
       const size_t policy_d_model = weights_.ip2_pol_b.size();
@@ -484,18 +621,20 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
 
       for (auto& layer : weights_.pol_encoder) {
         MakeEncoderLayer(head_buffer, head_buffer2, head_buffer3, batch_size,
-                         layer, embedding_size, weights_.pol_encoder_head_count,
-                         SELU);
+                         layer, policy_embedding_size, weights_.pol_encoder_head_count,
+                         attn_body_ ? smolgen_activation_ : NONE,
+                         attn_body_ ? ffn_activation_ : SELU,
+                         1.0f);
       }
 
       // Q
       FullyConnectedLayer<use_eigen>::Forward1D(
-          batch_size * kSquares, embedding_size, policy_d_model,
+          batch_size * kSquares, policy_embedding_size, policy_d_model,
           head_buffer.data(), weights_.ip2_pol_w.data(),
           weights_.ip2_pol_b.data(), NONE, head_buffer2.data());
       // K
       FullyConnectedLayer<use_eigen>::Forward1D(
-          batch_size * kSquares, embedding_size, policy_d_model,
+          batch_size * kSquares, policy_embedding_size, policy_d_model,
           head_buffer.data(), weights_.ip3_pol_w.data(),
           weights_.ip3_pol_b.data(), NONE, head_buffer3.data());
       const float scaling = 1.0f / sqrtf(policy_d_model);
@@ -564,6 +703,7 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
         }
       }
     } else if (conv_policy_) {
+      assert(!attn_body_);  // not supported with attention body
       convolve3.Forward(batch_size, output_channels, output_channels, conv_out,
                         weights_.policy1.weights.data(), res);
 
@@ -589,6 +729,7 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
       }
 
     } else {
+      assert(!attn_body_);  // not supported with attention body
       Convolution1<use_eigen>::Forward(
           batch_size, output_channels, num_policy_input_planes, conv_out,
           weights_.policy.weights.data(), head_buffer.data());
@@ -614,12 +755,22 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
     }
 
     // Value head
-    Convolution1<use_eigen>::Forward(
-        batch_size, output_channels, num_value_input_planes, conv_out,
-        weights_.value.weights.data(), head_buffer.data());
+    if (attn_body_) {
+      FullyConnectedLayer<use_eigen>::Forward1D(
+          batch_size * kSquares, weights_.ip_emb_b.size(), num_value_input_planes,
+          res, weights_.ip_val_w.data(),
+          weights_.ip_val_b.data(),
+          default_activation_,
+          head_buffer.data());
+    }
+    else {
+      Convolution1<use_eigen>::Forward(
+          batch_size, output_channels, num_value_input_planes, conv_out,
+          weights_.value.weights.data(), head_buffer.data());
 
-    BiasActivate(batch_size, num_value_input_planes, &head_buffer[0],
-                 weights_.value.biases.data(), default_activation_);
+      BiasActivate(batch_size, num_value_input_planes, &head_buffer[0],
+                  weights_.value.biases.data(), default_activation_);
+    }
 
     FullyConnectedLayer<use_eigen>::Forward1D(
         batch_size, num_value_input_planes * kSquares, num_value_channels,
@@ -656,12 +807,22 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
       }
     }
     if (moves_left_) {
-      Convolution1<use_eigen>::Forward(
-          batch_size, output_channels, num_moves_input_planes, conv_out,
-          weights_.moves_left.weights.data(), head_buffer.data());
+      if (attn_body_) {
+        FullyConnectedLayer<use_eigen>::Forward1D(
+            batch_size * kSquares, weights_.ip_emb_b.size(), num_moves_input_planes,
+            res, weights_.ip_mov_w.data(),
+            weights_.ip_mov_b.data(),
+            default_activation_,
+            head_buffer.data());
+      }
+      else {
+        Convolution1<use_eigen>::Forward(
+            batch_size, output_channels, num_moves_input_planes, conv_out,
+            weights_.moves_left.weights.data(), head_buffer.data());
 
-      BiasActivate(batch_size, num_moves_input_planes, &head_buffer[0],
-                   weights_.moves_left.biases.data(), default_activation_);
+        BiasActivate(batch_size, num_moves_input_planes, &head_buffer[0],
+                    weights_.moves_left.biases.data(), default_activation_);
+      }
 
       FullyConnectedLayer<use_eigen>::Forward1D(
           batch_size, num_moves_input_planes * kSquares, num_moves_channels,
@@ -718,11 +879,23 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
   attn_policy_ = file.format().network_format().policy() ==
                  pblczero::NetworkFormat::POLICY_ATTENTION;
 
+  attn_body_ = file.format().network_format().network() ==
+                 pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT;
+
   default_activation_ = file.format().network_format().default_activation() ==
                                 pblczero::NetworkFormat::DEFAULT_ACTIVATION_MISH
                             ? MISH
                             : RELU;
 
+  if (attn_body_) {
+    const auto smol_act = file.format().network_format().smolgen_activation();
+    smolgen_activation_ = smol_act == pblczero::NetworkFormat::SMOLGEN_ACTIVATION_INHERIT
+                                ? default_activation_ : static_cast<ActivationFunction>(smol_act);
+    const auto ffn_act = file.format().network_format().ffn_activation();
+    ffn_activation_ = ffn_act == pblczero::NetworkFormat::FFN_ACTIVATION_INHERIT
+                            ? default_activation_ : static_cast<ActivationFunction>(ffn_act);
+  }
+
   if (max_batch_size_ > kHardMaxBatchSize) {
     max_batch_size_ = kHardMaxBatchSize;
   }
@@ -804,7 +977,9 @@ std::unique_ptr<Network> MakeBlasNetwork(const std::optional<WeightsFile>& w,
   if (weights.format().network_format().network() !=
           pblczero::NetworkFormat::NETWORK_CLASSICAL_WITH_HEADFORMAT &&
       weights.format().network_format().network() !=
-          pblczero::NetworkFormat::NETWORK_SE_WITH_HEADFORMAT) {
+          pblczero::NetworkFormat::NETWORK_SE_WITH_HEADFORMAT &&
+      weights.format().network_format().network() !=
+          pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT) {
     throw Exception("Network format " +
                     pblczero::NetworkFormat::NetworkStructure_Name(
                         weights.format().network_format().network()) +
diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc
index 1a2dd34ead..9c90936c00 100644
--- a/src/neural/shared/activation.cc
+++ b/src/neural/shared/activation.cc
@@ -72,6 +72,8 @@ float Activate(const float val, const ActivationFunction activation) {
   switch (activation) {
     case RELU:
       return val > 0 ? val : 0;
+    case RELU_2:
+      return val > 0 ? val * val : 0;
     case MISH:
       return mish(val);
     case TANH:

From 919ef8e558bbaed23a022de7bcecb5092eebd102 Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Mon, 20 Feb 2023 11:51:28 +0100
Subject: [PATCH 31/46] Minor bug fixes (thanks @borg).

---
 src/neural/blas/fully_connected_layer.cc |  4 +++-
 src/neural/blas/layer_norm.ispc          |  4 ++--
 src/neural/blas/network_blas.cc          | 14 ++++----------
 3 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/src/neural/blas/fully_connected_layer.cc b/src/neural/blas/fully_connected_layer.cc
index b4879d3580..d228d2523f 100644
--- a/src/neural/blas/fully_connected_layer.cc
+++ b/src/neural/blas/fully_connected_layer.cc
@@ -136,7 +136,9 @@ void FullyConnectedLayer<true>::Forward1D(
             .transpose() *
         ConstEigenMatrixMap<float>(inputs, input_size, batch_size);
   }
-  ApplyBias(batch_size, output_size, biases, activation, outputs);
+  if (biases != nullptr) {
+    ApplyBias(batch_size, output_size, biases, activation, outputs);
+  }
 }
 
 template <>
diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc
index 9d7255e49c..43fa3fe12a 100644
--- a/src/neural/blas/layer_norm.ispc
+++ b/src/neural/blas/layer_norm.ispc
@@ -27,7 +27,7 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
     // One pass mean and variance taken in dimension C. Uses shifted variance calculation.
     float imean = 0;
     float ivar = 0;
-    if (skip != nullptr) {
+    if (skip != NULL) {
       float k = data[0] + skip[0];
       foreach (c = 0 ... channels) {
         float t = data[c] + skip[c];
@@ -51,7 +51,7 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
 #else
   // Mean taken in dimension C.
   float imean = 0;
-  if (skip != nullptr) {
+  if (skip != NULL) {
     foreach (c = 0 ... channels) {
       data[c] += skip[c];
       imean += data[c];
diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 2cf050bc62..b15903a771 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -223,14 +223,8 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
     float alpha) {
   const int d_model = layer.mha.q_b.size();
   const int dff_size = layer.ffn.dense1_b.size();
-
-  static std::vector<float> head_buffer4;
-  head_buffer4.clear();
-  head_buffer4.resize(batch_size * std::max(d_model, dff_size) * kSquares);
-
-  static std::vector<float> temp_buffer;
-  temp_buffer.clear();
-  temp_buffer.resize(heads * kSquares * kSquares);
+  std::vector<float> head_buffer4(batch_size * std::max(d_model, dff_size) * kSquares);
+  std::vector<float> temp_buffer(heads * kSquares * kSquares);
   // Q
   FullyConnectedLayer<use_eigen>::Forward1D(
       batch_size * kSquares, embedding_size, d_model, head_buffer.data(),
@@ -303,7 +297,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
           temp1, layer.mha.smolgen.dense1_w.data(),
           layer.mha.smolgen.dense1_b.data(), smolgen_activation, temp2);
       // Layer Norm + skip connection.
-      LayerNorm2DWithSkipConnection(batch_size, hidden_sz,
+      LayerNorm2DWithSkipConnection(1, hidden_sz,
                                     temp2, (const float*)nullptr,
                                     layer.mha.smolgen.ln1_gammas.data(),
                                     layer.mha.smolgen.ln1_betas.data(),
@@ -316,7 +310,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
           layer.mha.smolgen.dense2_w.data(),
           layer.mha.smolgen.dense2_b.data(), smolgen_activation, temp1);
       // Layer Norm + skip connection.
-      LayerNorm2DWithSkipConnection(batch_size, gen_sz_outputs,
+      LayerNorm2DWithSkipConnection(1, gen_sz_outputs,
                                     temp1, (const float*)nullptr,
                                     layer.mha.smolgen.ln2_gammas.data(),
                                     layer.mha.smolgen.ln2_betas.data(),

From 8ebb2226f370001f90b479783951f0fce5ee5d28 Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Mon, 20 Feb 2023 12:20:36 +0100
Subject: [PATCH 32/46] Comment fix.

---
 src/neural/blas/network_blas.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index b15903a771..d36fd88d4c 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -316,7 +316,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
                                     layer.mha.smolgen.ln2_betas.data(),
                                     1e-6);
 
-      // Global smolgen weights (use bias to add already calculated attention weights).
+      // Global smolgen weights.
       FullyConnectedLayer<use_eigen>::Forward1D(
           heads, gen_sz_outputs / heads, 64 * 64, temp1,
           weights_.smolgen_w.data(), (const float*)nullptr, NONE, temp2);

From ce2c2e79286543b381b8b69cde4c7dddedc0c99a Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Mon, 20 Feb 2023 12:29:08 +0100
Subject: [PATCH 33/46] Add check for add_gating.

---
 src/neural/blas/network_blas.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index d36fd88d4c..25a5b22c56 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -554,7 +554,7 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
           weights_.ip_emb_b.data(), default_activation_, res_buffer1.data());
 
       // Input gating
-      if (weights_.ip_mult_gate.size() > 0) {
+      if (weights_.ip_mult_gate.size() > 0 && weights_.ip_add_gate.size() > 0) {
         int idx;
         for (auto batch = size_t{0}; batch < batch_size; batch++) {
           for (auto i = 0; i < kSquares; i++) {

From d299d01262e6b1a566d327f0afb58abc1f1ed9ed Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Mon, 20 Feb 2023 12:30:30 +0100
Subject: [PATCH 34/46] Comment fix.

---
 src/neural/blas/network_blas.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 25a5b22c56..2c6b0d0881 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -568,7 +568,6 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
       }
 
       // Attention body encoders.
-
       float alpha = (float) pow(2.0 * weights_.encoder.size(), 0.25);
       for (auto& layer : weights_.encoder) {
         MakeEncoderLayer(res_buffer1, res_buffer2, res_buffer3, batch_size,

From def48b5c91a701f7b2637106703814db746f950c Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Mon, 20 Feb 2023 12:30:50 +0100
Subject: [PATCH 35/46] Comment fix.

---
 src/neural/blas/network_blas.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 2c6b0d0881..9a86e58b18 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -575,7 +575,6 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
                          smolgen_activation_, ffn_activation_, alpha);
       }
 
-      // for (auto i=0; i<embedding_size * batch_size * 64; i++) CERR << i << ";" << res_buffer1[i];
       res = res_buffer1.data();
       conv_in = res_buffer2.data();
       conv_out = res_buffer3.data();

From cc202bfa792abdfd940d621e2634fa34779f6720 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Mon, 20 Feb 2023 18:12:34 +0200
Subject: [PATCH 36/46] code formatting

---
 src/neural/blas/network_blas.cc | 138 ++++++++++++++++----------------
 1 file changed, 69 insertions(+), 69 deletions(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 9a86e58b18..60e61d0447 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -56,8 +56,7 @@ class BlasComputation : public NetworkComputation {
                   const ActivationFunction default_activation,
                   const ActivationFunction smolgen_activation,
                   const ActivationFunction ffn_activation,
-                  const bool attn_policy,
-                  const bool attn_body);
+                  const bool attn_policy, const bool attn_body);
 
   virtual ~BlasComputation() {}
 
@@ -147,7 +146,8 @@ class BlasNetwork : public Network {
   std::unique_ptr<NetworkComputation> NewComputation() override {
     return std::make_unique<BlasComputation<use_eigen>>(
         weights_, max_batch_size_, wdl_, moves_left_, conv_policy_,
-        default_activation_, smolgen_activation_, ffn_activation_, attn_policy_, attn_body_);
+        default_activation_, smolgen_activation_, ffn_activation_, attn_policy_,
+        attn_body_);
   }
 
   const NetworkCapabilities& GetCapabilities() const override {
@@ -179,8 +179,7 @@ BlasComputation<use_eigen>::BlasComputation(
     const bool moves_left, const bool conv_policy,
     const ActivationFunction default_activation,
     const ActivationFunction smolgen_activation,
-    const ActivationFunction ffn_activation, 
-    const bool attn_policy,
+    const ActivationFunction ffn_activation, const bool attn_policy,
     const bool attn_body)
     : weights_(weights),
       max_batch_size_(max_batch_size),
@@ -223,7 +222,8 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
     float alpha) {
   const int d_model = layer.mha.q_b.size();
   const int dff_size = layer.ffn.dense1_b.size();
-  std::vector<float> head_buffer4(batch_size * std::max(d_model, dff_size) * kSquares);
+  std::vector<float> head_buffer4(batch_size * std::max(d_model, dff_size) *
+                                  kSquares);
   std::vector<float> temp_buffer(heads * kSquares * kSquares);
   // Q
   FullyConnectedLayer<use_eigen>::Forward1D(
@@ -286,35 +286,32 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
       // Compress.
       const auto hidden_channels = layer.mha.smolgen.compress.size() / d_model;
       FullyConnectedLayer<use_eigen>::Forward1D(
-          kSquares, d_model, hidden_channels,
-          input, layer.mha.smolgen.compress.data(),
-          (const float*)nullptr, NONE, temp1);
+          kSquares, d_model, hidden_channels, input,
+          layer.mha.smolgen.compress.data(), (const float*)nullptr, NONE,
+          temp1);
 
       // Dense 1.
       const auto hidden_sz = layer.mha.smolgen.dense1_b.size();
       FullyConnectedLayer<use_eigen>::Forward1D(
-          1, kSquares * hidden_channels, hidden_sz,
-          temp1, layer.mha.smolgen.dense1_w.data(),
-          layer.mha.smolgen.dense1_b.data(), smolgen_activation, temp2);
+          1, kSquares * hidden_channels, hidden_sz, temp1,
+          layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(),
+          smolgen_activation, temp2);
       // Layer Norm + skip connection.
-      LayerNorm2DWithSkipConnection(1, hidden_sz,
-                                    temp2, (const float*)nullptr,
+      LayerNorm2DWithSkipConnection(1, hidden_sz, temp2, (const float*)nullptr,
                                     layer.mha.smolgen.ln1_gammas.data(),
-                                    layer.mha.smolgen.ln1_betas.data(),
-                                    1e-6);
+                                    layer.mha.smolgen.ln1_betas.data(), 1e-6);
 
       // Dense 2.
       const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size();
       FullyConnectedLayer<use_eigen>::Forward1D(
           1, hidden_sz, gen_sz_outputs, temp2,
-          layer.mha.smolgen.dense2_w.data(),
-          layer.mha.smolgen.dense2_b.data(), smolgen_activation, temp1);
+          layer.mha.smolgen.dense2_w.data(), layer.mha.smolgen.dense2_b.data(),
+          smolgen_activation, temp1);
       // Layer Norm + skip connection.
-      LayerNorm2DWithSkipConnection(1, gen_sz_outputs,
-                                    temp1, (const float*)nullptr,
+      LayerNorm2DWithSkipConnection(1, gen_sz_outputs, temp1,
+                                    (const float*)nullptr,
                                     layer.mha.smolgen.ln2_gammas.data(),
-                                    layer.mha.smolgen.ln2_betas.data(),
-                                    1e-6);
+                                    layer.mha.smolgen.ln2_betas.data(), 1e-6);
 
       // Global smolgen weights.
       FullyConnectedLayer<use_eigen>::Forward1D(
@@ -407,21 +404,22 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
   // Retrieve network key dimensions from the weights structure.
   const auto num_value_channels = weights_.ip1_val_b.size();
   const auto num_moves_channels = weights_.ip1_mov_b.size();
-  const auto num_value_input_planes = attn_body_ ? weights_.ip_val_b.size()
-                                     : weights_.value.biases.size();
+  const auto num_value_input_planes =
+      attn_body_ ? weights_.ip_val_b.size() : weights_.value.biases.size();
   const auto num_policy_input_planes = weights_.policy.biases.size();
-  const auto num_moves_input_planes = attn_body_ ? weights_.ip_mov_b.size()
-                                     : weights_.moves_left.biases.size();
+  const auto num_moves_input_planes =
+      attn_body_ ? weights_.ip_mov_b.size() : weights_.moves_left.biases.size();
   const auto num_output_policy = static_cast<size_t>(kPolicyOutputs);
-  const auto output_channels = attn_body_ ? weights_.ip_emb_b.size()
-                               : weights_.input.biases.size();
+  const auto output_channels =
+      attn_body_ ? weights_.ip_emb_b.size() : weights_.input.biases.size();
   const auto num_res_blocks = weights_.residual.size();
 
   // max_channels is the maximum number of input channels of any
   // convolution.
   // Residual blocks are identical, but the first convolution might be bigger
   // when the network has very few filters
-  const auto input_channels = static_cast<size_t>(kInputPlanes + (attn_body_ ? kNumPosEncodingChannels : 0));
+  const auto input_channels = static_cast<size_t>(
+      kInputPlanes + (attn_body_ ? kNumPosEncodingChannels : 0));
   const auto max_channels = std::max(output_channels, input_channels);
 
   // The policy head may increase convolution max output size.
@@ -484,7 +482,7 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
                         weights_.input.weights.data(), conv_out);
 
       BiasActivate(batch_size, output_channels, conv_out,
-                  weights_.input.biases.data(), default_activation_);
+                   weights_.input.biases.data(), default_activation_);
 
       // Residual tower
 
@@ -499,7 +497,7 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
                           conv1.weights.data(), conv_out);
 
         BiasActivate(batch_size, output_channels, &conv_out[0],
-                    conv1.biases.data(), default_activation_);
+                     conv1.biases.data(), default_activation_);
 
         std::swap(conv_in, res);
         std::swap(conv_out, conv_in);
@@ -513,12 +511,12 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
 
           auto se_fc_outputs = se.b1.size();
           ApplySEUnit<use_eigen>(batch_size, output_channels, se_fc_outputs,
-                                conv_in, conv2.biases.data(), res, se.w1.data(),
-                                se.b1.data(), se.w2.data(), se.b2.data(),
-                                conv_out, default_activation_);
+                                 conv_in, conv2.biases.data(), res,
+                                 se.w1.data(), se.b1.data(), se.w2.data(),
+                                 se.b2.data(), conv_out, default_activation_);
         } else {
           BiasResidual(batch_size, output_channels, &conv_out[0],
-                      conv2.biases.data(), res, default_activation_);
+                       conv2.biases.data(), res, default_activation_);
         }
       }
     }
@@ -526,10 +524,12 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
     if (attn_body_) {
       const auto embedding_size = weights_.ip_emb_b.size();
       assert(embedding_size > 0);
-      const auto input_size = num_res_blocks == 0 ? input_channels : weights_.input.biases.size();
+      const auto input_size =
+          num_res_blocks == 0 ? input_channels : weights_.input.biases.size();
 
       if (num_res_blocks == 0) {
-        // No residual means pure transformer, so process input position encoding.
+        // No residual means pure transformer, so process input position
+        // encoding.
         // Preprocess for attention body.
         for (auto batch = size_t{0}; batch < batch_size; batch++) {
           for (auto i = 0; i < kSquares; i++) {
@@ -549,9 +549,9 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
 
       // Input embedding.
       FullyConnectedLayer<use_eigen>::Forward1D(
-          batch_size * kSquares, input_size, embedding_size,
-          res_buffer3.data(), weights_.ip_emb_w.data(),
-          weights_.ip_emb_b.data(), default_activation_, res_buffer1.data());
+          batch_size * kSquares, input_size, embedding_size, res_buffer3.data(),
+          weights_.ip_emb_w.data(), weights_.ip_emb_b.data(),
+          default_activation_, res_buffer1.data());
 
       // Input gating
       if (weights_.ip_mult_gate.size() > 0 && weights_.ip_add_gate.size() > 0) {
@@ -560,15 +560,16 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
           for (auto i = 0; i < kSquares; i++) {
             for (auto j = 0; j < embedding_size; j++) {
               idx = batch * kSquares * embedding_size + i * embedding_size + j;
-              res_buffer1[idx] = res_buffer1[idx] * weights_.ip_mult_gate[j * kSquares + i]
-                                 + weights_.ip_add_gate[j * kSquares + i];
+              res_buffer1[idx] =
+                  res_buffer1[idx] * weights_.ip_mult_gate[j * kSquares + i] +
+                  weights_.ip_add_gate[j * kSquares + i];
             }
           }
         };
       }
 
       // Attention body encoders.
-      float alpha = (float) pow(2.0 * weights_.encoder.size(), 0.25);
+      float alpha = (float)pow(2.0 * weights_.encoder.size(), 0.25);
       for (auto& layer : weights_.encoder) {
         MakeEncoderLayer(res_buffer1, res_buffer2, res_buffer3, batch_size,
                          layer, embedding_size, weights_.encoder_head_count,
@@ -587,8 +588,9 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
         for (auto batch = size_t{0}; batch < batch_size; batch++) {
           for (auto i = 0; i < kSquares; i++) {
             for (size_t j = 0; j < output_channels; j++) {
-              res[batch * kSquares * output_channels + i * output_channels + j] =
-                  conv_out[batch * kSquares * output_channels + j * kSquares + i];
+              res[batch * kSquares * output_channels + i * output_channels +
+                  j] = conv_out[batch * kSquares * output_channels +
+                                j * kSquares + i];
             }
           }
         }
@@ -598,7 +600,8 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
       FullyConnectedLayer<use_eigen>::Forward1D(
           batch_size * kSquares, output_channels, policy_embedding_size, res,
           weights_.ip_pol_w.data(), weights_.ip_pol_b.data(),
-          attn_body_ ? default_activation_ : SELU,  // SELU activation hardcoded for apmish nets.
+          attn_body_ ? default_activation_
+                     : SELU,  // SELU activation hardcoded for apmish nets.
           head_buffer.data());
 
       const size_t policy_d_model = weights_.ip2_pol_b.size();
@@ -613,10 +616,10 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
 
       for (auto& layer : weights_.pol_encoder) {
         MakeEncoderLayer(head_buffer, head_buffer2, head_buffer3, batch_size,
-                         layer, policy_embedding_size, weights_.pol_encoder_head_count,
+                         layer, policy_embedding_size,
+                         weights_.pol_encoder_head_count,
                          attn_body_ ? smolgen_activation_ : NONE,
-                         attn_body_ ? ffn_activation_ : SELU,
-                         1.0f);
+                         attn_body_ ? ffn_activation_ : SELU, 1.0f);
       }
 
       // Q
@@ -749,19 +752,16 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
     // Value head
     if (attn_body_) {
       FullyConnectedLayer<use_eigen>::Forward1D(
-          batch_size * kSquares, weights_.ip_emb_b.size(), num_value_input_planes,
-          res, weights_.ip_val_w.data(),
-          weights_.ip_val_b.data(),
-          default_activation_,
-          head_buffer.data());
-    }
-    else {
+          batch_size * kSquares, weights_.ip_emb_b.size(),
+          num_value_input_planes, res, weights_.ip_val_w.data(),
+          weights_.ip_val_b.data(), default_activation_, head_buffer.data());
+    } else {
       Convolution1<use_eigen>::Forward(
           batch_size, output_channels, num_value_input_planes, conv_out,
           weights_.value.weights.data(), head_buffer.data());
 
       BiasActivate(batch_size, num_value_input_planes, &head_buffer[0],
-                  weights_.value.biases.data(), default_activation_);
+                   weights_.value.biases.data(), default_activation_);
     }
 
     FullyConnectedLayer<use_eigen>::Forward1D(
@@ -801,19 +801,16 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
     if (moves_left_) {
       if (attn_body_) {
         FullyConnectedLayer<use_eigen>::Forward1D(
-            batch_size * kSquares, weights_.ip_emb_b.size(), num_moves_input_planes,
-            res, weights_.ip_mov_w.data(),
-            weights_.ip_mov_b.data(),
-            default_activation_,
-            head_buffer.data());
-      }
-      else {
+            batch_size * kSquares, weights_.ip_emb_b.size(),
+            num_moves_input_planes, res, weights_.ip_mov_w.data(),
+            weights_.ip_mov_b.data(), default_activation_, head_buffer.data());
+      } else {
         Convolution1<use_eigen>::Forward(
             batch_size, output_channels, num_moves_input_planes, conv_out,
             weights_.moves_left.weights.data(), head_buffer.data());
 
         BiasActivate(batch_size, num_moves_input_planes, &head_buffer[0],
-                    weights_.moves_left.biases.data(), default_activation_);
+                     weights_.moves_left.biases.data(), default_activation_);
       }
 
       FullyConnectedLayer<use_eigen>::Forward1D(
@@ -872,7 +869,7 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
                  pblczero::NetworkFormat::POLICY_ATTENTION;
 
   attn_body_ = file.format().network_format().network() ==
-                 pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT;
+               pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT;
 
   default_activation_ = file.format().network_format().default_activation() ==
                                 pblczero::NetworkFormat::DEFAULT_ACTIVATION_MISH
@@ -881,11 +878,14 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
 
   if (attn_body_) {
     const auto smol_act = file.format().network_format().smolgen_activation();
-    smolgen_activation_ = smol_act == pblczero::NetworkFormat::SMOLGEN_ACTIVATION_INHERIT
-                                ? default_activation_ : static_cast<ActivationFunction>(smol_act);
+    smolgen_activation_ =
+        smol_act == pblczero::NetworkFormat::SMOLGEN_ACTIVATION_INHERIT
+            ? default_activation_
+            : static_cast<ActivationFunction>(smol_act);
     const auto ffn_act = file.format().network_format().ffn_activation();
     ffn_activation_ = ffn_act == pblczero::NetworkFormat::FFN_ACTIVATION_INHERIT
-                            ? default_activation_ : static_cast<ActivationFunction>(ffn_act);
+                          ? default_activation_
+                          : static_cast<ActivationFunction>(ffn_act);
   }
 
   if (max_batch_size_ > kHardMaxBatchSize) {

From add3c458386238e8372f215d723c68dbfd8e35f4 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Tue, 21 Feb 2023 02:05:27 +0200
Subject: [PATCH 37/46] assorted fixes

---
 src/neural/blas/layer_norm.ispc |  4 ++--
 src/neural/blas/network_blas.cc | 16 +++++++++-------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc
index 43fa3fe12a..fb42814ffa 100644
--- a/src/neural/blas/layer_norm.ispc
+++ b/src/neural/blas/layer_norm.ispc
@@ -27,8 +27,9 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
     // One pass mean and variance taken in dimension C. Uses shifted variance calculation.
     float imean = 0;
     float ivar = 0;
+    float k = data[0];
     if (skip != NULL) {
-      float k = data[0] + skip[0];
+      k += skip[0];
       foreach (c = 0 ... channels) {
         float t = data[c] + skip[c];
         data[c] = t;
@@ -37,7 +38,6 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
         ivar += t * t;
       }
     } else {
-      float k = data[0];
       foreach (c = 0 ... channels) {
         float t = data[c];
         t -= k;
diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 60e61d0447..9091cd6c46 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -299,7 +299,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
       // Layer Norm + skip connection.
       LayerNorm2DWithSkipConnection(1, hidden_sz, temp2, (const float*)nullptr,
                                     layer.mha.smolgen.ln1_gammas.data(),
-                                    layer.mha.smolgen.ln1_betas.data(), 1e-6);
+                                    layer.mha.smolgen.ln1_betas.data(), 1e-3);
 
       // Dense 2.
       const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size();
@@ -311,7 +311,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
       LayerNorm2DWithSkipConnection(1, gen_sz_outputs, temp1,
                                     (const float*)nullptr,
                                     layer.mha.smolgen.ln2_gammas.data(),
-                                    layer.mha.smolgen.ln2_betas.data(), 1e-6);
+                                    layer.mha.smolgen.ln2_betas.data(), 1e-3);
 
       // Global smolgen weights.
       FullyConnectedLayer<use_eigen>::Forward1D(
@@ -326,11 +326,13 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
 
     // Apply Softmax.
     for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) {
-#ifndef USE_ISPC
-      SoftmaxActivation(kSquares, QK + h, QK + h);
-#else
-      ispc::SoftmaxActivation(kSquares, QK + h, QK + h);
+#if defined(USE_ISPC)
+      if (!use_eigen) {
+        ispc::SoftmaxActivation(kSquares, QK + h, QK + h);
+        continue;
+      }
 #endif
+      SoftmaxActivation(kSquares, QK + h, QK + h);
     }
 
     // matmul(softmax(QK), V) for all heads per batch.
@@ -558,7 +560,7 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
         int idx;
         for (auto batch = size_t{0}; batch < batch_size; batch++) {
           for (auto i = 0; i < kSquares; i++) {
-            for (auto j = 0; j < embedding_size; j++) {
+            for (size_t j = 0; j < embedding_size; j++) {
               idx = batch * kSquares * embedding_size + i * embedding_size + j;
               res_buffer1[idx] =
                   res_buffer1[idx] * weights_.ip_mult_gate[j * kSquares + i] +

From e70462d281c865fcc243e4ecd09aea669386be78 Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Tue, 21 Feb 2023 13:38:17 +0100
Subject: [PATCH 38/46] Minor cs fix

---
 src/neural/blas/network_blas.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 9091cd6c46..ce029c35bc 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -590,9 +590,8 @@ void BlasComputation<use_eigen>::ComputeBlocking() {
         for (auto batch = size_t{0}; batch < batch_size; batch++) {
           for (auto i = 0; i < kSquares; i++) {
             for (size_t j = 0; j < output_channels; j++) {
-              res[batch * kSquares * output_channels + i * output_channels +
-                  j] = conv_out[batch * kSquares * output_channels +
-                                j * kSquares + i];
+              res[batch * kSquares * output_channels + i * output_channels + j] =
+                  conv_out[batch * kSquares * output_channels + j * kSquares + i];
             }
           }
         }

From 53f73de9e3f113659f3a04e1e61ff47e7502bb75 Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Sat, 11 Mar 2023 10:46:11 +0100
Subject: [PATCH 39/46] Add hack to fix nets with wrong proto.

---
 src/neural/blas/network_blas.cc | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index ce029c35bc..6703d62760 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -1008,6 +1008,25 @@ std::unique_ptr<Network> MakeBlasNetwork(const std::optional<WeightsFile>& w,
             weights.format().network_format().default_activation()) +
         " is not supported by BLAS backend.");
   }
+
+  // @todo Hack for old encoding compatibility. REMOVE BEFORE MERGING.
+  if (w->format().network_format().network() ==
+          pblczero::NetworkFormat::NETWORK_SE_WITH_HEADFORMAT &&
+      w->weights().encoder().size() > 0) {
+    CERR << "Attention body detected, hacking network format.";
+    WeightsFile x = *w;
+    x.mutable_format()->mutable_network_format()->set_network(
+        pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT);
+    if (w->weights().has_smolgen_w()) {
+      CERR << "BT2 detected, hacking activations.";
+      x.mutable_format()->mutable_network_format()->set_ffn_activation(
+          pblczero::NetworkFormat::FFN_ACTIVATION_RELU_2);
+      x.mutable_format()->mutable_network_format()->set_smolgen_activation(
+          pblczero::NetworkFormat::SMOLGEN_ACTIVATION_SWISH);
+    }
+    return std::make_unique<BlasNetwork<use_eigen>>(x, options);
+  }
+
   return std::make_unique<BlasNetwork<use_eigen>>(weights, options);
 }
 

From 971e33d7ddfc012d678ee8269a1e01613aa1024e Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Tue, 21 Mar 2023 23:54:07 +0100
Subject: [PATCH 40/46] Fix bug in smolgen for small nets.

---
 src/neural/blas/network_blas.cc | 29 +++++++++++++++++++----------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 6703d62760..a8201457d3 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -279,14 +279,15 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
 
     // Smolgen.
     if (layer.mha.has_smolgen) {
-      float* input = &head_buffer[batchStart];
+      float* input = &head_buffer[batch * kSquares * embedding_size];
       float* temp1 = &head_buffer2[batchStart];
       float* temp2 = &head_buffer3[batchStart];
 
       // Compress.
-      const auto hidden_channels = layer.mha.smolgen.compress.size() / d_model;
+      const auto hidden_channels =
+          layer.mha.smolgen.compress.size() / embedding_size;
       FullyConnectedLayer<use_eigen>::Forward1D(
-          kSquares, d_model, hidden_channels, input,
+          kSquares, embedding_size, hidden_channels, input,
           layer.mha.smolgen.compress.data(), (const float*)nullptr, NONE,
           temp1);
 
@@ -314,13 +315,21 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
                                     layer.mha.smolgen.ln2_betas.data(), 1e-3);
 
       // Global smolgen weights.
-      FullyConnectedLayer<use_eigen>::Forward1D(
-          heads, gen_sz_outputs / heads, 64 * 64, temp1,
-          weights_.smolgen_w.data(), (const float*)nullptr, NONE, temp2);
-
-      // Add smolgen weights to QK.
-      for (auto i = 0; i < heads * kSquares * kSquares; i++) {
-        QK[i] += temp2[i];
+      const float* A = temp1;
+      const float* B = weights_.smolgen_w.data();
+      float* C = QK;
+      if (use_eigen) {
+        auto C_mat = EigenMatrixMap<float>(C, 64 * 64, heads);
+        C_mat.noalias() +=
+            ConstEigenMatrixMap<float>(B, gen_sz_outputs / heads, 64 * 64)
+                .transpose() *
+            ConstEigenMatrixMap<float>(A, gen_sz_outputs / heads, heads);
+      } else {
+#ifdef USE_BLAS
+        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, heads, 64 * 64,
+                    gen_sz_outputs / heads, 1.0f, A, gen_sz_outputs / heads, B,
+                    gen_sz_outputs / heads, 1.0f, C, 64 * 64);
+#endif
       }
     }
 

From 124386106dacaf611ed607d677c3dd8a651eb431 Mon Sep 17 00:00:00 2001
From: borg323 <borg323@users.noreply.github.com>
Date: Tue, 21 Feb 2023 02:44:11 +0200
Subject: [PATCH 41/46] batch blas smolgen

---
 src/neural/blas/encoder.h       |  15 ++--
 src/neural/blas/network_blas.cc | 136 ++++++++++++++++----------------
 2 files changed, 77 insertions(+), 74 deletions(-)

diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h
index b9940a4b14..2e3a3f9a7b 100644
--- a/src/neural/blas/encoder.h
+++ b/src/neural/blas/encoder.h
@@ -41,8 +41,7 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size,
         data[i * channels + c] += skip[i * channels + c];
         mean += data[i * channels + c];
       }
-    }
-    else {
+    } else {
       for (size_t c = 0; c < channels; ++c) {
         mean += data[i * channels + c];
       }
@@ -64,9 +63,15 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size,
           betas[c] + gammas[c] * (data[i * channels + c] - mean) * den;
     }
 #else
-    ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels,
-                                        skip + i * channels, gammas, betas,
-                                        epsilon);
+    if (skip != nullptr) {
+      ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels,
+                                          skip + i * channels, gammas, betas,
+                                          epsilon);
+    } else {
+      ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels,
+                                          nullptr, gammas, betas, epsilon);
+    }
+
 #endif
   }
 }
diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index a8201457d3..cd6d08d546 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -224,7 +224,52 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
   const int dff_size = layer.ffn.dense1_b.size();
   std::vector<float> head_buffer4(batch_size * std::max(d_model, dff_size) *
                                   kSquares);
-  std::vector<float> temp_buffer(heads * kSquares * kSquares);
+
+  // Smolgen.
+  if (layer.mha.has_smolgen) {
+    float* input = &head_buffer[0];
+    float* QK = &head_buffer4[0];
+    // Compress.
+    const auto hidden_channels =
+        layer.mha.smolgen.compress.size() / embedding_size;
+    std::vector<float> temp1(batch_size * kSquares * hidden_channels);
+    FullyConnectedLayer<use_eigen>::Forward1D(
+        batch_size * kSquares, embedding_size, hidden_channels, input,
+        layer.mha.smolgen.compress.data(), (const float*)nullptr, NONE,
+        temp1.data());
+
+    // Dense 1.
+    const auto hidden_sz = layer.mha.smolgen.dense1_b.size();
+    std::vector<float> temp2(batch_size * hidden_sz);
+    FullyConnectedLayer<use_eigen>::Forward1D(
+        batch_size, kSquares * hidden_channels, hidden_sz, &temp1[0],
+        layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(),
+        smolgen_activation, temp2.data());
+    // Layer Norm + skip connection.
+    LayerNorm2DWithSkipConnection(batch_size, hidden_sz, temp2.data(),
+                                  (const float*)nullptr,
+                                  layer.mha.smolgen.ln1_gammas.data(),
+                                  layer.mha.smolgen.ln1_betas.data(), 1e-3);
+
+    // Dense 2.
+    const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size();
+    std::vector<float> temp3(batch_size * gen_sz_outputs);
+    FullyConnectedLayer<use_eigen>::Forward1D(
+        batch_size, hidden_sz, gen_sz_outputs, &temp2[0],
+        layer.mha.smolgen.dense2_w.data(), layer.mha.smolgen.dense2_b.data(),
+        smolgen_activation, temp3.data());
+    // Layer Norm + skip connection.
+    LayerNorm2DWithSkipConnection(batch_size, gen_sz_outputs, temp3.data(),
+                                  (const float*)nullptr,
+                                  layer.mha.smolgen.ln2_gammas.data(),
+                                  layer.mha.smolgen.ln2_betas.data(), 1e-3);
+
+    // Global smolgen weights.
+    FullyConnectedLayer<use_eigen>::Forward1D(
+        batch_size * heads, gen_sz_outputs / heads, 64 * 64, temp3.data(),
+        weights_.smolgen_w.data(), (const float*)nullptr, NONE, &QK[0]);
+  }
+
   // Q
   FullyConnectedLayer<use_eigen>::Forward1D(
       batch_size * kSquares, embedding_size, d_model, head_buffer.data(),
@@ -233,10 +278,6 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
   FullyConnectedLayer<use_eigen>::Forward1D(
       batch_size * kSquares, embedding_size, d_model, head_buffer.data(),
       layer.mha.k_w.data(), layer.mha.k_b.data(), NONE, head_buffer3.data());
-  // V
-  FullyConnectedLayer<use_eigen>::Forward1D(
-      batch_size * kSquares, embedding_size, d_model, head_buffer.data(),
-      layer.mha.v_w.data(), layer.mha.v_b.data(), NONE, head_buffer4.data());
 
   // MHA (Q, K, V)
   const int depth = d_model / heads;
@@ -246,29 +287,32 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
   for (auto batch = size_t{0}; batch < batch_size; batch++) {
     auto batchStart = batch * kSquares * d_model;
 
+    float* QK = &head_buffer4[batch * kSquares * kSquares * heads];
+
     const float* Q = &head_buffer2[batchStart];
     const float* K = &head_buffer3[batchStart];
-    const float* V = &head_buffer4[batchStart];
 
     // matmul(Q, K) for all heads per batch.
-    float* QK = temp_buffer.data();
+
     for (auto h = 0; h < heads; h++) {
       const float* A = &Q[h * depth];
       const float* B = &K[h * depth];
       float* C = &QK[h * kSquares * kSquares];
+      const float beta = layer.mha.has_smolgen ? 1.0f : 0.0f;
       if (use_eigen) {
         auto C_mat = EigenMatrixMap<float>(C, kSquares, kSquares);
         C_mat.noalias() =
+            beta * C_mat +
             scaling *
-            ConstEigenStridedMatrixMap<float>(
-                B, depth, kSquares, Eigen::OuterStride<>(heads * depth))
-                .transpose() *
-            ConstEigenStridedMatrixMap<float>(
-                A, depth, kSquares, Eigen::OuterStride<>(heads * depth));
+                ConstEigenStridedMatrixMap<float>(
+                    B, depth, kSquares, Eigen::OuterStride<>(heads * depth))
+                    .transpose() *
+                ConstEigenStridedMatrixMap<float>(
+                    A, depth, kSquares, Eigen::OuterStride<>(heads * depth));
       } else {
 #ifdef USE_BLAS
         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, kSquares, kSquares,
-                    depth, scaling, A, heads * depth, B, heads * depth, 0.0f, C,
+                    depth, scaling, A, heads * depth, B, heads * depth, beta, C,
                     kSquares);
 #else
         // Should never get here.
@@ -277,62 +321,6 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
       }
     }
 
-    // Smolgen.
-    if (layer.mha.has_smolgen) {
-      float* input = &head_buffer[batch * kSquares * embedding_size];
-      float* temp1 = &head_buffer2[batchStart];
-      float* temp2 = &head_buffer3[batchStart];
-
-      // Compress.
-      const auto hidden_channels =
-          layer.mha.smolgen.compress.size() / embedding_size;
-      FullyConnectedLayer<use_eigen>::Forward1D(
-          kSquares, embedding_size, hidden_channels, input,
-          layer.mha.smolgen.compress.data(), (const float*)nullptr, NONE,
-          temp1);
-
-      // Dense 1.
-      const auto hidden_sz = layer.mha.smolgen.dense1_b.size();
-      FullyConnectedLayer<use_eigen>::Forward1D(
-          1, kSquares * hidden_channels, hidden_sz, temp1,
-          layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(),
-          smolgen_activation, temp2);
-      // Layer Norm + skip connection.
-      LayerNorm2DWithSkipConnection(1, hidden_sz, temp2, (const float*)nullptr,
-                                    layer.mha.smolgen.ln1_gammas.data(),
-                                    layer.mha.smolgen.ln1_betas.data(), 1e-3);
-
-      // Dense 2.
-      const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size();
-      FullyConnectedLayer<use_eigen>::Forward1D(
-          1, hidden_sz, gen_sz_outputs, temp2,
-          layer.mha.smolgen.dense2_w.data(), layer.mha.smolgen.dense2_b.data(),
-          smolgen_activation, temp1);
-      // Layer Norm + skip connection.
-      LayerNorm2DWithSkipConnection(1, gen_sz_outputs, temp1,
-                                    (const float*)nullptr,
-                                    layer.mha.smolgen.ln2_gammas.data(),
-                                    layer.mha.smolgen.ln2_betas.data(), 1e-3);
-
-      // Global smolgen weights.
-      const float* A = temp1;
-      const float* B = weights_.smolgen_w.data();
-      float* C = QK;
-      if (use_eigen) {
-        auto C_mat = EigenMatrixMap<float>(C, 64 * 64, heads);
-        C_mat.noalias() +=
-            ConstEigenMatrixMap<float>(B, gen_sz_outputs / heads, 64 * 64)
-                .transpose() *
-            ConstEigenMatrixMap<float>(A, gen_sz_outputs / heads, heads);
-      } else {
-#ifdef USE_BLAS
-        cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, heads, 64 * 64,
-                    gen_sz_outputs / heads, 1.0f, A, gen_sz_outputs / heads, B,
-                    gen_sz_outputs / heads, 1.0f, C, 64 * 64);
-#endif
-      }
-    }
-
     // Apply Softmax.
     for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) {
 #if defined(USE_ISPC)
@@ -343,9 +331,19 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
 #endif
       SoftmaxActivation(kSquares, QK + h, QK + h);
     }
+  }
+
+  // V
+  FullyConnectedLayer<use_eigen>::Forward1D(
+      batch_size * kSquares, embedding_size, d_model, head_buffer.data(),
+      layer.mha.v_w.data(), layer.mha.v_b.data(), NONE, head_buffer3.data());
 
+  for (auto batch = size_t{0}; batch < batch_size; batch++) {
+    auto batchStart = batch * kSquares * d_model;
     // matmul(softmax(QK), V) for all heads per batch.
     float* attn = &head_buffer2[batchStart];
+    const float* V = &head_buffer3[batchStart];
+    const float* QK = &head_buffer4[batch * kSquares * kSquares * heads];
     for (auto h = 0; h < heads; h++) {
       const float* A = &QK[h * kSquares * kSquares];
       const float* B = &V[h * depth];

From d011548717163f0e3cc6d9e262464df9e95896cc Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Wed, 22 Mar 2023 12:26:25 +0100
Subject: [PATCH 42/46] Minor nits.

---
 src/neural/blas/network_blas.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index cd6d08d546..8bc85f9f2d 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -227,8 +227,9 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
 
   // Smolgen.
   if (layer.mha.has_smolgen) {
-    float* input = &head_buffer[0];
+    const float* input = &head_buffer[0];
     float* QK = &head_buffer4[0];
+
     // Compress.
     const auto hidden_channels =
         layer.mha.smolgen.compress.size() / embedding_size;
@@ -242,7 +243,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
     const auto hidden_sz = layer.mha.smolgen.dense1_b.size();
     std::vector<float> temp2(batch_size * hidden_sz);
     FullyConnectedLayer<use_eigen>::Forward1D(
-        batch_size, kSquares * hidden_channels, hidden_sz, &temp1[0],
+        batch_size, kSquares * hidden_channels, hidden_sz, temp1.data(),
         layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(),
         smolgen_activation, temp2.data());
     // Layer Norm + skip connection.
@@ -255,7 +256,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
     const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size();
     std::vector<float> temp3(batch_size * gen_sz_outputs);
     FullyConnectedLayer<use_eigen>::Forward1D(
-        batch_size, hidden_sz, gen_sz_outputs, &temp2[0],
+        batch_size, hidden_sz, gen_sz_outputs, temp2.data(),
         layer.mha.smolgen.dense2_w.data(), layer.mha.smolgen.dense2_b.data(),
         smolgen_activation, temp3.data());
     // Layer Norm + skip connection.
@@ -267,7 +268,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
     // Global smolgen weights.
     FullyConnectedLayer<use_eigen>::Forward1D(
         batch_size * heads, gen_sz_outputs / heads, 64 * 64, temp3.data(),
-        weights_.smolgen_w.data(), (const float*)nullptr, NONE, &QK[0]);
+        weights_.smolgen_w.data(), (const float*)nullptr, NONE, QK);
   }
 
   // Q

From c73e5d22c15a8361a0482bba5d2937df1118ba49 Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Wed, 22 Mar 2023 12:37:23 +0100
Subject: [PATCH 43/46] Replace 64 with kSquares

---
 src/neural/blas/network_blas.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 8bc85f9f2d..0dd2711c70 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -267,7 +267,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
 
     // Global smolgen weights.
     FullyConnectedLayer<use_eigen>::Forward1D(
-        batch_size * heads, gen_sz_outputs / heads, 64 * 64, temp3.data(),
+        batch_size * heads, gen_sz_outputs / heads, kSquares * kSquares, temp3.data(),
         weights_.smolgen_w.data(), (const float*)nullptr, NONE, QK);
   }
 

From 143b9cc3d4d635c0f429159cc98030abd47fe116 Mon Sep 17 00:00:00 2001
From: borg323 <39573933+borg323@users.noreply.github.com>
Date: Thu, 23 Mar 2023 17:43:29 +0200
Subject: [PATCH 44/46] Blas performance improvements (#6)

* vectorize activation functions

* minor speedups

---------

Co-authored-by: borg323 <borg323@users.noreply.github.com>
---
 src/neural/blas/encoder.h         | 11 ++++----
 src/neural/blas/layer_norm.ispc   |  7 +++--
 src/neural/blas/network_blas.cc   | 47 ++++++++++++-------------------
 src/neural/shared/activation.cc   | 33 +++++++++++++++++++++-
 src/neural/shared/activation.ispc | 32 ++++++++++++++++++---
 5 files changed, 88 insertions(+), 42 deletions(-)

diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h
index 2e3a3f9a7b..9a1e4e9f2d 100644
--- a/src/neural/blas/encoder.h
+++ b/src/neural/blas/encoder.h
@@ -30,15 +30,16 @@ namespace lczero {
 
 void LayerNorm2DWithSkipConnection(const size_t batch_size,
                                    const size_t channels, float* data,
-                                   const float* skip, const float* gammas,
-                                   const float* betas, float epsilon) {
+                                   const float alpha, const float* skip,
+                                   const float* gammas, const float* betas,
+                                   float epsilon) {
   for (size_t i = 0; i < batch_size; i++) {
 #ifndef USE_ISPC
     // Mean taken in dimension C.
     float mean = 0;
     if (skip != nullptr) {
       for (size_t c = 0; c < channels; ++c) {
-        data[i * channels + c] += skip[i * channels + c];
+        data[i * channels + c] += alpha * skip[i * channels + c];
         mean += data[i * channels + c];
       }
     } else {
@@ -64,11 +65,11 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size,
     }
 #else
     if (skip != nullptr) {
-      ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels,
+      ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, alpha,
                                           skip + i * channels, gammas, betas,
                                           epsilon);
     } else {
-      ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels,
+      ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, 0.0f,
                                           nullptr, gammas, betas, epsilon);
     }
 
diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc
index fb42814ffa..063bb74476 100644
--- a/src/neural/blas/layer_norm.ispc
+++ b/src/neural/blas/layer_norm.ispc
@@ -18,6 +18,7 @@
 
 export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
                                           uniform float data[],
+                                          const uniform float alpha,
                                           const uniform float skip[],
                                           const uniform float gammas[],
                                           const uniform float betas[],
@@ -29,9 +30,9 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
     float ivar = 0;
     float k = data[0];
     if (skip != NULL) {
-      k += skip[0];
+      k += alpha * skip[0];
       foreach (c = 0 ... channels) {
-        float t = data[c] + skip[c];
+        float t = data[c] + alpha * skip[c];
         data[c] = t;
         t -= k;
         imean += t;
@@ -53,7 +54,7 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels,
   float imean = 0;
   if (skip != NULL) {
     foreach (c = 0 ... channels) {
-      data[c] += skip[c];
+      data[c] += alpha * skip[c];
       imean += data[c];
     }
   } else {
diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 0dd2711c70..6c1cb17782 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -247,7 +247,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
         layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(),
         smolgen_activation, temp2.data());
     // Layer Norm + skip connection.
-    LayerNorm2DWithSkipConnection(batch_size, hidden_sz, temp2.data(),
+    LayerNorm2DWithSkipConnection(batch_size, hidden_sz, temp2.data(), 0.0f,
                                   (const float*)nullptr,
                                   layer.mha.smolgen.ln1_gammas.data(),
                                   layer.mha.smolgen.ln1_betas.data(), 1e-3);
@@ -261,7 +261,7 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
         smolgen_activation, temp3.data());
     // Layer Norm + skip connection.
     LayerNorm2DWithSkipConnection(batch_size, gen_sz_outputs, temp3.data(),
-                                  (const float*)nullptr,
+                                  0.0f, (const float*)nullptr,
                                   layer.mha.smolgen.ln2_gammas.data(),
                                   layer.mha.smolgen.ln2_betas.data(), 1e-3);
 
@@ -321,17 +321,19 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
 #endif
       }
     }
+  }
 
-    // Apply Softmax.
-    for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) {
+  // Apply Softmax.
+  float* QK = &head_buffer4[0];
+  for (size_t h = 0; h < batch_size * heads * kSquares * kSquares;
+       h += kSquares) {
 #if defined(USE_ISPC)
-      if (!use_eigen) {
-        ispc::SoftmaxActivation(kSquares, QK + h, QK + h);
-        continue;
-      }
-#endif
-      SoftmaxActivation(kSquares, QK + h, QK + h);
+    if (!use_eigen) {
+      ispc::SoftmaxActivation(kSquares, QK + h, QK + h);
+      continue;
     }
+#endif
+    SoftmaxActivation(kSquares, QK + h, QK + h);
   }
 
   // V
@@ -372,17 +374,11 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
       layer.mha.dense_w.data(), layer.mha.dense_b.data(), NONE,
       head_buffer3.data());
 
-  if (alpha != 1.0f) {
-    for (size_t i = 0; i < batch_size * kSquares * embedding_size; i++) {
-      head_buffer[i] *= alpha;
-    }
-  }
-
   // Layer Norm + skip connection.
   LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size,
-                                head_buffer.data(), head_buffer3.data(),
-                                layer.ln1_gammas.data(), layer.ln1_betas.data(),
-                                1e-6);
+                                head_buffer.data(), 1.0f / alpha,
+                                head_buffer3.data(), layer.ln1_gammas.data(),
+                                layer.ln1_betas.data(), 1e-6);
 
   // FFN.
   FullyConnectedLayer<use_eigen>::Forward1D(
@@ -395,18 +391,11 @@ void BlasComputation<use_eigen>::MakeEncoderLayer(
       head_buffer4.data(), layer.ffn.dense2_w.data(), layer.ffn.dense2_b.data(),
       NONE, head_buffer3.data());
 
-  if (alpha != 1.0f) {
-    for (size_t i = 0; i < batch_size * kSquares * layer.ffn.dense2_b.size();
-         i++) {
-      head_buffer[i] *= alpha;
-    }
-  }
-
   // Layer Norm + skip connection.
   LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size,
-                                head_buffer.data(), head_buffer3.data(),
-                                layer.ln2_gammas.data(), layer.ln2_betas.data(),
-                                1e-6);
+                                head_buffer.data(), 1.0f / alpha,
+                                head_buffer3.data(), layer.ln2_gammas.data(),
+                                layer.ln2_betas.data(), 1e-6);
 }
 
 template <bool use_eigen>
diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc
index 9c90936c00..f7d97ecca0 100644
--- a/src/neural/shared/activation.cc
+++ b/src/neural/shared/activation.cc
@@ -100,10 +100,14 @@ void Activate(const size_t len, const float* data, const float* bias,
       output[b] = data[b] + bias[b];
     }
   } else if (activation == RELU) {
+#ifndef USE_ISPC
     for (size_t b = 0; b < len; b++) {
       float val = data[b] + bias[b];
       output[b] = val > 0 ? val : 0;
     }
+#else
+    ispc::ActivateRelu(len, 1.0f, data, bias, 0.0f, output);
+#endif
   } else if (activation == MISH) {
 #ifndef USE_ISPC
     for (size_t b = 0; b < len; b++) {
@@ -112,6 +116,25 @@ void Activate(const size_t len, const float* data, const float* bias,
     }
 #else
     ispc::ActivateMish(len, 1.0f, data, bias, 0.0f, output);
+#endif
+  } else if (activation == RELU_2) {
+#ifndef USE_ISPC
+    for (size_t b = 0; b < len; b++) {
+      float val = data[b] + bias[b];
+      output[b] = val > 0 ? val * val : 0;
+    }
+#else
+    ispc::ActivateRelu_2(len, data, bias, output);
+#endif
+  } else if (activation == SWISH) {
+#ifndef USE_ISPC
+    for (size_t b = 0; b < len; b++) {
+      float val = data[b] + bias[b];
+      output[b] = val / (1.0f + exp(-val));
+      ;
+    }
+#else
+    ispc::ActivateSwish(len, data, bias, output);
 #endif
   } else if (activation == SELU) {
 #ifndef USE_ISPC
@@ -120,7 +143,7 @@ void Activate(const size_t len, const float* data, const float* bias,
       output[b] = selu(val);
     }
 #else
-    ispc::ActivateSelu(len, 1.0f, data, bias, 0.0f, output);
+    ispc::ActivateSelu(len, data, bias, output);
 #endif
   } else {
     for (size_t b = 0; b < len; b++) {
@@ -139,10 +162,14 @@ void Activate(const size_t len, float gamma, const float* data,
       output[b] = val;
     }
   } else if (activation == RELU) {
+#ifndef USE_ISPC
     for (size_t b = 0; b < len; b++) {
       float val = gamma * data[b] + bias[b] + beta;
       output[b] = val > 0 ? val : 0;
     }
+#else
+    ispc::ActivateRelu(len, gamma, data, bias, beta, output);
+#endif
   } else if (activation == MISH) {
 #ifndef USE_ISPC
     for (size_t b = 0; b < len; b++) {
@@ -174,10 +201,14 @@ void BiasResidual(const size_t batch_size, const size_t channels, float* data,
           arr[b] = val;
         }
       } else if (activation == RELU) {
+#ifndef USE_ISPC
         for (size_t b = 0; b < kSquares; b++) {
           float val = res[b] + arr[b] + bias;
           arr[b] = val > 0 ? val : 0;
         }
+#else
+        ispc::ActivateRelu(kSquares, 1.0f, res, arr, bias, arr);
+#endif
       } else if (activation == MISH) {
 #ifndef USE_ISPC
         for (size_t b = 0; b < kSquares; b++) {
diff --git a/src/neural/shared/activation.ispc b/src/neural/shared/activation.ispc
index 6190198515..987dc3e689 100644
--- a/src/neural/shared/activation.ispc
+++ b/src/neural/shared/activation.ispc
@@ -36,6 +36,31 @@ export void ActivateMish(uniform const size_t len, uniform float gamma,
   }
 }
 
+export void ActivateRelu(uniform const size_t len, uniform float gamma,
+                         const uniform float data[], const uniform float bias[],
+                         uniform float beta, uniform float output[]) {
+  foreach (b = 0 ... len) {
+    float val = gamma * data[b] + bias[b] + beta;
+    output[b] = val > 0 ? val : 0;
+  }
+}
+
+export void ActivateSwish(uniform const size_t len, const uniform float data[],
+                          const uniform float bias[], uniform float output[]) {
+  foreach (b = 0 ... len) {
+    float val = data[b] + bias[b];
+    output[b] = val / (1.0f + exp(-val));
+  }
+}
+
+export void ActivateRelu_2(uniform const size_t len, const uniform float data[],
+                           const uniform float bias[], uniform float output[]) {
+  foreach (b = 0 ... len) {
+    float val = data[b] + bias[b];
+    output[b] = val > 0 ? val * val : 0;
+  }
+}
+
 static inline float selu(float val) {
   float alpha = 1.67326324f, scale = 1.05070098f;
   if (val > 0) {
@@ -45,11 +70,10 @@ static inline float selu(float val) {
   }
 }
 
-export void ActivateSelu(uniform const size_t len, uniform float gamma,
-                         const uniform float data[], const uniform float bias[],
-                         uniform float beta, uniform float output[]) {
+export void ActivateSelu(uniform const size_t len, const uniform float data[],
+                         const uniform float bias[], uniform float output[]) {
   foreach (b = 0 ... len) {
-    float val = gamma * data[b] + bias[b] + beta;
+    float val = data[b] + bias[b];
     output[b] = selu(val);
   }
 }

From 9a718518e42716ba0be39c5900a49052d9dd5833 Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Tue, 28 Mar 2023 11:43:39 +0100
Subject: [PATCH 45/46] Remove duplicate code, fix proto changes.

---
 src/neural/blas/network_blas.cc | 23 ++---------------------
 1 file changed, 2 insertions(+), 21 deletions(-)

diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc
index 6c1cb17782..8a5400ffe7 100644
--- a/src/neural/blas/network_blas.cc
+++ b/src/neural/blas/network_blas.cc
@@ -877,11 +877,11 @@ BlasNetwork<use_eigen>::BlasNetwork(const WeightsFile& file,
   if (attn_body_) {
     const auto smol_act = file.format().network_format().smolgen_activation();
     smolgen_activation_ =
-        smol_act == pblczero::NetworkFormat::SMOLGEN_ACTIVATION_INHERIT
+        smol_act == pblczero::NetworkFormat::ACTIVATION_DEFAULT
             ? default_activation_
             : static_cast<ActivationFunction>(smol_act);
     const auto ffn_act = file.format().network_format().ffn_activation();
-    ffn_activation_ = ffn_act == pblczero::NetworkFormat::FFN_ACTIVATION_INHERIT
+    ffn_activation_ = ffn_act == pblczero::NetworkFormat::ACTIVATION_DEFAULT
                           ? default_activation_
                           : static_cast<ActivationFunction>(ffn_act);
   }
@@ -1005,25 +1005,6 @@ std::unique_ptr<Network> MakeBlasNetwork(const std::optional<WeightsFile>& w,
             weights.format().network_format().default_activation()) +
         " is not supported by BLAS backend.");
   }
-
-  // @todo Hack for old encoding compatibility. REMOVE BEFORE MERGING.
-  if (w->format().network_format().network() ==
-          pblczero::NetworkFormat::NETWORK_SE_WITH_HEADFORMAT &&
-      w->weights().encoder().size() > 0) {
-    CERR << "Attention body detected, hacking network format.";
-    WeightsFile x = *w;
-    x.mutable_format()->mutable_network_format()->set_network(
-        pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT);
-    if (w->weights().has_smolgen_w()) {
-      CERR << "BT2 detected, hacking activations.";
-      x.mutable_format()->mutable_network_format()->set_ffn_activation(
-          pblczero::NetworkFormat::FFN_ACTIVATION_RELU_2);
-      x.mutable_format()->mutable_network_format()->set_smolgen_activation(
-          pblczero::NetworkFormat::SMOLGEN_ACTIVATION_SWISH);
-    }
-    return std::make_unique<BlasNetwork<use_eigen>>(x, options);
-  }
-
   return std::make_unique<BlasNetwork<use_eigen>>(weights, options);
 }
 

From 1d327af3c210341d1dfe509ac8f5c6d5b15eb93a Mon Sep 17 00:00:00 2001
From: Aniebiet Udoh <almaudoh@yahoo.com>
Date: Tue, 28 Mar 2023 11:51:20 +0100
Subject: [PATCH 46/46] Remove missed import

---
 src/neural/shared/activation.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/neural/shared/activation.h b/src/neural/shared/activation.h
index fba71a5229..3d1140765a 100644
--- a/src/neural/shared/activation.h
+++ b/src/neural/shared/activation.h
@@ -21,8 +21,6 @@
 #include <cstddef>
 #include <vector>
 
-#include "proto/net.pb.h"
-
 namespace lczero {
 // The following list matches the one in net.proto. Ideally this would be done
 // by including proto/net.pb.h, but this is incompatible with nvcc.