From f2ba92ac4b523688eae2daf876487bafd673805f Mon Sep 17 00:00:00 2001 From: borg323 Date: Wed, 7 Dec 2022 01:46:36 +0200 Subject: [PATCH 01/46] update some shared files for attention body --- src/neural/network_legacy.cc | 10 + src/neural/network_legacy.h | 13 + src/neural/shared/attention_policy_map.h | 384 +++++++++++++++++++++-- 3 files changed, 383 insertions(+), 24 deletions(-) diff --git a/src/neural/network_legacy.cc b/src/neural/network_legacy.cc index f4819d4952..0872ae0ddc 100644 --- a/src/neural/network_legacy.cc +++ b/src/neural/network_legacy.cc @@ -30,6 +30,8 @@ static constexpr float kEpsilon = 1e-5f; LegacyWeights::LegacyWeights(const pblczero::Weights& weights) : input(weights.input()), + ip_emb_w(LayerAdapter(weights.ip_emb_w()).as_vector()), + ip_emb_b(LayerAdapter(weights.ip_emb_b()).as_vector()), policy1(weights.policy1()), policy(weights.policy()), ip_pol_w(LayerAdapter(weights.ip_pol_w()).as_vector()), @@ -40,11 +42,15 @@ LegacyWeights::LegacyWeights(const pblczero::Weights& weights) ip3_pol_b(LayerAdapter(weights.ip3_pol_b()).as_vector()), ip4_pol_w(LayerAdapter(weights.ip4_pol_w()).as_vector()), value(weights.value()), + ip_val_w(LayerAdapter(weights.ip_val_w()).as_vector()), + ip_val_b(LayerAdapter(weights.ip_val_b()).as_vector()), ip1_val_w(LayerAdapter(weights.ip1_val_w()).as_vector()), ip1_val_b(LayerAdapter(weights.ip1_val_b()).as_vector()), ip2_val_w(LayerAdapter(weights.ip2_val_w()).as_vector()), ip2_val_b(LayerAdapter(weights.ip2_val_b()).as_vector()), moves_left(weights.moves_left()), + ip_mov_w(LayerAdapter(weights.ip_mov_w()).as_vector()), + ip_mov_b(LayerAdapter(weights.ip_mov_b()).as_vector()), ip1_mov_w(LayerAdapter(weights.ip1_mov_w()).as_vector()), ip1_mov_b(LayerAdapter(weights.ip1_mov_b()).as_vector()), ip2_mov_w(LayerAdapter(weights.ip2_mov_w()).as_vector()), @@ -52,6 +58,10 @@ LegacyWeights::LegacyWeights(const pblczero::Weights& weights) for (const auto& res : weights.residual()) { residual.emplace_back(res); } + encoder_head_count = weights.headcount(); + for (const auto& enc : weights.encoder()) { + encoder.emplace_back(enc); + } pol_encoder_head_count = weights.pol_headcount(); for (const auto& enc : weights.pol_encoder()) { pol_encoder.emplace_back(enc); diff --git a/src/neural/network_legacy.h b/src/neural/network_legacy.h index 3ba6028d5e..19284af172 100644 --- a/src/neural/network_legacy.h +++ b/src/neural/network_legacy.h @@ -88,6 +88,15 @@ struct LegacyWeights { // Input convnet. ConvBlock input; + // Embedding layer + Vec ip_emb_w; + Vec ip_emb_b; + + // Encoder stack. + std::vector encoder; + int encoder_head_count; + + // Residual tower. std::vector residual; @@ -109,6 +118,8 @@ struct LegacyWeights { // Value head ConvBlock value; + Vec ip_val_w; + Vec ip_val_b; Vec ip1_val_w; Vec ip1_val_b; Vec ip2_val_w; @@ -116,6 +127,8 @@ struct LegacyWeights { // Moves left head ConvBlock moves_left; + Vec ip_mov_w; + Vec ip_mov_b; Vec ip1_mov_w; Vec ip1_mov_b; Vec ip2_mov_w; diff --git a/src/neural/shared/attention_policy_map.h b/src/neural/shared/attention_policy_map.h index 5ab0966654..df39df7dc9 100644 --- a/src/neural/shared/attention_policy_map.h +++ b/src/neural/shared/attention_policy_map.h @@ -380,30 +380,366 @@ const short kAttnPolicyMap[] = { 1848, 1849, 1850, 1851, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1852, 1853, 1854, 1855, 1856, 1857}; - -} // namespace lczero - - - - - - - - - - - - - - - - - - - - - - +#if 0 +// used only by Arcturai's T02 network +constexpr int kNumPosEncodingChannels = 6; +__device__ constexpr float kPosEncoding[64][kNumPosEncodingChannels] = { + {0., 0., 0., 0., 0., 0.}, {0., 0., 0., 0., 0., 1.}, + {0., 0., 0., 0., 1., 0.}, {0., 0., 0., 0., 1., 1.}, + {0., 0., 0., 1., 0., 0.}, {0., 0., 0., 1., 0., 1.}, + {0., 0., 0., 1., 1., 0.}, {0., 0., 0., 1., 1., 1.}, + {0., 0., 1., 0., 0., 0.}, {0., 0., 1., 0., 0., 1.}, + {0., 0., 1., 0., 1., 0.}, {0., 0., 1., 0., 1., 1.}, + {0., 0., 1., 1., 0., 0.}, {0., 0., 1., 1., 0., 1.}, + {0., 0., 1., 1., 1., 0.}, {0., 0., 1., 1., 1., 1.}, + {0., 1., 0., 0., 0., 0.}, {0., 1., 0., 0., 0., 1.}, + {0., 1., 0., 0., 1., 0.}, {0., 1., 0., 0., 1., 1.}, + {0., 1., 0., 1., 0., 0.}, {0., 1., 0., 1., 0., 1.}, + {0., 1., 0., 1., 1., 0.}, {0., 1., 0., 1., 1., 1.}, + {0., 1., 1., 0., 0., 0.}, {0., 1., 1., 0., 0., 1.}, + {0., 1., 1., 0., 1., 0.}, {0., 1., 1., 0., 1., 1.}, + {0., 1., 1., 1., 0., 0.}, {0., 1., 1., 1., 0., 1.}, + {0., 1., 1., 1., 1., 0.}, {0., 1., 1., 1., 1., 1.}, + {1., 0., 0., 0., 0., 0.}, {1., 0., 0., 0., 0., 1.}, + {1., 0., 0., 0., 1., 0.}, {1., 0., 0., 0., 1., 1.}, + {1., 0., 0., 1., 0., 0.}, {1., 0., 0., 1., 0., 1.}, + {1., 0., 0., 1., 1., 0.}, {1., 0., 0., 1., 1., 1.}, + {1., 0., 1., 0., 0., 0.}, {1., 0., 1., 0., 0., 1.}, + {1., 0., 1., 0., 1., 0.}, {1., 0., 1., 0., 1., 1.}, + {1., 0., 1., 1., 0., 0.}, {1., 0., 1., 1., 0., 1.}, + {1., 0., 1., 1., 1., 0.}, {1., 0., 1., 1., 1., 1.}, + {1., 1., 0., 0., 0., 0.}, {1., 1., 0., 0., 0., 1.}, + {1., 1., 0., 0., 1., 0.}, {1., 1., 0., 0., 1., 1.}, + {1., 1., 0., 1., 0., 0.}, {1., 1., 0., 1., 0., 1.}, + {1., 1., 0., 1., 1., 0.}, {1., 1., 0., 1., 1., 1.}, + {1., 1., 1., 0., 0., 0.}, {1., 1., 1., 0., 0., 1.}, + {1., 1., 1., 0., 1., 0.}, {1., 1., 1., 0., 1., 1.}, + {1., 1., 1., 1., 0., 0.}, {1., 1., 1., 1., 0., 1.}, + {1., 1., 1., 1., 1., 0.}, {1., 1., 1., 1., 1., 1.}}; +#endif + +constexpr int kNumPosEncodingChannels = 64; +#if defined(__CUDA_ARCH__) +__device__ +#endif +const float kPosEncoding[64][kNumPosEncodingChannels] = { + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, + 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + -1.0}; +} // namespace lczero From f091f5e33996fd2837edeb7fbb793951f092d19a Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Sun, 4 Dec 2022 11:05:21 +0200 Subject: [PATCH 02/46] update default net (#1804) --- appveyor.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index e2784bc6c6..ab5225341e 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -43,10 +43,10 @@ install: - cmd: IF %NAME%==cpu-openblas set GTEST=true - cmd: IF %NAME%==onednn set ONEDNN=true - cmd: IF %NAME%==onnx-dml set ONNX_DML=true -- cmd: set NET=744204 -- cmd: set NET_HASH=0f2f738e314bf618384045d4320a55333375d273d093adb805a4268ee53b519c -- cmd: IF NOT %BLAS%==true IF NOT %ANDROID%==true set NET=753723 -- cmd: IF NOT %BLAS%==true IF NOT %ANDROID%==true set NET_HASH=3e3444370b9fe413244fdc79671a490e19b93d3cca1669710ffeac890493d198 +- cmd: set NET=753723 +- cmd: set NET_HASH=3e3444370b9fe413244fdc79671a490e19b93d3cca1669710ffeac890493d198 +- cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET=791556 +- cmd: IF NOT %OPENCL%==true IF NOT %DX%==true set NET_HASH=f404e156ceb2882470fd8c032b8754af0fa0b71168328912eaef14671a256e34 - cmd: call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" amd64 - cmd: set DNNL_NAME=dnnl_win_1.5.0_cpu_vcomp - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/oneapi-src/oneDNN/releases/download/v1.5/dnnl_win_1.5.0_cpu_vcomp.zip From ad045295bc06f4c682b18372497a98b875ae7b6c Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Sun, 4 Dec 2022 11:05:51 +0200 Subject: [PATCH 03/46] option to select onnx opset (#1803) --- src/neural/onnx/builder.cc | 38 +++++++++++++++++++++++++++------ src/neural/onnx/builder.h | 3 ++- src/neural/onnx/converter.cc | 2 +- src/neural/onnx/converter.h | 1 + src/neural/onnx/network_onnx.cc | 1 + 5 files changed, 37 insertions(+), 8 deletions(-) diff --git a/src/neural/onnx/builder.cc b/src/neural/onnx/builder.cc index c046f69fb1..be97dfd7ac 100644 --- a/src/neural/onnx/builder.cc +++ b/src/neural/onnx/builder.cc @@ -29,6 +29,7 @@ #include +#include "neural/onnx/adapters.h" #include "neural/onnx/onnx.pb.h" #include "utils/exception.h" #include "utils/random.h" @@ -36,12 +37,15 @@ namespace lczero { -OnnxBuilder::OnnxBuilder() { +OnnxBuilder::OnnxBuilder(int opset) : opset_(opset) { + if (opset < 7 || opset > 17) { + throw Exception("Only ONNX opsets between 7 and 17 are supported."); + } model_.set_ir_version(4); model_.set_domain("org.lczero.models.*"); model_.set_producer_name("Lc0"); model_.set_producer_version(GetVersionStr()); - model_.add_opset_import()->set_version(9); + model_.add_opset_import()->set_version(opset); model_.mutable_graph()->set_name("org.lczero/converted/" + Random::Get().GetString(16)); } @@ -166,7 +170,12 @@ std::string OnnxBuilder::Squeeze(const std::string& name, const std::string& input) { auto* node = model_.mutable_graph()->add_node(); auto out = PopulateStdNodeFields(node, name, input, "Squeeze"); - AddIntsAttribute(node, "axes", {2, 3}); + if (opset_ < 13) { + AddIntsAttribute(node, "axes", {2, 3}); + } else { + node->add_input( + AddInitializer(name + "/axes", Int64OnnxConst({2, 3}, {2}))); + } return out; } @@ -287,7 +296,14 @@ std::vector OnnxBuilder::Split(const std::string& name, node->add_input(input); AddIntAttribute(node, "axis", axis); if (split.size() > 0) { - AddIntsAttribute(node, "split", split); + if (opset_ < 13) { + AddIntsAttribute(node, "split", split); + } else { + node->add_input(AddInitializer( + name + "/split", + Int64OnnxConst(std::vector(begin(split), end(split)), + {static_cast(split.size())}))); + } std::vector out; for (size_t i = 1; i <= split.size(); i++) { out.push_back(name + "/out" + std::to_string(i)); @@ -306,8 +322,17 @@ std::string OnnxBuilder::Slice(const std::string& name, std::initializer_list ends) { auto* node = model_.mutable_graph()->add_node(); auto out = PopulateStdNodeFields(node, name, input, "Slice"); - AddIntsAttribute(node, "starts", starts); - AddIntsAttribute(node, "ends", ends); + if (opset_ < 10) { + AddIntsAttribute(node, "starts", starts); + AddIntsAttribute(node, "ends", ends); + } else { + node->add_input(AddInitializer( + name + "/starts", Int32OnnxConst(std::vector(starts), + {static_cast(starts.size())}))); + node->add_input(AddInitializer( + name + "/ends", Int32OnnxConst(std::vector(ends), + {static_cast(ends.size())}))); + } return out; } @@ -331,6 +356,7 @@ std::string OnnxBuilder::Sigmoid(const std::string& name, return PopulateStdNodeFields(node, name, input, "Sigmoid"); } +// This is only defined in opset 17 but onnxruntime supports it from 1. std::string OnnxBuilder::LayerNormalization(const std::string& name, const std::string& input, const OnnxConst& scale, diff --git a/src/neural/onnx/builder.h b/src/neural/onnx/builder.h index f2f03363f9..3b88cb27ec 100644 --- a/src/neural/onnx/builder.h +++ b/src/neural/onnx/builder.h @@ -45,7 +45,7 @@ class OnnxConst { // Builds Onnx::ModelProto. class OnnxBuilder { public: - OnnxBuilder(); + OnnxBuilder(int opset); void AddInput(const std::string& name, std::initializer_list dims, pblczero::TensorProto::DataType datatype); void AddOutput(const std::string& name, std::initializer_list dims, @@ -112,6 +112,7 @@ class OnnxBuilder { std::string OutputAsString() const { return model_.OutputAsString(); } private: + const int opset_; pblczero::ModelProto model_; }; diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc index 6e8254221f..1bf4b88b70 100644 --- a/src/neural/onnx/converter.cc +++ b/src/neural/onnx/converter.cc @@ -559,7 +559,7 @@ void Converter::MakeMovesLeftHead(pblczero::OnnxModel* onnx, void Converter::GenerateOnnx(pblczero::OnnxModel* onnx) { LegacyWeights weights(src_.weights()); - OnnxBuilder builder; + OnnxBuilder builder(options_.opset); AddStdInitializers(&builder); diff --git a/src/neural/onnx/converter.h b/src/neural/onnx/converter.h index e22ab9dac9..8980ffb858 100644 --- a/src/neural/onnx/converter.h +++ b/src/neural/onnx/converter.h @@ -42,6 +42,7 @@ struct WeightsToOnnxConverterOptions { std::string output_value = "/output/value"; std::string output_mlh = "/output/mlh"; int batch_size = -1; + int opset = 17; }; // Converts "classical" weights file to weights file with embedded ONNX model. diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc index 747ef31ec8..7d7950175c 100644 --- a/src/neural/onnx/network_onnx.cc +++ b/src/neural/onnx/network_onnx.cc @@ -409,6 +409,7 @@ std::unique_ptr MakeOnnxNetwork(const std::optional& w, " is not supported by the ONNX backend."); } WeightsToOnnxConverterOptions converter_options; + converter_options.opset = opts.GetOrDefault("opset", 17); converter_options.data_type_ = fp16 ? WeightsToOnnxConverterOptions::DataType::kFloat16 : WeightsToOnnxConverterOptions::DataType::kFloat32; From 2782b94aaf9caa4b5c7a2dbaa54d7acf63f34113 Mon Sep 17 00:00:00 2001 From: trre123 <94855351+trre123@users.noreply.github.com> Date: Sun, 4 Dec 2022 04:06:51 -0500 Subject: [PATCH 04/46] Add 'simple' time manager (#1764) --- meson.build | 1 + src/mcts/stoppers/factory.cc | 8 ++- src/mcts/stoppers/simple.cc | 127 +++++++++++++++++++++++++++++++++++ src/mcts/stoppers/simple.h | 37 ++++++++++ 4 files changed, 171 insertions(+), 2 deletions(-) create mode 100644 src/mcts/stoppers/simple.cc create mode 100644 src/mcts/stoppers/simple.h diff --git a/meson.build b/meson.build index 38d1fcb147..49b60c1cd6 100644 --- a/meson.build +++ b/meson.build @@ -170,6 +170,7 @@ files += [ 'src/mcts/stoppers/common.cc', 'src/mcts/stoppers/factory.cc', 'src/mcts/stoppers/legacy.cc', + 'src/mcts/stoppers/simple.cc', 'src/mcts/stoppers/smooth.cc', 'src/mcts/stoppers/stoppers.cc', 'src/mcts/stoppers/timemgr.cc', diff --git a/src/mcts/stoppers/factory.cc b/src/mcts/stoppers/factory.cc index 6a5c116fcd..e08e269117 100644 --- a/src/mcts/stoppers/factory.cc +++ b/src/mcts/stoppers/factory.cc @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2019 The LCZero Authors + Copyright (C) 2022 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -32,6 +32,7 @@ #include "factory.h" #include "mcts/stoppers/alphazero.h" #include "mcts/stoppers/legacy.h" +#include "mcts/stoppers/simple.h" #include "mcts/stoppers/smooth.h" #include "mcts/stoppers/stoppers.h" #include "utils/exception.h" @@ -47,7 +48,7 @@ const OptionId kMoveOverheadId{ const OptionId kTimeManagerId{ "time-manager", "TimeManager", "Name and config of a time manager. " - "Possible names are 'legacy' (default), 'smooth' and 'alphazero'." + "Possible names are 'legacy' (default), 'smooth', 'alphazero', and simple." "See https://lc0.org/timemgr for configuration details."}; } // namespace @@ -82,6 +83,9 @@ std::unique_ptr MakeTimeManager(const OptionsDict& options) { } else if (managers[0] == "smooth") { time_manager = MakeSmoothTimeManager(move_overhead, tm_options.GetSubdict("smooth")); + } else if (managers[0] == "simple") { + time_manager = + MakeSimpleTimeManager(move_overhead, tm_options.GetSubdict("simple")); } if (!time_manager) { diff --git a/src/mcts/stoppers/simple.cc b/src/mcts/stoppers/simple.cc new file mode 100644 index 0000000000..fd93669375 --- /dev/null +++ b/src/mcts/stoppers/simple.cc @@ -0,0 +1,127 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2022 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + + Additional permission under GNU GPL version 3 section 7 + + If you modify this Program, or any covered work, by linking or + combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA + Toolkit and the NVIDIA CUDA Deep Neural Network library (or a + modified version of those libraries), containing parts covered by the + terms of the respective license agreement, the licensors of this + Program grant you additional permission to convey the resulting work. +*/ + +#include "mcts/stoppers/stoppers.h" + +namespace lczero { + +namespace { + +class SimpleTimeManager : public TimeManager { + public: + SimpleTimeManager(int64_t move_overhead, const OptionsDict& params) + : move_overhead_(move_overhead), + basepct_(params.GetOrDefault("base-pct", 1.83f)), + plypct_(params.GetOrDefault("ply-pct", 0.0454f)), + timefactor_(params.GetOrDefault("time-factor", 33.4f)), + opening_bonus_(params.GetOrDefault("opening-bonus", 82.5f)) { + if (basepct_ <= 0.0f || basepct_ > 100.0f) { + throw Exception("base-pct value to be in range [0.0, 100.0]"); + } + if (plypct_ < 0.0f || plypct_ > 1.0f) { + throw Exception("ply-pct value to be in range [0.0, 1.0]"); + } + if (timefactor_ < 0.0f || timefactor_ > 100.0f) { + throw Exception("time-factor value to be in range [0.0, 100.0]"); + } + if (opening_bonus_ < 0.0f || opening_bonus_ > 1000.0f) { + throw Exception("opening-bonus value to be in range [0.0, 1000.0]"); + } + } + std::unique_ptr GetStopper(const GoParams& params, + const NodeTree& tree) override; + + private: + const int64_t move_overhead_; + const float basepct_; + const float plypct_; + const float timefactor_; + const float opening_bonus_; + float prev_move_time = 0.0f; + float prev_total_moves_time = 0.0f; + bool bonus_applied = false; +}; + +std::unique_ptr SimpleTimeManager::GetStopper( + const GoParams& params, const NodeTree& tree) { + const Position& position = tree.HeadPosition(); + const bool is_black = position.IsBlackToMove(); + const std::optional& time = (is_black ? params.btime : params.wtime); + + // If no time limit is given, don't stop on this condition. + if (params.infinite || params.ponder || !time) return nullptr; + + const std::optional& inc = is_black ? params.binc : params.winc; + const int increment = inc ? std::max(int64_t(0), *inc) : 0; + + const float total_moves_time = + static_cast(*time) - static_cast(move_overhead_); + + // increase percentage as ply count increases + float pct = (basepct_ + position.GetGamePly() * plypct_) * 0.01f; + + // increase percentage as ratio of increment time to total time gets smaller + pct += pct * (static_cast(increment) / + static_cast(total_moves_time) * timefactor_); + + float this_move_time = total_moves_time * pct; + + // immediately spend time saved from smart pruning during previous move + if (prev_move_time > 0.0f) { + const float time_saved = + prev_move_time - (prev_total_moves_time - + (total_moves_time - static_cast(increment))); + + this_move_time += time_saved; + } + + // apply any opening bonus and note the next move will also benefit + // from an increased time_saved as a result + if (!bonus_applied) { + this_move_time += this_move_time * opening_bonus_ * 0.01f; + bonus_applied = true; + } + + this_move_time = std::min(this_move_time, total_moves_time); + + prev_move_time = this_move_time; + prev_total_moves_time = total_moves_time; + + LOGFILE << "Budgeted time for the move: " << this_move_time << "ms" + << "Remaining time " << *time << "ms(-" << move_overhead_ + << "ms overhead)"; + + return std::make_unique(this_move_time); +} + +} // namespace + +std::unique_ptr MakeSimpleTimeManager( + int64_t move_overhead, const OptionsDict& params) { + return std::make_unique(move_overhead, params); +} +} // namespace lczero diff --git a/src/mcts/stoppers/simple.h b/src/mcts/stoppers/simple.h new file mode 100644 index 0000000000..b7b6f3a349 --- /dev/null +++ b/src/mcts/stoppers/simple.h @@ -0,0 +1,37 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2020 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + + Additional permission under GNU GPL version 3 section 7 + + If you modify this Program, or any covered work, by linking or + combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA + Toolkit and the NVIDIA CUDA Deep Neural Network library (or a + modified version of those libraries), containing parts covered by the + terms of the respective license agreement, the licensors of this + Program grant you additional permission to convey the resulting work. +*/ + +#pragma once + +#include "utils/optionsdict.h" + +namespace lczero { + +std::unique_ptr MakeSimpleTimeManager( + int64_t move_overhead, const OptionsDict& params); + +} // namespace lczero From b218726db5c1594b8f922fe0aabd129e86043804 Mon Sep 17 00:00:00 2001 From: zz4032 Date: Sun, 4 Dec 2022 10:07:54 +0100 Subject: [PATCH 05/46] Backendbench assistant with TC-dependent output. (#1546) --- src/benchmark/backendbench.cc | 66 ++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/src/benchmark/backendbench.cc b/src/benchmark/backendbench.cc index d2ff622ec4..6792f9b778 100644 --- a/src/benchmark/backendbench.cc +++ b/src/benchmark/backendbench.cc @@ -50,23 +50,28 @@ const OptionId kFenId{"fen", "", "Benchmark initial position FEN."}; const OptionId kClippyId{"clippy", "", "Enable helpful assistant."}; -const OptionId kClippyThresholdId{"clippy-threshold", "", - "Ratio of nps improvement necessary for each " - "doubling of batchsize to be considered " - "best."}; - -void Clippy(std::string msg) { +void Clippy(std::string title, + std::string msg3, std::string best3, std::string msg2, + std::string best2, std::string msg, std::string best) { std::cout << " __" << std::endl; std::cout << " / \\" << std::endl; - std::cout << " | |" << std::endl; - std::cout << " + + " << std::string(msg.length() + 2, '_') << std::endl; - std::cout << "(@)(@) _|" << std::string(msg.length() + 2, ' ') << '|' + std::cout << " | | " << std::string(title.length()+2, '_') << std::endl; + std::cout << " + + | " << std::string(title.length()+1, ' ') + << "|" << std::endl; + std::cout << "(@)(@) _| " + << title << " |" << std::endl; - std::cout << " | | \\ " << msg << " |" << std::endl; - std::cout << " || |/ |" << std::string(msg.length() + 2, '_') << '|' + std::cout << " | | \\ " << std::string(6, ' ') << msg3 + << std::string(4 - best3.length(), ' ') << best3 + << std::string(title.length()-33, ' ') << "|" << std::endl; + std::cout << " || |/ | " << std::string(6, ' ') << msg2 + << std::string(4 - best2.length(), ' ') << best2 + << std::string(title.length()-33, ' ') << "|" << std::endl; + std::cout << " || || | " << std::string(6, ' ') << msg + << std::string(4 - best.length(), ' ') << best + << std::string(title.length()-33, ' ') << "|" << std::endl; + std::cout << " |\\_/| |" << std::string(title.length()+2, '_') << "|" << std::endl; - std::cout << " || ||" << std::endl; - std::cout << " |\\_/|" << std::endl; std::cout << " \\___/" << std::endl; } } // namespace @@ -82,7 +87,6 @@ void BackendBenchmark::Run() { options.Add(kBatchStepId, 1, 256) = 1; options.Add(kFenId) = ChessBoard::kStartposFen; options.Add(kClippyId) = false; - options.Add(kClippyThresholdId, 0.0f, 1.0f) = 0.15f; if (!options.ProcessAllFlags()) return; @@ -103,8 +107,8 @@ void BackendBenchmark::Run() { const int batches = option_dict.Get(kBatchesId); - int best = 1; - float best_nps = 0.0f; + int best = 1; int best2 = 1; int best3 = 1; + float best_nps = 0.0f; float best_nps2 = 0.0f; float best_nps3 = 0.0f; std::optional> pending; for (int i = option_dict.Get(kStartBatchSizeId); @@ -133,12 +137,27 @@ void BackendBenchmark::Run() { << " nps." << std::endl; if (option_dict.Get(kClippyId)) { - const float threshold = option_dict.Get(kClippyThresholdId); + float nps_ingame = std::pow((nps + best_nps) / 2, 1.085); + float nps_ingame2 = std::pow((nps + best_nps2) / 2, 1.085); + float nps_ingame3 = std::pow((nps + best_nps3) / 2, 1.085); + float threshold = 0.16947 * exp(-4.1695e-6 * nps_ingame * 180) + 0.02; + float threshold2 = 0.16947 * exp(-4.1695e-6 * nps_ingame2 * 15) + 0.02; + float threshold3 = 0.16947 * exp(-4.1695e-6 * nps_ingame3 * 1) + 0.02; if (nps > best_nps && threshold * (i - best) * best_nps < (nps - best_nps) * best) { best_nps = nps; best = i; + if (threshold2 * (i - best2) * best_nps2 < + (nps - best_nps2) * best2) { + best_nps2 = nps; + best2 = i; + if (threshold3 * (i - best3) * best_nps3 < + (nps - best_nps3) * best3) { + best_nps3 = nps; + best3 = i; + } + } if (!pending) { pending = std::chrono::steady_clock::now(); } @@ -147,16 +166,21 @@ void BackendBenchmark::Run() { time = std::chrono::steady_clock::now() - *pending; if (time.count() > 10) { Clippy( - std::to_string(best) + - " looks like the best minibatch-size for this net (so far)."); + "Recommended minibatch-size for this net (so far):", + "1s/move (Bullet): ", std::to_string(best3), + "15s/move (Rapid): ", std::to_string(best2), + "3min/move (Tournament): ", std::to_string(best)); pending.reset(); } } } } if (option_dict.Get(kClippyId)) { - Clippy(std::to_string(best) + - " looks like the best minibatch-size for this net."); + Clippy( + "Recommended minibatch-size for this net:", + "1s/move (Bullet): ", std::to_string(best3), + "15s/move (Rapid): ", std::to_string(best2), + "3min/move (Tournament): ", std::to_string(best)); } } catch (Exception& ex) { std::cerr << ex.what() << std::endl; From 402da93ce0966f221eb68a648514f8a2d24da862 Mon Sep 17 00:00:00 2001 From: Alexander Lyashuk Date: Sun, 4 Dec 2022 10:26:47 +0100 Subject: [PATCH 06/46] In smooth TM, use piggybank when it seems like the bestmove can be overtaken. (#1762) fixes https://github.com/LeelaChessZero/lc0/issues/1582 --- src/mcts/stoppers/smooth.cc | 155 ++++++++++++++++++++++++++++++---- src/mcts/stoppers/stoppers.cc | 5 +- 2 files changed, 143 insertions(+), 17 deletions(-) diff --git a/src/mcts/stoppers/smooth.cc b/src/mcts/stoppers/smooth.cc index e0cc38e651..2a1a196247 100644 --- a/src/mcts/stoppers/smooth.cc +++ b/src/mcts/stoppers/smooth.cc @@ -29,6 +29,7 @@ #include #include +#include #include "mcts/stoppers/legacy.h" #include "mcts/stoppers/stoppers.h" @@ -83,6 +84,19 @@ class Params { // Max number of avg move times in piggybank. float max_piggybank_moves() const { return max_piggybank_moves_; } + int64_t trend_nps_update_period_ms() const { + return trend_nps_update_period_ms_; + } + + // Expected ration of the best move nps in future, to the current nps. + float bestmove_optimism() const { return bestmove_optimism_; } + + // Expected ration of the non-best move nps in future, to the current nps. + float overtaker_optimism() const { return overtaker_optimism_; } + + // Force a use of piggybank during the first few milliseconds of the move. + float force_piggybank_ms() const { return force_piggybank_ms_; } + // Move overhead. int64_t move_overhead_ms() const { return move_overhead_ms_; } // Returns a function function that estimates remaining moves. @@ -104,6 +118,10 @@ class Params { const float per_move_piggybank_fraction_; const float max_piggybank_use_; const float max_piggybank_moves_; + const float trend_nps_update_period_ms_; + const float bestmove_optimism_; + const float overtaker_optimism_; + const float force_piggybank_ms_; const MovesLeftEstimator moves_left_estimator_; }; @@ -144,6 +162,12 @@ Params::Params(const OptionsDict& params, int64_t move_overhead) params.GetOrDefault("max-piggybank-use", 0.94f)), max_piggybank_moves_( params.GetOrDefault("max-piggybank-moves", 36.5f)), + trend_nps_update_period_ms_( + params.GetOrDefault("trend-nps-update-period-ms", 3000)), + bestmove_optimism_(params.GetOrDefault("bestmove-optimism", 0.2f)), + overtaker_optimism_( + params.GetOrDefault("overtaker-optimism", 4.0f)), + force_piggybank_ms_(params.GetOrDefault("force-piggybank-ms", 1000)), moves_left_estimator_(CreateMovesLeftEstimator(params)) {} // Returns the updated value of @from, towards @to by the number of halves @@ -198,9 +222,36 @@ float LinearDecay(float cur_value, float target_value, class SmoothTimeManager; +class VisitsTrendWatcher { + public: + VisitsTrendWatcher(float nps_update_period, float bestmove_optimism, + float overtaker_optimism) + : nps_update_period_(nps_update_period), + bestmove_optimism_(bestmove_optimism), + overtaker_optimism_(overtaker_optimism) {} + + void Update(uint64_t timestamp, const std::vector& visits); + bool IsBestmoveBeingOvertaken(uint64_t by_which_time) const; + + private: + const float nps_update_period_; + const float bestmove_optimism_; + const float overtaker_optimism_; + + mutable Mutex mutex_; + uint64_t prev_timestamp_ GUARDED_BY(mutex_) = 0; + std::vector prev_visits_ GUARDED_BY(mutex_); + uint64_t cur_timestamp_ GUARDED_BY(mutex_) = 0; + std::vector cur_visits_ GUARDED_BY(mutex_); + uint64_t last_timestamp_ GUARDED_BY(mutex_) = 0; + std::vector last_visits_ GUARDED_BY(mutex_); +}; + class SmoothStopper : public SearchStopper { public: SmoothStopper(int64_t deadline_ms, int64_t allowed_piggybank_use_ms, + float nps_update_period, float bestmove_optimism, + float overtaker_optimism, int64_t forces_piggybank_ms, SmoothTimeManager* manager); private: @@ -209,7 +260,9 @@ class SmoothStopper : public SearchStopper { const int64_t deadline_ms_; const int64_t allowed_piggybank_use_ms_; + const int64_t forced_piggybank_use_ms_; + VisitsTrendWatcher visits_trend_watcher_; SmoothTimeManager* const manager_; std::atomic_flag used_piggybank_; }; @@ -251,19 +304,19 @@ class SmoothTimeManager : public TimeManager { : total_move_time / move_allocated_time_ms_; // Recompute expected move time for logging. const float expected_move_time = move_allocated_time_ms_ * timeuse_; - // If piggybank was used, cannot update timeuse_. + int64_t piggybank_time_used = 0; if (used_piggybank) { piggybank_time_used = std::max(int64_t(), total_move_time - time_budget); piggybank_time_ -= piggybank_time_used; - } else { - timeuse_ = - ExponentialDecay(timeuse_, this_move_time_use, - params_.smartpruning_timeuse_halfupdate_moves(), - this_move_time_fraction); - if (timeuse_ < params_.min_smartpruning_timeuse()) { - timeuse_ = params_.min_smartpruning_timeuse(); - } + } + // If piggybank was used, time use is 100%. + timeuse_ = + ExponentialDecay(timeuse_, used_piggybank ? 1.0f : this_move_time_use, + params_.smartpruning_timeuse_halfupdate_moves(), + this_move_time_fraction); + if (timeuse_ < params_.min_smartpruning_timeuse()) { + timeuse_ = params_.min_smartpruning_timeuse(); } // Remember final number of nodes for tree reuse estimation. last_move_final_nodes_ = total_nodes; @@ -404,8 +457,10 @@ class SmoothTimeManager : public TimeManager { << ", moves=" << remaining_moves << ", time=" << total_remaining_ms << "ms, nps=" << nps_; - return std::make_unique(move_allocated_time_ms_, - allowed_piggybank_time_ms, this); + return std::make_unique( + move_allocated_time_ms_, allowed_piggybank_time_ms, + params_.trend_nps_update_period_ms(), params_.bestmove_optimism(), + params_.overtaker_optimism(), params_.force_piggybank_ms(), this); } void UpdateTreeReuseFactor(int64_t new_move_nodes) REQUIRES(mutex_) { @@ -460,11 +515,65 @@ class SmoothTimeManager : public TimeManager { bool is_first_move_ GUARDED_BY(mutex_) = true; }; +void VisitsTrendWatcher::Update(uint64_t timestamp, + const std::vector& visits) { + Mutex::Lock lock(mutex_); + if (timestamp <= last_timestamp_) return; + if (prev_visits_.empty()) { + prev_visits_ = visits; + cur_visits_ = visits; + prev_timestamp_ = timestamp; + cur_timestamp_ = timestamp; + } + last_timestamp_ = timestamp; + last_visits_ = visits; + if (cur_timestamp_ + nps_update_period_ >= timestamp) { + prev_timestamp_ = cur_timestamp_; + prev_visits_ = std::move(cur_visits_); + cur_visits_ = last_visits_; + cur_timestamp_ = last_timestamp_; + } +} + +bool VisitsTrendWatcher::IsBestmoveBeingOvertaken( + uint64_t by_which_time) const { + Mutex::Lock lock(mutex_); + // If we don't have any stats yet, we cannot stop the search. + if (prev_timestamp_ >= last_timestamp_) return false; + if (by_which_time <= last_timestamp_) return false; + std::vector npms; + npms.reserve(last_visits_.size()); + for (size_t i = 0; i < last_visits_.size(); ++i) { + npms.push_back(static_cast(last_visits_[i] - prev_visits_[i]) / + (last_timestamp_ - prev_timestamp_)); + } + const size_t bestmove_idx = + std::max_element(last_visits_.begin(), last_visits_.end()) - + last_visits_.begin(); + const auto planned_bestmove_visits = + last_visits_[bestmove_idx] + bestmove_optimism_ * npms[bestmove_idx] * + (by_which_time - last_timestamp_); + for (size_t i = 0; i < last_visits_.size(); ++i) { + if (i == bestmove_idx) continue; + const auto planned_visits = + last_visits_[i] + + overtaker_optimism_ * npms[i] * (by_which_time - last_timestamp_); + if (planned_visits > planned_bestmove_visits) return true; + } + return false; +} + SmoothStopper::SmoothStopper(int64_t deadline_ms, int64_t allowed_piggybank_use_ms, + float nps_update_period, float bestmove_optimism, + float overtaker_optimism, + int64_t forced_piggybank_use_ms, SmoothTimeManager* manager) : deadline_ms_(deadline_ms), allowed_piggybank_use_ms_(allowed_piggybank_use_ms), + forced_piggybank_use_ms_(forced_piggybank_use_ms), + visits_trend_watcher_(nps_update_period, bestmove_optimism, + overtaker_optimism), manager_(manager) { used_piggybank_.clear(); } @@ -478,10 +587,16 @@ bool SmoothStopper::ShouldStop(const IterationStats& stats, return true; } + visits_trend_watcher_.Update(stats.time_since_movestart, stats.edge_n); + const auto deadline_with_piggybank = deadline_ms_ + allowed_piggybank_use_ms_; + const bool force_use_piggybank = + stats.time_since_first_batch <= forced_piggybank_use_ms_; const bool use_piggybank = - (stats.time_usage_hint_ == IterationStats::TimeUsageHint::kNeedMoreTime); + (stats.time_usage_hint_ == IterationStats::TimeUsageHint::kNeedMoreTime || + force_use_piggybank || + visits_trend_watcher_.IsBestmoveBeingOvertaken(deadline_with_piggybank)); const int64_t time_limit = - use_piggybank ? (deadline_ms_ + allowed_piggybank_use_ms_) : deadline_ms_; + use_piggybank ? deadline_with_piggybank : deadline_ms_; hints->UpdateEstimatedNps(nps); hints->UpdateEstimatedRemainingTimeMs(time_limit - stats.time_since_movestart); @@ -489,10 +604,20 @@ bool SmoothStopper::ShouldStop(const IterationStats& stats, // It's not entirely correct as due to extended remaining time smart pruning // will trigger later and we spend more time than if use_piggyback was // false, even before reaching the deadline. - used_piggybank_.test_and_set(); + if (!used_piggybank_.test_and_set()) { + LOGFILE << "Entering piggybank, reason: " + << (stats.time_usage_hint_ == + IterationStats::TimeUsageHint::kNeedMoreTime + ? "requested by search." + : force_use_piggybank + ? "forced used in the beginning of the move." + : "bestmove can be overtaken."); + } } if (stats.time_since_movestart >= time_limit) { - LOGFILE << "Stopping search: Ran out of time."; + LOGFILE << "Stopping search: Ran out of time. elapsed=" << std::fixed + << stats.time_since_movestart << " limit=" << time_limit + << " piggy=" << use_piggybank; return true; } return false; diff --git a/src/mcts/stoppers/stoppers.cc b/src/mcts/stoppers/stoppers.cc index 8d6f7ae426..a66e4f8063 100644 --- a/src/mcts/stoppers/stoppers.cc +++ b/src/mcts/stoppers/stoppers.cc @@ -242,8 +242,9 @@ bool SmartPruningStopper::ShouldStop(const IterationStats& stats, } if (remaining_playouts < (largest_n - second_largest_n)) { - LOGFILE << remaining_playouts << " playouts remaining. Best move has " - << largest_n << " visits, second best -- " << second_largest_n + LOGFILE << std::fixed << remaining_playouts + << " playouts remaining. Best move has " << largest_n + << " visits, second best -- " << second_largest_n << ". Difference is " << (largest_n - second_largest_n) << ", so stopping the search after " << stats.batches_since_movestart << " batches."; From 8774cae6726b8e03f00be136af939b2eed3cfca6 Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Sun, 4 Dec 2022 13:31:38 +0200 Subject: [PATCH 07/46] only use thread binding with blas backend (#1797) --- src/main.cc | 3 --- src/mcts/node.cc | 4 ---- src/mcts/search.cc | 1 - src/mcts/search.h | 4 +--- src/neural/blas/network_blas.cc | 5 +++++ src/neural/network.h | 1 + src/neural/network_demux.cc | 7 ++----- src/neural/network_mux.cc | 7 ++----- 8 files changed, 11 insertions(+), 21 deletions(-) diff --git a/src/main.cc b/src/main.cc index 3f370b898d..4c8880d4e6 100644 --- a/src/main.cc +++ b/src/main.cc @@ -36,7 +36,6 @@ #include "utils/commandline.h" #include "utils/esc_codes.h" #include "utils/logging.h" -#include "utils/numa.h" #include "version.h" int main(int argc, const char** argv) { @@ -49,8 +48,6 @@ int main(int argc, const char** argv) { << " built " << __DATE__; try { - Numa::Init(); - Numa::BindThread(0); InitializeMagicBitboards(); CommandLine::Init(argc, argv); diff --git a/src/mcts/node.cc b/src/mcts/node.cc index 9b71cff091..9bda7f144a 100644 --- a/src/mcts/node.cc +++ b/src/mcts/node.cc @@ -39,7 +39,6 @@ #include "neural/network.h" #include "utils/exception.h" #include "utils/hashcat.h" -#include "utils/numa.h" namespace lczero { @@ -99,9 +98,6 @@ class NodeGarbageCollector { } void Worker() { - // Keep garbage collection on same core as where search workers are most - // likely to be to make any lock conention on gc mutex cheaper. - Numa::BindThread(0); while (!stop_.load()) { std::this_thread::sleep_for(std::chrono::milliseconds(kGCIntervalMs)); GarbageCollect(); diff --git a/src/mcts/search.cc b/src/mcts/search.cc index 29808e80ad..74077d912b 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -886,7 +886,6 @@ void Search::PopulateCommonIterationStats(IterationStats* stats) { } void Search::WatchdogThread() { - Numa::BindThread(0); LOGFILE << "Start a watchdog thread."; StoppersHints hints; IterationStats stats; diff --git a/src/mcts/search.h b/src/mcts/search.h index 9a1be4c125..1323320bca 100644 --- a/src/mcts/search.h +++ b/src/mcts/search.h @@ -44,7 +44,6 @@ #include "syzygy/syzygy.h" #include "utils/logging.h" #include "utils/mutex.h" -#include "utils/numa.h" namespace lczero { @@ -216,11 +215,10 @@ class SearchWorker { params_(params), moves_left_support_(search_->network_->GetCapabilities().moves_left != pblczero::NetworkFormat::MOVES_LEFT_NONE) { - Numa::BindThread(id); + search_->network_->InitThread(id); for (int i = 0; i < params.GetTaskWorkersPerSearchWorker(); i++) { task_workspaces_.emplace_back(); task_threads_.emplace_back([this, i]() { - Numa::BindThread(i); this->RunTasks(i); }); } diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 60827dc3f4..1bf2a61196 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -35,6 +35,7 @@ #include "neural/shared/attention_policy_map.h" #include "neural/shared/policy_map.h" #include "neural/shared/winograd_filter.h" +#include "utils/numa.h" #ifdef USE_DNNL #include @@ -135,6 +136,8 @@ class BlasNetwork : public Network { return capabilities_; } + void InitThread(int id) override { Numa::BindThread(id); } + private: // A cap on the max batch size since it consumes a lot of memory static constexpr auto kHardMaxBatchSize = 2048; @@ -648,6 +651,8 @@ BlasNetwork::BlasNetwork(const WeightsFile& file, : capabilities_{file.format().network_format().input(), file.format().network_format().moves_left()}, weights_(file.weights()) { + Numa::Init(); + max_batch_size_ = static_cast(options.GetOrDefault("batch_size", 256)); diff --git a/src/neural/network.h b/src/neural/network.h index 2d0a7c720e..fe6337711b 100644 --- a/src/neural/network.h +++ b/src/neural/network.h @@ -107,6 +107,7 @@ class Network { public: virtual const NetworkCapabilities& GetCapabilities() const = 0; virtual std::unique_ptr NewComputation() = 0; + virtual void InitThread(int /*id*/) {} virtual ~Network() = default; }; diff --git a/src/neural/network_demux.cc b/src/neural/network_demux.cc index 3845057ed4..8935b696c6 100644 --- a/src/neural/network_demux.cc +++ b/src/neural/network_demux.cc @@ -31,7 +31,6 @@ #include "neural/factory.h" #include "utils/exception.h" -#include "utils/numa.h" namespace lczero { namespace { @@ -135,7 +134,7 @@ class DemuxingNetwork : public Network { } for (int i = 0; i < nn_threads; ++i) { - threads_.emplace_back([this, i]() { Worker(i); }); + threads_.emplace_back([this, i]() { Worker(); }); } } @@ -163,9 +162,7 @@ class DemuxingNetwork : public Network { } } - void Worker(int id) { - // Add one to the id in order to leave space for an active search thread. - Numa::BindThread(id + 1); + void Worker() { // While Abort() is not called (and it can only be called from destructor). while (!abort_) { { diff --git a/src/neural/network_mux.cc b/src/neural/network_mux.cc index 440a382cda..e8d6ec71d3 100644 --- a/src/neural/network_mux.cc +++ b/src/neural/network_mux.cc @@ -31,7 +31,6 @@ #include "neural/factory.h" #include "utils/exception.h" -#include "utils/numa.h" namespace lczero { namespace { @@ -126,7 +125,7 @@ class MuxingNetwork : public Network { for (int i = 0; i < nn_threads; ++i) { threads_.emplace_back( - [this, net, max_batch, i]() { Worker(net, max_batch, i); }); + [this, net, max_batch]() { Worker(net, max_batch); }); } } @@ -154,9 +153,7 @@ class MuxingNetwork : public Network { } } - void Worker(Network* network, const int max_batch, int id) { - // Add one to the id in order to leave space for an active search thread. - Numa::BindThread(id + 1); + void Worker(Network* network, const int max_batch) { // While Abort() is not called (and it can only be called from destructor). while (!abort_) { std::vector children; From a3646eaf16f7df94ef0125438a9cabe6875b8de0 Mon Sep 17 00:00:00 2001 From: Naphthalin <40385638+Naphthalin@users.noreply.github.com> Date: Sun, 4 Dec 2022 12:32:20 +0100 Subject: [PATCH 08/46] Fix apparent "mate blindness" in endgame due to instamove logic (#1742) --- src/mcts/search.cc | 7 +++++++ src/mcts/stoppers/stoppers.cc | 3 ++- src/mcts/stoppers/timemgr.h | 1 + 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/mcts/search.cc b/src/mcts/search.cc index 74077d912b..6fe8d1c7e6 100644 --- a/src/mcts/search.cc +++ b/src/mcts/search.cc @@ -844,6 +844,7 @@ void Search::PopulateCommonIterationStats(IterationStats* stats) { stats->average_depth = cum_depth_ / (total_playouts_ ? total_playouts_ : 1); stats->edge_n.clear(); stats->win_found = false; + stats->may_resign = true; stats->num_losing_edges = 0; stats->time_usage_hint_ = IterationStats::TimeUsageHint::kNormal; @@ -870,6 +871,12 @@ void Search::PopulateCommonIterationStats(IterationStats* stats) { if (n > 0 && edge.IsTerminal() && edge.GetWL(0.0f) < 0.0f) { stats->num_losing_edges += 1; } + // If game is resignable, no need for moving quicker. This allows + // proving mate when losing anyway for better score output. + // Hardcoded resign threshold, because there is no available parameter. + if (n > 0 && q > -0.98f) { + stats->may_resign = false; + } if (max_n < n) { max_n = n; max_n_has_max_q_plus_m = false; diff --git a/src/mcts/stoppers/stoppers.cc b/src/mcts/stoppers/stoppers.cc index a66e4f8063..9f99a4269a 100644 --- a/src/mcts/stoppers/stoppers.cc +++ b/src/mcts/stoppers/stoppers.cc @@ -195,7 +195,8 @@ bool SmartPruningStopper::ShouldStop(const IterationStats& stats, LOGFILE << "Only one possible move. Moving immediately."; return true; } - if (stats.edge_n.size() <= static_cast(stats.num_losing_edges + 1)) { + if (stats.edge_n.size() <= static_cast(stats.num_losing_edges) + + (stats.may_resign ? 0 : 1)) { LOGFILE << "At most one non losing move, stopping search."; return true; } diff --git a/src/mcts/stoppers/timemgr.h b/src/mcts/stoppers/timemgr.h index b545f194c9..64d0c2c05d 100644 --- a/src/mcts/stoppers/timemgr.h +++ b/src/mcts/stoppers/timemgr.h @@ -52,6 +52,7 @@ struct IterationStats { // TODO: remove this in favor of time_usage_hint_=kImmediateMove when // smooth time manager is the default. bool win_found = false; + bool may_resign = false; int num_losing_edges = 0; enum class TimeUsageHint { kNormal, kNeedMoreTime, kImmediateMove }; From 2feda7cb6b8094f75ddec9c9a7386cc19474cef0 Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Sun, 4 Dec 2022 13:32:48 +0200 Subject: [PATCH 09/46] initial dfrc support (#1684) --- src/chess/board.cc | 96 +++++++++++++++++--------------- src/chess/board.h | 59 +++++++++++++------- src/chess/pgn.h | 4 +- src/neural/decoder.cc | 15 +++-- src/neural/encoder.cc | 22 ++++++-- src/trainingdata/trainingdata.cc | 20 ++++--- 6 files changed, 131 insertions(+), 85 deletions(-) diff --git a/src/chess/board.cc b/src/chess/board.cc index 0ec7de089d..0ace8142ef 100644 --- a/src/chess/board.cc +++ b/src/chess/board.cc @@ -463,21 +463,19 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const { // For castlings we don't check destination king square for checks, it // will be done in legal move check phase. if (castlings_.we_can_000()) { - const uint8_t qrook = castlings_.queenside_rook(); + const uint8_t qrook = castlings_.our_queenside_rook(); if (walk_free(std::min(static_cast(C1), qrook), std::max(static_cast(D1), king), qrook, king) && !range_attacked(king, C1)) { - result.emplace_back(source, - BoardSquare(RANK_1, castlings_.queenside_rook())); + result.emplace_back(source, BoardSquare(RANK_1, qrook)); } } if (castlings_.we_can_00()) { - const uint8_t krook = castlings_.kingside_rook(); + const uint8_t krook = castlings_.our_kingside_rook(); if (walk_free(std::min(static_cast(F1), king), std::max(static_cast(G1), krook), krook, king) && !range_attacked(king, G1)) { - result.emplace_back(source, - BoardSquare(RANK_1, castlings_.kingside_rook())); + result.emplace_back(source, BoardSquare(RANK_1, krook)); } } continue; @@ -625,10 +623,10 @@ bool ChessBoard::ApplyMove(Move move) { rooks_.reset(to); bishops_.reset(to); pawns_.reset(to); - if (to.as_int() == 56 + castlings_.kingside_rook()) { + if (to.as_int() == A8 + castlings_.their_kingside_rook()) { castlings_.reset_they_can_00(); } - if (to.as_int() == 56 + castlings_.queenside_rook()) { + if (to.as_int() == A8 + castlings_.their_queenside_rook()) { castlings_.reset_they_can_000(); } @@ -672,8 +670,12 @@ bool ChessBoard::ApplyMove(Move move) { // Reset castling rights. if (from_row == RANK_1 && rooks_.get(from)) { - if (from_col == castlings_.queenside_rook()) castlings_.reset_we_can_000(); - if (from_col == castlings_.kingside_rook()) castlings_.reset_we_can_00(); + if (from_col == castlings_.our_queenside_rook()) { + castlings_.reset_we_can_000(); + } + if (from_col == castlings_.our_kingside_rook()) { + castlings_.reset_we_can_00(); + } } // Ordinary move. @@ -1043,62 +1045,66 @@ void ChessBoard::SetFromFen(std::string fen, int* rule50_ply, int* moves) { } if (castlings != "-") { - uint8_t left_rook = FILE_A; - uint8_t right_rook = FILE_H; + uint8_t our_left_rook = FILE_A; + uint8_t our_right_rook = FILE_H; + uint8_t their_left_rook = FILE_A; + uint8_t their_right_rook = FILE_H; for (char c : castlings) { const bool is_black = std::islower(c); const int king_col = (is_black ? their_king_ : our_king_).col(); - if (!is_black) c = std::tolower(c); const auto rooks = (is_black ? their_pieces_ : our_pieces_) & ChessBoard::rooks(); - if (c == 'k') { - // Finding rightmost rook. - for (right_rook = FILE_H; right_rook > king_col; --right_rook) { - if (rooks.get(is_black ? RANK_8 : RANK_1, right_rook)) break; + auto find_rook = [rooks, king_col, fen](bool forward, uint8_t rank) { + uint8_t rook; + for (rook = forward ? FILE_A : FILE_H; rook != king_col; + rook += 2 * forward - 1) { + if (rooks.get(rank, rook)) break; } - if (right_rook == king_col) { - throw Exception("Bad fen string (no kingside rook): " + fen); + if (rook == king_col) { + throw Exception("Bad fen string (missing rook): " + fen); } - if (is_black) { - castlings_.set_they_can_00(); + return rook; + }; + if (c == 'K') { + // Finding rightmost rook. + our_right_rook = find_rook(false, RANK_1); + castlings_.set_we_can_00(); + } else if (c == 'Q') { + // Finding leftmost rook. + our_left_rook = find_rook(true, RANK_1); + castlings_.set_we_can_000(); + } else if (c >= 'A' && c <= 'H') { + int rook_col = c - 'A'; + if (rook_col < king_col) { + our_left_rook = rook_col; + castlings_.set_we_can_000(); } else { + our_right_rook = rook_col; castlings_.set_we_can_00(); } + } else if (c == 'k') { + // Finding rightmost rook. + their_right_rook = find_rook(false, RANK_8); + castlings_.set_they_can_00(); } else if (c == 'q') { // Finding leftmost rook. - for (left_rook = FILE_A; left_rook < king_col; ++left_rook) { - if (rooks.get(is_black ? RANK_8 : RANK_1, left_rook)) break; - } - if (left_rook == king_col) { - throw Exception("Bad fen string (no queenside rook): " + fen); - } - if (is_black) { - castlings_.set_they_can_000(); - } else { - castlings_.set_we_can_000(); - } + their_left_rook = find_rook(true, RANK_8); + castlings_.set_they_can_000(); } else if (c >= 'a' && c <= 'h') { int rook_col = c - 'a'; if (rook_col < king_col) { - left_rook = rook_col; - if (is_black) { - castlings_.set_they_can_000(); - } else { - castlings_.set_we_can_000(); - } + their_left_rook = rook_col; + castlings_.set_they_can_000(); } else { - right_rook = rook_col; - if (is_black) { - castlings_.set_they_can_00(); - } else { - castlings_.set_we_can_00(); - } + their_right_rook = rook_col; + castlings_.set_they_can_00(); } } else { throw Exception("Bad fen string (unexpected casting symbol): " + fen); } } - castlings_.SetRookPositions(left_rook, right_rook); + castlings_.SetRookPositions(our_left_rook, our_right_rook, their_left_rook, + their_right_rook); } if (en_passant != "-") { diff --git a/src/chess/board.h b/src/chess/board.h index c9480b9631..bc0b82fbc5 100644 --- a/src/chess/board.h +++ b/src/chess/board.h @@ -116,7 +116,12 @@ class ChessBoard { class Castlings { public: - Castlings() : queenside_rook_(0), kingside_rook_(7) {} + Castlings() + : our_queenside_rook_(FILE_A), + their_queenside_rook_(FILE_A), + our_kingside_rook_(FILE_H), + their_kingside_rook_(FILE_H), + data_(0) {} void set_we_can_00() { data_ |= 1; } void set_we_can_000() { data_ |= 2; } @@ -134,7 +139,11 @@ class ChessBoard { bool they_can_000() const { return data_ & 8; } bool no_legal_castle() const { return data_ == 0; } - void Mirror() { data_ = ((data_ & 0b11) << 2) + ((data_ & 0b1100) >> 2); } + void Mirror() { + std::swap(our_queenside_rook_, their_queenside_rook_); + std::swap(our_kingside_rook_, their_kingside_rook_); + data_ = ((data_ & 0b11) << 2) + ((data_ & 0b1100) >> 2); + } // Note: this is not a strict xfen compatible output. Without access to the // board its not possible to know whether there is ambiguity so all cases @@ -142,16 +151,17 @@ class ChessBoard { std::string as_string() const { if (data_ == 0) return "-"; std::string result; - if (queenside_rook() == FILE_A && kingside_rook() == FILE_H) { + if (our_queenside_rook() == FILE_A && our_kingside_rook() == FILE_H && + their_queenside_rook() == FILE_A && their_kingside_rook() == FILE_H) { if (we_can_00()) result += 'K'; if (we_can_000()) result += 'Q'; if (they_can_00()) result += 'k'; if (they_can_000()) result += 'q'; } else { - if (we_can_00()) result += 'A' + kingside_rook(); - if (we_can_000()) result += 'A' + queenside_rook(); - if (they_can_00()) result += 'a' + kingside_rook(); - if (they_can_000()) result += 'a' + queenside_rook(); + if (we_can_00()) result += 'A' + our_kingside_rook(); + if (we_can_000()) result += 'A' + our_queenside_rook(); + if (they_can_00()) result += 'a' + their_kingside_rook(); + if (they_can_000()) result += 'a' + their_queenside_rook(); } return result; } @@ -164,8 +174,10 @@ class ChessBoard { if (they_can_00()) result += 'k'; if (they_can_000()) result += 'q'; result += '['; - result += 'a' + queenside_rook(); - result += 'a' + kingside_rook(); + result += 'A' + our_queenside_rook(); + result += 'A' + our_kingside_rook(); + result += 'a' + their_queenside_rook(); + result += 'a' + their_kingside_rook(); result += ']'; return result; } @@ -173,29 +185,38 @@ class ChessBoard { uint8_t as_int() const { return data_; } bool operator==(const Castlings& other) const { - assert(queenside_rook_ == other.queenside_rook_ && - kingside_rook_ == other.kingside_rook_); + assert(our_queenside_rook_ == other.our_queenside_rook_ && + our_kingside_rook_ == other.our_kingside_rook_ && + their_queenside_rook_ == other.their_queenside_rook_ && + their_kingside_rook_ == other.their_kingside_rook_); return data_ == other.data_; } - uint8_t queenside_rook() const { return queenside_rook_; } - uint8_t kingside_rook() const { return kingside_rook_; } - void SetRookPositions(std::uint8_t left, std::uint8_t right) { - queenside_rook_ = left; - kingside_rook_ = right; + uint8_t our_queenside_rook() const { return our_queenside_rook_; } + uint8_t our_kingside_rook() const { return our_kingside_rook_; } + uint8_t their_queenside_rook() const { return their_queenside_rook_; } + uint8_t their_kingside_rook() const { return their_kingside_rook_; } + void SetRookPositions(uint8_t our_left, uint8_t our_right, + uint8_t their_left, uint8_t their_right) { + our_queenside_rook_ = our_left; + our_kingside_rook_ = our_right; + their_queenside_rook_ = their_left; + their_kingside_rook_ = their_right; } private: // Position of "left" (queenside) rook in starting game position. - std::uint8_t queenside_rook_ : 3; + uint8_t our_queenside_rook_; + uint8_t their_queenside_rook_; // Position of "right" (kingside) rook in starting position. - std::uint8_t kingside_rook_ : 3; + uint8_t our_kingside_rook_; + uint8_t their_kingside_rook_; // - Bit 0 -- "our" side's kingside castle. // - Bit 1 -- "our" side's queenside castle. // - Bit 2 -- opponent's side's kingside castle. // - Bit 3 -- opponent's side's queenside castle. - std::uint8_t data_ = 0; + uint8_t data_; }; std::string DebugString() const; diff --git a/src/chess/pgn.h b/src/chess/pgn.h index 0f38d406c4..512d4c673b 100644 --- a/src/chess/pgn.h +++ b/src/chess/pgn.h @@ -213,10 +213,10 @@ class PgnReader { BoardSquare king_sq(GetLowestBit(king_board.as_int())); if (san.size() > 4 && san[3] == '-' && san[4] == 'O') { m = Move(BoardSquare(0, king_sq.col()), - BoardSquare(0, board.castlings().queenside_rook())); + BoardSquare(0, board.castlings().our_queenside_rook())); } else { m = Move(BoardSquare(0, king_sq.col()), - BoardSquare(0, board.castlings().kingside_rook())); + BoardSquare(0, board.castlings().our_kingside_rook())); } return m; } diff --git a/src/neural/decoder.cc b/src/neural/decoder.cc index 13007caca6..1798523730 100644 --- a/src/neural/decoder.cc +++ b/src/neural/decoder.cc @@ -95,29 +95,34 @@ void PopulateBoard(pblczero::NetworkFormat::InputFormat input_format, case pblczero::NetworkFormat::INPUT_112_WITH_CANONICALIZATION_V2: case pblczero::NetworkFormat:: INPUT_112_WITH_CANONICALIZATION_V2_ARMAGEDDON: { - auto queenside = 0; - auto kingside = 7; + int our_queenside = ChessBoard::FILE_A; + int their_queenside = ChessBoard::FILE_A; + int our_kingside = ChessBoard::FILE_H; + int their_kingside = ChessBoard::FILE_H; if (planes[kAuxPlaneBase + 0].mask != 0) { auto mask = planes[kAuxPlaneBase + 0].mask; - queenside = GetLowestBit((mask >> 56) | mask); if ((mask & 0xFFLL) != 0) { + our_queenside = GetLowestBit(mask & 0xFFLL); castlings.set_we_can_000(); } if (mask >> 56 != 0) { + their_queenside = GetLowestBit(mask >> 56); castlings.set_they_can_000(); } } if (planes[kAuxPlaneBase + 1].mask != 0) { auto mask = planes[kAuxPlaneBase + 1].mask; - kingside = GetLowestBit((mask >> 56) | mask); if ((mask & 0xFFLL) != 0) { + our_kingside = GetLowestBit(mask & 0xFFLL); castlings.set_we_can_00(); } if (mask >> 56 != 0) { + their_kingside = GetLowestBit(mask >> 56); castlings.set_they_can_00(); } } - castlings.SetRookPositions(queenside, kingside); + castlings.SetRookPositions(our_queenside, our_kingside, their_queenside, + their_kingside); break; } diff --git a/src/neural/encoder.cc b/src/neural/encoder.cc index ef8ef6524a..63851fad48 100644 --- a/src/neural/encoder.cc +++ b/src/neural/encoder.cc @@ -182,13 +182,23 @@ InputPlanes EncodePositionForNN( // h-side (kingside) castling right. const auto& cast = board.castlings(); result[kAuxPlaneBase + 0].mask = - ((cast.we_can_000() ? BoardSquare(ChessBoard::A1).as_board() : 0) | - (cast.they_can_000() ? BoardSquare(ChessBoard::A8).as_board() : 0)) - << cast.queenside_rook(); + (cast.we_can_000() + ? BoardSquare(ChessBoard::A1 + cast.our_queenside_rook()) + .as_board() + : 0) | + (cast.they_can_000() + ? BoardSquare(ChessBoard::A8 + cast.their_queenside_rook()) + .as_board() + : 0); result[kAuxPlaneBase + 1].mask = - ((cast.we_can_00() ? BoardSquare(ChessBoard::A1).as_board() : 0) | - (cast.they_can_00() ? BoardSquare(ChessBoard::A8).as_board() : 0)) - << cast.kingside_rook(); + (cast.we_can_00() + ? BoardSquare(ChessBoard::A1 + cast.our_kingside_rook()) + .as_board() + : 0) | + (cast.they_can_00() + ? BoardSquare(ChessBoard::A8 + cast.their_kingside_rook()) + .as_board() + : 0); break; } default: diff --git a/src/trainingdata/trainingdata.cc b/src/trainingdata/trainingdata.cc index cb4b0c82bd..1285dc7b49 100644 --- a/src/trainingdata/trainingdata.cc +++ b/src/trainingdata/trainingdata.cc @@ -191,18 +191,22 @@ void V6TrainingDataArray::Add(const Node* node, const PositionHistory& history, const auto& castlings = position.GetBoard().castlings(); // Populate castlings. // For non-frc trained nets, just send 1 like we used to. - uint8_t queen_side = 1; - uint8_t king_side = 1; + uint8_t our_queen_side = 1; + uint8_t our_king_side = 1; + uint8_t their_queen_side = 1; + uint8_t their_king_side = 1; // If frc trained, send the bit mask representing rook position. if (Is960CastlingFormat(input_format_)) { - queen_side <<= castlings.queenside_rook(); - king_side <<= castlings.kingside_rook(); + our_queen_side <<= castlings.our_queenside_rook(); + our_king_side <<= castlings.our_kingside_rook(); + their_queen_side <<= castlings.their_queenside_rook(); + their_king_side <<= castlings.their_kingside_rook(); } - result.castling_us_ooo = castlings.we_can_000() ? queen_side : 0; - result.castling_us_oo = castlings.we_can_00() ? king_side : 0; - result.castling_them_ooo = castlings.they_can_000() ? queen_side : 0; - result.castling_them_oo = castlings.they_can_00() ? king_side : 0; + result.castling_us_ooo = castlings.we_can_000() ? our_queen_side : 0; + result.castling_us_oo = castlings.we_can_00() ? our_king_side : 0; + result.castling_them_ooo = castlings.they_can_000() ? their_queen_side : 0; + result.castling_them_oo = castlings.they_can_00() ? their_king_side : 0; // Other params. if (IsCanonicalFormat(input_format_)) { From 2efcd4f19034ed23d22f7e367818e18bc8388e41 Mon Sep 17 00:00:00 2001 From: almaudoh Date: Wed, 7 Dec 2022 17:58:41 +0100 Subject: [PATCH 10/46] Fix error in mha scaling in encoder layers. (#1808) --- src/neural/blas/network_blas.cc | 2 +- src/neural/metal/mps/MetalNetworkBuilder.mm | 2 -- src/neural/metal/mps/NetworkGraph.h | 2 -- src/neural/metal/mps/NetworkGraph.mm | 17 +++++++++-------- src/neural/onnx/adapters.cc | 1 + src/neural/onnx/converter.cc | 4 ++-- 6 files changed, 13 insertions(+), 15 deletions(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 1bf2a61196..3edc1f15b0 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -352,7 +352,7 @@ void BlasComputation::ComputeBlocking() { const int d_model = layer.mha.q_b.size(); const int heads = weights_.pol_encoder_head_count; const int depth = d_model / heads; - const float scaling = 1.0f / sqrtf(d_model); + const float scaling = 1.0f / sqrtf(depth); // MHA is done per batch since there's a fourth dimension introduced. for (auto batch = size_t{0}; batch < batch_size; batch++) { diff --git a/src/neural/metal/mps/MetalNetworkBuilder.mm b/src/neural/metal/mps/MetalNetworkBuilder.mm index 9940920307..2a25accdfd 100644 --- a/src/neural/metal/mps/MetalNetworkBuilder.mm +++ b/src/neural/metal/mps/MetalNetworkBuilder.mm @@ -148,8 +148,6 @@ Toolkit and the NVIDIA CUDA Deep Neural Network library (or a withKeys:mhaK withValues:mhaV heads:weights.pol_encoder_head_count - dModel:dModel - scale:1.0f / sqrt(dModel) label:[NSString stringWithFormat:@"policy/encoder_%zu/mha", i]]; // MHA final dense layer. diff --git a/src/neural/metal/mps/NetworkGraph.h b/src/neural/metal/mps/NetworkGraph.h index 353a367bbc..8df9a1058c 100644 --- a/src/neural/metal/mps/NetworkGraph.h +++ b/src/neural/metal/mps/NetworkGraph.h @@ -120,8 +120,6 @@ static MPSImageFeatureChannelFormat fcFormat = MPSImageFeatureChannelFormatFloat withKeys:(MPSGraphTensor * __nonnull)keys withValues:(MPSGraphTensor * __nonnull)values heads:(NSUInteger)heads - dModel:(NSUInteger)dModel - scale:(float)scale label:(NSString * __nonnull)label; -(nonnull MPSGraphTensor *) scaledQKMatmulWithQueries:(MPSGraphTensor * __nonnull)queries diff --git a/src/neural/metal/mps/NetworkGraph.mm b/src/neural/metal/mps/NetworkGraph.mm index c5e46e7e23..2a437ffe23 100644 --- a/src/neural/metal/mps/NetworkGraph.mm +++ b/src/neural/metal/mps/NetworkGraph.mm @@ -611,12 +611,12 @@ -(nonnull MPSGraphTensor *) scaledMHAMatmulWithQueries:(MPSGraphTensor * __nonnu withKeys:(MPSGraphTensor * __nonnull)keys withValues:(MPSGraphTensor * __nonnull)values heads:(NSUInteger)heads - dModel:(NSUInteger)dModel - scale:(float)scale label:(NSString * __nonnull)label { // Split heads. - const NSUInteger depth = dModel / heads; + const NSUInteger dmodel = [[queries.shape lastObject] intValue]; + const NSUInteger depth = dmodel / heads; + queries = [_graph reshapeTensor:queries withShape:@[@(-1), @64, @(heads), @(depth)] name:[NSString stringWithFormat:@"%@/reshape_q", label]]; queries = [_graph transposeTensor:queries dimension:1 withDimension:2 name:[NSString stringWithFormat:@"%@/transpose_q", label]]; @@ -631,10 +631,11 @@ -(nonnull MPSGraphTensor *) scaledMHAMatmulWithQueries:(MPSGraphTensor * __nonnu MPSGraphTensor * attn = [_graph matrixMultiplicationWithPrimaryTensor:queries secondaryTensor:keys name:[NSString stringWithFormat:@"%@/matmul_qk", label]]; - attn = [_graph multiplicationWithPrimaryTensor:attn - secondaryTensor:[_graph constantWithScalar:scale - shape:@[@1] dataType:attn.dataType] - name:[NSString stringWithFormat:@"%@/scale", label]]; + attn = [_graph divisionWithPrimaryTensor:attn + secondaryTensor:[_graph constantWithScalar:sqrt(depth) + shape:@[@1] + dataType:attn.dataType] + name:[NSString stringWithFormat:@"%@/scale", label]]; attn = [self applyActivationWithTensor:attn activation:@"softmax" label:label]; @@ -645,7 +646,7 @@ -(nonnull MPSGraphTensor *) scaledMHAMatmulWithQueries:(MPSGraphTensor * __nonnu attn = [_graph transposeTensor:attn dimension:1 withDimension:2 name:[NSString stringWithFormat:@"%@/transpose_a", label]]; - return [_graph reshapeTensor:attn withShape:@[@(-1), @(dModel)] name:[NSString stringWithFormat:@"%@/reshape_a", label]]; + return [_graph reshapeTensor:attn withShape:@[@(-1), @(dmodel)] name:[NSString stringWithFormat:@"%@/reshape_a", label]]; } -(nonnull MPSGraphTensor *) scaledQKMatmulWithQueries:(MPSGraphTensor * __nonnull)queries diff --git a/src/neural/onnx/adapters.cc b/src/neural/onnx/adapters.cc index e0f5485046..96f59ea616 100644 --- a/src/neural/onnx/adapters.cc +++ b/src/neural/onnx/adapters.cc @@ -26,6 +26,7 @@ Program grant you additional permission to convey the resulting work. */ #include "neural/onnx/adapters.h" +#include #include "utils/fp16_utils.h" #include "utils/transpose.h" diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc index 1bf4b88b70..aaf200120d 100644 --- a/src/neural/onnx/converter.cc +++ b/src/neural/onnx/converter.cc @@ -275,10 +275,10 @@ std::string Converter::MakeEncoderLayer( std::unique_ptr scale; if (GetDataType() == pblczero::TensorProto::FLOAT16) { scale = std::make_unique( - Float16OnnxConst({FP32toFP16(1.0f / sqrtf(d_model))}, {1})); + Float16OnnxConst({FP32toFP16(1.0f / sqrtf(depth))}, {1})); } else { scale = std::make_unique( - FloatOnnxConst({1.0f / sqrtf(d_model)}, {1})); + FloatOnnxConst({1.0f / sqrtf(depth)}, {1})); } flow = builder->Mul(name + "/mha/QK/scale", flow, *scale); flow = builder->Softmax(name + "/mha/QK/softmax", flow, 3); From eeeab14f929daa81d908c97d57b7c494b086d994 Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Thu, 8 Dec 2022 15:43:56 +0200 Subject: [PATCH 11/46] add probability for early termination of selfplay openings (#1681) --- src/selfplay/game.cc | 35 +++++++++++++++++++++++++++++++++-- src/selfplay/game.h | 4 ++++ src/selfplay/tournament.cc | 31 +++++++++++++++---------------- 3 files changed, 52 insertions(+), 18 deletions(-) diff --git a/src/selfplay/game.cc b/src/selfplay/game.cc index d56e1552e0..c06a0e3d85 100644 --- a/src/selfplay/game.cc +++ b/src/selfplay/game.cc @@ -31,6 +31,7 @@ #include "mcts/stoppers/common.h" #include "mcts/stoppers/factory.h" +#include "utils/random.h" namespace lczero { @@ -60,6 +61,10 @@ const OptionId kSyzygyTablebaseId{ "List of Syzygy tablebase directories, list entries separated by system " "separator (\";\" for Windows, \":\" for Linux).", 's'}; +const OptionId kOpeningStopProbId{ + "opening-stop-prob", "OpeningStopProb", + "From each opening move, start a self-play game with probability max(p, " + "1/n), where p is the value given and n the opening moves remaining."}; } // namespace void SelfPlayGame::PopulateUciParams(OptionsParser* options) { @@ -71,6 +76,7 @@ void SelfPlayGame::PopulateUciParams(OptionsParser* options) { options->Add(kUciChess960) = false; PopulateTimeManagementOptions(RunType::kSelfplay, options); options->Add(kSyzygyTablebaseId); + options->Add(kOpeningStopProbId, 0.0f, 1.0f) = 0.0f; } SelfPlayGame::SelfPlayGame(PlayerOptions white, PlayerOptions black, @@ -91,10 +97,34 @@ SelfPlayGame::SelfPlayGame(PlayerOptions white, PlayerOptions black, tree_[1] = std::make_shared(); tree_[1]->ResetToPosition(orig_fen_, {}); } + int ply = 0; + auto white_prob = white.uci_options->Get(kOpeningStopProbId); + auto black_prob = black.uci_options->Get(kOpeningStopProbId); + if (white_prob != black_prob && white_prob != 0 && black_prob != 0) { + throw Exception("Stop probabilities must be both equal or zero!"); + } + for (Move m : opening.moves) { + // For early exit from the opening, we support two cases: a) where both + // sides have the same exit probability and b) where one side's exit + // probability is zero. In the following formula, `positions` is the number + // of possible exit points remaining, used for adjusting the exit + // probability (to avoid favoring the last position). + auto exit_prob_now = tree_[0]->IsBlackToMove() ? black_prob : white_prob; + auto exit_prob_next = tree_[0]->IsBlackToMove() ? white_prob : black_prob; + int positions = opening.moves.size() - ply + 1; + if (exit_prob_now > 0.0f && + Random::Get().GetFloat(1.0f) < + std::max(exit_prob_now, + exit_prob_now / (exit_prob_now * ((positions + 1) / 2) + + exit_prob_next * (positions / 2)))) { + break; + } tree_[0]->MakeMove(m); if (tree_[0] != tree_[1]) tree_[1]->MakeMove(m); + ply++; } + start_ply_ = ply; } void SelfPlayGame::Play(int white_threads, int black_threads, bool training, @@ -102,8 +132,9 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training, bool blacks_move = tree_[0]->IsBlackToMove(); // If we are training, verify that input formats are consistent. - if (training && options_[0].network->GetCapabilities().input_format != - options_[1].network->GetCapabilities().input_format) { + if (training && + options_[0].network->GetCapabilities().input_format != + options_[1].network->GetCapabilities().input_format) { throw Exception("Can't mix networks with different input format!"); } // Take syzygy tablebases from player1 options. diff --git a/src/selfplay/game.h b/src/selfplay/game.h index 7ad7c2b2ae..918c328ce1 100644 --- a/src/selfplay/game.h +++ b/src/selfplay/game.h @@ -85,6 +85,9 @@ class SelfPlayGame { // not. void Abort(); + // Number of ply used from the given opening. + int GetStartPly() const { return start_ply_; } + // Writes training data to a file. void WriteTrainingData(TrainingDataWriter* writer) const; @@ -103,6 +106,7 @@ class SelfPlayGame { // tree_[0] == tree_[1]. std::shared_ptr tree_[2]; std::string orig_fen_; + int start_ply_; // Search that is currently in progress. Stored in members so that Abort() // can stop it. diff --git a/src/selfplay/tournament.cc b/src/selfplay/tournament.cc index 084e62126f..cd9b7f79c8 100644 --- a/src/selfplay/tournament.cc +++ b/src/selfplay/tournament.cc @@ -80,10 +80,10 @@ const OptionId kOpeningsMirroredId{ const OptionId kOpeningsModeId{"openings-mode", "OpeningsMode", "A choice of sequential, shuffled, or random."}; const OptionId kSyzygyTablebaseId{ - "syzygy-paths", "SyzygyPath", - "List of Syzygy tablebase directories, list entries separated by system " - "separator (\";\" for Windows, \":\" for Linux).", - 's' }; + "syzygy-paths", "SyzygyPath", + "List of Syzygy tablebase directories, list entries separated by system " + "separator (\";\" for Windows, \":\" for Linux).", + 's'}; } // namespace @@ -218,17 +218,15 @@ SelfPlayTournament::SelfPlayTournament( } // Take syzygy tablebases from options. - std::string tb_paths = - options.Get(kSyzygyTablebaseId); + std::string tb_paths = options.Get(kSyzygyTablebaseId); if (!tb_paths.empty()) { - syzygy_tb_ = std::make_unique(); - CERR << "Loading Syzygy tablebases from " << tb_paths; - if (!syzygy_tb_->init(tb_paths)) { - CERR << "Failed to load Syzygy tablebases!"; - syzygy_tb_ = nullptr; - } + syzygy_tb_ = std::make_unique(); + CERR << "Loading Syzygy tablebases from " << tb_paths; + if (!syzygy_tb_->init(tb_paths)) { + CERR << "Failed to load Syzygy tablebases!"; + syzygy_tb_ = nullptr; + } } - } void SelfPlayTournament::PlayOneGame(int game_number) { @@ -352,7 +350,7 @@ void SelfPlayTournament::PlayOneGame(int game_number) { auto player2_threads = player_options_[1][color_idx[1]].Get(kThreadsId); game.Play(player1_threads, player2_threads, kTraining, syzygy_tb_.get(), enable_resign); - + // If game was aborted, it's still undecided. if (game.GetGameResult() != GameResult::UNDECIDED) { // Game callback. @@ -362,12 +360,13 @@ void SelfPlayTournament::PlayOneGame(int game_number) { game_info.game_id = game_number; game_info.initial_fen = opening.start_fen; game_info.moves = game.GetMoves(); - game_info.play_start_ply = opening.moves.size(); + game_info.play_start_ply = game.GetStartPly(); if (!enable_resign) { game_info.min_false_positive_threshold = game.GetWorstEvalForWinnerOrDraw(); } - if (kTraining) { + if (kTraining && + game_info.play_start_ply < static_cast(game_info.moves.size())) { TrainingDataWriter writer(game_number); game.WriteTrainingData(&writer); writer.Finalize(); From bb238006f9db90226082a375184919cda3aa361c Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Mon, 12 Dec 2022 13:57:46 +0200 Subject: [PATCH 12/46] update some shared files for smolgen --- appveyor.yml | 6 +++--- src/neural/network_legacy.cc | 22 ++++++++++++++++++++-- src/neural/network_legacy.h | 25 ++++++++++++++++++++++++- src/neural/shared/activation.h | 2 +- 4 files changed, 48 insertions(+), 7 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index ab5225341e..fb64dc3d18 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -51,9 +51,9 @@ install: - cmd: set DNNL_NAME=dnnl_win_1.5.0_cpu_vcomp - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/oneapi-src/oneDNN/releases/download/v1.5/dnnl_win_1.5.0_cpu_vcomp.zip - cmd: IF %NAME%==cpu-dnnl IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_1.5.0_cpu_vcomp.zip -oC:\cache -- cmd: IF %NAME%==onednn set DNNL_NAME=dnnl_win_2.6.0_cpu_vcomp_gpu_vcomp -- cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/borg323/oneDNN/releases/download/v2.6/dnnl_win_2.6.0_cpu_vcomp_gpu_vcomp.zip -- cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_2.6.0_cpu_vcomp_gpu_vcomp.zip -oC:\cache +- cmd: IF %NAME%==onednn set DNNL_NAME=dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp +- cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% appveyor DownloadFile https://github.com/borg323/oneDNN/releases/download/v2.7.2/dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip +- cmd: IF %NAME%==onednn IF NOT EXIST C:\cache\%DNNL_NAME% 7z x dnnl_win_2.7.2_cpu_vcomp_gpu_vcomp.zip -oC:\cache - cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 appveyor DownloadFile https://github.com/borg323/onnxruntime/releases/download/v1.13.1/onnxruntime-win-x64-dml-1.13.1.zip - cmd: IF %NAME%==onnx-dml IF NOT EXIST C:\cache\onnxruntime-win-x64-dml-1.13.1 7z x onnxruntime-win-x64-dml-1.13.1.zip -oC:\cache - cmd: IF %NAME%==onnx-dml set ONNX_NAME=onnxruntime-win-x64-dml-1.13.1 diff --git a/src/neural/network_legacy.cc b/src/neural/network_legacy.cc index 0872ae0ddc..387590de6b 100644 --- a/src/neural/network_legacy.cc +++ b/src/neural/network_legacy.cc @@ -32,6 +32,8 @@ LegacyWeights::LegacyWeights(const pblczero::Weights& weights) : input(weights.input()), ip_emb_w(LayerAdapter(weights.ip_emb_w()).as_vector()), ip_emb_b(LayerAdapter(weights.ip_emb_b()).as_vector()), + ip_mult_gate(LayerAdapter(weights.ip_mult_gate()).as_vector()), + ip_add_gate(LayerAdapter(weights.ip_add_gate()).as_vector()), policy1(weights.policy1()), policy(weights.policy()), ip_pol_w(LayerAdapter(weights.ip_pol_w()).as_vector()), @@ -54,7 +56,9 @@ LegacyWeights::LegacyWeights(const pblczero::Weights& weights) ip1_mov_w(LayerAdapter(weights.ip1_mov_w()).as_vector()), ip1_mov_b(LayerAdapter(weights.ip1_mov_b()).as_vector()), ip2_mov_w(LayerAdapter(weights.ip2_mov_w()).as_vector()), - ip2_mov_b(LayerAdapter(weights.ip2_mov_b()).as_vector()) { + ip2_mov_b(LayerAdapter(weights.ip2_mov_b()).as_vector()), + smolgen_w(LayerAdapter(weights.smolgen_w()).as_vector()), + has_smolgen(weights.has_smolgen_w()) { for (const auto& res : weights.residual()) { residual.emplace_back(res); } @@ -145,7 +149,9 @@ LegacyWeights::MHA::MHA(const pblczero::Weights::MHA& mha) v_w(LayerAdapter(mha.v_w()).as_vector()), v_b(LayerAdapter(mha.v_b()).as_vector()), dense_w(LayerAdapter(mha.dense_w()).as_vector()), - dense_b(LayerAdapter(mha.dense_b()).as_vector()) {} + dense_b(LayerAdapter(mha.dense_b()).as_vector()), + smolgen(Smolgen(mha.smolgen())), + has_smolgen(mha.has_smolgen()) {} LegacyWeights::FFN::FFN(const pblczero::Weights::FFN& ffn) : dense1_w(LayerAdapter(ffn.dense1_w()).as_vector()), @@ -162,4 +168,16 @@ LegacyWeights::EncoderLayer::EncoderLayer( ln2_gammas(LayerAdapter(encoder.ln2_gammas()).as_vector()), ln2_betas(LayerAdapter(encoder.ln2_betas()).as_vector()) {} +LegacyWeights::Smolgen::Smolgen( + const pblczero::Weights::Smolgen& smolgen) + : compress(LayerAdapter(smolgen.compress()).as_vector()), + dense1_w(LayerAdapter(smolgen.dense1_w()).as_vector()), + dense1_b(LayerAdapter(smolgen.dense1_b()).as_vector()), + ln1_gammas(LayerAdapter(smolgen.ln1_gammas()).as_vector()), + ln1_betas(LayerAdapter(smolgen.ln1_betas()).as_vector()), + dense2_w(LayerAdapter(smolgen.dense2_w()).as_vector()), + dense2_b(LayerAdapter(smolgen.dense2_b()).as_vector()), + ln2_gammas(LayerAdapter(smolgen.ln2_gammas()).as_vector()), + ln2_betas(LayerAdapter(smolgen.ln2_betas()).as_vector()) {} + } // namespace lczero diff --git a/src/neural/network_legacy.h b/src/neural/network_legacy.h index 19284af172..5715c40fbb 100644 --- a/src/neural/network_legacy.h +++ b/src/neural/network_legacy.h @@ -55,6 +55,19 @@ struct LegacyWeights { bool has_se; }; + struct Smolgen { + explicit Smolgen(const pblczero::Weights::Smolgen& smolgen); + Vec compress; + Vec dense1_w; + Vec dense1_b; + Vec ln1_gammas; + Vec ln1_betas; + Vec dense2_w; + Vec dense2_b; + Vec ln2_gammas; + Vec ln2_betas; + }; + struct MHA { explicit MHA(const pblczero::Weights::MHA& mha); Vec q_w; @@ -65,6 +78,8 @@ struct LegacyWeights { Vec v_b; Vec dense_w; Vec dense_b; + Smolgen smolgen; + bool has_smolgen; }; struct FFN { @@ -92,11 +107,14 @@ struct LegacyWeights { Vec ip_emb_w; Vec ip_emb_b; + // Input gating + Vec ip_mult_gate; + Vec ip_add_gate; + // Encoder stack. std::vector encoder; int encoder_head_count; - // Residual tower. std::vector residual; @@ -133,6 +151,11 @@ struct LegacyWeights { Vec ip1_mov_b; Vec ip2_mov_w; Vec ip2_mov_b; + + // Smolgen global weights + Vec smolgen_w; + Vec smolgen_b; + bool has_smolgen; }; } // namespace lczero diff --git a/src/neural/shared/activation.h b/src/neural/shared/activation.h index 5937126b32..8a55df486b 100644 --- a/src/neural/shared/activation.h +++ b/src/neural/shared/activation.h @@ -22,7 +22,7 @@ #include namespace lczero { -enum ActivationFunction { NONE, RELU, TANH, SIGMOID, SELU, MISH }; +enum ActivationFunction { NONE, RELU, TANH, SIGMOID, SELU, MISH, SWISH }; // Softmax activation void SoftmaxActivation(const size_t size, const float* input, float* output); From c80a4b72a2271be3cc54344169267b2d60ebbd5a Mon Sep 17 00:00:00 2001 From: borg323 Date: Fri, 16 Dec 2022 22:57:01 +0200 Subject: [PATCH 13/46] add swish activation --- src/neural/shared/activation.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc index ecf154f2ed..9f6b7489c4 100644 --- a/src/neural/shared/activation.cc +++ b/src/neural/shared/activation.cc @@ -78,6 +78,8 @@ float Activate(const float val, const ActivationFunction activation) { return 1.0f / (1.0f + expf(-val)); case SELU: return selu(val); + case SWISH: + return val / (1.0f + expf(-val)); case NONE: // Nothing to do. break; From 45965dda5665a50d709d54afece6285b3e92f7ed Mon Sep 17 00:00:00 2001 From: Ankan Banerjee Date: Sat, 17 Dec 2022 09:13:08 +0530 Subject: [PATCH 14/46] Persistent L2 cache opt for cuda backend (#1815) * add persistent L2 cache opt - goal is to fit activations in residual block in L2 cache. - around 6.7% improvement in T80 networks. * add checks for cuda version - prevent compile errors when compiled with old CUDA toolkit. * fix typo with cudart_version --- src/neural/cuda/layers.cc | 6 +++ src/neural/cuda/network_cuda.cc | 67 +++++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 8 deletions(-) diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc index 643974b375..47feb4eb9a 100644 --- a/src/neural/cuda/layers.cc +++ b/src/neural/cuda/layers.cc @@ -1204,6 +1204,12 @@ void ResidualBlock::Eval(int N, DataType* output, DataType* transformed_output = transformed_input + scratch_size / (2 * sizeof(DataType)); + // caller wants us to sub-allocate all memory we need from "output" tensor + if (!scratch) { + transformed_input = output; // this is true in normal cases too! + transformed_output = transformed_input + (N * C * 8 * 8 * 36 / 16); + } + if (first_block_) { InputTransform(N, c_input_, transformed_input, input, stream); diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/cuda/network_cuda.cc index b374d27561..5052b78021 100644 --- a/src/neural/cuda/network_cuda.cc +++ b/src/neural/cuda/network_cuda.cc @@ -178,6 +178,10 @@ class CudaNetwork : public Network { cudaGetDeviceProperties(&deviceProp, gpu_id_); showDeviceInfo(deviceProp); + l2_cache_size_ = deviceProp.l2CacheSize; + + allow_cache_opt_ = options.GetOrDefault("cache_opt", true); + // Select GPU to run on (for *the current* thread). ReportCUDAErrors(cudaSetDevice(gpu_id_)); @@ -225,6 +229,7 @@ class CudaNetwork : public Network { const int kNumInputPlanes = kInputPlanes; const int kNumFilters = (int)weights.input.biases.size(); numBlocks_ = (int)weights.residual.size(); + numFilters_ = kNumFilters; // Warn if the memory required for storing transformed weights is // going to exceed 40% of total video memory, force custom_winograd off @@ -533,19 +538,52 @@ class CudaNetwork : public Network { float* opVal = io->op_value_mem_gpu_; float* opMov = io->op_moves_left_mem_gpu_; + + // Figure out if the memory requirment for running the res block would fit + // in the L2 cache. + bool enableCacheOpt = false; + DataType* skip_connection = + use_res_block_winograd_fuse_opt_ ? tensor_mem[1] : tensor_mem[2]; + +#if CUDART_VERSION >= 11000 + const int pre_transform_tensor_size = + batchSize * numFilters_ * 8 * 8 * sizeof(DataType); + const int transformed_tensor_size = pre_transform_tensor_size * 36 / 16; + const int res_block_mem = + transformed_tensor_size * 2 + pre_transform_tensor_size; + + cudaStreamAttrValue stream_attribute = {}; + stream_attribute.accessPolicyWindow.base_ptr = tensor_mem[2]; + stream_attribute.accessPolicyWindow.num_bytes = res_block_mem; + stream_attribute.accessPolicyWindow.hitRatio = 1.0f; + stream_attribute.accessPolicyWindow.hitProp = cudaAccessPropertyPersisting; + stream_attribute.accessPolicyWindow.missProp = cudaAccessPropertyStreaming; + + if (allow_cache_opt_ && use_res_block_winograd_fuse_opt_ && + (res_block_mem <= scratch_size_) && (res_block_mem <= l2_cache_size_)) { + // we can use a single alloc to hold all the required tensors, and enable + // persistent L2 caching on it + ReportCUDAErrors(cudaStreamSetAttribute( + stream, cudaStreamAttributeAccessPolicyWindow, &stream_attribute)); + + enableCacheOpt = true; + skip_connection = + tensor_mem[2] + 2 * transformed_tensor_size / sizeof(DataType); + } +#endif + int l = 0; // Input. - network_[l++]->Eval( - batchSize, - use_res_block_winograd_fuse_opt_ ? tensor_mem[1] : tensor_mem[2], - tensor_mem[0], nullptr, scratch_mem, scratch_size_, nullptr, cublas, - stream); // input conv + network_[l++]->Eval(batchSize, skip_connection, tensor_mem[0], nullptr, + scratch_mem, scratch_size_, nullptr, cublas, + stream); // input conv // Residual block. for (int block = 0; block < numBlocks_; block++) { if (use_res_block_winograd_fuse_opt_) { - network_[l++]->Eval(batchSize, tensor_mem[2], tensor_mem[1], nullptr, - scratch_mem, scratch_size_, nullptr, cublas, + network_[l++]->Eval(batchSize, tensor_mem[2], skip_connection, nullptr, + enableCacheOpt ? nullptr : scratch_mem, + scratch_size_, nullptr, cublas, stream); // block } else { network_[l++]->Eval(batchSize, tensor_mem[0], tensor_mem[2], nullptr, @@ -558,6 +596,16 @@ class CudaNetwork : public Network { } } +#if CUDART_VERSION >= 11000 + if (enableCacheOpt) { + // reset the cache settings + stream_attribute.accessPolicyWindow.num_bytes = 0; + cudaStreamSetAttribute(stream, cudaStreamAttributeAccessPolicyWindow, + &stream_attribute); + cudaCtxResetPersistingL2Cache(); + } +#endif + // Policy head. if (attn_policy_) { network_[l++]->Eval( @@ -761,18 +809,21 @@ class CudaNetwork : public Network { private: const NetworkCapabilities capabilities_; int gpu_id_; + int l2_cache_size_; int max_batch_size_; bool wdl_; bool moves_left_; bool use_res_block_winograd_fuse_opt_; // fuse operations inside the residual // tower bool multi_stream_; // run multiple parallel network evals + bool allow_cache_opt_; // try to fit residual block activations in L2 cache // Currently only one NN Eval can happen a time (we can fix this if needed // by allocating more memory). mutable std::mutex lock_; int numBlocks_; + int numFilters_; bool has_se_; bool conv_policy_; bool attn_policy_; @@ -840,7 +891,7 @@ class CudaNetwork : public Network { CERR << "GPU clock frequency: " << deviceProp.clockRate / 1e3f << " MHz"; CERR << "GPU compute capability: " << deviceProp.major << "." << deviceProp.minor; - + CERR << "L2 cache capacity: " << deviceProp.l2CacheSize; if (std::is_same::value && deviceProp.major >= 7) { CERR << "WARNING: you will probably get better performance from the " "cuda-fp16 backend."; From a127abd1f348bafdd5b94c4a16a1d5380d834a6b Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Sun, 18 Dec 2022 02:12:13 +0200 Subject: [PATCH 15/46] fix undefined behavior (#1817) --- src/neural/cuda/layers.cc | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc index 47feb4eb9a..0415b24a92 100644 --- a/src/neural/cuda/layers.cc +++ b/src/neural/cuda/layers.cc @@ -1200,14 +1200,16 @@ void ResidualBlock::Eval(int N, DataType* output, // Split the scratch space into two parts - use first part for holding // transformed input and second part for transformed output. - DataType* transformed_input = (DataType*)scratch; - DataType* transformed_output = - transformed_input + scratch_size / (2 * sizeof(DataType)); - - // caller wants us to sub-allocate all memory we need from "output" tensor + DataType* transformed_input; + DataType* transformed_output; if (!scratch) { - transformed_input = output; // this is true in normal cases too! + // Caller wants us to sub-allocate all memory we need from "output" tensor. + transformed_input = output; // This is true in normal cases too! transformed_output = transformed_input + (N * C * 8 * 8 * 36 / 16); + } else { + transformed_input = (DataType*)scratch; + transformed_output = + transformed_input + scratch_size / (2 * sizeof(DataType)); } if (first_block_) { From f4a918a19be24e8c8119209f22379e8e8147b495 Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Sun, 18 Dec 2022 02:12:46 +0200 Subject: [PATCH 16/46] set MADV_RANDOM for syzygy mmap (#1818) --- src/syzygy/syzygy.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/syzygy/syzygy.cc b/src/syzygy/syzygy.cc index 89352e52c9..554186f7d9 100644 --- a/src/syzygy/syzygy.cc +++ b/src/syzygy/syzygy.cc @@ -1047,6 +1047,11 @@ class SyzygyTablebaseImpl { } *mapping = statbuf.st_size; base_address = mmap(nullptr, statbuf.st_size, PROT_READ, MAP_SHARED, fd, 0); +#if defined(MADV_RANDOM) + // For context: + // and . + madvise(base_address, statbuf.st_size, MADV_RANDOM); +#endif ::close(fd); if (base_address == MAP_FAILED) { throw Exception("Could not mmap() " + fname); From 6a774591ce34cc84da842e74dad65c4297678be2 Mon Sep 17 00:00:00 2001 From: almaudoh Date: Sun, 25 Dec 2022 05:45:28 +0100 Subject: [PATCH 17/46] Fix softmax cuda (#1822) * Fix softmax function in cuda. * Add fix for negative zero. * Code style fixes. Co-authored-by: Alma --- src/neural/cuda/common_kernels.cu | 29 ++++++++++++++++++++++------- src/neural/cuda/winograd_helper.inc | 18 ++++++++++++++++++ 2 files changed, 40 insertions(+), 7 deletions(-) diff --git a/src/neural/cuda/common_kernels.cu b/src/neural/cuda/common_kernels.cu index a1ecb36d97..684d8d1e8a 100644 --- a/src/neural/cuda/common_kernels.cu +++ b/src/neural/cuda/common_kernels.cu @@ -682,8 +682,12 @@ __global__ void softmax_opt_64_kernel(T* output, const T* input, int N) { copyAs(&x[0], &input[index * 2]); } - ex[0] = exp(x[0]); - ex[1] = exp(x[1]); + float threadMax = max(x[0], x[1]); + float maxval = warpMax(threadMax); + maxval = __shfl_sync(0xFFFFFFFF, maxval, 0); + + ex[0] = exp(x[0] - maxval); + ex[1] = exp(x[1] - maxval); float threadSum = ex[0] + ex[1]; float Sum = warpReduce(threadSum); @@ -716,14 +720,25 @@ __global__ void softmax_kernel(T* output, const T* input) { int C = blockDim.x; int index = n * C + c; - __shared__ float sum; - if (c == 0) sum = 0; - __syncthreads(); - // softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis) float x = (float)input[index]; - float ex = exp(x); + + __shared__ float sum, maxval; + if (c == 0) { + sum = 0; + maxval = x; + } + + __syncthreads(); + + // Get max across warp first, and then update across C dimension + float warpmax = warpMax(x); + if ((c & 0x1F) == 0) atomicMaxFloat(&maxval, warpmax); + + __syncthreads(); + + float ex = exp(x - maxval); // compute warp wide sums first float val = warpReduce(ex); diff --git a/src/neural/cuda/winograd_helper.inc b/src/neural/cuda/winograd_helper.inc index 456649ba87..3f362f3074 100644 --- a/src/neural/cuda/winograd_helper.inc +++ b/src/neural/cuda/winograd_helper.inc @@ -419,6 +419,24 @@ __device__ __forceinline__ float warpReduce(float x) { return x; } +// fast max reduction for the warp +__device__ __forceinline__ float warpMax(float x) { +#pragma unroll + for (int mask = 16; mask > 0; mask >>= 1) + x = max(x, __shfl_xor_sync(0xFFFFFFFF, x, mask)); + + return x; +} + +// atomic max implementation for floats +__device__ __forceinline__ float atomicMaxFloat(float* addr, float val) { + float max; + max = !signbit(val) ? __int_as_float(atomicMax((int*)addr, __float_as_int(val))) : + __uint_as_float(atomicMin((unsigned int*)addr, __float_as_uint(val))); + + return max; +} + // Helper fuction to do vector loads/stores template __device__ __forceinline__ void copyAs(void* dst, const void* src) { From 662648ac742c846a0a5cc65321d4c9c29c7f501f Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Mon, 26 Dec 2022 13:10:27 +0200 Subject: [PATCH 18/46] set cache_opt default to false (#1823) --- src/neural/cuda/network_cuda.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neural/cuda/network_cuda.cc b/src/neural/cuda/network_cuda.cc index 5052b78021..dce3b28265 100644 --- a/src/neural/cuda/network_cuda.cc +++ b/src/neural/cuda/network_cuda.cc @@ -180,7 +180,7 @@ class CudaNetwork : public Network { l2_cache_size_ = deviceProp.l2CacheSize; - allow_cache_opt_ = options.GetOrDefault("cache_opt", true); + allow_cache_opt_ = options.GetOrDefault("cache_opt", false); // Select GPU to run on (for *the current* thread). ReportCUDAErrors(cudaSetDevice(gpu_id_)); From a5de019e08ef49516b1c48223795f00367bf5589 Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Wed, 28 Dec 2022 00:04:40 +0200 Subject: [PATCH 19/46] fix onnx final mlh activation (#1825) --- src/neural/onnx/converter.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc index aaf200120d..6004e4d56f 100644 --- a/src/neural/onnx/converter.cc +++ b/src/neural/onnx/converter.cc @@ -552,7 +552,8 @@ void Converter::MakeMovesLeftHead(pblczero::OnnxModel* onnx, *GetWeghtsConverter(weights.ip2_mov_w, {mlh_fc1_outputs, 1}, {1, 0})); flow = builder->Add("/mlh/dense2/add", flow, *GetWeghtsConverter(weights.ip2_mov_b, {1})); - auto output = builder->Relu(options_.output_mlh, flow); + flow = MakeActivation(builder, flow, "/mlh/dense2", default_activation_); + auto output = builder->Identity(options_.output_mlh, flow); builder->AddOutput(output, {options_.batch_size, 1}, GetDataType()); onnx->set_output_mlh(output); } From c41ba16945c2520b8aad44b11ec9575bcaf86cb3 Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Wed, 28 Dec 2022 15:15:35 +0200 Subject: [PATCH 20/46] inline fp16 conversions (#1824) --- appveyor.yml | 4 +- meson.build | 1 - src/neural/onnx/network_onnx.cc | 9 ++- src/utils/fp16_utils.cc | 107 -------------------------------- src/utils/fp16_utils.h | 83 ++++++++++++++++++++++++- 5 files changed, 88 insertions(+), 116 deletions(-) delete mode 100644 src/utils/fp16_utils.cc diff --git a/appveyor.yml b/appveyor.yml index fb64dc3d18..187c8cef95 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -116,10 +116,12 @@ before_build: - cmd: IF %DX%==true SET BUILD_BLAS=true - cmd: SET EMBED=false - cmd: IF %APPVEYOR_REPO_TAG%==true IF %ANDROID%==true SET EMBED=true +- cmd: SET POPCNT=true +- cmd: IF %NAME%==cpu-openblas SET POPCNT=false - cmd: SET EXTRA= - cmd: IF %ANDROID%==false SET EXTRA=-Db_vscrt=md - cmd: IF %ONNX_DML%==true SET EXTRA=-Db_vscrt=md -Donnx_libdir=C:\cache\%ONNX_NAME%\lib -Donnx_include=C:\cache\%ONNX_NAME%\include -- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA% +- cmd: IF %ANDROID%==false meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BUILD_BLAS% -Ddnnl=true -Ddx=%DX% -Dcudnn=%CUDNN% -Donednn=%ONEDNN% -Dispc_native_only=false -Dpopcnt=%POPCNT% -Dcudnn_include="%CUDA_PATH%\include","%CUDA_PATH%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%CUDA_PATH%\cuda\lib\x64" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Ddnnl_dir="%PKG_FOLDER%\%DNNL_NAME%" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.77\build\native\lib\x64" -Ddefault_library=static -Dmalloc=mimalloc -Dmimalloc_libdir="%MIMALLOC_PATH%"\out\msvc-x64\Release %EXTRA% - cmd: IF %ANDROID%==true meson arm64-v8a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-aarch64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-aarch64\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-aarch64 - cmd: IF %ANDROID%==true meson armeabi-v7a --buildtype release -Dgtest=false -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\android-armv7a\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\android-armv7a\lib" -Dembed=%EMBED% -Ddefault_library=static --cross-file crossfile-armv7a -Dispc=false -Dneon=false build_script: diff --git a/meson.build b/meson.build index 49b60c1cd6..54f9210a94 100644 --- a/meson.build +++ b/meson.build @@ -209,7 +209,6 @@ files += [ 'src/utils/random.cc', 'src/utils/string.cc', 'src/utils/weights_adapter.cc', - 'src/utils/fp16_utils.cc', 'src/version.cc', ] includes += include_directories('src') diff --git a/src/neural/onnx/network_onnx.cc b/src/neural/onnx/network_onnx.cc index 7d7950175c..6908ea47df 100644 --- a/src/neural/onnx/network_onnx.cc +++ b/src/neural/onnx/network_onnx.cc @@ -196,12 +196,11 @@ Ort::Value OnnxComputation::PrepareInputs(int start, int batch_size) { int end = std::min(start + batch_size, static_cast(raw_input_.size())); for (int i = start; i < end; i++) { for (const auto& plane : raw_input_[i]) { + DataType value = std::is_same::value + ? FP32toFP16(plane.value) + : plane.value; for (auto bit : IterateBits(plane.mask)) { - if (std::is_same::value) { - *(iter + bit) = FP32toFP16(plane.value); - } else { - *(iter + bit) = plane.value; - } + *(iter + bit) = value; } iter += 64; } diff --git a/src/utils/fp16_utils.cc b/src/utils/fp16_utils.cc deleted file mode 100644 index 826389b1e9..0000000000 --- a/src/utils/fp16_utils.cc +++ /dev/null @@ -1,107 +0,0 @@ -/* - This file is part of Leela Chess Zero. - Copyright (C) 2020 The LCZero Authors - - Leela Chess is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - Leela Chess is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with Leela Chess. If not, see . - - Additional permission under GNU GPL version 3 section 7 - - If you modify this Program, or any covered work, by linking or - combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA - Toolkit and the NVIDIA CUDA Deep Neural Network library (or a - modified version of those libraries), containing parts covered by the - terms of the respective license agreement, the licensors of this - Program grant you additional permission to convey the resulting work. -*/ - -#include -#include - -// Define NO_F16C to avoid the F16C intrinsics. Also disabled with NO_POPCNT -// since it catches most processors without F16C instructions. - -#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ - defined(__x86_64__) -#include -#else -#define NO_F16C -#endif - -namespace lczero { - -uint16_t FP32toFP16(float f32) { -#if defined(NO_POPCNT) || defined(NO_F16C) || \ - (defined(__GNUC__) && !defined(__F16C__)) - unsigned int x; - unsigned int sign = 0; - memcpy(&x, &f32, sizeof(float)); - if (x & 0x80000000) sign = 0x8000; - x &= 0x7fffffff; - if (x >= 0x477ff000) { - if ((x & 0x7f800000) == 0x7f800000 && (x & 0x7fffff)) { - x = ((x >> 13) - 0x38000) | 0x200; - } else { - x = 0x7c00; - } - } else if (x <= 0x33000000) - x = 0; - else if (x <= 0x387fefff) { - int shift = 126 - ((x >> 23) & 0xff); - x = (x & 0x7fffff) | 0x800000; - if (x & (0x17fffff >> (24 - shift))) x += 0x800000 >> (24 - shift); - x >>= shift; - } else { - // Adjust exponent and round to nearest even. - if (x & 0x2fff) { - x -= 0x37fff000; - } else { - x -= 0x38000000; - } - x >>= 13; - } - return x | sign; -#else - __m128 A = _mm_set_ss(f32); - __m128i H = _mm_cvtps_ph(A, 0); - return _mm_extract_epi16(H, 0); -#endif -} - -float FP16toFP32(uint16_t f16) { -#if defined(NO_POPCNT) || defined(NO_F16C) || \ - (defined(__GNUC__) && !defined(__F16C__)) - unsigned int x; - float f; - x = f16 & 0x7fff; - if ((x & 0x7c00) == 0) { - f = 5.9604645e-8f * x; - memcpy(&x, &f, sizeof(float)); - } else if (x >= 0x7c00) { - if (x & 0x1ff) x |= 0x200; - x = (x + 0x38000) << 13; - } else { - x = (x + 0x1c000) << 13; - } - if (f16 & 0x8000) x |= 0x80000000; - memcpy(&f, &x, sizeof(float)); - return f; -#else - __m128i H = _mm_setzero_si128(); - H = _mm_insert_epi16(H, f16, 0); - __m128 A = _mm_cvtph_ps(H); - return _mm_cvtss_f32(A); -#endif -} - -} // namespace lczero diff --git a/src/utils/fp16_utils.h b/src/utils/fp16_utils.h index fadf83d031..2680536599 100644 --- a/src/utils/fp16_utils.h +++ b/src/utils/fp16_utils.h @@ -25,9 +25,88 @@ Program grant you additional permission to convey the resulting work. */ #pragma once + +#include +#include + +// Define NO_F16C to avoid the F16C intrinsics. Also disabled with NO_POPCNT +// since it catches most processors without F16C instructions. +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || \ + defined(__x86_64__) +#include +#else +#define NO_F16C +#endif + namespace lczero { -uint16_t FP32toFP16(float f32); -float FP16toFP32(uint16_t f16); +#if defined(NO_POPCNT) || defined(NO_F16C) || \ + (defined(__GNUC__) && !defined(__F16C__)) + +inline uint16_t FP32toFP16(float f32) { + unsigned int x; + unsigned int sign = 0; + memcpy(&x, &f32, sizeof(float)); + if (x & 0x80000000) sign = 0x8000; + x &= 0x7fffffff; + if (x >= 0x477ff000) { + if ((x & 0x7f800000) == 0x7f800000 && (x & 0x7fffff)) { + x = ((x >> 13) - 0x38000) | 0x200; + } else { + x = 0x7c00; + } + } else if (x <= 0x33000000) + x = 0; + else if (x <= 0x387fefff) { + int shift = 126 - ((x >> 23) & 0xff); + x = (x & 0x7fffff) | 0x800000; + if (x & (0x17fffff >> (24 - shift))) x += 0x800000 >> (24 - shift); + x >>= shift; + } else { + // Adjust exponent and round to nearest even. + if (x & 0x2fff) { + x -= 0x37fff000; + } else { + x -= 0x38000000; + } + x >>= 13; + } + return x | sign; +} + +inline float FP16toFP32(uint16_t f16) { + unsigned int x; + float f; + x = f16 & 0x7fff; + if ((x & 0x7c00) == 0) { + f = 5.9604645e-8f * x; + memcpy(&x, &f, sizeof(float)); + } else if (x >= 0x7c00) { + if (x & 0x1ff) x |= 0x200; + x = (x + 0x38000) << 13; + } else { + x = (x + 0x1c000) << 13; + } + if (f16 & 0x8000) x |= 0x80000000; + memcpy(&f, &x, sizeof(float)); + return f; +} + +#else + +inline uint16_t FP32toFP16(float f32) { + __m128 A = _mm_set_ss(f32); + __m128i H = _mm_cvtps_ph(A, 0); + return _mm_extract_epi16(H, 0); +} + +inline float FP16toFP32(uint16_t f16) { + __m128i H = _mm_setzero_si128(); + H = _mm_insert_epi16(H, f16, 0); + __m128 A = _mm_cvtph_ps(H); + return _mm_cvtss_f32(A); +} + +#endif } // namespace lczero From 9ff390be73fc54a18e804014b96993461ce584ae Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Wed, 28 Dec 2022 15:15:55 +0200 Subject: [PATCH 21/46] onnx squeeze cleanup (#1826) --- src/neural/onnx/builder.cc | 11 +++++++---- src/neural/onnx/builder.h | 3 ++- src/neural/onnx/converter.cc | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/neural/onnx/builder.cc b/src/neural/onnx/builder.cc index be97dfd7ac..5cafe518a0 100644 --- a/src/neural/onnx/builder.cc +++ b/src/neural/onnx/builder.cc @@ -167,14 +167,17 @@ std::string OnnxBuilder::GlobalAveragePool(const std::string& name, } std::string OnnxBuilder::Squeeze(const std::string& name, - const std::string& input) { + const std::string& input, + std::initializer_list axes) { auto* node = model_.mutable_graph()->add_node(); auto out = PopulateStdNodeFields(node, name, input, "Squeeze"); if (opset_ < 13) { - AddIntsAttribute(node, "axes", {2, 3}); + AddIntsAttribute(node, "axes", axes); } else { - node->add_input( - AddInitializer(name + "/axes", Int64OnnxConst({2, 3}, {2}))); + node->add_input(AddInitializer( + name + "/axes", + Int64OnnxConst(std::vector(begin(axes), end(axes)), + {static_cast(axes.size())}))); } return out; } diff --git a/src/neural/onnx/builder.h b/src/neural/onnx/builder.h index 3b88cb27ec..96fa4e82ee 100644 --- a/src/neural/onnx/builder.h +++ b/src/neural/onnx/builder.h @@ -68,7 +68,8 @@ class OnnxBuilder { const OnnxConst&); std::string GlobalAveragePool(const std::string& name, const std::string& input); - std::string Squeeze(const std::string& name, const std::string& input); + std::string Squeeze(const std::string& name, const std::string& input, + std::initializer_list axes); std::string MatMul(const std::string& name, const std::string& input1, const OnnxConst& input2); std::string MatMul(const std::string& name, const std::string& input1, diff --git a/src/neural/onnx/converter.cc b/src/neural/onnx/converter.cc index 6004e4d56f..8e1c43eb18 100644 --- a/src/neural/onnx/converter.cc +++ b/src/neural/onnx/converter.cc @@ -184,7 +184,7 @@ std::string Converter::MakeSqueezeAndExcite( const int se_filters = se_unit.b1.size(); auto flow = builder->GlobalAveragePool(name + "/pooled", input); - flow = builder->Squeeze(name + "/squeeze", flow); + flow = builder->Squeeze(name + "/squeeze", flow, {2, 3}); flow = builder->MatMul( name + "/matmul1", flow, *GetWeghtsConverter(se_unit.w1, {NumFilters(), se_filters}, {1, 0})); From ec887c5766ea3bc5292b66f56e2b03cac968e1c8 Mon Sep 17 00:00:00 2001 From: borg323 Date: Fri, 30 Dec 2022 01:00:02 +0200 Subject: [PATCH 22/46] update submodule --- libs/lczero-common | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/lczero-common b/libs/lczero-common index 4dfa4ce833..2165d35bf6 160000 --- a/libs/lczero-common +++ b/libs/lczero-common @@ -1 +1 @@ -Subproject commit 4dfa4ce8339357819f7de01517e6297d4c768cdf +Subproject commit 2165d35bf63e95549eb4feff06a755ec88af5264 From 825d0d093f00e999027ce1284ad6044e5780b0a1 Mon Sep 17 00:00:00 2001 From: borg323 Date: Sun, 15 Jan 2023 01:54:42 +0200 Subject: [PATCH 23/46] update for new net.proto --- libs/lczero-common | 2 +- src/neural/shared/activation.cc | 4 ++++ src/neural/shared/activation.h | 18 +++++++++++++++--- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/libs/lczero-common b/libs/lczero-common index 2165d35bf6..1e1138673c 160000 --- a/libs/lczero-common +++ b/libs/lczero-common @@ -1 +1 @@ -Subproject commit 2165d35bf63e95549eb4feff06a755ec88af5264 +Subproject commit 1e1138673cb171bec090e1da54f3f2f57854620a diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc index 9f6b7489c4..5ba2e17a40 100644 --- a/src/neural/shared/activation.cc +++ b/src/neural/shared/activation.cc @@ -21,6 +21,8 @@ #include #include +#include "utils/exception.h" + #ifdef USE_ISPC #include "activation_ispc.h" #endif @@ -83,6 +85,8 @@ float Activate(const float val, const ActivationFunction activation) { case NONE: // Nothing to do. break; + default: + throw Exception("unsupported activation function"); } return val; } diff --git a/src/neural/shared/activation.h b/src/neural/shared/activation.h index 8a55df486b..1ef3e0de48 100644 --- a/src/neural/shared/activation.h +++ b/src/neural/shared/activation.h @@ -21,17 +21,29 @@ #include #include +#include "proto/net.pb.h" + namespace lczero { -enum ActivationFunction { NONE, RELU, TANH, SIGMOID, SELU, MISH, SWISH }; +enum ActivationFunction { + NONE = pblczero::NetworkFormat::ActivationFunction::NONE, + RELU = pblczero::NetworkFormat::ActivationFunction::RELU, + TANH = pblczero::NetworkFormat::ActivationFunction::TANH, + SIGMOID = pblczero::NetworkFormat::ActivationFunction::SIGMOID, + SELU = pblczero::NetworkFormat::ActivationFunction::SELU, + MISH = pblczero::NetworkFormat::ActivationFunction::MISH, + SWISH = pblczero::NetworkFormat::ActivationFunction::SWISH, + RELU_2 = pblczero::NetworkFormat::ActivationFunction::RELU_2, + SOFTMAX = pblczero::NetworkFormat::ActivationFunction::SOFTMAX +}; // Softmax activation void SoftmaxActivation(const size_t size, const float* input, float* output); -void BiasResidual(const size_t batch_size, const size_t channels, float * data, +void BiasResidual(const size_t batch_size, const size_t channels, float* data, const float* biases, const float* eltwise, const ActivationFunction activation = RELU); -void BiasActivate(const size_t batch_size, const size_t channels, float * data, +void BiasActivate(const size_t batch_size, const size_t channels, float* data, const float* biases, const ActivationFunction activation = RELU); From 155c54a02338fb96a13312427b064dd9349d5bb5 Mon Sep 17 00:00:00 2001 From: borg323 Date: Wed, 8 Feb 2023 01:46:50 +0200 Subject: [PATCH 24/46] ispc alternatives for attention policy parts --- meson.build | 1 + src/neural/blas/encoder.h | 18 +++++++-- src/neural/blas/layer_norm.ispc | 63 +++++++++++++++++++++++++++++++ src/neural/shared/activation.cc | 9 +++++ src/neural/shared/activation.ispc | 20 +++++++++- 5 files changed, 106 insertions(+), 5 deletions(-) create mode 100644 src/neural/blas/layer_norm.ispc diff --git a/meson.build b/meson.build index 54f9210a94..56d391c432 100644 --- a/meson.build +++ b/meson.build @@ -376,6 +376,7 @@ if get_option('build_backends') if get_option('ispc') and ispc.found() files += iscp_gen.process('src/neural/blas/winograd_transform.ispc') + files += iscp_gen.process('src/neural/blas/layer_norm.ispc') files += iscp_gen.process('src/neural/shared/activation.ispc') add_project_arguments('-DUSE_ISPC', language : 'cpp') endif diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h index 99d2752752..5314525507 100644 --- a/src/neural/blas/encoder.h +++ b/src/neural/blas/encoder.h @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2018-2019 The LCZero Authors + Copyright (C) 2022-2023 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -25,6 +25,10 @@ #include "neural/shared/activation.h" #include "utils/exception.h" +#ifdef USE_ISPC +#include "layer_norm_ispc.h" +#endif + namespace lczero { namespace { @@ -44,6 +48,7 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size, const float* skip, const float* gammas, const float* betas, float epsilon) { for (size_t i = 0; i < batch_size; i++) { +#ifndef USE_ISPC // Mean taken in dimension C. float mean = 0; for (size_t c = 0; c < channels; ++c) { @@ -61,11 +66,16 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size, var /= channels; // Norm. + float den = 1.0f / std::sqrt(var + epsilon); for (size_t c = 0; c < channels; ++c) { - data[i * channels + c] = betas[c] + gammas[c] * - (data[i * channels + c] - mean) / - std::sqrt(var + epsilon); + data[i * channels + c] = + betas[c] + gammas[c] * (data[i * channels + c] - mean) * den; } +#else + ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, + skip + i * channels, gammas, betas, + epsilon); +#endif } } diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc new file mode 100644 index 0000000000..7995c9b0e2 --- /dev/null +++ b/src/neural/blas/layer_norm.ispc @@ -0,0 +1,63 @@ +/* + This file is part of Leela Chess Zero. + Copyright (C) 2023 The LCZero Authors + + Leela Chess is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Leela Chess is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with Leela Chess. If not, see . + */ + +export void LayerNorm2DWithSkipConnection(uniform const size_t channels, + uniform float data[], + const uniform float skip[], + const uniform float gammas[], + const uniform float betas[], + uniform const float epsilon) { +#if 0 + // Faster but potentially less stable version for future testing. + // One pass mean and variance taken in dimension C. Uses shifted variance calculation. + float imean = 0; + float ivar = 0; + float k = data[0] + skip[0]; + foreach (c = 0 ... channels) { + float t = data[c] + skip[c]; + data[c] = t; + t -= k; + imean += t; + ivar += t * t; + } + float mean = reduce_add(imean) / channels; + float var = (reduce_add(ivar) - channels * mean * mean) / channels; + mean += k; +#else + // Mean taken in dimension C. + float imean = 0; + foreach (c = 0 ... channels) { + data[c] += skip[c]; + imean += data[c]; + } + float mean = reduce_add(imean) / channels; + + // Variance. + float ivar = 0; + foreach (c = 0 ... channels) { + float diff = data[c] - mean; + ivar += diff * diff; + } + float var = reduce_add(ivar) / channels; +#endif + + float den = rsqrt(var + epsilon); + foreach (c = 0 ... channels) { + data[c] = betas[c] + gammas[c] * (data[c] - mean) * den; + } +} diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc index 5ba2e17a40..1a2dd34ead 100644 --- a/src/neural/shared/activation.cc +++ b/src/neural/shared/activation.cc @@ -110,6 +110,15 @@ void Activate(const size_t len, const float* data, const float* bias, } #else ispc::ActivateMish(len, 1.0f, data, bias, 0.0f, output); +#endif + } else if (activation == SELU) { +#ifndef USE_ISPC + for (size_t b = 0; b < len; b++) { + float val = data[b] + bias[b]; + output[b] = selu(val); + } +#else + ispc::ActivateSelu(len, 1.0f, data, bias, 0.0f, output); #endif } else { for (size_t b = 0; b < len; b++) { diff --git a/src/neural/shared/activation.ispc b/src/neural/shared/activation.ispc index 27cc36d116..ef209e512d 100644 --- a/src/neural/shared/activation.ispc +++ b/src/neural/shared/activation.ispc @@ -1,6 +1,6 @@ /* This file is part of Leela Chess Zero. - Copyright (C) 2022 The LCZero Authors + Copyright (C) 2022-2023 The LCZero Authors Leela Chess is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -35,3 +35,21 @@ export void ActivateMish(uniform const size_t len, uniform float gamma, output[b] = mish(val); } } + +static inline float selu(float val) { + float alpha = 1.67326324f, scale = 1.05070098f; + if (val > 0) { + return scale * val; + } else { + return scale * alpha * (exp(val) - 1.0f); + } +} + +export void ActivateSelu(uniform const size_t len, uniform float gamma, + const uniform float data[], const uniform float bias[], + uniform float beta, uniform float output[]) { + foreach (b = 0 ... len) { + float val = gamma * data[b] + bias[b] + beta; + output[b] = selu(val); + } +} From cab4c0415e280133ca6fe88de821f8ce807a3c3f Mon Sep 17 00:00:00 2001 From: borg323 Date: Thu, 9 Feb 2023 22:47:34 +0200 Subject: [PATCH 25/46] remove mha transpose --- src/neural/blas/encoder.h | 55 ------------------ src/neural/blas/network_blas.cc | 100 +++++++++++++++++++------------- 2 files changed, 59 insertions(+), 96 deletions(-) diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h index 5314525507..567db1be40 100644 --- a/src/neural/blas/encoder.h +++ b/src/neural/blas/encoder.h @@ -18,12 +18,9 @@ #pragma once -#include #include -#include #include "neural/shared/activation.h" -#include "utils/exception.h" #ifdef USE_ISPC #include "layer_norm_ispc.h" @@ -31,18 +28,6 @@ namespace lczero { -namespace { - -template -using EigenMatrixMap = - Eigen::Map>; - -template -using ConstEigenMatrixMap = - Eigen::Map>; - -} // namespace - void LayerNorm2DWithSkipConnection(const size_t batch_size, const size_t channels, float* data, const float* skip, const float* gammas, @@ -79,44 +64,4 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size, } } -template -void AttentionMatmul2D(const bool transpose_a, const bool transpose_b, - const size_t batch_size, const size_t M, const size_t N, - const size_t K, const float scaling, const float* input1, - const float* input2, float* output) { - for (auto batch = size_t{0}; batch < batch_size; batch++) { - const float* A = &input1[batch * M * K]; - const float* B = &input2[batch * N * K]; - float* C = &output[batch * M * N]; - if (use_eigen) { - auto C_mat = EigenMatrixMap(C, N, M); - - if (transpose_a && transpose_b) { - C_mat.noalias() = scaling * - ConstEigenMatrixMap(B, K, N).transpose() * - ConstEigenMatrixMap(A, M, K).transpose(); - } else if (transpose_a) { - C_mat.noalias() = scaling * ConstEigenMatrixMap(B, N, K) * - ConstEigenMatrixMap(A, M, K).transpose(); - } else if (transpose_b) { - C_mat.noalias() = scaling * - ConstEigenMatrixMap(B, K, N).transpose() * - ConstEigenMatrixMap(A, K, M); - } else { - C_mat.noalias() = scaling * ConstEigenMatrixMap(B, N, K) * - ConstEigenMatrixMap(A, K, M); - } - } else { -#ifdef USE_BLAS - cblas_sgemm(CblasRowMajor, transpose_a ? CblasTrans : CblasNoTrans, - transpose_b ? CblasTrans : CblasNoTrans, M, N, K, scaling, A, - transpose_a ? M : K, B, transpose_b ? K : N, 0.0f, C, N); -#else - // Should never get here. - throw Exception("Blas backend internal error"); -#endif - } - } -} - } // namespace lczero diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 3edc1f15b0..72117410fa 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -177,6 +177,14 @@ using EigenMatrixMap = template using ConstEigenMatrixMap = Eigen::Map>; +template +using EigenStridedMatrixMap = + Eigen::Map, 0, + Eigen::OuterStride<>>; +template +using ConstEigenStridedMatrixMap = + Eigen::Map, 0, + Eigen::OuterStride<>>; template void BlasComputation::ComputeBlocking() { @@ -327,9 +335,8 @@ void BlasComputation::ComputeBlocking() { if (weights_.pol_encoder.size() > 0) { std::vector head_buffer4(largest_batch_size * max_channel_size * kSquares); - std::vector temp_buffer1(policy_d_model * kSquares); - std::vector temp_buffer2(policy_d_model * kSquares); - std::vector temp_buffer3(policy_d_model * kSquares); + std::vector temp_buffer(weights_.pol_encoder_head_count * + kSquares * kSquares); for (auto layer : weights_.pol_encoder) { // Q @@ -358,51 +365,62 @@ void BlasComputation::ComputeBlocking() { for (auto batch = size_t{0}; batch < batch_size; batch++) { auto batchStart = batch * kSquares * d_model; - // Reshape and transpose for each head. - const float* Q = temp_buffer1.data(); - const float* K = temp_buffer2.data(); - const float* V = temp_buffer3.data(); - - for (int head = 0; head < heads; head++) { - for (int j = 0; j < kSquares; j++) { - auto channelStart = batchStart + j * d_model + head * depth; - auto transposeStart = head * kSquares * depth + j * depth; - std::copy(head_buffer2.begin() + channelStart, - head_buffer2.begin() + channelStart + depth, - temp_buffer1.begin() + transposeStart); - std::copy(head_buffer3.begin() + channelStart, - head_buffer3.begin() + channelStart + depth, - temp_buffer2.begin() + transposeStart); - std::copy(head_buffer4.begin() + channelStart, - head_buffer4.begin() + channelStart + depth, - temp_buffer3.begin() + transposeStart); - } - } + const float* Q = &head_buffer2[batchStart]; + const float* K = &head_buffer3[batchStart]; + const float* V = &head_buffer4[batchStart]; // matmul(Q, K) for all heads per batch. - float* QK = &head_buffer2[batchStart]; - AttentionMatmul2D(false, true, heads, kSquares, kSquares, - depth, scaling, Q, K, QK); - + float* QK = temp_buffer.data(); + for (auto h = 0; h < heads; h++) { + const float* A = &Q[h * depth]; + const float* B = &K[h * depth]; + float* C = &QK[h * kSquares * kSquares]; + if (use_eigen) { + auto C_mat = EigenMatrixMap(C, kSquares, kSquares); + C_mat.noalias() = + scaling * + ConstEigenStridedMatrixMap( + B, depth, kSquares, Eigen::OuterStride<>(heads * depth)) + .transpose() * + ConstEigenStridedMatrixMap( + A, depth, kSquares, + Eigen::OuterStride<>(heads * depth)); + } else { +#ifdef USE_BLAS + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, kSquares, + kSquares, depth, scaling, A, heads * depth, B, + heads * depth, 0.0f, C, kSquares); +#else + // Should never get here. + throw Exception("Blas backend internal error"); +#endif + } + } // Apply Softmax. for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) { SoftmaxActivation(kSquares, QK + h, QK + h); } // matmul(softmax(QK), V) for all heads per batch. - float* attn = &head_buffer3[batchStart]; - AttentionMatmul2D(false, false, heads, kSquares, depth, - kSquares, 1.0, QK, V, attn); - - // Transpose back into N x 64 x H x D. - for (int j = 0; j < kSquares; j++) { - for (int head = 0; head < heads; head++) { - auto transposeStart = - batchStart + head * kSquares * depth + j * depth; - std::copy(head_buffer3.begin() + transposeStart, - head_buffer3.begin() + transposeStart + depth, - head_buffer2.begin() + batchStart + j * d_model + - head * depth); + float* attn = &head_buffer2[batchStart]; + for (auto h = 0; h < heads; h++) { + const float* A = &QK[h * kSquares * kSquares]; + const float* B = &V[h * depth]; + float* C = &attn[h * depth]; + if (use_eigen) { + auto C_mat = EigenStridedMatrixMap( + C, depth, kSquares, Eigen::OuterStride<>(heads * depth)); + C_mat.noalias() = + ConstEigenStridedMatrixMap( + B, depth, kSquares, + Eigen::OuterStride<>(heads * depth)) * + ConstEigenMatrixMap(A, kSquares, kSquares); + } else { +#ifdef USE_BLAS + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, kSquares, + depth, kSquares, 1.0f, A, kSquares, B, + heads * depth, 0.0f, C, heads * depth); +#endif } } } @@ -651,7 +669,7 @@ BlasNetwork::BlasNetwork(const WeightsFile& file, : capabilities_{file.format().network_format().input(), file.format().network_format().moves_left()}, weights_(file.weights()) { - Numa::Init(); + Numa::Init(); max_batch_size_ = static_cast(options.GetOrDefault("batch_size", 256)); From ce1460846d9ff155cfdfb40c5cfb889adbc79a75 Mon Sep 17 00:00:00 2001 From: borg323 Date: Fri, 10 Feb 2023 01:19:28 +0200 Subject: [PATCH 26/46] ispc softmax --- src/neural/blas/network_blas.cc | 8 ++++++++ src/neural/shared/activation.ispc | 22 ++++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 72117410fa..9a92682b3b 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -41,6 +41,10 @@ #include #endif +#ifdef USE_ISPC +#include "activation_ispc.h" +#endif + namespace lczero { namespace { @@ -398,7 +402,11 @@ void BlasComputation::ComputeBlocking() { } // Apply Softmax. for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) { +#ifdef USE_ISPC SoftmaxActivation(kSquares, QK + h, QK + h); +#else + ispc::SoftmaxActivation(kSquares, QK + h, QK + h); +#endif } // matmul(softmax(QK), V) for all heads per batch. diff --git a/src/neural/shared/activation.ispc b/src/neural/shared/activation.ispc index ef209e512d..6190198515 100644 --- a/src/neural/shared/activation.ispc +++ b/src/neural/shared/activation.ispc @@ -53,3 +53,25 @@ export void ActivateSelu(uniform const size_t len, uniform float gamma, output[b] = selu(val); } } + +export void SoftmaxActivation(uniform const size_t size, + const uniform float input[], + uniform float output[]) { + float vmax = -3.4e38f; + foreach (c = 0 ... size) { + if (input[c] > vmax) vmax = input[c]; + } + uniform float alpha = reduce_max(vmax); + + float t = 0.0f; + foreach (c = 0 ... size) { + float val = exp(input[c] - alpha); + output[c] = val; + t += val; + } + uniform float denom = 1.0f / reduce_add(t); + + foreach (c = 0 ... size) { + output[c] *= denom; + } +} From bd8aafed4a9f882e08e0484f6491e7301aa68996 Mon Sep 17 00:00:00 2001 From: borg323 Date: Fri, 10 Feb 2023 23:40:53 +0200 Subject: [PATCH 27/46] important typo fix --- src/neural/blas/network_blas.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 9a92682b3b..a5e6976ee4 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -402,7 +402,7 @@ void BlasComputation::ComputeBlocking() { } // Apply Softmax. for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) { -#ifdef USE_ISPC +#ifndef USE_ISPC SoftmaxActivation(kSquares, QK + h, QK + h); #else ispc::SoftmaxActivation(kSquares, QK + h, QK + h); From 8d67b1f7109534f431192916e8898ead02498890 Mon Sep 17 00:00:00 2001 From: borg323 Date: Sat, 11 Feb 2023 02:24:34 +0200 Subject: [PATCH 28/46] refactor blas encoder --- src/neural/blas/network_blas.cc | 277 +++++++++++++++++--------------- 1 file changed, 150 insertions(+), 127 deletions(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index a5e6976ee4..9a3e51ce7b 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -102,6 +102,12 @@ class BlasComputation : public NetworkComputation { private: void EncodePlanes(const InputPlanes& sample, float* buffer); + void MakeEncoderLayer(std::vector& head_buffer, + std::vector& head_buffer2, + std::vector& head_buffer3, size_t batch_size, + const LegacyWeights::EncoderLayer& layer, + int embedding_size, int heads, + ActivationFunction activation, float alpha = 1.0f); static constexpr auto kWidth = 8; static constexpr auto kHeight = 8; @@ -190,6 +196,146 @@ using ConstEigenStridedMatrixMap = Eigen::Map, 0, Eigen::OuterStride<>>; +template +void BlasComputation::MakeEncoderLayer( + std::vector& head_buffer, std::vector& head_buffer2, + std::vector& head_buffer3, size_t batch_size, + const LegacyWeights::EncoderLayer& layer, int embedding_size, int heads, + ActivationFunction activation, float alpha) { + const int d_model = layer.mha.q_b.size(); + static std::vector head_buffer4; + head_buffer4.clear(); + head_buffer4.resize(batch_size * d_model * kSquares); + static std::vector temp_buffer; + temp_buffer.clear(); + temp_buffer.resize(heads * kSquares * kSquares); + // Q + FullyConnectedLayer::Forward1D( + batch_size * kSquares, embedding_size, d_model, head_buffer.data(), + layer.mha.q_w.data(), layer.mha.q_b.data(), NONE, head_buffer2.data()); + // K + FullyConnectedLayer::Forward1D( + batch_size * kSquares, embedding_size, d_model, head_buffer.data(), + layer.mha.k_w.data(), layer.mha.k_b.data(), NONE, head_buffer3.data()); + // V + FullyConnectedLayer::Forward1D( + batch_size * kSquares, embedding_size, d_model, head_buffer.data(), + layer.mha.v_w.data(), layer.mha.v_b.data(), NONE, head_buffer4.data()); + + // MHA (Q, K, V) + const int depth = d_model / heads; + const float scaling = 1.0f / sqrtf(depth); + + // MHA is done per batch since there's a fourth dimension introduced. + for (auto batch = size_t{0}; batch < batch_size; batch++) { + auto batchStart = batch * kSquares * d_model; + + const float* Q = &head_buffer2[batchStart]; + const float* K = &head_buffer3[batchStart]; + const float* V = &head_buffer4[batchStart]; + + // matmul(Q, K) for all heads per batch. + float* QK = temp_buffer.data(); + for (auto h = 0; h < heads; h++) { + const float* A = &Q[h * depth]; + const float* B = &K[h * depth]; + float* C = &QK[h * kSquares * kSquares]; + if (use_eigen) { + auto C_mat = EigenMatrixMap(C, kSquares, kSquares); + C_mat.noalias() = + scaling * + ConstEigenStridedMatrixMap( + B, depth, kSquares, Eigen::OuterStride<>(heads * depth)) + .transpose() * + ConstEigenStridedMatrixMap( + A, depth, kSquares, Eigen::OuterStride<>(heads * depth)); + } else { +#ifdef USE_BLAS + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, kSquares, kSquares, + depth, scaling, A, heads * depth, B, heads * depth, 0.0f, C, + kSquares); +#else + // Should never get here. + throw Exception("Blas backend internal error"); +#endif + } + } + // Apply Softmax. + for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) { +#ifndef USE_ISPC + SoftmaxActivation(kSquares, QK + h, QK + h); +#else + ispc::SoftmaxActivation(kSquares, QK + h, QK + h); +#endif + } + + // matmul(softmax(QK), V) for all heads per batch. + float* attn = &head_buffer2[batchStart]; + for (auto h = 0; h < heads; h++) { + const float* A = &QK[h * kSquares * kSquares]; + const float* B = &V[h * depth]; + float* C = &attn[h * depth]; + if (use_eigen) { + auto C_mat = EigenStridedMatrixMap( + C, depth, kSquares, Eigen::OuterStride<>(heads * depth)); + C_mat.noalias() = + ConstEigenStridedMatrixMap( + B, depth, kSquares, Eigen::OuterStride<>(heads * depth)) * + ConstEigenMatrixMap(A, kSquares, kSquares); + } else { +#ifdef USE_BLAS + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, kSquares, depth, + kSquares, 1.0f, A, kSquares, B, heads * depth, 0.0f, C, + heads * depth); +#endif + } + } + } + + // Fully connected final MHA layer. + FullyConnectedLayer::Forward1D( + batch_size * kSquares, d_model, embedding_size, head_buffer2.data(), + layer.mha.dense_w.data(), layer.mha.dense_b.data(), NONE, + head_buffer3.data()); + + if (alpha != 1.0f) { + for (size_t i = 0; i < batch_size * kSquares * embedding_size; i++) { + head_buffer[i] *= alpha; + } + } + + // Layer Norm + skip connection. + LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size, + head_buffer.data(), head_buffer3.data(), + layer.ln1_gammas.data(), layer.ln1_betas.data(), + 1e-6); + + // FFN. + const size_t dff_size = layer.ffn.dense1_b.size(); + FullyConnectedLayer::Forward1D( + batch_size * kSquares, embedding_size, dff_size, head_buffer.data(), + layer.ffn.dense1_w.data(), layer.ffn.dense1_b.data(), activation, + head_buffer2.data()); + + FullyConnectedLayer::Forward1D( + batch_size * kSquares, dff_size, layer.ffn.dense2_b.size(), + head_buffer2.data(), layer.ffn.dense2_w.data(), layer.ffn.dense2_b.data(), + NONE, head_buffer3.data()); + + if (alpha != 1.0f) { + for (size_t i = 0; i < batch_size * kSquares * layer.ffn.dense2_b.size(); + i++) { + head_buffer[i] *= alpha; + } + } + + // Layer Norm + skip connection. + LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size, + head_buffer.data(), head_buffer3.data(), + layer.ln2_gammas.data(), layer.ln2_betas.data(), + 1e-6); +} + template void BlasComputation::ComputeBlocking() { // Retrieve network key dimensions from the weights structure. @@ -336,133 +482,10 @@ void BlasComputation::ComputeBlocking() { std::vector head_buffer3(largest_batch_size * max_channel_size * kSquares); - if (weights_.pol_encoder.size() > 0) { - std::vector head_buffer4(largest_batch_size * max_channel_size * - kSquares); - std::vector temp_buffer(weights_.pol_encoder_head_count * - kSquares * kSquares); - - for (auto layer : weights_.pol_encoder) { - // Q - FullyConnectedLayer::Forward1D( - batch_size * kSquares, embedding_size, layer.mha.q_b.size(), - head_buffer.data(), layer.mha.q_w.data(), layer.mha.q_b.data(), - NONE, head_buffer2.data()); - // K - FullyConnectedLayer::Forward1D( - batch_size * kSquares, embedding_size, layer.mha.k_b.size(), - head_buffer.data(), layer.mha.k_w.data(), layer.mha.k_b.data(), - NONE, head_buffer3.data()); - // V - FullyConnectedLayer::Forward1D( - batch_size * kSquares, embedding_size, layer.mha.v_b.size(), - head_buffer.data(), layer.mha.v_w.data(), layer.mha.v_b.data(), - NONE, head_buffer4.data()); - - // MHA (Q, K, V) - const int d_model = layer.mha.q_b.size(); - const int heads = weights_.pol_encoder_head_count; - const int depth = d_model / heads; - const float scaling = 1.0f / sqrtf(depth); - - // MHA is done per batch since there's a fourth dimension introduced. - for (auto batch = size_t{0}; batch < batch_size; batch++) { - auto batchStart = batch * kSquares * d_model; - - const float* Q = &head_buffer2[batchStart]; - const float* K = &head_buffer3[batchStart]; - const float* V = &head_buffer4[batchStart]; - - // matmul(Q, K) for all heads per batch. - float* QK = temp_buffer.data(); - for (auto h = 0; h < heads; h++) { - const float* A = &Q[h * depth]; - const float* B = &K[h * depth]; - float* C = &QK[h * kSquares * kSquares]; - if (use_eigen) { - auto C_mat = EigenMatrixMap(C, kSquares, kSquares); - C_mat.noalias() = - scaling * - ConstEigenStridedMatrixMap( - B, depth, kSquares, Eigen::OuterStride<>(heads * depth)) - .transpose() * - ConstEigenStridedMatrixMap( - A, depth, kSquares, - Eigen::OuterStride<>(heads * depth)); - } else { -#ifdef USE_BLAS - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, kSquares, - kSquares, depth, scaling, A, heads * depth, B, - heads * depth, 0.0f, C, kSquares); -#else - // Should never get here. - throw Exception("Blas backend internal error"); -#endif - } - } - // Apply Softmax. - for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) { -#ifndef USE_ISPC - SoftmaxActivation(kSquares, QK + h, QK + h); -#else - ispc::SoftmaxActivation(kSquares, QK + h, QK + h); -#endif - } - - // matmul(softmax(QK), V) for all heads per batch. - float* attn = &head_buffer2[batchStart]; - for (auto h = 0; h < heads; h++) { - const float* A = &QK[h * kSquares * kSquares]; - const float* B = &V[h * depth]; - float* C = &attn[h * depth]; - if (use_eigen) { - auto C_mat = EigenStridedMatrixMap( - C, depth, kSquares, Eigen::OuterStride<>(heads * depth)); - C_mat.noalias() = - ConstEigenStridedMatrixMap( - B, depth, kSquares, - Eigen::OuterStride<>(heads * depth)) * - ConstEigenMatrixMap(A, kSquares, kSquares); - } else { -#ifdef USE_BLAS - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, kSquares, - depth, kSquares, 1.0f, A, kSquares, B, - heads * depth, 0.0f, C, heads * depth); -#endif - } - } - } - - // Fully connected final MHA layer. - FullyConnectedLayer::Forward1D( - batch_size * kSquares, d_model, embedding_size, - head_buffer2.data(), layer.mha.dense_w.data(), - layer.mha.dense_b.data(), NONE, head_buffer3.data()); - - // Layer Norm + skip connection. - LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size, - head_buffer.data(), head_buffer3.data(), - layer.ln1_gammas.data(), - layer.ln1_betas.data(), 1e-6); - - // FFN. - const size_t dff_size = layer.ffn.dense1_b.size(); - FullyConnectedLayer::Forward1D( - batch_size * kSquares, embedding_size, dff_size, - head_buffer.data(), layer.ffn.dense1_w.data(), - layer.ffn.dense1_b.data(), SELU, head_buffer2.data()); - - FullyConnectedLayer::Forward1D( - batch_size * kSquares, dff_size, layer.ffn.dense2_b.size(), - head_buffer2.data(), layer.ffn.dense2_w.data(), - layer.ffn.dense2_b.data(), NONE, head_buffer3.data()); - - // Layer Norm + skip connection. - LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size, - head_buffer.data(), head_buffer3.data(), - layer.ln2_gammas.data(), - layer.ln2_betas.data(), 1e-6); - } + for (auto layer : weights_.pol_encoder) { + MakeEncoderLayer(head_buffer, head_buffer2, head_buffer3, batch_size, + layer, embedding_size, weights_.pol_encoder_head_count, + SELU); } // Q From a96c4c048c6dbf7bb03a1f2f6212d028e011f9cb Mon Sep 17 00:00:00 2001 From: borg323 Date: Sat, 11 Feb 2023 02:32:03 +0200 Subject: [PATCH 29/46] blas performance fix --- src/neural/blas/network_blas.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 9a3e51ce7b..48ea687ce7 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -482,7 +482,7 @@ void BlasComputation::ComputeBlocking() { std::vector head_buffer3(largest_batch_size * max_channel_size * kSquares); - for (auto layer : weights_.pol_encoder) { + for (auto& layer : weights_.pol_encoder) { MakeEncoderLayer(head_buffer, head_buffer2, head_buffer3, batch_size, layer, embedding_size, weights_.pol_encoder_head_count, SELU); From ffd6bbaab1569236472c1abdd2b06ef7b9f35f69 Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Mon, 20 Feb 2023 01:49:30 +0100 Subject: [PATCH 30/46] Attention body support to blas. --- src/neural/blas/encoder.h | 13 +- src/neural/blas/fully_connected_layer.cc | 4 +- src/neural/blas/layer_norm.ispc | 36 ++- src/neural/blas/network_blas.cc | 331 +++++++++++++++++------ src/neural/shared/activation.cc | 2 + 5 files changed, 294 insertions(+), 92 deletions(-) diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h index 567db1be40..b9940a4b14 100644 --- a/src/neural/blas/encoder.h +++ b/src/neural/blas/encoder.h @@ -36,9 +36,16 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size, #ifndef USE_ISPC // Mean taken in dimension C. float mean = 0; - for (size_t c = 0; c < channels; ++c) { - data[i * channels + c] += skip[i * channels + c]; - mean += data[i * channels + c]; + if (skip != nullptr) { + for (size_t c = 0; c < channels; ++c) { + data[i * channels + c] += skip[i * channels + c]; + mean += data[i * channels + c]; + } + } + else { + for (size_t c = 0; c < channels; ++c) { + mean += data[i * channels + c]; + } } mean /= channels; diff --git a/src/neural/blas/fully_connected_layer.cc b/src/neural/blas/fully_connected_layer.cc index 2465779ed1..b4879d3580 100644 --- a/src/neural/blas/fully_connected_layer.cc +++ b/src/neural/blas/fully_connected_layer.cc @@ -103,7 +103,9 @@ void FullyConnectedLayer::Forward1D( outputs, // C (int)output_size); // ldc, leading rank of C } - ApplyBias(batch_size, output_size, biases, activation, outputs); + if (biases != nullptr) { + ApplyBias(batch_size, output_size, biases, activation, outputs); + } } template <> diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc index 7995c9b0e2..9d7255e49c 100644 --- a/src/neural/blas/layer_norm.ispc +++ b/src/neural/blas/layer_norm.ispc @@ -27,13 +27,23 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels, // One pass mean and variance taken in dimension C. Uses shifted variance calculation. float imean = 0; float ivar = 0; - float k = data[0] + skip[0]; - foreach (c = 0 ... channels) { - float t = data[c] + skip[c]; - data[c] = t; - t -= k; - imean += t; - ivar += t * t; + if (skip != nullptr) { + float k = data[0] + skip[0]; + foreach (c = 0 ... channels) { + float t = data[c] + skip[c]; + data[c] = t; + t -= k; + imean += t; + ivar += t * t; + } + } else { + float k = data[0]; + foreach (c = 0 ... channels) { + float t = data[c]; + t -= k; + imean += t; + ivar += t * t; + } } float mean = reduce_add(imean) / channels; float var = (reduce_add(ivar) - channels * mean * mean) / channels; @@ -41,9 +51,15 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels, #else // Mean taken in dimension C. float imean = 0; - foreach (c = 0 ... channels) { - data[c] += skip[c]; - imean += data[c]; + if (skip != nullptr) { + foreach (c = 0 ... channels) { + data[c] += skip[c]; + imean += data[c]; + } + } else { + foreach (c = 0 ... channels) { + imean += data[c]; + } } float mean = reduce_add(imean) / channels; diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 48ea687ce7..2cf050bc62 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -54,7 +54,10 @@ class BlasComputation : public NetworkComputation { BlasComputation(const LegacyWeights& weights, const size_t max_batch_size, const bool wdl, const bool moves_left, const bool conv_policy, const ActivationFunction default_activation, - const bool attn_policy); + const ActivationFunction smolgen_activation, + const ActivationFunction ffn_activation, + const bool attn_policy, + const bool attn_body); virtual ~BlasComputation() {} @@ -107,7 +110,9 @@ class BlasComputation : public NetworkComputation { std::vector& head_buffer3, size_t batch_size, const LegacyWeights::EncoderLayer& layer, int embedding_size, int heads, - ActivationFunction activation, float alpha = 1.0f); + ActivationFunction smolgen_activation = SWISH, + ActivationFunction ffn_activation = RELU_2, + float alpha = 1.0f); static constexpr auto kWidth = 8; static constexpr auto kHeight = 8; @@ -127,7 +132,10 @@ class BlasComputation : public NetworkComputation { bool moves_left_; bool conv_policy_; ActivationFunction default_activation_; + ActivationFunction smolgen_activation_; + ActivationFunction ffn_activation_; bool attn_policy_; + bool attn_body_; }; template @@ -139,7 +147,7 @@ class BlasNetwork : public Network { std::unique_ptr NewComputation() override { return std::make_unique>( weights_, max_batch_size_, wdl_, moves_left_, conv_policy_, - default_activation_, attn_policy_); + default_activation_, smolgen_activation_, ffn_activation_, attn_policy_, attn_body_); } const NetworkCapabilities& GetCapabilities() const override { @@ -159,14 +167,21 @@ class BlasNetwork : public Network { bool moves_left_; bool conv_policy_; ActivationFunction default_activation_; + ActivationFunction smolgen_activation_; + ActivationFunction ffn_activation_; bool attn_policy_; + bool attn_body_; }; template BlasComputation::BlasComputation( const LegacyWeights& weights, const size_t max_batch_size, const bool wdl, const bool moves_left, const bool conv_policy, - const ActivationFunction default_activation, const bool attn_policy) + const ActivationFunction default_activation, + const ActivationFunction smolgen_activation, + const ActivationFunction ffn_activation, + const bool attn_policy, + const bool attn_body) : weights_(weights), max_batch_size_(max_batch_size), policies_(0), @@ -175,7 +190,10 @@ BlasComputation::BlasComputation( moves_left_(moves_left), conv_policy_(conv_policy), default_activation_(default_activation), - attn_policy_(attn_policy) { + smolgen_activation_(smolgen_activation), + ffn_activation_(ffn_activation), + attn_policy_(attn_policy), + attn_body_(attn_body) { #ifdef USE_DNNL omp_set_num_threads(1); #endif @@ -201,11 +219,15 @@ void BlasComputation::MakeEncoderLayer( std::vector& head_buffer, std::vector& head_buffer2, std::vector& head_buffer3, size_t batch_size, const LegacyWeights::EncoderLayer& layer, int embedding_size, int heads, - ActivationFunction activation, float alpha) { + ActivationFunction smolgen_activation, ActivationFunction ffn_activation, + float alpha) { const int d_model = layer.mha.q_b.size(); + const int dff_size = layer.ffn.dense1_b.size(); + static std::vector head_buffer4; head_buffer4.clear(); - head_buffer4.resize(batch_size * d_model * kSquares); + head_buffer4.resize(batch_size * std::max(d_model, dff_size) * kSquares); + static std::vector temp_buffer; temp_buffer.clear(); temp_buffer.resize(heads * kSquares * kSquares); @@ -260,6 +282,57 @@ void BlasComputation::MakeEncoderLayer( #endif } } + + // Smolgen. + if (layer.mha.has_smolgen) { + float* input = &head_buffer[batchStart]; + float* temp1 = &head_buffer2[batchStart]; + float* temp2 = &head_buffer3[batchStart]; + + // Compress. + const auto hidden_channels = layer.mha.smolgen.compress.size() / d_model; + FullyConnectedLayer::Forward1D( + kSquares, d_model, hidden_channels, + input, layer.mha.smolgen.compress.data(), + (const float*)nullptr, NONE, temp1); + + // Dense 1. + const auto hidden_sz = layer.mha.smolgen.dense1_b.size(); + FullyConnectedLayer::Forward1D( + 1, kSquares * hidden_channels, hidden_sz, + temp1, layer.mha.smolgen.dense1_w.data(), + layer.mha.smolgen.dense1_b.data(), smolgen_activation, temp2); + // Layer Norm + skip connection. + LayerNorm2DWithSkipConnection(batch_size, hidden_sz, + temp2, (const float*)nullptr, + layer.mha.smolgen.ln1_gammas.data(), + layer.mha.smolgen.ln1_betas.data(), + 1e-6); + + // Dense 2. + const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size(); + FullyConnectedLayer::Forward1D( + 1, hidden_sz, gen_sz_outputs, temp2, + layer.mha.smolgen.dense2_w.data(), + layer.mha.smolgen.dense2_b.data(), smolgen_activation, temp1); + // Layer Norm + skip connection. + LayerNorm2DWithSkipConnection(batch_size, gen_sz_outputs, + temp1, (const float*)nullptr, + layer.mha.smolgen.ln2_gammas.data(), + layer.mha.smolgen.ln2_betas.data(), + 1e-6); + + // Global smolgen weights (use bias to add already calculated attention weights). + FullyConnectedLayer::Forward1D( + heads, gen_sz_outputs / heads, 64 * 64, temp1, + weights_.smolgen_w.data(), (const float*)nullptr, NONE, temp2); + + // Add smolgen weights to QK. + for (auto i = 0; i < heads * kSquares * kSquares; i++) { + QK[i] += temp2[i]; + } + } + // Apply Softmax. for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) { #ifndef USE_ISPC @@ -311,15 +384,14 @@ void BlasComputation::MakeEncoderLayer( 1e-6); // FFN. - const size_t dff_size = layer.ffn.dense1_b.size(); FullyConnectedLayer::Forward1D( batch_size * kSquares, embedding_size, dff_size, head_buffer.data(), - layer.ffn.dense1_w.data(), layer.ffn.dense1_b.data(), activation, - head_buffer2.data()); + layer.ffn.dense1_w.data(), layer.ffn.dense1_b.data(), ffn_activation, + head_buffer4.data()); FullyConnectedLayer::Forward1D( batch_size * kSquares, dff_size, layer.ffn.dense2_b.size(), - head_buffer2.data(), layer.ffn.dense2_w.data(), layer.ffn.dense2_b.data(), + head_buffer4.data(), layer.ffn.dense2_w.data(), layer.ffn.dense2_b.data(), NONE, head_buffer3.data()); if (alpha != 1.0f) { @@ -341,17 +413,21 @@ void BlasComputation::ComputeBlocking() { // Retrieve network key dimensions from the weights structure. const auto num_value_channels = weights_.ip1_val_b.size(); const auto num_moves_channels = weights_.ip1_mov_b.size(); - const auto num_value_input_planes = weights_.value.biases.size(); + const auto num_value_input_planes = attn_body_ ? weights_.ip_val_b.size() + : weights_.value.biases.size(); const auto num_policy_input_planes = weights_.policy.biases.size(); - const auto num_moves_input_planes = weights_.moves_left.biases.size(); + const auto num_moves_input_planes = attn_body_ ? weights_.ip_mov_b.size() + : weights_.moves_left.biases.size(); const auto num_output_policy = static_cast(kPolicyOutputs); - const auto output_channels = weights_.input.biases.size(); + const auto output_channels = attn_body_ ? weights_.ip_emb_b.size() + : weights_.input.biases.size(); + const auto num_res_blocks = weights_.residual.size(); // max_channels is the maximum number of input channels of any // convolution. // Residual blocks are identical, but the first convolution might be bigger // when the network has very few filters - const auto input_channels = static_cast(kInputPlanes); + const auto input_channels = static_cast(kInputPlanes + (attn_body_ ? kNumPosEncodingChannels : 0)); const auto max_channels = std::max(output_channels, input_channels); // The policy head may increase convolution max output size. @@ -361,8 +437,8 @@ void BlasComputation::ComputeBlocking() { : output_channels; // Determine the largest batch for allocations. - const auto plane_count = planes_.size(); - const auto largest_batch_size = std::min(max_batch_size_, plane_count); + const auto total_batches = planes_.size(); + const auto largest_batch_size = std::min(max_batch_size_, total_batches); /* Typically input_channels = 112 @@ -380,10 +456,8 @@ void BlasComputation::ComputeBlocking() { std::vector output_fc(largest_batch_size * max_fc_channels); std::vector res_buffer1(largest_batch_size * max_channels * kSquares); - std::vector res_buffer2(largest_batch_size * output_channels * - kSquares); - std::vector res_buffer3(largest_batch_size * output_channels * - kSquares); + std::vector res_buffer2(largest_batch_size * max_channels * kSquares); + std::vector res_buffer3(largest_batch_size * max_channels * kSquares); WinogradConvolution3 convolve3(largest_batch_size, max_channels, max_output_channels); @@ -403,73 +477,136 @@ void BlasComputation::ComputeBlocking() { float* conv_out = res_buffer2.data(); float* res = res_buffer3.data(); - for (size_t i = 0; i < plane_count; i += largest_batch_size) { - const auto batch_size = std::min(plane_count - i, largest_batch_size); + for (size_t i = 0; i < total_batches; i += largest_batch_size) { + const auto batch_size = std::min(total_batches - i, largest_batch_size); for (size_t j = 0; j < batch_size; j++) { EncodePlanes(planes_[i + j], &conv_in[j * kSquares * kInputPlanes]); } - // Input convolution + if (num_res_blocks > 0) { + // Input convolution - convolve3.Forward(batch_size, kInputPlanes, output_channels, conv_in, - weights_.input.weights.data(), conv_out); + convolve3.Forward(batch_size, kInputPlanes, output_channels, conv_in, + weights_.input.weights.data(), conv_out); - BiasActivate(batch_size, output_channels, conv_out, - weights_.input.biases.data(), default_activation_); + BiasActivate(batch_size, output_channels, conv_out, + weights_.input.biases.data(), default_activation_); - // Residual tower + // Residual tower - for (auto& residual : weights_.residual) { - const auto& conv1 = residual.conv1; - const auto& conv2 = residual.conv2; - const auto& se = residual.se; + for (auto& residual : weights_.residual) { + const auto& conv1 = residual.conv1; + const auto& conv2 = residual.conv2; + const auto& se = residual.se; - std::swap(conv_out, conv_in); + std::swap(conv_out, conv_in); - convolve3.Forward(batch_size, output_channels, output_channels, conv_in, - conv1.weights.data(), conv_out); + convolve3.Forward(batch_size, output_channels, output_channels, conv_in, + conv1.weights.data(), conv_out); - BiasActivate(batch_size, output_channels, &conv_out[0], - conv1.biases.data(), default_activation_); + BiasActivate(batch_size, output_channels, &conv_out[0], + conv1.biases.data(), default_activation_); - std::swap(conv_in, res); - std::swap(conv_out, conv_in); + std::swap(conv_in, res); + std::swap(conv_out, conv_in); - convolve3.Forward(batch_size, output_channels, output_channels, conv_in, - conv2.weights.data(), conv_out); + convolve3.Forward(batch_size, output_channels, output_channels, conv_in, + conv2.weights.data(), conv_out); - if (residual.has_se) { - // No relu if followed by SE-unit and residual/bias is added later - std::swap(conv_out, conv_in); + if (residual.has_se) { + // No relu if followed by SE-unit and residual/bias is added later + std::swap(conv_out, conv_in); - auto se_fc_outputs = se.b1.size(); - ApplySEUnit(batch_size, output_channels, se_fc_outputs, - conv_in, conv2.biases.data(), res, se.w1.data(), - se.b1.data(), se.w2.data(), se.b2.data(), - conv_out, default_activation_); - } else { - BiasResidual(batch_size, output_channels, &conv_out[0], - conv2.biases.data(), res, default_activation_); + auto se_fc_outputs = se.b1.size(); + ApplySEUnit(batch_size, output_channels, se_fc_outputs, + conv_in, conv2.biases.data(), res, se.w1.data(), + se.b1.data(), se.w2.data(), se.b2.data(), + conv_out, default_activation_); + } else { + BiasResidual(batch_size, output_channels, &conv_out[0], + conv2.biases.data(), res, default_activation_); + } } } + if (attn_body_) { + const auto embedding_size = weights_.ip_emb_b.size(); + assert(embedding_size > 0); + const auto input_size = num_res_blocks == 0 ? input_channels : weights_.input.biases.size(); + + if (num_res_blocks == 0) { + // No residual means pure transformer, so process input position encoding. + // Preprocess for attention body. + for (auto batch = size_t{0}; batch < batch_size; batch++) { + for (auto i = 0; i < kSquares; i++) { + // NCHW to NHWC conversion. + for (size_t j = 0; j < kInputPlanes; j++) { + res[batch * kSquares * input_size + i * input_size + j] = + conv_in[batch * kSquares * kInputPlanes + j * kSquares + i]; + } + // Position encoding. + for (size_t j = kInputPlanes; j < input_size; j++) { + res[batch * kSquares * input_size + i * input_size + j] = + kPosEncoding[i][j - kInputPlanes]; + } + } + } + } + + // Input embedding. + FullyConnectedLayer::Forward1D( + batch_size * kSquares, input_size, embedding_size, + res_buffer3.data(), weights_.ip_emb_w.data(), + weights_.ip_emb_b.data(), default_activation_, res_buffer1.data()); + + // Input gating + if (weights_.ip_mult_gate.size() > 0) { + int idx; + for (auto batch = size_t{0}; batch < batch_size; batch++) { + for (auto i = 0; i < kSquares; i++) { + for (auto j = 0; j < embedding_size; j++) { + idx = batch * kSquares * embedding_size + i * embedding_size + j; + res_buffer1[idx] = res_buffer1[idx] * weights_.ip_mult_gate[j * kSquares + i] + + weights_.ip_add_gate[j * kSquares + i]; + } + } + }; + } + + // Attention body encoders. + + float alpha = (float) pow(2.0 * weights_.encoder.size(), 0.25); + for (auto& layer : weights_.encoder) { + MakeEncoderLayer(res_buffer1, res_buffer2, res_buffer3, batch_size, + layer, embedding_size, weights_.encoder_head_count, + smolgen_activation_, ffn_activation_, alpha); + } + + // for (auto i=0; i::Forward1D( - batch_size * kSquares, output_channels, embedding_size, res, + batch_size * kSquares, output_channels, policy_embedding_size, res, weights_.ip_pol_w.data(), weights_.ip_pol_b.data(), - SELU, // SELU activation for attention head. + attn_body_ ? default_activation_ : SELU, // SELU activation hardcoded for apmish nets. head_buffer.data()); const size_t policy_d_model = weights_.ip2_pol_b.size(); @@ -484,18 +621,20 @@ void BlasComputation::ComputeBlocking() { for (auto& layer : weights_.pol_encoder) { MakeEncoderLayer(head_buffer, head_buffer2, head_buffer3, batch_size, - layer, embedding_size, weights_.pol_encoder_head_count, - SELU); + layer, policy_embedding_size, weights_.pol_encoder_head_count, + attn_body_ ? smolgen_activation_ : NONE, + attn_body_ ? ffn_activation_ : SELU, + 1.0f); } // Q FullyConnectedLayer::Forward1D( - batch_size * kSquares, embedding_size, policy_d_model, + batch_size * kSquares, policy_embedding_size, policy_d_model, head_buffer.data(), weights_.ip2_pol_w.data(), weights_.ip2_pol_b.data(), NONE, head_buffer2.data()); // K FullyConnectedLayer::Forward1D( - batch_size * kSquares, embedding_size, policy_d_model, + batch_size * kSquares, policy_embedding_size, policy_d_model, head_buffer.data(), weights_.ip3_pol_w.data(), weights_.ip3_pol_b.data(), NONE, head_buffer3.data()); const float scaling = 1.0f / sqrtf(policy_d_model); @@ -564,6 +703,7 @@ void BlasComputation::ComputeBlocking() { } } } else if (conv_policy_) { + assert(!attn_body_); // not supported with attention body convolve3.Forward(batch_size, output_channels, output_channels, conv_out, weights_.policy1.weights.data(), res); @@ -589,6 +729,7 @@ void BlasComputation::ComputeBlocking() { } } else { + assert(!attn_body_); // not supported with attention body Convolution1::Forward( batch_size, output_channels, num_policy_input_planes, conv_out, weights_.policy.weights.data(), head_buffer.data()); @@ -614,12 +755,22 @@ void BlasComputation::ComputeBlocking() { } // Value head - Convolution1::Forward( - batch_size, output_channels, num_value_input_planes, conv_out, - weights_.value.weights.data(), head_buffer.data()); + if (attn_body_) { + FullyConnectedLayer::Forward1D( + batch_size * kSquares, weights_.ip_emb_b.size(), num_value_input_planes, + res, weights_.ip_val_w.data(), + weights_.ip_val_b.data(), + default_activation_, + head_buffer.data()); + } + else { + Convolution1::Forward( + batch_size, output_channels, num_value_input_planes, conv_out, + weights_.value.weights.data(), head_buffer.data()); - BiasActivate(batch_size, num_value_input_planes, &head_buffer[0], - weights_.value.biases.data(), default_activation_); + BiasActivate(batch_size, num_value_input_planes, &head_buffer[0], + weights_.value.biases.data(), default_activation_); + } FullyConnectedLayer::Forward1D( batch_size, num_value_input_planes * kSquares, num_value_channels, @@ -656,12 +807,22 @@ void BlasComputation::ComputeBlocking() { } } if (moves_left_) { - Convolution1::Forward( - batch_size, output_channels, num_moves_input_planes, conv_out, - weights_.moves_left.weights.data(), head_buffer.data()); + if (attn_body_) { + FullyConnectedLayer::Forward1D( + batch_size * kSquares, weights_.ip_emb_b.size(), num_moves_input_planes, + res, weights_.ip_mov_w.data(), + weights_.ip_mov_b.data(), + default_activation_, + head_buffer.data()); + } + else { + Convolution1::Forward( + batch_size, output_channels, num_moves_input_planes, conv_out, + weights_.moves_left.weights.data(), head_buffer.data()); - BiasActivate(batch_size, num_moves_input_planes, &head_buffer[0], - weights_.moves_left.biases.data(), default_activation_); + BiasActivate(batch_size, num_moves_input_planes, &head_buffer[0], + weights_.moves_left.biases.data(), default_activation_); + } FullyConnectedLayer::Forward1D( batch_size, num_moves_input_planes * kSquares, num_moves_channels, @@ -718,11 +879,23 @@ BlasNetwork::BlasNetwork(const WeightsFile& file, attn_policy_ = file.format().network_format().policy() == pblczero::NetworkFormat::POLICY_ATTENTION; + attn_body_ = file.format().network_format().network() == + pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT; + default_activation_ = file.format().network_format().default_activation() == pblczero::NetworkFormat::DEFAULT_ACTIVATION_MISH ? MISH : RELU; + if (attn_body_) { + const auto smol_act = file.format().network_format().smolgen_activation(); + smolgen_activation_ = smol_act == pblczero::NetworkFormat::SMOLGEN_ACTIVATION_INHERIT + ? default_activation_ : static_cast(smol_act); + const auto ffn_act = file.format().network_format().ffn_activation(); + ffn_activation_ = ffn_act == pblczero::NetworkFormat::FFN_ACTIVATION_INHERIT + ? default_activation_ : static_cast(ffn_act); + } + if (max_batch_size_ > kHardMaxBatchSize) { max_batch_size_ = kHardMaxBatchSize; } @@ -804,7 +977,9 @@ std::unique_ptr MakeBlasNetwork(const std::optional& w, if (weights.format().network_format().network() != pblczero::NetworkFormat::NETWORK_CLASSICAL_WITH_HEADFORMAT && weights.format().network_format().network() != - pblczero::NetworkFormat::NETWORK_SE_WITH_HEADFORMAT) { + pblczero::NetworkFormat::NETWORK_SE_WITH_HEADFORMAT && + weights.format().network_format().network() != + pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT) { throw Exception("Network format " + pblczero::NetworkFormat::NetworkStructure_Name( weights.format().network_format().network()) + diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc index 1a2dd34ead..9c90936c00 100644 --- a/src/neural/shared/activation.cc +++ b/src/neural/shared/activation.cc @@ -72,6 +72,8 @@ float Activate(const float val, const ActivationFunction activation) { switch (activation) { case RELU: return val > 0 ? val : 0; + case RELU_2: + return val > 0 ? val * val : 0; case MISH: return mish(val); case TANH: From 919ef8e558bbaed23a022de7bcecb5092eebd102 Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Mon, 20 Feb 2023 11:51:28 +0100 Subject: [PATCH 31/46] Minor bug fixes (thanks @borg). --- src/neural/blas/fully_connected_layer.cc | 4 +++- src/neural/blas/layer_norm.ispc | 4 ++-- src/neural/blas/network_blas.cc | 14 ++++---------- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/neural/blas/fully_connected_layer.cc b/src/neural/blas/fully_connected_layer.cc index b4879d3580..d228d2523f 100644 --- a/src/neural/blas/fully_connected_layer.cc +++ b/src/neural/blas/fully_connected_layer.cc @@ -136,7 +136,9 @@ void FullyConnectedLayer::Forward1D( .transpose() * ConstEigenMatrixMap(inputs, input_size, batch_size); } - ApplyBias(batch_size, output_size, biases, activation, outputs); + if (biases != nullptr) { + ApplyBias(batch_size, output_size, biases, activation, outputs); + } } template <> diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc index 9d7255e49c..43fa3fe12a 100644 --- a/src/neural/blas/layer_norm.ispc +++ b/src/neural/blas/layer_norm.ispc @@ -27,7 +27,7 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels, // One pass mean and variance taken in dimension C. Uses shifted variance calculation. float imean = 0; float ivar = 0; - if (skip != nullptr) { + if (skip != NULL) { float k = data[0] + skip[0]; foreach (c = 0 ... channels) { float t = data[c] + skip[c]; @@ -51,7 +51,7 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels, #else // Mean taken in dimension C. float imean = 0; - if (skip != nullptr) { + if (skip != NULL) { foreach (c = 0 ... channels) { data[c] += skip[c]; imean += data[c]; diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 2cf050bc62..b15903a771 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -223,14 +223,8 @@ void BlasComputation::MakeEncoderLayer( float alpha) { const int d_model = layer.mha.q_b.size(); const int dff_size = layer.ffn.dense1_b.size(); - - static std::vector head_buffer4; - head_buffer4.clear(); - head_buffer4.resize(batch_size * std::max(d_model, dff_size) * kSquares); - - static std::vector temp_buffer; - temp_buffer.clear(); - temp_buffer.resize(heads * kSquares * kSquares); + std::vector head_buffer4(batch_size * std::max(d_model, dff_size) * kSquares); + std::vector temp_buffer(heads * kSquares * kSquares); // Q FullyConnectedLayer::Forward1D( batch_size * kSquares, embedding_size, d_model, head_buffer.data(), @@ -303,7 +297,7 @@ void BlasComputation::MakeEncoderLayer( temp1, layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(), smolgen_activation, temp2); // Layer Norm + skip connection. - LayerNorm2DWithSkipConnection(batch_size, hidden_sz, + LayerNorm2DWithSkipConnection(1, hidden_sz, temp2, (const float*)nullptr, layer.mha.smolgen.ln1_gammas.data(), layer.mha.smolgen.ln1_betas.data(), @@ -316,7 +310,7 @@ void BlasComputation::MakeEncoderLayer( layer.mha.smolgen.dense2_w.data(), layer.mha.smolgen.dense2_b.data(), smolgen_activation, temp1); // Layer Norm + skip connection. - LayerNorm2DWithSkipConnection(batch_size, gen_sz_outputs, + LayerNorm2DWithSkipConnection(1, gen_sz_outputs, temp1, (const float*)nullptr, layer.mha.smolgen.ln2_gammas.data(), layer.mha.smolgen.ln2_betas.data(), From 8ebb2226f370001f90b479783951f0fce5ee5d28 Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Mon, 20 Feb 2023 12:20:36 +0100 Subject: [PATCH 32/46] Comment fix. --- src/neural/blas/network_blas.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index b15903a771..d36fd88d4c 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -316,7 +316,7 @@ void BlasComputation::MakeEncoderLayer( layer.mha.smolgen.ln2_betas.data(), 1e-6); - // Global smolgen weights (use bias to add already calculated attention weights). + // Global smolgen weights. FullyConnectedLayer::Forward1D( heads, gen_sz_outputs / heads, 64 * 64, temp1, weights_.smolgen_w.data(), (const float*)nullptr, NONE, temp2); From ce2c2e79286543b381b8b69cde4c7dddedc0c99a Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Mon, 20 Feb 2023 12:29:08 +0100 Subject: [PATCH 33/46] Add check for add_gating. --- src/neural/blas/network_blas.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index d36fd88d4c..25a5b22c56 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -554,7 +554,7 @@ void BlasComputation::ComputeBlocking() { weights_.ip_emb_b.data(), default_activation_, res_buffer1.data()); // Input gating - if (weights_.ip_mult_gate.size() > 0) { + if (weights_.ip_mult_gate.size() > 0 && weights_.ip_add_gate.size() > 0) { int idx; for (auto batch = size_t{0}; batch < batch_size; batch++) { for (auto i = 0; i < kSquares; i++) { From d299d01262e6b1a566d327f0afb58abc1f1ed9ed Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Mon, 20 Feb 2023 12:30:30 +0100 Subject: [PATCH 34/46] Comment fix. --- src/neural/blas/network_blas.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 25a5b22c56..2c6b0d0881 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -568,7 +568,6 @@ void BlasComputation::ComputeBlocking() { } // Attention body encoders. - float alpha = (float) pow(2.0 * weights_.encoder.size(), 0.25); for (auto& layer : weights_.encoder) { MakeEncoderLayer(res_buffer1, res_buffer2, res_buffer3, batch_size, From def48b5c91a701f7b2637106703814db746f950c Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Mon, 20 Feb 2023 12:30:50 +0100 Subject: [PATCH 35/46] Comment fix. --- src/neural/blas/network_blas.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 2c6b0d0881..9a86e58b18 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -575,7 +575,6 @@ void BlasComputation::ComputeBlocking() { smolgen_activation_, ffn_activation_, alpha); } - // for (auto i=0; i Date: Mon, 20 Feb 2023 18:12:34 +0200 Subject: [PATCH 36/46] code formatting --- src/neural/blas/network_blas.cc | 138 ++++++++++++++++---------------- 1 file changed, 69 insertions(+), 69 deletions(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 9a86e58b18..60e61d0447 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -56,8 +56,7 @@ class BlasComputation : public NetworkComputation { const ActivationFunction default_activation, const ActivationFunction smolgen_activation, const ActivationFunction ffn_activation, - const bool attn_policy, - const bool attn_body); + const bool attn_policy, const bool attn_body); virtual ~BlasComputation() {} @@ -147,7 +146,8 @@ class BlasNetwork : public Network { std::unique_ptr NewComputation() override { return std::make_unique>( weights_, max_batch_size_, wdl_, moves_left_, conv_policy_, - default_activation_, smolgen_activation_, ffn_activation_, attn_policy_, attn_body_); + default_activation_, smolgen_activation_, ffn_activation_, attn_policy_, + attn_body_); } const NetworkCapabilities& GetCapabilities() const override { @@ -179,8 +179,7 @@ BlasComputation::BlasComputation( const bool moves_left, const bool conv_policy, const ActivationFunction default_activation, const ActivationFunction smolgen_activation, - const ActivationFunction ffn_activation, - const bool attn_policy, + const ActivationFunction ffn_activation, const bool attn_policy, const bool attn_body) : weights_(weights), max_batch_size_(max_batch_size), @@ -223,7 +222,8 @@ void BlasComputation::MakeEncoderLayer( float alpha) { const int d_model = layer.mha.q_b.size(); const int dff_size = layer.ffn.dense1_b.size(); - std::vector head_buffer4(batch_size * std::max(d_model, dff_size) * kSquares); + std::vector head_buffer4(batch_size * std::max(d_model, dff_size) * + kSquares); std::vector temp_buffer(heads * kSquares * kSquares); // Q FullyConnectedLayer::Forward1D( @@ -286,35 +286,32 @@ void BlasComputation::MakeEncoderLayer( // Compress. const auto hidden_channels = layer.mha.smolgen.compress.size() / d_model; FullyConnectedLayer::Forward1D( - kSquares, d_model, hidden_channels, - input, layer.mha.smolgen.compress.data(), - (const float*)nullptr, NONE, temp1); + kSquares, d_model, hidden_channels, input, + layer.mha.smolgen.compress.data(), (const float*)nullptr, NONE, + temp1); // Dense 1. const auto hidden_sz = layer.mha.smolgen.dense1_b.size(); FullyConnectedLayer::Forward1D( - 1, kSquares * hidden_channels, hidden_sz, - temp1, layer.mha.smolgen.dense1_w.data(), - layer.mha.smolgen.dense1_b.data(), smolgen_activation, temp2); + 1, kSquares * hidden_channels, hidden_sz, temp1, + layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(), + smolgen_activation, temp2); // Layer Norm + skip connection. - LayerNorm2DWithSkipConnection(1, hidden_sz, - temp2, (const float*)nullptr, + LayerNorm2DWithSkipConnection(1, hidden_sz, temp2, (const float*)nullptr, layer.mha.smolgen.ln1_gammas.data(), - layer.mha.smolgen.ln1_betas.data(), - 1e-6); + layer.mha.smolgen.ln1_betas.data(), 1e-6); // Dense 2. const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size(); FullyConnectedLayer::Forward1D( 1, hidden_sz, gen_sz_outputs, temp2, - layer.mha.smolgen.dense2_w.data(), - layer.mha.smolgen.dense2_b.data(), smolgen_activation, temp1); + layer.mha.smolgen.dense2_w.data(), layer.mha.smolgen.dense2_b.data(), + smolgen_activation, temp1); // Layer Norm + skip connection. - LayerNorm2DWithSkipConnection(1, gen_sz_outputs, - temp1, (const float*)nullptr, + LayerNorm2DWithSkipConnection(1, gen_sz_outputs, temp1, + (const float*)nullptr, layer.mha.smolgen.ln2_gammas.data(), - layer.mha.smolgen.ln2_betas.data(), - 1e-6); + layer.mha.smolgen.ln2_betas.data(), 1e-6); // Global smolgen weights. FullyConnectedLayer::Forward1D( @@ -407,21 +404,22 @@ void BlasComputation::ComputeBlocking() { // Retrieve network key dimensions from the weights structure. const auto num_value_channels = weights_.ip1_val_b.size(); const auto num_moves_channels = weights_.ip1_mov_b.size(); - const auto num_value_input_planes = attn_body_ ? weights_.ip_val_b.size() - : weights_.value.biases.size(); + const auto num_value_input_planes = + attn_body_ ? weights_.ip_val_b.size() : weights_.value.biases.size(); const auto num_policy_input_planes = weights_.policy.biases.size(); - const auto num_moves_input_planes = attn_body_ ? weights_.ip_mov_b.size() - : weights_.moves_left.biases.size(); + const auto num_moves_input_planes = + attn_body_ ? weights_.ip_mov_b.size() : weights_.moves_left.biases.size(); const auto num_output_policy = static_cast(kPolicyOutputs); - const auto output_channels = attn_body_ ? weights_.ip_emb_b.size() - : weights_.input.biases.size(); + const auto output_channels = + attn_body_ ? weights_.ip_emb_b.size() : weights_.input.biases.size(); const auto num_res_blocks = weights_.residual.size(); // max_channels is the maximum number of input channels of any // convolution. // Residual blocks are identical, but the first convolution might be bigger // when the network has very few filters - const auto input_channels = static_cast(kInputPlanes + (attn_body_ ? kNumPosEncodingChannels : 0)); + const auto input_channels = static_cast( + kInputPlanes + (attn_body_ ? kNumPosEncodingChannels : 0)); const auto max_channels = std::max(output_channels, input_channels); // The policy head may increase convolution max output size. @@ -484,7 +482,7 @@ void BlasComputation::ComputeBlocking() { weights_.input.weights.data(), conv_out); BiasActivate(batch_size, output_channels, conv_out, - weights_.input.biases.data(), default_activation_); + weights_.input.biases.data(), default_activation_); // Residual tower @@ -499,7 +497,7 @@ void BlasComputation::ComputeBlocking() { conv1.weights.data(), conv_out); BiasActivate(batch_size, output_channels, &conv_out[0], - conv1.biases.data(), default_activation_); + conv1.biases.data(), default_activation_); std::swap(conv_in, res); std::swap(conv_out, conv_in); @@ -513,12 +511,12 @@ void BlasComputation::ComputeBlocking() { auto se_fc_outputs = se.b1.size(); ApplySEUnit(batch_size, output_channels, se_fc_outputs, - conv_in, conv2.biases.data(), res, se.w1.data(), - se.b1.data(), se.w2.data(), se.b2.data(), - conv_out, default_activation_); + conv_in, conv2.biases.data(), res, + se.w1.data(), se.b1.data(), se.w2.data(), + se.b2.data(), conv_out, default_activation_); } else { BiasResidual(batch_size, output_channels, &conv_out[0], - conv2.biases.data(), res, default_activation_); + conv2.biases.data(), res, default_activation_); } } } @@ -526,10 +524,12 @@ void BlasComputation::ComputeBlocking() { if (attn_body_) { const auto embedding_size = weights_.ip_emb_b.size(); assert(embedding_size > 0); - const auto input_size = num_res_blocks == 0 ? input_channels : weights_.input.biases.size(); + const auto input_size = + num_res_blocks == 0 ? input_channels : weights_.input.biases.size(); if (num_res_blocks == 0) { - // No residual means pure transformer, so process input position encoding. + // No residual means pure transformer, so process input position + // encoding. // Preprocess for attention body. for (auto batch = size_t{0}; batch < batch_size; batch++) { for (auto i = 0; i < kSquares; i++) { @@ -549,9 +549,9 @@ void BlasComputation::ComputeBlocking() { // Input embedding. FullyConnectedLayer::Forward1D( - batch_size * kSquares, input_size, embedding_size, - res_buffer3.data(), weights_.ip_emb_w.data(), - weights_.ip_emb_b.data(), default_activation_, res_buffer1.data()); + batch_size * kSquares, input_size, embedding_size, res_buffer3.data(), + weights_.ip_emb_w.data(), weights_.ip_emb_b.data(), + default_activation_, res_buffer1.data()); // Input gating if (weights_.ip_mult_gate.size() > 0 && weights_.ip_add_gate.size() > 0) { @@ -560,15 +560,16 @@ void BlasComputation::ComputeBlocking() { for (auto i = 0; i < kSquares; i++) { for (auto j = 0; j < embedding_size; j++) { idx = batch * kSquares * embedding_size + i * embedding_size + j; - res_buffer1[idx] = res_buffer1[idx] * weights_.ip_mult_gate[j * kSquares + i] - + weights_.ip_add_gate[j * kSquares + i]; + res_buffer1[idx] = + res_buffer1[idx] * weights_.ip_mult_gate[j * kSquares + i] + + weights_.ip_add_gate[j * kSquares + i]; } } }; } // Attention body encoders. - float alpha = (float) pow(2.0 * weights_.encoder.size(), 0.25); + float alpha = (float)pow(2.0 * weights_.encoder.size(), 0.25); for (auto& layer : weights_.encoder) { MakeEncoderLayer(res_buffer1, res_buffer2, res_buffer3, batch_size, layer, embedding_size, weights_.encoder_head_count, @@ -587,8 +588,9 @@ void BlasComputation::ComputeBlocking() { for (auto batch = size_t{0}; batch < batch_size; batch++) { for (auto i = 0; i < kSquares; i++) { for (size_t j = 0; j < output_channels; j++) { - res[batch * kSquares * output_channels + i * output_channels + j] = - conv_out[batch * kSquares * output_channels + j * kSquares + i]; + res[batch * kSquares * output_channels + i * output_channels + + j] = conv_out[batch * kSquares * output_channels + + j * kSquares + i]; } } } @@ -598,7 +600,8 @@ void BlasComputation::ComputeBlocking() { FullyConnectedLayer::Forward1D( batch_size * kSquares, output_channels, policy_embedding_size, res, weights_.ip_pol_w.data(), weights_.ip_pol_b.data(), - attn_body_ ? default_activation_ : SELU, // SELU activation hardcoded for apmish nets. + attn_body_ ? default_activation_ + : SELU, // SELU activation hardcoded for apmish nets. head_buffer.data()); const size_t policy_d_model = weights_.ip2_pol_b.size(); @@ -613,10 +616,10 @@ void BlasComputation::ComputeBlocking() { for (auto& layer : weights_.pol_encoder) { MakeEncoderLayer(head_buffer, head_buffer2, head_buffer3, batch_size, - layer, policy_embedding_size, weights_.pol_encoder_head_count, + layer, policy_embedding_size, + weights_.pol_encoder_head_count, attn_body_ ? smolgen_activation_ : NONE, - attn_body_ ? ffn_activation_ : SELU, - 1.0f); + attn_body_ ? ffn_activation_ : SELU, 1.0f); } // Q @@ -749,19 +752,16 @@ void BlasComputation::ComputeBlocking() { // Value head if (attn_body_) { FullyConnectedLayer::Forward1D( - batch_size * kSquares, weights_.ip_emb_b.size(), num_value_input_planes, - res, weights_.ip_val_w.data(), - weights_.ip_val_b.data(), - default_activation_, - head_buffer.data()); - } - else { + batch_size * kSquares, weights_.ip_emb_b.size(), + num_value_input_planes, res, weights_.ip_val_w.data(), + weights_.ip_val_b.data(), default_activation_, head_buffer.data()); + } else { Convolution1::Forward( batch_size, output_channels, num_value_input_planes, conv_out, weights_.value.weights.data(), head_buffer.data()); BiasActivate(batch_size, num_value_input_planes, &head_buffer[0], - weights_.value.biases.data(), default_activation_); + weights_.value.biases.data(), default_activation_); } FullyConnectedLayer::Forward1D( @@ -801,19 +801,16 @@ void BlasComputation::ComputeBlocking() { if (moves_left_) { if (attn_body_) { FullyConnectedLayer::Forward1D( - batch_size * kSquares, weights_.ip_emb_b.size(), num_moves_input_planes, - res, weights_.ip_mov_w.data(), - weights_.ip_mov_b.data(), - default_activation_, - head_buffer.data()); - } - else { + batch_size * kSquares, weights_.ip_emb_b.size(), + num_moves_input_planes, res, weights_.ip_mov_w.data(), + weights_.ip_mov_b.data(), default_activation_, head_buffer.data()); + } else { Convolution1::Forward( batch_size, output_channels, num_moves_input_planes, conv_out, weights_.moves_left.weights.data(), head_buffer.data()); BiasActivate(batch_size, num_moves_input_planes, &head_buffer[0], - weights_.moves_left.biases.data(), default_activation_); + weights_.moves_left.biases.data(), default_activation_); } FullyConnectedLayer::Forward1D( @@ -872,7 +869,7 @@ BlasNetwork::BlasNetwork(const WeightsFile& file, pblczero::NetworkFormat::POLICY_ATTENTION; attn_body_ = file.format().network_format().network() == - pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT; + pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT; default_activation_ = file.format().network_format().default_activation() == pblczero::NetworkFormat::DEFAULT_ACTIVATION_MISH @@ -881,11 +878,14 @@ BlasNetwork::BlasNetwork(const WeightsFile& file, if (attn_body_) { const auto smol_act = file.format().network_format().smolgen_activation(); - smolgen_activation_ = smol_act == pblczero::NetworkFormat::SMOLGEN_ACTIVATION_INHERIT - ? default_activation_ : static_cast(smol_act); + smolgen_activation_ = + smol_act == pblczero::NetworkFormat::SMOLGEN_ACTIVATION_INHERIT + ? default_activation_ + : static_cast(smol_act); const auto ffn_act = file.format().network_format().ffn_activation(); ffn_activation_ = ffn_act == pblczero::NetworkFormat::FFN_ACTIVATION_INHERIT - ? default_activation_ : static_cast(ffn_act); + ? default_activation_ + : static_cast(ffn_act); } if (max_batch_size_ > kHardMaxBatchSize) { From add3c458386238e8372f215d723c68dbfd8e35f4 Mon Sep 17 00:00:00 2001 From: borg323 Date: Tue, 21 Feb 2023 02:05:27 +0200 Subject: [PATCH 37/46] assorted fixes --- src/neural/blas/layer_norm.ispc | 4 ++-- src/neural/blas/network_blas.cc | 16 +++++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc index 43fa3fe12a..fb42814ffa 100644 --- a/src/neural/blas/layer_norm.ispc +++ b/src/neural/blas/layer_norm.ispc @@ -27,8 +27,9 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels, // One pass mean and variance taken in dimension C. Uses shifted variance calculation. float imean = 0; float ivar = 0; + float k = data[0]; if (skip != NULL) { - float k = data[0] + skip[0]; + k += skip[0]; foreach (c = 0 ... channels) { float t = data[c] + skip[c]; data[c] = t; @@ -37,7 +38,6 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels, ivar += t * t; } } else { - float k = data[0]; foreach (c = 0 ... channels) { float t = data[c]; t -= k; diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 60e61d0447..9091cd6c46 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -299,7 +299,7 @@ void BlasComputation::MakeEncoderLayer( // Layer Norm + skip connection. LayerNorm2DWithSkipConnection(1, hidden_sz, temp2, (const float*)nullptr, layer.mha.smolgen.ln1_gammas.data(), - layer.mha.smolgen.ln1_betas.data(), 1e-6); + layer.mha.smolgen.ln1_betas.data(), 1e-3); // Dense 2. const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size(); @@ -311,7 +311,7 @@ void BlasComputation::MakeEncoderLayer( LayerNorm2DWithSkipConnection(1, gen_sz_outputs, temp1, (const float*)nullptr, layer.mha.smolgen.ln2_gammas.data(), - layer.mha.smolgen.ln2_betas.data(), 1e-6); + layer.mha.smolgen.ln2_betas.data(), 1e-3); // Global smolgen weights. FullyConnectedLayer::Forward1D( @@ -326,11 +326,13 @@ void BlasComputation::MakeEncoderLayer( // Apply Softmax. for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) { -#ifndef USE_ISPC - SoftmaxActivation(kSquares, QK + h, QK + h); -#else - ispc::SoftmaxActivation(kSquares, QK + h, QK + h); +#if defined(USE_ISPC) + if (!use_eigen) { + ispc::SoftmaxActivation(kSquares, QK + h, QK + h); + continue; + } #endif + SoftmaxActivation(kSquares, QK + h, QK + h); } // matmul(softmax(QK), V) for all heads per batch. @@ -558,7 +560,7 @@ void BlasComputation::ComputeBlocking() { int idx; for (auto batch = size_t{0}; batch < batch_size; batch++) { for (auto i = 0; i < kSquares; i++) { - for (auto j = 0; j < embedding_size; j++) { + for (size_t j = 0; j < embedding_size; j++) { idx = batch * kSquares * embedding_size + i * embedding_size + j; res_buffer1[idx] = res_buffer1[idx] * weights_.ip_mult_gate[j * kSquares + i] + From e70462d281c865fcc243e4ecd09aea669386be78 Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Tue, 21 Feb 2023 13:38:17 +0100 Subject: [PATCH 38/46] Minor cs fix --- src/neural/blas/network_blas.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 9091cd6c46..ce029c35bc 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -590,9 +590,8 @@ void BlasComputation::ComputeBlocking() { for (auto batch = size_t{0}; batch < batch_size; batch++) { for (auto i = 0; i < kSquares; i++) { for (size_t j = 0; j < output_channels; j++) { - res[batch * kSquares * output_channels + i * output_channels + - j] = conv_out[batch * kSquares * output_channels + - j * kSquares + i]; + res[batch * kSquares * output_channels + i * output_channels + j] = + conv_out[batch * kSquares * output_channels + j * kSquares + i]; } } } From 53f73de9e3f113659f3a04e1e61ff47e7502bb75 Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Sat, 11 Mar 2023 10:46:11 +0100 Subject: [PATCH 39/46] Add hack to fix nets with wrong proto. --- src/neural/blas/network_blas.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index ce029c35bc..6703d62760 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -1008,6 +1008,25 @@ std::unique_ptr MakeBlasNetwork(const std::optional& w, weights.format().network_format().default_activation()) + " is not supported by BLAS backend."); } + + // @todo Hack for old encoding compatibility. REMOVE BEFORE MERGING. + if (w->format().network_format().network() == + pblczero::NetworkFormat::NETWORK_SE_WITH_HEADFORMAT && + w->weights().encoder().size() > 0) { + CERR << "Attention body detected, hacking network format."; + WeightsFile x = *w; + x.mutable_format()->mutable_network_format()->set_network( + pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT); + if (w->weights().has_smolgen_w()) { + CERR << "BT2 detected, hacking activations."; + x.mutable_format()->mutable_network_format()->set_ffn_activation( + pblczero::NetworkFormat::FFN_ACTIVATION_RELU_2); + x.mutable_format()->mutable_network_format()->set_smolgen_activation( + pblczero::NetworkFormat::SMOLGEN_ACTIVATION_SWISH); + } + return std::make_unique>(x, options); + } + return std::make_unique>(weights, options); } From 971e33d7ddfc012d678ee8269a1e01613aa1024e Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Tue, 21 Mar 2023 23:54:07 +0100 Subject: [PATCH 40/46] Fix bug in smolgen for small nets. --- src/neural/blas/network_blas.cc | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 6703d62760..a8201457d3 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -279,14 +279,15 @@ void BlasComputation::MakeEncoderLayer( // Smolgen. if (layer.mha.has_smolgen) { - float* input = &head_buffer[batchStart]; + float* input = &head_buffer[batch * kSquares * embedding_size]; float* temp1 = &head_buffer2[batchStart]; float* temp2 = &head_buffer3[batchStart]; // Compress. - const auto hidden_channels = layer.mha.smolgen.compress.size() / d_model; + const auto hidden_channels = + layer.mha.smolgen.compress.size() / embedding_size; FullyConnectedLayer::Forward1D( - kSquares, d_model, hidden_channels, input, + kSquares, embedding_size, hidden_channels, input, layer.mha.smolgen.compress.data(), (const float*)nullptr, NONE, temp1); @@ -314,13 +315,21 @@ void BlasComputation::MakeEncoderLayer( layer.mha.smolgen.ln2_betas.data(), 1e-3); // Global smolgen weights. - FullyConnectedLayer::Forward1D( - heads, gen_sz_outputs / heads, 64 * 64, temp1, - weights_.smolgen_w.data(), (const float*)nullptr, NONE, temp2); - - // Add smolgen weights to QK. - for (auto i = 0; i < heads * kSquares * kSquares; i++) { - QK[i] += temp2[i]; + const float* A = temp1; + const float* B = weights_.smolgen_w.data(); + float* C = QK; + if (use_eigen) { + auto C_mat = EigenMatrixMap(C, 64 * 64, heads); + C_mat.noalias() += + ConstEigenMatrixMap(B, gen_sz_outputs / heads, 64 * 64) + .transpose() * + ConstEigenMatrixMap(A, gen_sz_outputs / heads, heads); + } else { +#ifdef USE_BLAS + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, heads, 64 * 64, + gen_sz_outputs / heads, 1.0f, A, gen_sz_outputs / heads, B, + gen_sz_outputs / heads, 1.0f, C, 64 * 64); +#endif } } From 124386106dacaf611ed607d677c3dd8a651eb431 Mon Sep 17 00:00:00 2001 From: borg323 Date: Tue, 21 Feb 2023 02:44:11 +0200 Subject: [PATCH 41/46] batch blas smolgen --- src/neural/blas/encoder.h | 15 ++-- src/neural/blas/network_blas.cc | 136 ++++++++++++++++---------------- 2 files changed, 77 insertions(+), 74 deletions(-) diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h index b9940a4b14..2e3a3f9a7b 100644 --- a/src/neural/blas/encoder.h +++ b/src/neural/blas/encoder.h @@ -41,8 +41,7 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size, data[i * channels + c] += skip[i * channels + c]; mean += data[i * channels + c]; } - } - else { + } else { for (size_t c = 0; c < channels; ++c) { mean += data[i * channels + c]; } @@ -64,9 +63,15 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size, betas[c] + gammas[c] * (data[i * channels + c] - mean) * den; } #else - ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, - skip + i * channels, gammas, betas, - epsilon); + if (skip != nullptr) { + ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, + skip + i * channels, gammas, betas, + epsilon); + } else { + ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, + nullptr, gammas, betas, epsilon); + } + #endif } } diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index a8201457d3..cd6d08d546 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -224,7 +224,52 @@ void BlasComputation::MakeEncoderLayer( const int dff_size = layer.ffn.dense1_b.size(); std::vector head_buffer4(batch_size * std::max(d_model, dff_size) * kSquares); - std::vector temp_buffer(heads * kSquares * kSquares); + + // Smolgen. + if (layer.mha.has_smolgen) { + float* input = &head_buffer[0]; + float* QK = &head_buffer4[0]; + // Compress. + const auto hidden_channels = + layer.mha.smolgen.compress.size() / embedding_size; + std::vector temp1(batch_size * kSquares * hidden_channels); + FullyConnectedLayer::Forward1D( + batch_size * kSquares, embedding_size, hidden_channels, input, + layer.mha.smolgen.compress.data(), (const float*)nullptr, NONE, + temp1.data()); + + // Dense 1. + const auto hidden_sz = layer.mha.smolgen.dense1_b.size(); + std::vector temp2(batch_size * hidden_sz); + FullyConnectedLayer::Forward1D( + batch_size, kSquares * hidden_channels, hidden_sz, &temp1[0], + layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(), + smolgen_activation, temp2.data()); + // Layer Norm + skip connection. + LayerNorm2DWithSkipConnection(batch_size, hidden_sz, temp2.data(), + (const float*)nullptr, + layer.mha.smolgen.ln1_gammas.data(), + layer.mha.smolgen.ln1_betas.data(), 1e-3); + + // Dense 2. + const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size(); + std::vector temp3(batch_size * gen_sz_outputs); + FullyConnectedLayer::Forward1D( + batch_size, hidden_sz, gen_sz_outputs, &temp2[0], + layer.mha.smolgen.dense2_w.data(), layer.mha.smolgen.dense2_b.data(), + smolgen_activation, temp3.data()); + // Layer Norm + skip connection. + LayerNorm2DWithSkipConnection(batch_size, gen_sz_outputs, temp3.data(), + (const float*)nullptr, + layer.mha.smolgen.ln2_gammas.data(), + layer.mha.smolgen.ln2_betas.data(), 1e-3); + + // Global smolgen weights. + FullyConnectedLayer::Forward1D( + batch_size * heads, gen_sz_outputs / heads, 64 * 64, temp3.data(), + weights_.smolgen_w.data(), (const float*)nullptr, NONE, &QK[0]); + } + // Q FullyConnectedLayer::Forward1D( batch_size * kSquares, embedding_size, d_model, head_buffer.data(), @@ -233,10 +278,6 @@ void BlasComputation::MakeEncoderLayer( FullyConnectedLayer::Forward1D( batch_size * kSquares, embedding_size, d_model, head_buffer.data(), layer.mha.k_w.data(), layer.mha.k_b.data(), NONE, head_buffer3.data()); - // V - FullyConnectedLayer::Forward1D( - batch_size * kSquares, embedding_size, d_model, head_buffer.data(), - layer.mha.v_w.data(), layer.mha.v_b.data(), NONE, head_buffer4.data()); // MHA (Q, K, V) const int depth = d_model / heads; @@ -246,29 +287,32 @@ void BlasComputation::MakeEncoderLayer( for (auto batch = size_t{0}; batch < batch_size; batch++) { auto batchStart = batch * kSquares * d_model; + float* QK = &head_buffer4[batch * kSquares * kSquares * heads]; + const float* Q = &head_buffer2[batchStart]; const float* K = &head_buffer3[batchStart]; - const float* V = &head_buffer4[batchStart]; // matmul(Q, K) for all heads per batch. - float* QK = temp_buffer.data(); + for (auto h = 0; h < heads; h++) { const float* A = &Q[h * depth]; const float* B = &K[h * depth]; float* C = &QK[h * kSquares * kSquares]; + const float beta = layer.mha.has_smolgen ? 1.0f : 0.0f; if (use_eigen) { auto C_mat = EigenMatrixMap(C, kSquares, kSquares); C_mat.noalias() = + beta * C_mat + scaling * - ConstEigenStridedMatrixMap( - B, depth, kSquares, Eigen::OuterStride<>(heads * depth)) - .transpose() * - ConstEigenStridedMatrixMap( - A, depth, kSquares, Eigen::OuterStride<>(heads * depth)); + ConstEigenStridedMatrixMap( + B, depth, kSquares, Eigen::OuterStride<>(heads * depth)) + .transpose() * + ConstEigenStridedMatrixMap( + A, depth, kSquares, Eigen::OuterStride<>(heads * depth)); } else { #ifdef USE_BLAS cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, kSquares, kSquares, - depth, scaling, A, heads * depth, B, heads * depth, 0.0f, C, + depth, scaling, A, heads * depth, B, heads * depth, beta, C, kSquares); #else // Should never get here. @@ -277,62 +321,6 @@ void BlasComputation::MakeEncoderLayer( } } - // Smolgen. - if (layer.mha.has_smolgen) { - float* input = &head_buffer[batch * kSquares * embedding_size]; - float* temp1 = &head_buffer2[batchStart]; - float* temp2 = &head_buffer3[batchStart]; - - // Compress. - const auto hidden_channels = - layer.mha.smolgen.compress.size() / embedding_size; - FullyConnectedLayer::Forward1D( - kSquares, embedding_size, hidden_channels, input, - layer.mha.smolgen.compress.data(), (const float*)nullptr, NONE, - temp1); - - // Dense 1. - const auto hidden_sz = layer.mha.smolgen.dense1_b.size(); - FullyConnectedLayer::Forward1D( - 1, kSquares * hidden_channels, hidden_sz, temp1, - layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(), - smolgen_activation, temp2); - // Layer Norm + skip connection. - LayerNorm2DWithSkipConnection(1, hidden_sz, temp2, (const float*)nullptr, - layer.mha.smolgen.ln1_gammas.data(), - layer.mha.smolgen.ln1_betas.data(), 1e-3); - - // Dense 2. - const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size(); - FullyConnectedLayer::Forward1D( - 1, hidden_sz, gen_sz_outputs, temp2, - layer.mha.smolgen.dense2_w.data(), layer.mha.smolgen.dense2_b.data(), - smolgen_activation, temp1); - // Layer Norm + skip connection. - LayerNorm2DWithSkipConnection(1, gen_sz_outputs, temp1, - (const float*)nullptr, - layer.mha.smolgen.ln2_gammas.data(), - layer.mha.smolgen.ln2_betas.data(), 1e-3); - - // Global smolgen weights. - const float* A = temp1; - const float* B = weights_.smolgen_w.data(); - float* C = QK; - if (use_eigen) { - auto C_mat = EigenMatrixMap(C, 64 * 64, heads); - C_mat.noalias() += - ConstEigenMatrixMap(B, gen_sz_outputs / heads, 64 * 64) - .transpose() * - ConstEigenMatrixMap(A, gen_sz_outputs / heads, heads); - } else { -#ifdef USE_BLAS - cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, heads, 64 * 64, - gen_sz_outputs / heads, 1.0f, A, gen_sz_outputs / heads, B, - gen_sz_outputs / heads, 1.0f, C, 64 * 64); -#endif - } - } - // Apply Softmax. for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) { #if defined(USE_ISPC) @@ -343,9 +331,19 @@ void BlasComputation::MakeEncoderLayer( #endif SoftmaxActivation(kSquares, QK + h, QK + h); } + } + + // V + FullyConnectedLayer::Forward1D( + batch_size * kSquares, embedding_size, d_model, head_buffer.data(), + layer.mha.v_w.data(), layer.mha.v_b.data(), NONE, head_buffer3.data()); + for (auto batch = size_t{0}; batch < batch_size; batch++) { + auto batchStart = batch * kSquares * d_model; // matmul(softmax(QK), V) for all heads per batch. float* attn = &head_buffer2[batchStart]; + const float* V = &head_buffer3[batchStart]; + const float* QK = &head_buffer4[batch * kSquares * kSquares * heads]; for (auto h = 0; h < heads; h++) { const float* A = &QK[h * kSquares * kSquares]; const float* B = &V[h * depth]; From d011548717163f0e3cc6d9e262464df9e95896cc Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Wed, 22 Mar 2023 12:26:25 +0100 Subject: [PATCH 42/46] Minor nits. --- src/neural/blas/network_blas.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index cd6d08d546..8bc85f9f2d 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -227,8 +227,9 @@ void BlasComputation::MakeEncoderLayer( // Smolgen. if (layer.mha.has_smolgen) { - float* input = &head_buffer[0]; + const float* input = &head_buffer[0]; float* QK = &head_buffer4[0]; + // Compress. const auto hidden_channels = layer.mha.smolgen.compress.size() / embedding_size; @@ -242,7 +243,7 @@ void BlasComputation::MakeEncoderLayer( const auto hidden_sz = layer.mha.smolgen.dense1_b.size(); std::vector temp2(batch_size * hidden_sz); FullyConnectedLayer::Forward1D( - batch_size, kSquares * hidden_channels, hidden_sz, &temp1[0], + batch_size, kSquares * hidden_channels, hidden_sz, temp1.data(), layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(), smolgen_activation, temp2.data()); // Layer Norm + skip connection. @@ -255,7 +256,7 @@ void BlasComputation::MakeEncoderLayer( const auto gen_sz_outputs = layer.mha.smolgen.dense2_b.size(); std::vector temp3(batch_size * gen_sz_outputs); FullyConnectedLayer::Forward1D( - batch_size, hidden_sz, gen_sz_outputs, &temp2[0], + batch_size, hidden_sz, gen_sz_outputs, temp2.data(), layer.mha.smolgen.dense2_w.data(), layer.mha.smolgen.dense2_b.data(), smolgen_activation, temp3.data()); // Layer Norm + skip connection. @@ -267,7 +268,7 @@ void BlasComputation::MakeEncoderLayer( // Global smolgen weights. FullyConnectedLayer::Forward1D( batch_size * heads, gen_sz_outputs / heads, 64 * 64, temp3.data(), - weights_.smolgen_w.data(), (const float*)nullptr, NONE, &QK[0]); + weights_.smolgen_w.data(), (const float*)nullptr, NONE, QK); } // Q From c73e5d22c15a8361a0482bba5d2937df1118ba49 Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Wed, 22 Mar 2023 12:37:23 +0100 Subject: [PATCH 43/46] Replace 64 with kSquares --- src/neural/blas/network_blas.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 8bc85f9f2d..0dd2711c70 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -267,7 +267,7 @@ void BlasComputation::MakeEncoderLayer( // Global smolgen weights. FullyConnectedLayer::Forward1D( - batch_size * heads, gen_sz_outputs / heads, 64 * 64, temp3.data(), + batch_size * heads, gen_sz_outputs / heads, kSquares * kSquares, temp3.data(), weights_.smolgen_w.data(), (const float*)nullptr, NONE, QK); } From 143b9cc3d4d635c0f429159cc98030abd47fe116 Mon Sep 17 00:00:00 2001 From: borg323 <39573933+borg323@users.noreply.github.com> Date: Thu, 23 Mar 2023 17:43:29 +0200 Subject: [PATCH 44/46] Blas performance improvements (#6) * vectorize activation functions * minor speedups --------- Co-authored-by: borg323 --- src/neural/blas/encoder.h | 11 ++++---- src/neural/blas/layer_norm.ispc | 7 +++-- src/neural/blas/network_blas.cc | 47 ++++++++++++------------------- src/neural/shared/activation.cc | 33 +++++++++++++++++++++- src/neural/shared/activation.ispc | 32 ++++++++++++++++++--- 5 files changed, 88 insertions(+), 42 deletions(-) diff --git a/src/neural/blas/encoder.h b/src/neural/blas/encoder.h index 2e3a3f9a7b..9a1e4e9f2d 100644 --- a/src/neural/blas/encoder.h +++ b/src/neural/blas/encoder.h @@ -30,15 +30,16 @@ namespace lczero { void LayerNorm2DWithSkipConnection(const size_t batch_size, const size_t channels, float* data, - const float* skip, const float* gammas, - const float* betas, float epsilon) { + const float alpha, const float* skip, + const float* gammas, const float* betas, + float epsilon) { for (size_t i = 0; i < batch_size; i++) { #ifndef USE_ISPC // Mean taken in dimension C. float mean = 0; if (skip != nullptr) { for (size_t c = 0; c < channels; ++c) { - data[i * channels + c] += skip[i * channels + c]; + data[i * channels + c] += alpha * skip[i * channels + c]; mean += data[i * channels + c]; } } else { @@ -64,11 +65,11 @@ void LayerNorm2DWithSkipConnection(const size_t batch_size, } #else if (skip != nullptr) { - ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, + ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, alpha, skip + i * channels, gammas, betas, epsilon); } else { - ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, + ispc::LayerNorm2DWithSkipConnection(channels, data + i * channels, 0.0f, nullptr, gammas, betas, epsilon); } diff --git a/src/neural/blas/layer_norm.ispc b/src/neural/blas/layer_norm.ispc index fb42814ffa..063bb74476 100644 --- a/src/neural/blas/layer_norm.ispc +++ b/src/neural/blas/layer_norm.ispc @@ -18,6 +18,7 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels, uniform float data[], + const uniform float alpha, const uniform float skip[], const uniform float gammas[], const uniform float betas[], @@ -29,9 +30,9 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels, float ivar = 0; float k = data[0]; if (skip != NULL) { - k += skip[0]; + k += alpha * skip[0]; foreach (c = 0 ... channels) { - float t = data[c] + skip[c]; + float t = data[c] + alpha * skip[c]; data[c] = t; t -= k; imean += t; @@ -53,7 +54,7 @@ export void LayerNorm2DWithSkipConnection(uniform const size_t channels, float imean = 0; if (skip != NULL) { foreach (c = 0 ... channels) { - data[c] += skip[c]; + data[c] += alpha * skip[c]; imean += data[c]; } } else { diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 0dd2711c70..6c1cb17782 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -247,7 +247,7 @@ void BlasComputation::MakeEncoderLayer( layer.mha.smolgen.dense1_w.data(), layer.mha.smolgen.dense1_b.data(), smolgen_activation, temp2.data()); // Layer Norm + skip connection. - LayerNorm2DWithSkipConnection(batch_size, hidden_sz, temp2.data(), + LayerNorm2DWithSkipConnection(batch_size, hidden_sz, temp2.data(), 0.0f, (const float*)nullptr, layer.mha.smolgen.ln1_gammas.data(), layer.mha.smolgen.ln1_betas.data(), 1e-3); @@ -261,7 +261,7 @@ void BlasComputation::MakeEncoderLayer( smolgen_activation, temp3.data()); // Layer Norm + skip connection. LayerNorm2DWithSkipConnection(batch_size, gen_sz_outputs, temp3.data(), - (const float*)nullptr, + 0.0f, (const float*)nullptr, layer.mha.smolgen.ln2_gammas.data(), layer.mha.smolgen.ln2_betas.data(), 1e-3); @@ -321,17 +321,19 @@ void BlasComputation::MakeEncoderLayer( #endif } } + } - // Apply Softmax. - for (int h = 0; h < heads * kSquares * kSquares; h += kSquares) { + // Apply Softmax. + float* QK = &head_buffer4[0]; + for (size_t h = 0; h < batch_size * heads * kSquares * kSquares; + h += kSquares) { #if defined(USE_ISPC) - if (!use_eigen) { - ispc::SoftmaxActivation(kSquares, QK + h, QK + h); - continue; - } -#endif - SoftmaxActivation(kSquares, QK + h, QK + h); + if (!use_eigen) { + ispc::SoftmaxActivation(kSquares, QK + h, QK + h); + continue; } +#endif + SoftmaxActivation(kSquares, QK + h, QK + h); } // V @@ -372,17 +374,11 @@ void BlasComputation::MakeEncoderLayer( layer.mha.dense_w.data(), layer.mha.dense_b.data(), NONE, head_buffer3.data()); - if (alpha != 1.0f) { - for (size_t i = 0; i < batch_size * kSquares * embedding_size; i++) { - head_buffer[i] *= alpha; - } - } - // Layer Norm + skip connection. LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size, - head_buffer.data(), head_buffer3.data(), - layer.ln1_gammas.data(), layer.ln1_betas.data(), - 1e-6); + head_buffer.data(), 1.0f / alpha, + head_buffer3.data(), layer.ln1_gammas.data(), + layer.ln1_betas.data(), 1e-6); // FFN. FullyConnectedLayer::Forward1D( @@ -395,18 +391,11 @@ void BlasComputation::MakeEncoderLayer( head_buffer4.data(), layer.ffn.dense2_w.data(), layer.ffn.dense2_b.data(), NONE, head_buffer3.data()); - if (alpha != 1.0f) { - for (size_t i = 0; i < batch_size * kSquares * layer.ffn.dense2_b.size(); - i++) { - head_buffer[i] *= alpha; - } - } - // Layer Norm + skip connection. LayerNorm2DWithSkipConnection(batch_size * kSquares, embedding_size, - head_buffer.data(), head_buffer3.data(), - layer.ln2_gammas.data(), layer.ln2_betas.data(), - 1e-6); + head_buffer.data(), 1.0f / alpha, + head_buffer3.data(), layer.ln2_gammas.data(), + layer.ln2_betas.data(), 1e-6); } template diff --git a/src/neural/shared/activation.cc b/src/neural/shared/activation.cc index 9c90936c00..f7d97ecca0 100644 --- a/src/neural/shared/activation.cc +++ b/src/neural/shared/activation.cc @@ -100,10 +100,14 @@ void Activate(const size_t len, const float* data, const float* bias, output[b] = data[b] + bias[b]; } } else if (activation == RELU) { +#ifndef USE_ISPC for (size_t b = 0; b < len; b++) { float val = data[b] + bias[b]; output[b] = val > 0 ? val : 0; } +#else + ispc::ActivateRelu(len, 1.0f, data, bias, 0.0f, output); +#endif } else if (activation == MISH) { #ifndef USE_ISPC for (size_t b = 0; b < len; b++) { @@ -112,6 +116,25 @@ void Activate(const size_t len, const float* data, const float* bias, } #else ispc::ActivateMish(len, 1.0f, data, bias, 0.0f, output); +#endif + } else if (activation == RELU_2) { +#ifndef USE_ISPC + for (size_t b = 0; b < len; b++) { + float val = data[b] + bias[b]; + output[b] = val > 0 ? val * val : 0; + } +#else + ispc::ActivateRelu_2(len, data, bias, output); +#endif + } else if (activation == SWISH) { +#ifndef USE_ISPC + for (size_t b = 0; b < len; b++) { + float val = data[b] + bias[b]; + output[b] = val / (1.0f + exp(-val)); + ; + } +#else + ispc::ActivateSwish(len, data, bias, output); #endif } else if (activation == SELU) { #ifndef USE_ISPC @@ -120,7 +143,7 @@ void Activate(const size_t len, const float* data, const float* bias, output[b] = selu(val); } #else - ispc::ActivateSelu(len, 1.0f, data, bias, 0.0f, output); + ispc::ActivateSelu(len, data, bias, output); #endif } else { for (size_t b = 0; b < len; b++) { @@ -139,10 +162,14 @@ void Activate(const size_t len, float gamma, const float* data, output[b] = val; } } else if (activation == RELU) { +#ifndef USE_ISPC for (size_t b = 0; b < len; b++) { float val = gamma * data[b] + bias[b] + beta; output[b] = val > 0 ? val : 0; } +#else + ispc::ActivateRelu(len, gamma, data, bias, beta, output); +#endif } else if (activation == MISH) { #ifndef USE_ISPC for (size_t b = 0; b < len; b++) { @@ -174,10 +201,14 @@ void BiasResidual(const size_t batch_size, const size_t channels, float* data, arr[b] = val; } } else if (activation == RELU) { +#ifndef USE_ISPC for (size_t b = 0; b < kSquares; b++) { float val = res[b] + arr[b] + bias; arr[b] = val > 0 ? val : 0; } +#else + ispc::ActivateRelu(kSquares, 1.0f, res, arr, bias, arr); +#endif } else if (activation == MISH) { #ifndef USE_ISPC for (size_t b = 0; b < kSquares; b++) { diff --git a/src/neural/shared/activation.ispc b/src/neural/shared/activation.ispc index 6190198515..987dc3e689 100644 --- a/src/neural/shared/activation.ispc +++ b/src/neural/shared/activation.ispc @@ -36,6 +36,31 @@ export void ActivateMish(uniform const size_t len, uniform float gamma, } } +export void ActivateRelu(uniform const size_t len, uniform float gamma, + const uniform float data[], const uniform float bias[], + uniform float beta, uniform float output[]) { + foreach (b = 0 ... len) { + float val = gamma * data[b] + bias[b] + beta; + output[b] = val > 0 ? val : 0; + } +} + +export void ActivateSwish(uniform const size_t len, const uniform float data[], + const uniform float bias[], uniform float output[]) { + foreach (b = 0 ... len) { + float val = data[b] + bias[b]; + output[b] = val / (1.0f + exp(-val)); + } +} + +export void ActivateRelu_2(uniform const size_t len, const uniform float data[], + const uniform float bias[], uniform float output[]) { + foreach (b = 0 ... len) { + float val = data[b] + bias[b]; + output[b] = val > 0 ? val * val : 0; + } +} + static inline float selu(float val) { float alpha = 1.67326324f, scale = 1.05070098f; if (val > 0) { @@ -45,11 +70,10 @@ static inline float selu(float val) { } } -export void ActivateSelu(uniform const size_t len, uniform float gamma, - const uniform float data[], const uniform float bias[], - uniform float beta, uniform float output[]) { +export void ActivateSelu(uniform const size_t len, const uniform float data[], + const uniform float bias[], uniform float output[]) { foreach (b = 0 ... len) { - float val = gamma * data[b] + bias[b] + beta; + float val = data[b] + bias[b]; output[b] = selu(val); } } From 9a718518e42716ba0be39c5900a49052d9dd5833 Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Tue, 28 Mar 2023 11:43:39 +0100 Subject: [PATCH 45/46] Remove duplicate code, fix proto changes. --- src/neural/blas/network_blas.cc | 23 ++--------------------- 1 file changed, 2 insertions(+), 21 deletions(-) diff --git a/src/neural/blas/network_blas.cc b/src/neural/blas/network_blas.cc index 6c1cb17782..8a5400ffe7 100644 --- a/src/neural/blas/network_blas.cc +++ b/src/neural/blas/network_blas.cc @@ -877,11 +877,11 @@ BlasNetwork::BlasNetwork(const WeightsFile& file, if (attn_body_) { const auto smol_act = file.format().network_format().smolgen_activation(); smolgen_activation_ = - smol_act == pblczero::NetworkFormat::SMOLGEN_ACTIVATION_INHERIT + smol_act == pblczero::NetworkFormat::ACTIVATION_DEFAULT ? default_activation_ : static_cast(smol_act); const auto ffn_act = file.format().network_format().ffn_activation(); - ffn_activation_ = ffn_act == pblczero::NetworkFormat::FFN_ACTIVATION_INHERIT + ffn_activation_ = ffn_act == pblczero::NetworkFormat::ACTIVATION_DEFAULT ? default_activation_ : static_cast(ffn_act); } @@ -1005,25 +1005,6 @@ std::unique_ptr MakeBlasNetwork(const std::optional& w, weights.format().network_format().default_activation()) + " is not supported by BLAS backend."); } - - // @todo Hack for old encoding compatibility. REMOVE BEFORE MERGING. - if (w->format().network_format().network() == - pblczero::NetworkFormat::NETWORK_SE_WITH_HEADFORMAT && - w->weights().encoder().size() > 0) { - CERR << "Attention body detected, hacking network format."; - WeightsFile x = *w; - x.mutable_format()->mutable_network_format()->set_network( - pblczero::NetworkFormat::NETWORK_ATTENTIONBODY_WITH_HEADFORMAT); - if (w->weights().has_smolgen_w()) { - CERR << "BT2 detected, hacking activations."; - x.mutable_format()->mutable_network_format()->set_ffn_activation( - pblczero::NetworkFormat::FFN_ACTIVATION_RELU_2); - x.mutable_format()->mutable_network_format()->set_smolgen_activation( - pblczero::NetworkFormat::SMOLGEN_ACTIVATION_SWISH); - } - return std::make_unique>(x, options); - } - return std::make_unique>(weights, options); } From 1d327af3c210341d1dfe509ac8f5c6d5b15eb93a Mon Sep 17 00:00:00 2001 From: Aniebiet Udoh Date: Tue, 28 Mar 2023 11:51:20 +0100 Subject: [PATCH 46/46] Remove missed import --- src/neural/shared/activation.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/neural/shared/activation.h b/src/neural/shared/activation.h index fba71a5229..3d1140765a 100644 --- a/src/neural/shared/activation.h +++ b/src/neural/shared/activation.h @@ -21,8 +21,6 @@ #include #include -#include "proto/net.pb.h" - namespace lczero { // The following list matches the one in net.proto. Ideally this would be done // by including proto/net.pb.h, but this is incompatible with nvcc.