Skip to content

Commit

Permalink
Merge pull request LeelaChessZero#8 from LeelaChessZero/master
Browse files Browse the repository at this point in the history
get latest
  • Loading branch information
ankan-ban authored Dec 21, 2018
2 parents beed96e + 1a5f95f commit 80ac4a1
Show file tree
Hide file tree
Showing 9 changed files with 104 additions and 17 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ Versioning follows the Semantic Versioning guidelines, with major, minor and pat
Download using git:

```
git clone -b release --recurse-submodules https://github.com/LeelaChessZero/lc0.git
git clone -b release/0.19 --recurse-submodules https://github.com/LeelaChessZero/lc0.git
```

If downloading an archive, you need to also download and place the submodule:
* Download https://github.com/LeelaChessZero/lc0/archive/release.zip ([.tar.gz](https://github.com/LeelaChessZero/lc0/archive/release.tar.gz) archive is also available)
* Download https://github.com/LeelaChessZero/lc0/archive/release/0.19.zip ([.tar.gz](https://github.com/LeelaChessZero/lc0/archive/release/0.19.tar.gz) archive is also available)
* Extract
* Download https://github.com/LeelaChessZero/lczero-common/archive/master.zip (also available as [.tar.gz](https://github.com/LeelaChessZero/lczero-common/archive/master.tar.gz))
* Move the second archive into the first archive's `libs/lczero-common/` folder and extract
Expand Down
12 changes: 11 additions & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ install:
- cmd: IF NOT EXIST c:\cache\protobuf\ cmake -G "Visual Studio 15 2017 Win64" -Dprotobuf_BUILD_SHARED_LIBS=NO -Dprotobuf_MSVC_STATIC_RUNTIME=NO -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_INSTALL_PREFIX=c:/cache/protobuf ../cmake
- cmd: IF NOT EXIST c:\cache\protobuf\ msbuild INSTALL.vcxproj /p:Configuration=Release /p:Platform=x64 /m
- cmd: set PATH=c:\cache\protobuf\bin;%PATH%
- cmd: IF NOT EXIST c:\cache\testnet appveyor DownloadFile http://lczero.org/get_network?sha=7170f639ba1cdc407283b8e52377283e36845b954788c6ada8897937637ef032 -Filename c:\cache\testnet
- cmd: IF %GTEST%==true IF NOT EXIST C:\cache\syzygy mkdir C:\cache\syzygy
- cmd: IF %GTEST%==true cd C:\cache\syzygy
- cmd: IF %GTEST%==true IF NOT EXIST KQvK.rtbz curl --remote-name-all https://tablebase.lichess.ovh/tables/standard/3-4-5/K{P,N,R,B,Q}vK.rtb{w,z}
Expand All @@ -53,7 +54,16 @@ before_build:
- cmd: git submodule update --init --recursive
- cmd: meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BLAS% -Dcudnn=%CUDA% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%PKG_FOLDER%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%PKG_FOLDER%\cuda\lib\x64" -Dprotobuf_include="%PKG_FOLDER%\protobuf\include" -Dprotobuf_libdir="%PKG_FOLDER%\protobuf\lib" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS.0.2.14.1\lib\native\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS.0.2.14.1\lib\native\lib\x64" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\lib\x64" -Ddefault_library=static
build_script:
- cmd: msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
- cmd: IF %APPVEYOR_REPO_TAG%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
- cmd: IF %APPVEYOR_REPO_TAG%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGInstrument /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
- cmd: cd build
- cmd: IF %APPVEYOR_REPO_TAG%==true IF %BLAS%==true copy C:\cache\OpenBLAS.0.2.14.1\lib\native\bin\x64\*.dll
- cmd: IF %APPVEYOR_REPO_TAG%==true IF %OPENCL%==true copy C:\cache\opencl-nug.0.777.12\build\native\bin\OpenCL.dll
- cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true copy "%CUDA_PATH%"\bin\*.dll
- cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true copy %PKG_FOLDER%\cuda\bin\cudnn64_7.dll
- cmd: IF %APPVEYOR_REPO_TAG%==true lc0 benchmark --weights=c:\cache\testnet --backend=random --movetime=10000
- cmd: cd ..
- cmd: IF %APPVEYOR_REPO_TAG%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGOptimize /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
after_build:
- cmd: IF %APPVEYOR_REPO_TAG%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip %APPVEYOR_BUILD_FOLDER%\build\lc0.exe
- cmd: IF %APPVEYOR_REPO_TAG%==true appveyor DownloadFile "https://ci.appveyor.com/api/projects/LeelaChessZero/lczero-client/artifacts/client.exe?branch=release&pr=false&job=Environment%%3A%%20NAME%%3D.exe%%2C%%20GOOS%%3Dwindows"
Expand Down
8 changes: 5 additions & 3 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,20 @@ BUILDDIR=build/${BUILDTYPE}

if [ -f ${BUILDDIR}/build.ninja ]
then
meson configure ${BUILDDIR} --buildtype ${BUILDTYPE} --prefix ${INSTALL_PREFIX:-/usr/local} "$@"
meson configure ${BUILDDIR} -Dbuildtype=${BUILDTYPE} -Dprefix=${INSTALL_PREFIX:-/usr/local} "$@"
else
meson ${BUILDDIR} --buildtype ${BUILDTYPE} --prefix ${INSTALL_PREFIX:-/usr/local} "$@"
fi

pushd ${BUILDDIR}

NINJA=$(awk '/ninja/ {ninja=$4} END {print ninja}' meson-logs/meson-log.txt)

if [ -n "${INSTALL_PREFIX}" ]
then
ninja install
${NINJA} install
else
ninja
${NINJA}
fi

popd
1 change: 1 addition & 0 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ endif
if cc.get_id() == 'clang' or cc.get_id() == 'gcc'
add_project_arguments('-Wextra', language : 'cpp')
add_project_arguments('-pedantic', language : 'cpp')
add_project_arguments('-ffast-math', language : 'cpp')

if get_option('buildtype') == 'release'
add_project_arguments('-march=native', language : 'cpp')
Expand Down
12 changes: 9 additions & 3 deletions src/mcts/search.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "mcts/node.h"
#include "neural/cache.h"
#include "neural/encoder.h"
#include "utils/fastmath.h"
#include "utils/random.h"

namespace lczero {
Expand Down Expand Up @@ -198,7 +199,7 @@ inline float ComputeCpuct(const SearchParams& params, uint32_t N) {
const float init = params.GetCpuct();
const float k = params.GetCpuctFactor();
const float base = params.GetCpuctBase();
return init + (k ? k * std::log((N + base) / base) : 0.0f);
return init + (k ? k * FastLog((N + base) / base) : 0.0f);
}
} // namespace

Expand Down Expand Up @@ -837,7 +838,9 @@ SearchWorker::NodeToProcess SearchWorker::PickNodeToExtend(
// n_in_flight_ is incremented. If the method returns false, then there is
// a search collision, and this node is already being expanded.
if (!node->TryStartScoreUpdate()) {
IncrementNInFlight(node, search_->root_node_, collision_limit - 1);
if (!is_root_node) {
IncrementNInFlight(node->GetParent(), search_->root_node_, collision_limit - 1);
}
return NodeToProcess::Collision(node, depth, collision_limit);
}
// Either terminal or unexamined leaf node -- the end of this playout.
Expand Down Expand Up @@ -1137,7 +1140,10 @@ void SearchWorker::FetchSingleNodeResult(NodeToProcess* node_to_process,
float p =
computation_->GetPVal(idx_in_computation, edge.GetMove().as_nn_index());
if (params_.GetPolicySoftmaxTemp() != 1.0f) {
p = pow(p, 1 / params_.GetPolicySoftmaxTemp());
// Flush denormals to zero.
p = p < 1.17549435E-38
? 0.0
: FastPow2(FastLog2(p) / params_.GetPolicySoftmaxTemp());
}
edge.edge()->SetP(p);
// Edge::SetP does some rounding, so only add to the total after rounding.
Expand Down
2 changes: 0 additions & 2 deletions src/neural/cuda/layers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,6 @@ void SELayer<float>::LoadWeights(float* w1, float* b1, float* w2, float* b2,
size_t num_weights1 = C * numFc1Out_;
size_t weight_size1 = sizeof(float) * num_weights1;

size_t num_weights2 = 2 * num_weights1;
size_t weight_size2 = 2 * weight_size1;

// Weight for the first FC layer.
Expand Down Expand Up @@ -385,7 +384,6 @@ void SELayer<float>::Eval(int N, float* output, const float* input,
const float* /*input2*/, void* scratch,
size_t scratch_size, cudnnHandle_t /*cudnn*/,
cublasHandle_t cublas) {
assert(output == input2);
// Ping-pong between 'op1' and 'op2' (parts of scratch memory).
float* op1 = (float*)scratch;
float* op2 = (float*)scratch + scratch_size / sizeof(float) / 2;
Expand Down
4 changes: 4 additions & 0 deletions src/neural/loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ std::string DecompressGzip(const std::string& filename) {
if (!file) throw Exception("Cannot read weights from " + filename);
while (true) {
int sz = gzread(file, &buffer[bytes_read], buffer.size() - bytes_read);
if (sz < 0) {
int errnum;
throw Exception(gzerror(file, &errnum));
}
if (sz == static_cast<int>(buffer.size()) - bytes_read) {
bytes_read = buffer.size();
buffer.resize(buffer.size() * 2);
Expand Down
10 changes: 4 additions & 6 deletions src/neural/opencl/OpenCLTuner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ static void sgemmBatched_ref(const std::vector<float>& a,
auto offset_v = batch * n * k;
auto offset_m = batch * m * n;

// cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, m, n, k, 1.0f,
// &a[offset_u], m, &b[offset_v], n, 0.0f, &c[offset_m], n);
// Calculates C = transpose(tranpose(A) * B) in row major, or
// C = A * transpose(B) in column major.
for (auto i = 0; i < m; i++) {
Expand Down Expand Up @@ -169,16 +167,16 @@ static float compare_ref(std::vector<float>& x, std::vector<float>& ref,
const int m_ceil, const int n_ceil) {
auto sum = 0.0f;
for (auto batch = 0; batch < batch_size; batch++) {
for (auto i = 0; i < n; i++) {
for (auto j = 0; j < m; j++) {
auto r = ref[batch * n * m + i * m + j];
for (auto j = 0; j < m; j++) {
for (auto i = 0; i < n; i++) {
auto r = ref[batch * n * m + j * n + i];
auto y = x[batch * n_ceil * m_ceil + j * n_ceil + i];

sum += (r - y) * (r - y);
}
}
}
return sum / (m * n);
return sum / (m * n * batch_size);
}

std::string Tuner::tune_sgemm(const int m, const int n, const int k,
Expand Down
68 changes: 68 additions & 0 deletions src/utils/fastmath.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/*
This file is part of Leela Chess Zero.
Copyright (C) 2018 The LCZero Authors
Leela Chess is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Leela Chess is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with Leela Chess. If not, see <http://www.gnu.org/licenses/>.
Additional permission under GNU GPL version 3 section 7
If you modify this Program, or any covered work, by linking or
combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
modified version of those libraries), containing parts covered by the
terms of the respective license agreement, the licensors of this
Program grant you additional permission to convey the resulting work.
*/

#pragma once

#include <cstring>

namespace lczero {
// These stunts are performed by trained professionals, do not try this at home.

// Fast approximate log2(x). Does no range checking.
// The approximation used here is log2(2^N*(1+f)) ~ N+f*(1.342671-0.342671*f)
// where N is the integer and f the fractional part, f>=0.
inline float FastLog2(const float a) {
int32_t tmp;
std::memcpy(&tmp, &a, sizeof(float));
int expb = (tmp >> 23);
tmp = (tmp & 0x7fffff) | (0x7f << 23);
float out;
std::memcpy(&out, &tmp, sizeof(float));
return out * (2.028011f - 0.342671f * out) - 128.68534f + expb;
}

// Fast approximate 2^x. Does only limited range checking.
// The approximation used here is 2^(N+f) ~ 2^N*(1+f*(0.656366+0.343634*f))
// where N is the integer and f the fractional part, f>=0.
inline float FastPow2(const float a) {
if (a < -126) return 0.0;
int exp = floor(a);
float out = a - exp;
out = 1.0f + out * (0.656366f + 0.343634f * out);
int32_t tmp;
std::memcpy(&tmp, &out, sizeof(float));
tmp += exp << 23;
std::memcpy(&out, &tmp, sizeof(float));
return out;
}

// Fast approximate ln(x). Does no range checking.
inline float FastLog(const float a) {
return 0.6931471805599453f * FastLog2(a);
}

} // namespace lczero

0 comments on commit 80ac4a1

Please sign in to comment.