diff --git a/appveyor.yml b/appveyor.yml
index b8afe0fb1a..f773ea3859 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -17,7 +17,8 @@ install:
 - cmd: IF %NAME%==opencl set OPENCL=true
 - cmd: IF %NAME%==blas set BLAS=true
 - cmd: IF %NAME%==blas set GTEST=true
-- cmd: IF %BLAS%==true nuget install OpenBLAS -Version 0.2.14.1 -OutputDirectory C:\cache
+- cmd: IF %BLAS%==true IF NOT EXIST C:\cache\OpenBLAS appveyor DownloadFile https://sjeng.org/ftp/OpenBLAS-0.3.3-win-oldthread.zip
+- cmd: IF %BLAS%==true IF NOT EXIST C:\cache\OpenBLAS 7z x OpenBLAS-0.3.3-win-oldthread.zip -oC:\cache\OpenBLAS
 - cmd: IF %OPENCL%==true nuget install opencl-nug -Version 0.777.12 -OutputDirectory C:\cache
 - cmd: IF %BLAS%==true IF NOT EXIST C:\cache\ispc-v1.9.2-windows appveyor DownloadFile https://sourceforge.net/projects/ispcmirror/files/v1.9.2/ispc-v1.9.2-windows.zip
 - cmd: IF %BLAS%==true IF NOT EXIST C:\cache\ispc-v1.9.2-windows 7z x ispc-v1.9.2-windows.zip -oC:\cache
@@ -52,12 +53,12 @@ cache:
   - C:\projects\lc0\subprojects\packagecache
 before_build:
 - cmd: git submodule update --init --recursive
-- cmd: meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BLAS% -Dcudnn=%CUDA% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%PKG_FOLDER%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%PKG_FOLDER%\cuda\lib\x64" -Dprotobuf_include="%PKG_FOLDER%\protobuf\include" -Dprotobuf_libdir="%PKG_FOLDER%\protobuf\lib" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS.0.2.14.1\lib\native\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS.0.2.14.1\lib\native\lib\x64" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\lib\x64" -Ddefault_library=static
+- cmd: meson build --backend vs2017 --buildtype release -Dgtest=%GTEST% -Dopencl=%OPENCL% -Dblas=%BLAS% -Dcudnn=%CUDA% -Dispc_native_only=false -Dpopcnt=false -Dcudnn_include="%CUDA_PATH%\include","%PKG_FOLDER%\cuda\include" -Dcudnn_libdirs="%CUDA_PATH%\lib\x64","%PKG_FOLDER%\cuda\lib\x64" -Dprotobuf_include="%PKG_FOLDER%\protobuf\include" -Dprotobuf_libdir="%PKG_FOLDER%\protobuf\lib" -Dopenblas_include="%PKG_FOLDER%\OpenBLAS\dist64\include" -Dopenblas_libdirs="%PKG_FOLDER%\OpenBLAS\dist64\lib" -Dopencl_include="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\include" -Dopencl_libdirs="%PKG_FOLDER%\opencl-nug.0.777.12\build\native\lib\x64" -Ddefault_library=static
 build_script:
 - cmd: IF %APPVEYOR_REPO_TAG%==false msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=true /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
 - cmd: IF %APPVEYOR_REPO_TAG%==true msbuild "C:\projects\lc0\build\lc0.sln" /m /p:WholeProgramOptimization=PGInstrument /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
 - cmd: cd build
-- cmd: IF %APPVEYOR_REPO_TAG%==true IF %BLAS%==true copy C:\cache\OpenBLAS.0.2.14.1\lib\native\bin\x64\*.dll
+- cmd: IF %APPVEYOR_REPO_TAG%==true IF %BLAS%==true copy C:\cache\OpenBLAS\dist64\bin\libopenblas.dll
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %OPENCL%==true copy C:\cache\opencl-nug.0.777.12\build\native\bin\OpenCL.dll
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true copy "%CUDA_PATH%"\bin\*.dll
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true copy %PKG_FOLDER%\cuda\bin\cudnn64_7.dll
@@ -68,7 +69,7 @@ after_build:
 - cmd: IF %APPVEYOR_REPO_TAG%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip %APPVEYOR_BUILD_FOLDER%\build\lc0.exe
 - cmd: IF %APPVEYOR_REPO_TAG%==true appveyor DownloadFile "https://ci.appveyor.com/api/projects/LeelaChessZero/lczero-client/artifacts/client.exe?branch=release&pr=false&job=Environment%%3A%%20NAME%%3D.exe%%2C%%20GOOS%%3Dwindows"
 - cmd: IF %APPVEYOR_REPO_TAG%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip client.exe
-- cmd: IF %APPVEYOR_REPO_TAG%==true IF %BLAS%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\OpenBLAS.0.2.14.1\lib\native\bin\x64\*.dll
+- cmd: IF %APPVEYOR_REPO_TAG%==true IF %BLAS%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\OpenBLAS\dist64\bin\libopenblas.dll
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %OPENCL%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip C:\cache\opencl-nug.0.777.12\build\native\bin\OpenCL.dll
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%CUDA_PATH%\bin\cudart64_100.dll" "%CUDA_PATH%\bin\cublas64_100.dll"
 - cmd: IF %APPVEYOR_REPO_TAG%==true IF %CUDA%==true 7z a lc0-%APPVEYOR_REPO_TAG_NAME%-windows-%NAME%.zip "%PKG_FOLDER%\cuda\bin\cudnn64_7.dll"
@@ -91,7 +92,7 @@ deploy:
       appveyor_repo_tag: true
 test_script:
 - cmd: cd build
-- cmd: IF %GTEST%==true copy C:\cache\OpenBLAS.0.2.14.1\lib\native\bin\x64\*.dll
+- cmd: IF %GTEST%==true copy C:\cache\OpenBLAS\dist64\bin\libopenblas.dll
 - cmd: IF %GTEST%==true xcopy /s /i C:\cache\syzygy syzygy
 - cmd: IF %GTEST%==true meson test --print-errorlogs
 - cmd: cd ..
diff --git a/changelog.txt b/changelog.txt
index bbf4629767..fbb902a7bb 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,3 +1,72 @@
+v0.20.0-rc1 (2018-12-22)
+~~~~~~~~~~~
+
+* Squeeze-and-Excitation Networks are now supported! (lc0.org/se)
+* Older text network files are no longer supported.
+* Various performance fixes (most major being having fast approximate math
+  functions).
+* For systems with multiple GPUs, in addition to "multiplexing" backend
+  we now also have "demux" backend and "roundrobin" backend.
+* Compiler settings tweaks (use VS2017 for windows builds, always have LTO
+  enabled, windows releases have PGO enabled).
+* Benchmark mode has more options now (e.g. movetime) and saner defaults.
+* Added an option to prevent engine to resign too early (used in training).
+* Fixed a bug when number of visits could be too high in collision nodes.
+  The fix is pretty hacky, there will be better fix later.
+* 32-bit version compiles again.
+
+v0.19.1 (2018-12-10)
+~~~~~~~
+
+(no changes relative to v0.19.1-rc2)
+
+v0.19.1-rc2 (2018-12-07)
+~~~~~~~~~~~
+
+* Temperature and FPU related params. (#568)
+* Rework Cpuct related params. (#567)
+
+v0.19.1-rc1 (2018-12-06)
+~~~~~~~~~~~
+
+* Updated cpuct formula from alphazero paper. (#563)
+* remove UpdateFromUciOptions() from EnsureReady() (#558)
+* revert IsSearchActive() and better fix for one of #500 crashes (#555)
+
+v0.19.0 (2018-11-19)
+~~~~~~~
+
+* remove Wait() from EngineController::Stop() (#522)
+
+v0.19.0-rc5 (2018-11-17)
+~~~~~~~~~~~
+
+* OpenCL: replace thread_local with a resource pool. (#516)
+* optional wtime and btime (#515)
+* Make convolve1 work with workgroup size of 128 (#514)
+* adjust average depth calculation for multivisits (#510)
+
+v0.19.0-rc4 (2018-11-12)
+~~~~~~~~~~~
+
+* Microseconds have 6 digits, not 3! (#505)
+* use bestmove_is_sent_ for Search::IsSearchActive() (#502)
+
+v0.19.0-rc3 (2018-11-07)
+~~~~~~~~~~~
+
+* Fix OpenCL tuner always loading the first saved tuning (#491)
+* Do not show warning when ComputeBlocking() takes too much time. (#494)
+* Output microseconds in log rather than milliseconds. (#495)
+* Add benchmark features (#483)
+* Fix EncodePositionForNN test failure (#490)
+
+v0.19.0-rc2 (2018-11-03)
+~~~~~~~~~~~
+
+* Version v0.19.0-rc1 reported it's version as v0.19.0-dev
+  Therefore v0.19.0-rc2 is released with this issue fixed.
+
 v0.19.0-rc1 (2018-11-03)
 ~~~~~~~~~~~
 
diff --git a/meson.build b/meson.build
index c30d4592b6..9bbebb794c 100644
--- a/meson.build
+++ b/meson.build
@@ -15,7 +15,7 @@
 # along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
 
 project('lc0', 'cpp',
-        default_options : ['cpp_std=c++14', 'b_ndebug=if-release', 'b_lto=true'],
+        default_options : ['cpp_std=c++14', 'b_ndebug=if-release'],
         meson_version: '>=0.45')
 
 cc = meson.get_compiler('cpp')
@@ -26,7 +26,6 @@ endif
 if cc.get_id() == 'clang' or cc.get_id() == 'gcc'
   add_project_arguments('-Wextra', language : 'cpp')
   add_project_arguments('-pedantic', language : 'cpp')
-  add_project_arguments('-ffast-math', language : 'cpp')
 
   if get_option('buildtype') == 'release'
     add_project_arguments('-march=native', language : 'cpp')
@@ -51,7 +50,10 @@ else
 endif
 protoc = find_program('protoc', required : false)
 # For tensorflow skip system protobuf, chances are it will not work.
-if not protobuf_dep.found() or not protoc.found() or get_option('tensorflow')
+if get_option('protobuf-3-6-0')
+  deps += subproject('protobuf-3.6.0').get_variable('protobuf_dep')
+  protoc = subproject('protobuf-3.6.0').get_variable('protoc')
+elif not protobuf_dep.found() or not protoc.found() or get_option('tensorflow')
   deps += subproject('protobuf').get_variable('protobuf_dep')
   protoc = subproject('protobuf').get_variable('protoc')
 else
@@ -98,9 +100,11 @@ files += [
   'src/neural/factory.cc',
   'src/neural/loader.cc',
   'src/neural/network_check.cc',
+  'src/neural/network_demux.cc',
   'src/neural/network_legacy.cc',
   'src/neural/network_mux.cc',
   'src/neural/network_random.cc',
+  'src/neural/network_rr.cc',
   'src/neural/network_st_batch.cc',
   'src/neural/writer.cc',
   'src/selfplay/game.cc',
@@ -155,6 +159,7 @@ if get_option('build_backends')
       tensorflow_include,
       tensorflow_include[0] + '/bazel-genfiles',
       tensorflow_include[0] + '/tensorflow/contrib/makefile/downloads',
+      tensorflow_include[0] + '/tensorflow/contrib/makefile/downloads/absl',
       tensorflow_include[0] + '/tensorflow/contrib/makefile/downloads/eigen',
       tensorflow_include[0] + '/tensorflow/contrib/makefile/downloads/gemmlowp',
       tensorflow_include[0] + '/tensorflow/contrib/makefile/downloads/nsync/public',
@@ -179,6 +184,9 @@ if get_option('build_backends')
 
   mkl_libdirs = get_option('mkl_libdirs')
   mkl_lib = cc.find_library('mkl_rt', dirs: mkl_libdirs, required: false)
+  if not mkl_lib.found()
+    mkl_lib = cc.find_library('mklml', dirs: mkl_libdirs, required: false)
+  endif
 
   openblas_libdirs = get_option('openblas_libdirs')
   openblas_lib = cc.find_library('openblas.dll', dirs: openblas_libdirs, required: false)
@@ -394,9 +402,9 @@ endif # if get_option('build_backends')
 if not has_backends and get_option('build_backends')
   error('''
 
-        No usable computation backends (cudnn/tensorflow/etc) are found.
-        If you want to build it with random only backend, pass
-        -D build_backends=false to a meson build.''')
+        No usable computation backends (cudnn/opencl/blas/etc) enabled.
+        If you want to build with the random backend only, add
+        -Dbuild_backends=false to the build command line.''')
 endif
 
 
diff --git a/meson_options.txt b/meson_options.txt
index 5d8bd7d012..04993f626f 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -117,3 +117,8 @@ option('gtest',
        type: 'boolean',
        value: true,
        description: 'Build gtest tests')
+
+option('protobuf-3-6-0',
+       type: 'boolean',
+       value: false,
+       description: 'Use the protobuf 3.6.0 subproject')
diff --git a/src/benchmark/benchmark.cc b/src/benchmark/benchmark.cc
index eb83368fea..234f0b2b5c 100644
--- a/src/benchmark/benchmark.cc
+++ b/src/benchmark/benchmark.cc
@@ -56,17 +56,6 @@ void Benchmark::Run() {
   options.Add<IntOption>(kNNCacheSizeId, 0, 999999999) = 200000;
   options.Add<IntOption>(kThreadsOptionId, 1, 128) = kDefaultThreads;
 
-  auto defaults = options.GetMutableDefaultsOptions();
-
-  defaults->Set<int>(SearchParams::kMiniBatchSizeId.GetId(), 256);
-  defaults->Set<float>(SearchParams::kFpuReductionId.GetId(), 1.2f);
-  defaults->Set<float>(SearchParams::kCpuctId.GetId(), 3.4f);
-  defaults->Set<float>(SearchParams::kPolicySoftmaxTempId.GetId(), 2.2f);
-  defaults->Set<int>(SearchParams::kMaxCollisionVisitsId.GetId(), 9999);
-  defaults->Set<int>(SearchParams::kMaxCollisionEventsId.GetId(), 32);
-  defaults->Set<int>(SearchParams::kCacheHistoryLengthId.GetId(), 0);
-  defaults->Set<bool>(SearchParams::kOutOfOrderEvalId.GetId(), true);
-
   if (!options.ProcessAllFlags()) return;
 
   try {
diff --git a/src/chess/board.cc b/src/chess/board.cc
index 609ae57ff2..d9689ab26c 100644
--- a/src/chess/board.cc
+++ b/src/chess/board.cc
@@ -188,6 +188,7 @@ BitBoard ChessBoard::en_passant() const { return pawns_ - pawns(); }
 
 MoveList ChessBoard::GeneratePseudolegalMoves() const {
   MoveList result;
+  result.reserve(60);
   for (auto source : our_pieces_) {
     // King
     if (source == our_king_) {
@@ -336,8 +337,8 @@ MoveList ChessBoard::GeneratePseudolegalMoves() const {
     }
     // Knight.
     {
-      for (const auto destination : kKnightAttacks[source.as_int()]) {
-        if (our_pieces_.get(destination)) continue;
+      for (const auto destination :
+           kKnightAttacks[source.as_int()] - our_pieces_) {
         result.emplace_back(source, destination);
       }
     }
@@ -405,9 +406,6 @@ bool ChessBoard::ApplyMove(Move move) {
     return reset_50_moves;
   }
 
-  // Now destination square for our piece is known.
-  our_pieces_.set(to);
-
   // Promotion
   if (move.promotion() != Move::Promotion::None) {
     switch (move.promotion()) {
@@ -456,13 +454,13 @@ bool ChessBoard::ApplyMove(Move move) {
 bool ChessBoard::IsUnderAttack(BoardSquare square) const {
   const int row = square.row();
   const int col = square.col();
-  // Check king
+  // Check king.
   {
     const int krow = their_king_.row();
     const int kcol = their_king_.col();
     if (std::abs(krow - row) <= 1 && std::abs(kcol - col) <= 1) return true;
   }
-  // Check Rooks (and queen)
+  // Check rooks (and queens).
   if (kRookAttacks[square.as_int()].intersects(their_pieces_ * rooks_)) {
     for (const auto& direction : kRookDirections) {
       auto dst_row = row;
@@ -480,7 +478,7 @@ bool ChessBoard::IsUnderAttack(BoardSquare square) const {
       }
     }
   }
-  // Check Bishops
+  // Check bishops.
   if (kBishopAttacks[square.as_int()].intersects(their_pieces_ * bishops_)) {
     for (const auto& direction : kBishopDirections) {
       auto dst_row = row;
@@ -498,11 +496,11 @@ bool ChessBoard::IsUnderAttack(BoardSquare square) const {
       }
     }
   }
-  // Check pawns
+  // Check pawns.
   if (kPawnAttacks[square.as_int()].intersects(their_pieces_ * pawns_)) {
     return true;
   }
-  // Check knights
+  // Check knights.
   {
     if (kKnightAttacks[square.as_int()].intersects(their_pieces_ - their_king_ -
                                                    rooks_ - bishops_ -
@@ -513,18 +511,135 @@ bool ChessBoard::IsUnderAttack(BoardSquare square) const {
   return false;
 }
 
-bool ChessBoard::IsLegalMove(Move move, bool was_under_check) const {
-  const auto& from = move.from();
-  const auto& to = move.to();
+KingAttackInfo ChessBoard::GenerateKingAttackInfo() const {
+  KingAttackInfo king_attack_info;
 
-  // If we are already under check, also apply move and check if valid.
-  // TODO(mooskagh) Optimize this case
-  if (was_under_check) {
-    ChessBoard board(*this);
-    board.ApplyMove(move);
-    return !board.IsUnderCheck();
+  // Number of attackers that give check (used for double check detection).
+  unsigned num_king_attackers = 0;
+
+  const int row = our_king_.row();
+  const int col = our_king_.col();
+  // King checks are unnecessary, as kings cannot give check.
+  // Check rooks (and queens).
+  if (kRookAttacks[our_king_.as_int()].intersects(their_pieces_ * rooks_)) {
+    for (const auto& direction : kRookDirections) {
+      auto dst_row = row;
+      auto dst_col = col;
+      BitBoard attack_line(0);
+      bool possible_pinned_piece_found = false;
+      BoardSquare possible_pinned_piece;
+      while (true) {
+        dst_row += direction.first;
+        dst_col += direction.second;
+        if (!BoardSquare::IsValid(dst_row, dst_col)) break;
+        const BoardSquare destination(dst_row, dst_col);
+        if (our_pieces_.get(destination)) {
+          if (possible_pinned_piece_found) {
+            // No pieces pinned.
+            break;
+          } else {
+            // This is a possible pinned piece.
+            possible_pinned_piece_found = true;
+            possible_pinned_piece = destination;
+          }
+        }
+        if (!possible_pinned_piece_found) {
+          attack_line.set(destination);
+        }
+        if (their_pieces_.get(destination)) {
+          if (rooks_.get(destination)) {
+            if (possible_pinned_piece_found) {
+              // Store the pinned piece.
+              king_attack_info.pinned_pieces_.set(possible_pinned_piece);
+            } else {
+              // Update attacking lines.
+              king_attack_info.attacking_lines_ =
+                  king_attack_info.attacking_lines_ + attack_line;
+              num_king_attackers++;
+            }
+          }
+          break;
+        }
+      }
+    }
+  }
+  // Check bishops.
+  if (kBishopAttacks[our_king_.as_int()].intersects(their_pieces_ * bishops_)) {
+    for (const auto& direction : kBishopDirections) {
+      auto dst_row = row;
+      auto dst_col = col;
+      BitBoard attack_line(0);
+      bool possible_pinned_piece_found = false;
+      BoardSquare possible_pinned_piece;
+      while (true) {
+        dst_row += direction.first;
+        dst_col += direction.second;
+        if (!BoardSquare::IsValid(dst_row, dst_col)) break;
+        const BoardSquare destination(dst_row, dst_col);
+        if (our_pieces_.get(destination)) {
+          if (possible_pinned_piece_found) {
+            // No pieces pinned.
+            break;
+          } else {
+            // This is a possible pinned piece.
+            possible_pinned_piece_found = true;
+            possible_pinned_piece = destination;
+          }
+        }
+        if (!possible_pinned_piece_found) {
+          attack_line.set(destination);
+        }
+        if (their_pieces_.get(destination)) {
+          if (bishops_.get(destination)) {
+            if (possible_pinned_piece_found) {
+              // Store the pinned piece.
+              king_attack_info.pinned_pieces_.set(possible_pinned_piece);
+            } else {
+              // Update attacking lines.
+              king_attack_info.attacking_lines_ =
+                  king_attack_info.attacking_lines_ + attack_line;
+              num_king_attackers++;
+            }
+          }
+          break;
+        }
+      }
+    }
+  }
+  // Check pawns.
+  const BitBoard attacking_pawns =
+      kPawnAttacks[our_king_.as_int()] * their_pieces_ * pawns_;
+  king_attack_info.attacking_lines_ =
+      king_attack_info.attacking_lines_ + attacking_pawns;
+
+  if (attacking_pawns.as_int()) {
+    // No more than one pawn can give check.
+    num_king_attackers++;
   }
 
+  // Check knights.
+  const BitBoard attacking_knights =
+      kKnightAttacks[our_king_.as_int()] *
+      (their_pieces_ - their_king_ - rooks_ - bishops_ - (pawns_ * kPawnMask));
+  king_attack_info.attacking_lines_ =
+      king_attack_info.attacking_lines_ + attacking_knights;
+
+  if (attacking_knights.as_int()) {
+    // No more than one knight can give check.
+    num_king_attackers++;
+  }
+
+  assert(num_king_attackers <= 2);
+  king_attack_info.double_check_ = (num_king_attackers == 2);
+
+  return king_attack_info;
+}
+
+bool ChessBoard::IsLegalMove(Move move,
+                             const KingAttackInfo& king_attack_info) const {
+  const auto& from = move.from();
+  const auto& to = move.to();
+
   // En passant. Complex but rare. Just apply
   // and check that we are not under check.
   if (from.row() == 4 && pawns_.get(from) && from.col() != to.col() &&
@@ -534,83 +649,72 @@ bool ChessBoard::IsLegalMove(Move move, bool was_under_check) const {
     return !board.IsUnderCheck();
   }
 
-  // If it's kings move, check that destination
-  // is not under attack.
+  // Check if we are already under check.
+  if (king_attack_info.in_check()) {
+    // King move.
+    if (from == our_king_) {
+      // Just apply and check that we are not under check.
+      ChessBoard board(*this);
+      board.ApplyMove(move);
+      return !board.IsUnderCheck();
+    }
+
+    // Pinned pieces can never resolve a check.
+    if (king_attack_info.is_pinned(from)) {
+      return false;
+    }
+
+    // The piece to move is no king and is not pinned.
+    if (king_attack_info.in_double_check()) {
+      // Only a king move can resolve the double check.
+      return false;
+    } else {
+      // Only one attacking piece gives check.
+      // Our piece is free to move (not pinned). Check if the attacker is
+      // captured or interposed after the piece has moved to its destination
+      // square.
+      return king_attack_info.is_on_attack_line(to);
+    }
+  }
+
+  // Castlings were checked earlier.
+  // Moreover, no pseudolegal king moves to an attacked square are generated.
+  // If it's king's move at this moment, its certainly legal.
   if (from == our_king_) {
-    // Castlings were checked earlier.
-    if (std::abs(static_cast<int>(from.col()) - static_cast<int>(to.col())) > 1)
-      return true;
-    return !IsUnderAttack(to);
+    return true;
   }
 
-  // Not check that piece was pinned. And it was, check that after the move
-  // it is still on like of attack.
-  int dx = from.col() - our_king_.col();
-  int dy = from.row() - our_king_.row();
-
-  // If it's not on the same file/rank/diagonal as our king, cannot be pinned.
-  if (dx != 0 && dy != 0 && std::abs(dx) != std::abs(dy)) return true;
-  dx = (dx > 0) - (dx < 0);  // Sign.
-  dy = (dy > 0) - (dy < 0);
-  auto col = our_king_.col();
-  auto row = our_king_.row();
-  while (true) {
-    col += dx;
-    row += dy;
-    // Attacking line left board, good.
-    if (!BoardSquare::IsValid(row, col)) return true;
-    const BoardSquare square(row, col);
-    // The source square of the move is now free.
-    if (square == from) continue;
-    // The destination square if the move is our piece. King is not under
-    // attack.
-    if (square == to) return true;
-    // Our piece on the line. Not under attack.
-    if (our_pieces_.get(square)) return true;
-    if (their_pieces_.get(square)) {
-      if (dx == 0 || dy == 0) {
-        // Have to be afraid of rook-like piece.
-        return !rooks_.get(square);
-      } else {
-        // Have to be afraid of bishop-like piece.
-        return !bishops_.get(square);
-      }
-      return true;
-    }
+  // If we get here, we are not under check.
+  // If the piece is not pinned, it is free to move anywhere.
+  if (!king_attack_info.is_pinned(from)) return true;
+
+  // The piece is pinned. Now check that it stays on the same line w.r.t. the
+  // king.
+  int dx_from = from.col() - our_king_.col();
+  int dy_from = from.row() - our_king_.row();
+  int dx_to = to.col() - our_king_.col();
+  int dy_to = to.row() - our_king_.row();
+
+  if (dx_from == 0 || dx_to == 0) {
+    return (dx_from == dx_to);
+  } else {
+    return (dx_from * dy_to == dx_to * dy_from);
   }
 }
 
 MoveList ChessBoard::GenerateLegalMoves() const {
-  const bool was_under_check = IsUnderCheck();
+  const KingAttackInfo king_attack_info = GenerateKingAttackInfo();
   MoveList move_list = GeneratePseudolegalMoves();
   MoveList result;
   result.reserve(move_list.size());
 
   for (Move m : move_list) {
-    if (IsLegalMove(m, was_under_check)) result.emplace_back(m);
+    if (IsLegalMove(m, king_attack_info)) result.emplace_back(m);
   }
 
   return result;
 }
 
-std::vector<MoveExecution> ChessBoard::GenerateLegalMovesAndPositions() const {
-  MoveList move_list = GeneratePseudolegalMoves();
-  std::vector<MoveExecution> result;
-
-  for (const auto& move : move_list) {
-    result.emplace_back();
-    auto& newboard = result.back().board;
-    newboard = *this;
-    result.back().reset_50_moves = newboard.ApplyMove(move);
-    if (newboard.IsUnderCheck()) {
-      result.pop_back();
-      continue;
-    }
-    result.back().move = move;
-  }
-  return result;
-}
-
 void ChessBoard::SetFromFen(const std::string& fen, int* no_capture_ply,
                             int* moves) {
   Clear();
diff --git a/src/chess/board.h b/src/chess/board.h
index 830e4c35cf..9188c5f6f3 100644
--- a/src/chess/board.h
+++ b/src/chess/board.h
@@ -33,7 +33,22 @@
 
 namespace lczero {
 
-struct MoveExecution;
+// Represents king attack info used during legal move detection.
+class KingAttackInfo {
+ public:
+  bool in_check() const { return attacking_lines_.as_int(); }
+  bool in_double_check() const { return double_check_; }
+  bool is_pinned(const BoardSquare square) const {
+    return pinned_pieces_.get(square);
+  }
+  bool is_on_attack_line(const BoardSquare square) const {
+    return attacking_lines_.get(square);
+  }
+
+  bool double_check_ = 0;
+  BitBoard pinned_pieces_ = {0};
+  BitBoard attacking_lines_ = {0};
+};
 
 // Represents a board position.
 // Unlike most chess engines, the board is mirrored for black.
@@ -66,23 +81,25 @@ class ChessBoard {
   bool ApplyMove(Move move);
   // Checks if the square is under attack from "theirs" (black).
   bool IsUnderAttack(BoardSquare square) const;
+  // Generates the king attack info used for legal move detection.
+  KingAttackInfo GenerateKingAttackInfo() const;
   // Checks if "our" (white) king is under check.
   bool IsUnderCheck() const { return IsUnderAttack(our_king_); }
-  // Checks whether at least one of the sides has mating material.
 
+  // Checks whether at least one of the sides has mating material.
   bool HasMatingMaterial() const;
   // Generates legal moves.
   MoveList GenerateLegalMoves() const;
   // Check whether pseudolegal move is legal.
-  bool IsLegalMove(Move move, bool was_under_check) const;
-  // Returns a list of legal moves and board positions after the move is made.
-  std::vector<MoveExecution> GenerateLegalMovesAndPositions() const;
+  bool IsLegalMove(Move move, const KingAttackInfo& king_attack_info) const;
 
   uint64_t Hash() const {
     return HashCat({our_pieces_.as_int(), their_pieces_.as_int(),
                     rooks_.as_int(), bishops_.as_int(), pawns_.as_int(),
-                    our_king_.as_int(), their_king_.as_int(),
-                    castlings_.as_int(), flipped_});
+                    (static_cast<uint32_t>(our_king_.as_int()) << 24) |
+                        (static_cast<uint32_t>(their_king_.as_int()) << 16) |
+                        (static_cast<uint32_t>(castlings_.as_int()) << 8) |
+                        static_cast<uint32_t>(flipped_)});
   }
 
   class Castlings {
@@ -168,8 +185,8 @@ class ChessBoard {
   // Pawns.
   // Ranks 1 and 8 have special meaning. Pawn at rank 1 means that
   // corresponding white pawn on rank 4 can be taken en passant. Rank 8 is the
-  // same for black pawns. Those "fake" pawns are not present in white_ and
-  // black_ bitboards.
+  // same for black pawns. Those "fake" pawns are not present in our_pieces_ and
+  // their_pieces_ bitboards.
   BitBoard pawns_;
   BoardSquare our_king_;
   BoardSquare their_king_;
@@ -177,11 +194,4 @@ class ChessBoard {
   bool flipped_ = false;  // aka "Black to move".
 };
 
-// Stores the move and state of the board after the move is done.
-struct MoveExecution {
-  Move move;
-  ChessBoard board;
-  bool reset_50_moves;
-};
-
 }  // namespace lczero
diff --git a/src/engine.cc b/src/engine.cc
index ec80370dbb..6d160f8fcd 100644
--- a/src/engine.cc
+++ b/src/engine.cc
@@ -58,19 +58,17 @@ const OptionId kMoveOverheadId{
     "Amount of time, in milliseconds, that the engine subtracts from it's "
     "total available time (to compensate for slow connection, interprocess "
     "communication, etc)."};
-const OptionId kTimePeakPlyId{"time-peak-halfmove", "TimePeakHalfmove",
-                              "For which halfmove the time budgeting algorithm "
-                              "should allocate the maximum amount of time."};
-const OptionId kTimeLeftWidthId{
-    "time-left-width", "TimeLeftWidth",
-    "\"Width\" of time budget graph to the left of the peak value. For small "
-    "values, moves far from the peak will get little time; for larger values, "
-    "they will get almost the same time as the peak move."};
-const OptionId kTimeRightWidthId{
-    "time-right-width", "TimeRightWidth",
-    "\"Width\" of time budget graph to the right of the peak value. For small "
-    "values, moves far from the peak will get little time; for larger values, "
-    "they will get almost the same time as the peak move."};
+const OptionId kTimeMidpointMoveId{
+    "time-midpoint-move", "TimeMidpointMove",
+    "The move where the time budgeting algorithm guesses half of all "
+    "games to be completed by. Half of the time allocated for the first move "
+    "is allocated at approximately this move."};
+const OptionId kTimeSteepnessId{
+    "time-steepness", "TimeSteepness",
+    "\"Steepness\" of the function the time budgeting algorithm uses to "
+    "consider when games are completed. Lower values leave more time for "
+    "the endgame, higher values use more time for each move before the "
+    "midpoint."};
 const OptionId kSyzygyTablebaseId{
     "syzygy-paths", "SyzygyPath",
     "List of Syzygy tablebase directories, list entries separated by system "
@@ -99,13 +97,22 @@ const size_t kAvgCacheItemSize =
     NNCache::GetItemStructSize() + sizeof(CachedNNRequest) +
     sizeof(CachedNNRequest::IdxAndProb) * kAvgMovesPerPosition;
 
-float ComputeMoveWeight(int ply, float peak, float left_width,
-                        float right_width) {
-  // Inflection points of the function are at ply = peak +/- width.
-  // At these points the function is at 2/3 of its max value.
-  const float width = ply > peak ? right_width : left_width;
-  constexpr float width_scaler = 1.518651485f;  // 2 / log(2 + sqrt(3))
-  return std::pow(std::cosh((ply - peak) / width / width_scaler), -2.0f);
+float ComputeEstimatedMovesToGo(int ply, float midpoint, float steepness) {
+  // An analysis of chess games shows that the distribution of game lengths
+  // looks like a log-logistic distribution. The mean residual time function
+  // calculates how many more moves are expected in the game given that we are
+  // at the current ply. Given that this function can be expensive to compute,
+  // we calculate the median residual time function instead. This is derived and
+  // shown to be similar to the mean residual time in "Some Useful Properties of
+  // Log-Logistic Random Variables for Health Care Simulations" (Clark &
+  // El-Taha, 2015).
+  // midpoint: The median length of games.
+  // steepness: How quickly the function drops off from its maximum value,
+  // around the midpoint.
+  float move = ply / 2.0f;
+  return midpoint * std::pow(1 + 2 * std::pow(move / midpoint, steepness),
+                             1 / steepness) -
+         move;
 }
 
 }  // namespace
@@ -125,36 +132,22 @@ void EngineController::PopulateOptions(OptionsParser* options) {
   options->Add<IntOption>(kNNCacheSizeId, 0, 999999999) = 200000;
   options->Add<FloatOption>(kSlowMoverId, 0.0f, 100.0f) = 1.0f;
   options->Add<IntOption>(kMoveOverheadId, 0, 100000000) = 200;
-  options->Add<FloatOption>(kTimePeakPlyId, -1000.0f, 1000.0f) = 26.2f;
-  options->Add<FloatOption>(kTimeLeftWidthId, 0.0f, 1000.0f) = 82.0f;
-  options->Add<FloatOption>(kTimeRightWidthId, 0.0f, 1000.0f) = 74.0f;
+  options->Add<FloatOption>(kTimeMidpointMoveId, 1.0f, 100.0f) = 51.5f;
+  options->Add<FloatOption>(kTimeSteepnessId, 1.0f, 100.0f) = 7.0f;
   options->Add<StringOption>(kSyzygyTablebaseId);
   // Add "Ponder" option to signal to GUIs that we support pondering.
   // This option is currently not used by lc0 in any way.
   options->Add<BoolOption>(kPonderId) = true;
-  options->Add<FloatOption>(kSpendSavedTimeId, 0.0f, 1.0f) = 0.6f;
+  options->Add<FloatOption>(kSpendSavedTimeId, 0.0f, 1.0f) = 1.0f;
   options->Add<IntOption>(kRamLimitMbId, 0, 100000000) = 0;
 
   // Hide time curve options.
-  options->HideOption(kTimePeakPlyId);
-  options->HideOption(kTimeLeftWidthId);
-  options->HideOption(kTimeRightWidthId);
+  options->HideOption(kTimeMidpointMoveId);
+  options->HideOption(kTimeSteepnessId);
 
   NetworkFactory::PopulateOptions(options);
   SearchParams::Populate(options);
   ConfigFile::PopulateOptions(options);
-
-  auto defaults = options->GetMutableDefaultsOptions();
-
-  defaults->Set<int>(SearchParams::kMiniBatchSizeId.GetId(), 256);
-  defaults->Set<float>(SearchParams::kFpuReductionId.GetId(), 1.2f);
-  defaults->Set<float>(SearchParams::kCpuctId.GetId(), 3.0f);
-  defaults->Set<float>(SearchParams::kCpuctFactorId.GetId(), 2.0f);
-  defaults->Set<float>(SearchParams::kPolicySoftmaxTempId.GetId(), 2.2f);
-  defaults->Set<int>(SearchParams::kMaxCollisionVisitsId.GetId(), 9999);
-  defaults->Set<int>(SearchParams::kMaxCollisionEventsId.GetId(), 32);
-  defaults->Set<int>(SearchParams::kCacheHistoryLengthId.GetId(), 0);
-  defaults->Set<bool>(SearchParams::kOutOfOrderEvalId.GetId(), true);
 }
 
 SearchLimits EngineController::PopulateSearchLimits(
@@ -162,11 +155,6 @@ SearchLimits EngineController::PopulateSearchLimits(
     std::chrono::steady_clock::time_point start_time) {
   SearchLimits limits;
   int64_t move_overhead = options_.Get<int>(kMoveOverheadId.GetId());
-  if (params.movetime) {
-    limits.search_deadline = start_time + std::chrono::milliseconds(
-                                              *params.movetime - move_overhead);
-  }
-
   const optional<int64_t>& time = (is_black ? params.btime : params.wtime);
   if (!params.searchmoves.empty()) {
     limits.searchmoves.reserve(params.searchmoves.size());
@@ -175,6 +163,10 @@ SearchLimits EngineController::PopulateSearchLimits(
     }
   }
   limits.infinite = params.infinite || params.ponder;
+  if (params.movetime && !limits.infinite) {
+    limits.search_deadline = start_time + std::chrono::milliseconds(
+                                              *params.movetime - move_overhead);
+  }
   if (params.nodes) limits.visits = *params.nodes;
   int ram_limit = options_.Get<int>(kRamLimitMbId.GetId());
   if (ram_limit) {
@@ -192,19 +184,26 @@ SearchLimits EngineController::PopulateSearchLimits(
   const optional<int64_t>& inc = is_black ? params.binc : params.winc;
   int increment = inc ? std::max(int64_t(0), *inc) : 0;
 
-  int movestogo = params.movestogo.value_or(50);
-  // Fix non-standard uci command.
-  if (movestogo == 0) movestogo = 1;
-
   // How to scale moves time.
   float slowmover = options_.Get<float>(kSlowMoverId.GetId());
-  float time_curve_peak = options_.Get<float>(kTimePeakPlyId.GetId());
-  float time_curve_left_width = options_.Get<float>(kTimeLeftWidthId.GetId());
-  float time_curve_right_width = options_.Get<float>(kTimeRightWidthId.GetId());
+  float time_curve_midpoint = options_.Get<float>(kTimeMidpointMoveId.GetId());
+  float time_curve_steepness = options_.Get<float>(kTimeSteepnessId.GetId());
+
+  float movestogo =
+      ComputeEstimatedMovesToGo(ply, time_curve_midpoint, time_curve_steepness);
+
+  // If the number of moves remaining until the time control are less than
+  // the estimated number of moves left in the game, then use the number of
+  // moves until the time control instead.
+  if (params.movestogo &&
+      *params.movestogo > 0 &&  // Ignore non-standard uci command.
+      *params.movestogo < movestogo) {
+    movestogo = *params.movestogo;
+  }
 
-  // Total time till control including increments.
+  // Total time, including increments, until time control.
   auto total_moves_time =
-      std::max(int64_t{0}, *time + increment * (movestogo - 1) - move_overhead);
+      std::max(0.0f, *time + increment * (movestogo - 1) - move_overhead);
 
   // If there is time spared from previous searches, the `time_to_squander` part
   // of it will be used immediately, remove that from planning.
@@ -216,20 +215,12 @@ SearchLimits EngineController::PopulateSearchLimits(
     total_moves_time -= time_to_squander;
   }
 
-  constexpr int kSmartPruningToleranceMs = 200;
-  float this_move_weight = ComputeMoveWeight(
-      ply, time_curve_peak, time_curve_left_width, time_curve_right_width);
-  float other_move_weights = 0.0f;
-  for (int i = 1; i < movestogo; ++i)
-    other_move_weights +=
-        ComputeMoveWeight(ply + 2 * i, time_curve_peak, time_curve_left_width,
-                          time_curve_right_width);
-  // Compute the move time without slowmover.
-  float this_move_time = total_moves_time * this_move_weight /
-                         (this_move_weight + other_move_weights);
+  // Evenly split total time between all moves.
+  float this_move_time = total_moves_time / movestogo;
 
   // Only extend thinking time with slowmover if smart pruning can potentially
   // reduce it.
+  constexpr int kSmartPruningToleranceMs = 200;
   if (slowmover < 1.0 ||
       this_move_time * slowmover > kSmartPruningToleranceMs) {
     this_move_time *= slowmover;
diff --git a/src/mcts/node.cc b/src/mcts/node.cc
index 2d754dd161..d968377be2 100644
--- a/src/mcts/node.cc
+++ b/src/mcts/node.cc
@@ -233,7 +233,10 @@ bool Node::TryStartScoreUpdate() {
   return true;
 }
 
-void Node::CancelScoreUpdate(int multivisit) { n_in_flight_ -= multivisit; }
+void Node::CancelScoreUpdate(int multivisit) {
+  n_in_flight_ -= multivisit;
+  best_child_cached_ = nullptr;
+}
 
 void Node::FinalizeScoreUpdate(float v, int multivisit) {
   // Recompute Q.
@@ -246,6 +249,18 @@ void Node::FinalizeScoreUpdate(float v, int multivisit) {
   n_ += multivisit;
   // Decrement virtual loss.
   n_in_flight_ -= multivisit;
+  // Best child is potentially no longer valid.
+  best_child_cached_ = nullptr;
+}
+
+void Node::UpdateBestChild(const Iterator& best_edge, int visits_allowed) {
+  best_child_cached_ = best_edge.node();
+  // An edge can point to an unexpanded node with n==0. These nodes don't
+  // increment their n_in_flight_ the same way and thus are not safe to cache.
+  if (best_child_cached_ && best_child_cached_->GetN() == 0) {
+    best_child_cached_ = nullptr;
+  }
+  best_child_cache_in_flight_limit_ = visits_allowed + n_in_flight_;
 }
 
 Node::NodeRange Node::ChildNodes() const { return child_.get(); }
diff --git a/src/mcts/node.h b/src/mcts/node.h
index de7f87d0df..26d020b2ef 100644
--- a/src/mcts/node.h
+++ b/src/mcts/node.h
@@ -183,6 +183,23 @@ class Node {
   // Updates max depth, if new depth is larger.
   void UpdateMaxDepth(int depth);
 
+  // Caches the best child if possible.
+  void UpdateBestChild(const Iterator& best_edge, int collisions_allowed);
+
+  // Gets a cached best child if it is still valid.
+  Node* GetCachedBestChild() {
+    if (n_in_flight_ < best_child_cache_in_flight_limit_) {
+      return best_child_cached_;
+    }
+    return nullptr;
+  }
+
+  // Gets how many more visits the cached value is valid for. Only valid if
+  // GetCachedBestChild returns a value.
+  int GetRemainingCacheVisits() {
+    return best_child_cache_in_flight_limit_ - n_in_flight_;
+  }
+
   // Calculates the full depth if new depth is larger, updates it, returns
   // in depth parameter, and returns true if it was indeed updated.
   bool UpdateFullDepth(uint16_t* depth);
@@ -216,6 +233,13 @@ class Node {
   std::string DebugString() const;
 
  private:
+  // Performs construction time type initialization. For use only with a node
+  // that has not been used beyond its construction.
+  void Reinit(Node* parent, uint16_t index) {
+    parent_ = parent;
+    index_ = index;
+  }
+
   // To minimize the number of padding bytes and to avoid having unnecessary
   // padding when new fields are added, we arrange the fields by size, largest
   // to smallest.
@@ -231,6 +255,9 @@ class Node {
   std::unique_ptr<Node> child_;
   // Pointer to a next sibling. nullptr if there are no further siblings.
   std::unique_ptr<Node> sibling_;
+  // Cached pointer to best child, valid while n_in_flight <
+  // best_child_cache_in_flight_limit_
+  Node* best_child_cached_ = nullptr;
 
   // 4 byte fields.
   // Average value (from value head of neural network) of all visited nodes in
@@ -246,6 +273,9 @@ class Node {
   // but not finished). This value is added to n during selection which node
   // to pick in MCTS, and also when selecting the best move.
   uint32_t n_in_flight_ = 0;
+  // If best_child_cached_ is non-null, and n_in_flight_ < this,
+  // best_child_cached_ is still the best child.
+  uint32_t best_child_cache_in_flight_limit_ = 0;
 
   // 2 byte fields.
   // Index of this node is parent's edge list.
@@ -273,9 +303,9 @@ class Node {
 
 // A basic sanity check. This must be adjusted when Node members are adjusted.
 #if defined(__i386__) || (defined(__arm__) && !defined(__aarch64__))
-static_assert(sizeof(Node) == 40, "Unexpected size of Node for 32bit compile");
+static_assert(sizeof(Node) == 48, "Unexpected size of Node for 32bit compile");
 #else
-static_assert(sizeof(Node) == 64, "Unexpected size of Node");
+static_assert(sizeof(Node) == 72, "Unexpected size of Node");
 #endif
 
 // Contains Edge and Node pair and set of proxy functions to simplify access
@@ -392,7 +422,8 @@ class Edge_Iterator : public EdgeAndNode {
   Edge_Iterator& operator*() { return *this; }
 
   // If there is node, return it. Otherwise spawn a new one and return it.
-  Node* GetOrSpawnNode(Node* parent) {
+  Node* GetOrSpawnNode(Node* parent,
+                       std::unique_ptr<Node>* node_source = nullptr) {
     if (node_) return node_;  // If there is already a node, return it.
     Actualize();              // But maybe other thread already did that.
     if (node_) return node_;  // If it did, return.
@@ -408,7 +439,12 @@ class Edge_Iterator : public EdgeAndNode {
     // 2. Create fresh Node(idx_.5):
     //    node_ptr_ -> &Node(idx_.3).sibling_  ->  Node(idx_.5)
     //    tmp -> Node(idx_.7)
-    *node_ptr_ = std::make_unique<Node>(parent, current_idx_);
+    if (node_source && *node_source) {
+      (*node_source)->Reinit(parent, current_idx_);
+      *node_ptr_ = std::move(*node_source);
+    } else {
+      *node_ptr_ = std::make_unique<Node>(parent, current_idx_);
+    }
     // 3. Attach stored pointer back to a list:
     //    node_ptr_ ->
     //         &Node(idx_.3).sibling_ -> Node(idx_.5).sibling_ -> Node(idx_.7)
diff --git a/src/mcts/params.cc b/src/mcts/params.cc
index 6aa7d5ae8b..85dcb5ff5e 100644
--- a/src/mcts/params.cc
+++ b/src/mcts/params.cc
@@ -162,14 +162,13 @@ const OptionId SearchParams::kHistoryFillId{
     "synthesize them (always, never, or only at non-standard fen position)."};
 
 void SearchParams::Populate(OptionsParser* options) {
-  // Here the "safe defaults" are listed.
-  // Many of them are overridden with optimized defaults in engine.cc and
-  // tournament.cc
-  options->Add<IntOption>(kMiniBatchSizeId, 1, 1024) = 1;
+  // Here the uci optimized defaults" are set.
+  // Many of them are overridden with training specific values in tournament.cc.
+  options->Add<IntOption>(kMiniBatchSizeId, 1, 1024) = 256;
   options->Add<IntOption>(kMaxPrefetchBatchId, 0, 1024) = 32;
-  options->Add<FloatOption>(kCpuctId, 0.0f, 100.0f) = 1.2f;
+  options->Add<FloatOption>(kCpuctId, 0.0f, 100.0f) = 3.0f;
   options->Add<FloatOption>(kCpuctBaseId, 1.0f, 1000000000.0f) = 19652.0f;
-  options->Add<FloatOption>(kCpuctFactorId, 0.0f, 1000.0f) = 0.0f;
+  options->Add<FloatOption>(kCpuctFactorId, 0.0f, 1000.0f) = 2.0f;
   options->Add<FloatOption>(kTemperatureId, 0.0f, 100.0f) = 0.0f;
   options->Add<IntOption>(kTempDecayMovesId, 0, 100) = 0;
   options->Add<IntOption>(kTemperatureCutoffMoveId, 0, 1000) = 0;
@@ -182,13 +181,13 @@ void SearchParams::Populate(OptionsParser* options) {
   options->Add<FloatOption>(kSmartPruningFactorId, 0.0f, 10.0f) = 1.33f;
   std::vector<std::string> fpu_strategy = {"reduction", "absolute"};
   options->Add<ChoiceOption>(kFpuStrategyId, fpu_strategy) = "reduction";
-  options->Add<FloatOption>(kFpuReductionId, -100.0f, 100.0f) = 0.0f;
+  options->Add<FloatOption>(kFpuReductionId, -100.0f, 100.0f) = 1.2f;
   options->Add<FloatOption>(kFpuValueId, -1.0f, 1.0f) = -1.0f;
-  options->Add<IntOption>(kCacheHistoryLengthId, 0, 7) = 7;
-  options->Add<FloatOption>(kPolicySoftmaxTempId, 0.1f, 10.0f) = 1.0f;
-  options->Add<IntOption>(kMaxCollisionEventsId, 1, 1024) = 1;
-  options->Add<IntOption>(kMaxCollisionVisitsId, 1, 1000000) = 1;
-  options->Add<BoolOption>(kOutOfOrderEvalId) = false;
+  options->Add<IntOption>(kCacheHistoryLengthId, 0, 7) = 0;
+  options->Add<FloatOption>(kPolicySoftmaxTempId, 0.1f, 10.0f) = 2.2f;
+  options->Add<IntOption>(kMaxCollisionEventsId, 1, 1024) = 32;
+  options->Add<IntOption>(kMaxCollisionVisitsId, 1, 1000000) = 9999;
+  options->Add<BoolOption>(kOutOfOrderEvalId) = true;
   options->Add<IntOption>(kMultiPvId, 1, 500) = 1;
   std::vector<std::string> score_type = {"centipawn", "win_percentage", "Q"};
   options->Add<ChoiceOption>(kScoreTypeId, score_type) = "centipawn";
@@ -213,7 +212,8 @@ SearchParams::SearchParams(const OptionsDict& options)
       kMaxCollisionVisits(options.Get<int>(kMaxCollisionVisitsId.GetId())),
       kOutOfOrderEval(options.Get<bool>(kOutOfOrderEvalId.GetId())),
       kHistoryFill(
-          EncodeHistoryFill(options.Get<std::string>(kHistoryFillId.GetId()))) {
+          EncodeHistoryFill(options.Get<std::string>(kHistoryFillId.GetId()))),
+      kMiniBatchSize(options.Get<int>(kMiniBatchSizeId.GetId())){
 }
 
 }  // namespace lczero
diff --git a/src/mcts/params.h b/src/mcts/params.h
index 1218416ccb..bcbe780f46 100644
--- a/src/mcts/params.h
+++ b/src/mcts/params.h
@@ -43,7 +43,7 @@ class SearchParams {
 
   // Parameter getters.
   int GetMiniBatchSize() const {
-    return options_.Get<int>(kMiniBatchSizeId.GetId());
+    return kMiniBatchSize;
   }
   int GetMaxPrefetchBatch() const {
     return options_.Get<int>(kMaxPrefetchBatchId.GetId());
@@ -138,6 +138,7 @@ class SearchParams {
   const int kMaxCollisionVisits;
   const bool kOutOfOrderEval;
   const FillEmptyHistory kHistoryFill;
+  const int kMiniBatchSize;
 };
 
 }  // namespace lczero
diff --git a/src/mcts/search.cc b/src/mcts/search.cc
index b4b9ea28b4..d946741f08 100644
--- a/src/mcts/search.cc
+++ b/src/mcts/search.cc
@@ -411,6 +411,10 @@ void Search::UpdateRemainingMoves() {
   }
   // Even if we exceeded limits, don't go crazy by not allowing any playouts.
   if (remaining_playouts_ <= 1) remaining_playouts_ = 1;
+  // Since remaining_playouts_ has changed, the logic for selecting visited root
+  // nodes may also change. Use a 0 visit cancel score update to clear out any
+  // cached best edge.
+  root_node_->CancelScoreUpdate(0);
 }
 
 // Return the evaluation of the actual best child, regardless of temperature
@@ -656,7 +660,7 @@ void Search::Stop() {
 
 void Search::Abort() {
   Mutex::Lock lock(counters_mutex_);
-  if (!stop_.load(std::memory_order_acquire)) {
+  if (!stop_.load(std::memory_order_acquire) || !bestmove_is_sent_) {
     bestmove_is_sent_ = true;
     FireStopInternal();
   }
@@ -812,8 +816,12 @@ SearchWorker::NodeToProcess SearchWorker::PickNodeToExtend(
   Node* node = search_->root_node_;
   Node::Iterator best_edge;
   Node::Iterator second_best_edge;
-  // Initialize position sequence with pre-move position.
-  history_.Trim(search_->played_history_.GetLength());
+
+  // Precache a newly constructed node to avoid memory allocations being
+  // performed while the mutex is held.
+  if (!precached_node_) {
+    precached_node_ = std::make_unique<Node>(nullptr, 0);
+  }
 
   SharedMutex::Lock lock(search_->nodes_mutex_);
 
@@ -823,6 +831,7 @@ SearchWorker::NodeToProcess SearchWorker::PickNodeToExtend(
   // True on first iteration, false as we dive deeper.
   bool is_root_node = true;
   uint16_t depth = 0;
+  bool node_already_updated = true;
 
   while (true) {
     // First, terminate if we find collisions or leaf nodes.
@@ -832,7 +841,9 @@ SearchWorker::NodeToProcess SearchWorker::PickNodeToExtend(
     //            in the beginning (and there would be no need for "if
     //            (!is_root_node)"), but that would mean extra mutex lock.
     //            Will revisit that after rethinking locking strategy.
-    if (!is_root_node) node = best_edge.GetOrSpawnNode(/* parent */ node);
+    if (!node_already_updated) {
+      node = best_edge.GetOrSpawnNode(/* parent */ node, &precached_node_);
+    }
     best_edge.Reset();
     depth++;
     // n_in_flight_ is incremented. If the method returns false, then there is
@@ -852,6 +863,18 @@ SearchWorker::NodeToProcess SearchWorker::PickNodeToExtend(
         return NodeToProcess::Extension(node, depth);
       }
     }
+    Node* possible_shortcut_child = node->GetCachedBestChild();
+    if (possible_shortcut_child) {
+      // Add two here to reverse the conservatism that goes into calculating the
+      // remaining cache visits.
+      collision_limit =
+          std::min(collision_limit, node->GetRemainingCacheVisits() + 2);
+      is_root_node = false;
+      node = possible_shortcut_child;
+      node_already_updated = true;
+      continue;
+    }
+    node_already_updated = false;
 
     // If we fall through, then n_in_flight_ has been incremented but this
     // playout remains incomplete; we must go deeper.
@@ -895,14 +918,19 @@ SearchWorker::NodeToProcess SearchWorker::PickNodeToExtend(
     }
 
     if (second_best_edge) {
+      int estimated_visits_to_change_best =
+          best_edge.GetVisitsToReachU(second_best, puct_mult, fpu);
+      // Only cache for n-2 steps as the estimate created by GetVisitsToReachU
+      // has potential rounding errors and some conservative logic that can push
+      // it up to 2 away from the real value.
+      node->UpdateBestChild(best_edge,
+                            std::max(0, estimated_visits_to_change_best - 2));
       collision_limit =
-          std::min(collision_limit,
-                   best_edge.GetVisitsToReachU(second_best, puct_mult, fpu));
+          std::min(collision_limit, estimated_visits_to_change_best);
       assert(collision_limit >= 1);
       second_best_edge.Reset();
     }
 
-    history_.Append(best_edge.GetMove());
     if (is_root_node && possible_moves <= 1 && !search_->limits_.infinite) {
       // If there is only one move theoretically possible within remaining time,
       // output it.
@@ -914,6 +942,22 @@ SearchWorker::NodeToProcess SearchWorker::PickNodeToExtend(
 }
 
 void SearchWorker::ExtendNode(Node* node) {
+  // Initialize position sequence with pre-move position.
+  history_.Trim(search_->played_history_.GetLength());
+  std::vector<Move> to_add;
+  // Could instead reserve one more than the difference between history_.size()
+  // and history_.capacity().
+  to_add.reserve(60);
+  Node* cur = node;
+  while (cur != search_->root_node_) {
+    Node* prev = cur->GetParent();
+    to_add.push_back(prev->GetEdgeToNode(cur)->GetMove());
+    cur = prev;
+  }
+  for (int i = to_add.size() - 1; i >= 0; i--) {
+    history_.Append(to_add[i]);
+  }
+
   // We don't need the mutex because other threads will see that N=0 and
   // N-in-flight=1 and will not touch this node.
   const auto& board = history_.Last().GetBoard();
@@ -992,7 +1036,8 @@ bool SearchWorker::AddNodeToComputation(Node* node, bool add_if_cached) {
 
   if (node && node->HasChildren()) {
     // Legal moves are known, use them.
-    for (auto edge : node->Edges()) {
+    moves.reserve(node->GetNumEdges());
+    for (const auto& edge : node->Edges()) {
       moves.emplace_back(edge.GetMove().as_nn_index());
     }
   } else {
diff --git a/src/mcts/search.h b/src/mcts/search.h
index 128c7d8109..14b5bcd51f 100644
--- a/src/mcts/search.h
+++ b/src/mcts/search.h
@@ -294,6 +294,7 @@ class SearchWorker {
   bool root_move_filter_populated_ = false;
   int number_out_of_order_ = 0;
   const SearchParams& params_;
+  std::unique_ptr<Node> precached_node_;
 };
 
 }  // namespace lczero
diff --git a/src/neural/cuda/layers.cc b/src/neural/cuda/layers.cc
index 53a3cce8d1..25ff7a2fae 100644
--- a/src/neural/cuda/layers.cc
+++ b/src/neural/cuda/layers.cc
@@ -117,7 +117,7 @@ ConvLayer<DataType>::ConvLayer(BaseLayer<DataType>* ip, int C, int H, int W,
         cudnnSetConvolutionMathType(conv_desc_, CUDNN_TENSOR_OP_MATH));
 
   // TODO: dynamic selection of algorithm!
-  if ((C > 32) && (!fp16)) {
+  if ((C > 32) && (!fp16) && (filter_size_ > 1)) {
     conv_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED;
   } else {
     conv_algo_ = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
diff --git a/src/neural/cuda/network_cudnn.cc b/src/neural/cuda/network_cudnn.cc
index 013ac2200a..69f4d2e833 100644
--- a/src/neural/cuda/network_cudnn.cc
+++ b/src/neural/cuda/network_cudnn.cc
@@ -59,11 +59,15 @@ struct InputsOutputs {
     ReportCUDAErrors(
         cudaHostGetDevicePointer(&input_val_mem_gpu_, input_val_mem_, 0));
 
+
     ReportCUDAErrors(cudaHostAlloc(
-        &op_policy_mem_, maxBatchSize * kNumOutputPolicy * sizeof(float),
-        cudaHostAllocMapped));
-    ReportCUDAErrors(
-        cudaHostGetDevicePointer(&op_policy_mem_gpu_, op_policy_mem_, 0));
+        &op_policy_mem_, maxBatchSize * kNumOutputPolicy * sizeof(float), 0));
+
+    // Seperate device memory copy for policy output.
+    // It's faster to write to device memory and then copy to host memory
+    // than having the kernel write directly to it.
+    ReportCUDAErrors(cudaMalloc(&op_policy_mem_gpu_, 
+        maxBatchSize * kNumOutputPolicy * sizeof(float)));
 
     ReportCUDAErrors(cudaHostAlloc(&op_value_mem_, maxBatchSize * sizeof(float),
                                    cudaHostAllocMapped));
@@ -74,6 +78,7 @@ struct InputsOutputs {
     ReportCUDAErrors(cudaFreeHost(input_masks_mem_));
     ReportCUDAErrors(cudaFreeHost(input_val_mem_));
     ReportCUDAErrors(cudaFreeHost(op_policy_mem_));
+    ReportCUDAErrors(cudaFree(op_policy_mem_gpu_));
     ReportCUDAErrors(cudaFreeHost(op_value_mem_));
   }
   uint64_t* input_masks_mem_;
@@ -81,11 +86,13 @@ struct InputsOutputs {
   float* op_policy_mem_;
   float* op_value_mem_;
 
-  // GPU pointers for the above allocations
+  // GPU pointers for the above allocations.
   uint64_t* input_masks_mem_gpu_;
   float* input_val_mem_gpu_;
-  float* op_policy_mem_gpu_;
   float* op_value_mem_gpu_;
+
+  // This is a seperate copy.
+  float* op_policy_mem_gpu_;
 };
 
 template <typename DataType>
@@ -173,16 +180,21 @@ class CudnnNetwork : public Network {
     has_se_ = false;
 
     // 0. Process weights.
-    processConvBlock(weights.input, true);
+
+    // TODO: Get filter sizes from proto file? 
+    // Hardcoded right now: 
+    //  3 for input and residual block convolutions.
+    //  1 for policy and value head convolutions.
+    processConvBlock(weights.input, true, 3);
     for (int i = 0; i < numBlocks_; i++) {
       if (weights.residual[i].has_se) {
         has_se_ = true;
       }
-      processConvBlock(weights.residual[i].conv1, true);
-      processConvBlock(weights.residual[i].conv2, true);
+      processConvBlock(weights.residual[i].conv1, true, 3);
+      processConvBlock(weights.residual[i].conv2, true, 3);
     }
-    processConvBlock(weights.policy);
-    processConvBlock(weights.value);
+    processConvBlock(weights.policy, true, 1);
+    processConvBlock(weights.value, true, 1);
 
     // 1. Allocate scratch space (used internally by cudnn to run convolutions,
     //     and also for format/layout conversion for weights).
@@ -283,15 +295,11 @@ class CudnnNetwork : public Network {
     // Policy head.
     {
       auto convPol = std::make_unique<ConvLayer<DataType>>(
-          resi_last_, weights.policy.bn_means.size(), 8, 8, 1, kNumFilters);
-      convPol->LoadWeights(&weights.policy.weights[0], nullptr, scratch_mem_);
+          resi_last_, weights.policy.bn_means.size(), 8, 8, 1, kNumFilters, true, true);
+      convPol->LoadWeights(&weights.policy.weights[0],
+                           &weights.policy.biases[0], scratch_mem_);
       network_.emplace_back(std::move(convPol));
 
-      auto BNPol = std::make_unique<BNLayer<DataType>>(getLastLayer(), true);
-      BNPol->LoadWeights(&weights.policy.bn_means[0],
-                         &weights.policy.bn_stddivs[0]);
-      network_.emplace_back(std::move(BNPol));
-
       auto FCPol = std::make_unique<FCLayer<DataType>>(
           getLastLayer(), weights.ip_pol_b.size(), 1, 1, false, true);
       FCPol->LoadWeights(&weights.ip_pol_w[0], &weights.ip_pol_b[0],
@@ -307,15 +315,11 @@ class CudnnNetwork : public Network {
     // Value head.
     {
       auto convVal = std::make_unique<ConvLayer<DataType>>(
-          resi_last_, weights.value.bn_means.size(), 8, 8, 1, kNumFilters);
-      convVal->LoadWeights(&weights.value.weights[0], nullptr, scratch_mem_);
+          resi_last_, weights.value.biases.size(), 8, 8, 1, kNumFilters, true, true);
+      convVal->LoadWeights(&weights.value.weights[0], &weights.value.biases[0],
+                           scratch_mem_);
       network_.emplace_back(std::move(convVal));
 
-      auto BNVal = std::make_unique<BNLayer<DataType>>(getLastLayer(), true);
-      BNVal->LoadWeights(&weights.value.bn_means[0],
-                         &weights.value.bn_stddivs[0]);
-      network_.emplace_back(std::move(BNVal));
-
       auto FCVal1 = std::make_unique<FCLayer<DataType>>(
           getLastLayer(), weights.ip1_val_b.size(), 1, 1, true, true);
       FCVal1->LoadWeights(&weights.ip1_val_w[0], &weights.ip1_val_b[0],
@@ -403,43 +407,44 @@ class CudnnNetwork : public Network {
                         scratch_mem_, scratch_size_, cudnn_,
                         cublas_);  // pol conv
     network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
-                        scratch_mem_, scratch_size_, cudnn_,
-                        cublas_);  // pol BN
-    network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
                         scratch_mem_, scratch_size_, cudnn_,
                         cublas_);  // pol FC
     if (std::is_same<half, DataType>::value) {
       // TODO: consider softmax layer that writes directly to fp32.
-      network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
+      network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[1], nullptr,
                           scratch_mem_, scratch_size_, cudnn_,
                           cublas_);  // pol softmax
-      copyTypeConverted(opPol, (half*)(tensor_mem_[1]),
+      copyTypeConverted(opPol, (half*)(tensor_mem_[0]),
                         batchSize * kNumOutputPolicy);  // POLICY
     } else {
-      network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[0], nullptr,
+      network_[l++]->Eval(batchSize, (DataType*)opPol, tensor_mem_[1], nullptr,
                           scratch_mem_, scratch_size_, cudnn_,
                           cublas_);  // pol softmax  // POLICY
     }
 
+    // Copy policy output from device memory to host memory.
+    ReportCUDAErrors(cudaMemcpyAsync(io->op_policy_mem_, 
+                                     io->op_policy_mem_gpu_, 
+                                     sizeof(float) * kNumOutputPolicy * 
+                                     batchSize, cudaMemcpyDeviceToHost));
+
     // value head
     network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
                         scratch_mem_, scratch_size_, cudnn_,
                         cublas_);  // value conv
-    network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[0], nullptr,
-                        scratch_mem_, scratch_size_, cudnn_,
-                        cublas_);  // value BN
-    network_[l++]->Eval(batchSize, tensor_mem_[0], tensor_mem_[2], nullptr,
+
+    network_[l++]->Eval(batchSize, tensor_mem_[1], tensor_mem_[0], nullptr,
                         scratch_mem_, scratch_size_, cudnn_,
                         cublas_);  // value FC1
 
     if (std::is_same<half, DataType>::value) {
       // TODO: consider fusing the bias-add of FC2 with format conversion.
-      network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[0], nullptr,
+      network_[l++]->Eval(batchSize, tensor_mem_[2], tensor_mem_[1], nullptr,
                           scratch_mem_, scratch_size_, cudnn_,
                           cublas_);  // value FC2
       copyTypeConverted(opVal, (half*)(tensor_mem_[2]), batchSize);  // VALUE
     } else {
-      network_[l++]->Eval(batchSize, (DataType*)opVal, tensor_mem_[0], nullptr,
+      network_[l++]->Eval(batchSize, (DataType*)opVal, tensor_mem_[1], nullptr,
                           scratch_mem_, scratch_size_, cudnn_,
                           cublas_);  // value FC2    // VALUE
     }
@@ -535,7 +540,8 @@ class CudnnNetwork : public Network {
   mutable std::mutex inputs_outputs_lock_;
   std::list<std::unique_ptr<InputsOutputs>> free_inputs_outputs_;
 
-  void processConvBlock(LegacyWeights::ConvBlock& block, bool foldBNLayer = false) {
+  void processConvBlock(LegacyWeights::ConvBlock& block, bool foldBNLayer,
+                        int filterSize) {
     const float epsilon = 1e-5f;
 
     // Compute reciprocal of std-dev from the variances (so that it can be
@@ -557,13 +563,15 @@ class CudnnNetwork : public Network {
     // convolution idea proposed by Henrik Forst�n and first implemented in
     // leela go zero.
     if (foldBNLayer) {
+      const int spatialSize = filterSize * filterSize;
       const int outputs = block.biases.size();
-      const int channels = block.weights.size() / (outputs * 3 * 3);
-
+      const int channels = block.weights.size() / (outputs * spatialSize);
+  
       for (auto o = 0; o < outputs; o++) {
         for (auto c = 0; c < channels; c++) {
-          for (auto i = 0; i < 9; i++) {
-            block.weights[o * channels * 9 + c * 9 + i] *= block.bn_stddivs[o];
+          for (auto i = 0; i < spatialSize; i++) {
+            block.weights[o * channels * spatialSize + c * spatialSize + i] *=
+                block.bn_stddivs[o];
           }
         }
 
diff --git a/src/neural/loader.cc b/src/neural/loader.cc
index c9a2b076e7..9921babee0 100644
--- a/src/neural/loader.cc
+++ b/src/neural/loader.cc
@@ -164,7 +164,7 @@ std::string DiscoverWeightsFile() {
     gzFile file = gzopen(candidate.second.c_str(), "rb");
 
     if (!file) continue;
-    char buf[256];
+    unsigned char buf[256];
     int sz = gzread(file, buf, 256);
     gzclose(file);
     if (sz < 0) continue;
@@ -180,8 +180,10 @@ std::string DiscoverWeightsFile() {
 
     // First byte of the protobuf stream is 0x0d for fixed32, so we ignore it as
     // our own magic should suffice.
-    auto magic = reinterpret_cast<std::uint32_t*>(buf + 1);
-    if (*magic == kWeightMagic) {
+    auto magic = buf[1] | (static_cast<uint32_t>(buf[2]) << 8) |
+                 (static_cast<uint32_t>(buf[3]) << 16) |
+                 (static_cast<uint32_t>(buf[4]) << 24);
+    if (magic == kWeightMagic) {
       CERR << "Found pb network file: " << candidate.second;
       return candidate.second;
     }
diff --git a/src/neural/network_demux.cc b/src/neural/network_demux.cc
new file mode 100644
index 0000000000..c791d1f11b
--- /dev/null
+++ b/src/neural/network_demux.cc
@@ -0,0 +1,223 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/factory.h"
+
+#include <condition_variable>
+#include <queue>
+#include <thread>
+#include "utils/exception.h"
+
+namespace lczero {
+namespace {
+
+class DemuxingNetwork;
+class DemuxingComputation : public NetworkComputation {
+ public:
+  DemuxingComputation(DemuxingNetwork* network) : network_(network) {}
+
+  void AddInput(InputPlanes&& input) override { planes_.emplace_back(input); }
+
+  void ComputeBlocking() override;
+
+  int GetBatchSize() const override { return planes_.size(); }
+
+  float GetQVal(int sample) const override {
+    int idx = sample / partial_size_;
+    int offset = sample % partial_size_;
+    return parents_[idx]->GetQVal(offset);
+  }
+
+  float GetPVal(int sample, int move_id) const override {
+    int idx = sample / partial_size_;
+    int offset = sample % partial_size_;
+    return parents_[idx]->GetPVal(offset, move_id);
+  }
+
+  void NotifyComplete() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    dataready_--;
+    if (dataready_ == 0) {
+      dataready_cv_.notify_one();
+    }
+  }
+
+  NetworkComputation* AddParentFromNetwork(Network* network) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    parents_.emplace_back(network->NewComputation());
+    int cur_idx = (parents_.size() - 1) * partial_size_;
+    for (int i = cur_idx; i < std::min(GetBatchSize(), cur_idx + partial_size_);
+         i++) {
+      parents_.back()->AddInput(std::move(planes_[i]));
+    }
+    return parents_.back().get();
+  }
+
+ private:
+  std::vector<InputPlanes> planes_;
+  DemuxingNetwork* network_;
+  std::vector<std::unique_ptr<NetworkComputation>> parents_;
+
+  std::mutex mutex_;
+  std::condition_variable dataready_cv_;
+  int dataready_ = 0;
+  int partial_size_ = 0;
+};
+
+class DemuxingNetwork : public Network {
+ public:
+  DemuxingNetwork(const WeightsFile& weights, const OptionsDict& options) {
+    minimum_split_size_ = options.GetOrDefault<int>("minimum-split-size", 0);
+    const auto parents = options.ListSubdicts();
+    if (parents.empty()) {
+      // If options are empty, or multiplexer configured in root object,
+      // initialize on root object and default backend.
+      auto backends = NetworkFactory::Get()->GetBackendsList();
+      AddBackend(backends[0], weights, options);
+    }
+
+    for (const auto& name : parents) {
+      AddBackend(name, weights, options.GetSubdict(name));
+    }
+  }
+
+  void AddBackend(const std::string& name, const WeightsFile& weights,
+                  const OptionsDict& opts) {
+    const int nn_threads = opts.GetOrDefault<int>("threads", 1);
+    const std::string backend = opts.GetOrDefault<std::string>("backend", name);
+
+    networks_.emplace_back(
+        NetworkFactory::Get()->Create(backend, weights, opts));
+
+    for (int i = 0; i < nn_threads; ++i) {
+      threads_.emplace_back([this]() { Worker(); });
+    }
+  }
+
+  std::unique_ptr<NetworkComputation> NewComputation() override {
+    return std::make_unique<DemuxingComputation>(this);
+  }
+
+  void Enqueue(DemuxingComputation* computation) {
+    std::lock_guard<std::mutex> lock(mutex_);
+    queue_.push(computation);
+    cv_.notify_one();
+  }
+
+  ~DemuxingNetwork() {
+    Abort();
+    Wait();
+    // Unstuck waiting computations.
+    while (!queue_.empty()) {
+      queue_.front()->NotifyComplete();
+      queue_.pop();
+    }
+  }
+
+  void Worker() {
+    // While Abort() is not called (and it can only be called from destructor).
+    while (!abort_) {
+      {
+        {
+          std::unique_lock<std::mutex> lock(mutex_);
+          // Wait until there's come work to compute.
+          cv_.wait(lock, [&] { return abort_ || !queue_.empty(); });
+          if (abort_) break;
+        }
+
+        // While there is a work in queue, process it.
+        while (true) {
+          
+          DemuxingComputation* to_notify;
+          {
+            std::unique_lock<std::mutex> lock(mutex_);
+            if (queue_.empty()) break;
+            to_notify = queue_.front();
+            queue_.pop();
+          }
+          long long net_idx = ++(counter_) % networks_.size();
+          NetworkComputation* to_compute = to_notify->AddParentFromNetwork(networks_[net_idx].get());
+          to_compute->ComputeBlocking();
+          to_notify->NotifyComplete();
+        }
+      }
+    }
+  }
+
+  void Abort() {
+    {
+      std::lock_guard<std::mutex> lock(mutex_);
+      abort_ = true;
+    }
+    cv_.notify_all();
+  }
+
+  void Wait() {
+    while (!threads_.empty()) {
+      threads_.back().join();
+      threads_.pop_back();
+    }
+  }
+
+  std::vector<std::unique_ptr<Network>> networks_;
+  std::queue<DemuxingComputation*> queue_;
+  int minimum_split_size_ = 0;
+  std::atomic<long long> counter_;
+  bool abort_ = false;
+
+  std::mutex mutex_;
+  std::condition_variable cv_;
+
+  std::vector<std::thread> threads_;
+};
+
+void DemuxingComputation::ComputeBlocking() {
+  if (GetBatchSize() == 0) return;
+  partial_size_ = (GetBatchSize() + network_->networks_.size() - 1) /
+                  network_->networks_.size();
+  if (partial_size_ < network_->minimum_split_size_) {
+    partial_size_ = std::min(GetBatchSize(), network_->minimum_split_size_);
+  }
+  int splits = (GetBatchSize() + partial_size_ - 1) / partial_size_;
+
+  std::unique_lock<std::mutex> lock(mutex_);
+  dataready_ = splits;
+  for (int j=0; j < splits; j++) {
+    network_->Enqueue(this);
+  }
+  dataready_cv_.wait(lock, [this]() { return dataready_ == 0; });
+}
+
+std::unique_ptr<Network> MakeDemuxingNetwork(const WeightsFile& weights,
+                                             const OptionsDict& options) {
+  return std::make_unique<DemuxingNetwork>(weights, options);
+}
+
+REGISTER_NETWORK("demux", MakeDemuxingNetwork, -1001)
+
+}  // namespace
+}  // namespace lczero
diff --git a/src/neural/network_rr.cc b/src/neural/network_rr.cc
new file mode 100644
index 0000000000..5979b9b2c7
--- /dev/null
+++ b/src/neural/network_rr.cc
@@ -0,0 +1,82 @@
+/*
+  This file is part of Leela Chess Zero.
+  Copyright (C) 2018 The LCZero Authors
+
+  Leela Chess is free software: you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation, either version 3 of the License, or
+  (at your option) any later version.
+
+  Leela Chess is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with Leela Chess.  If not, see <http://www.gnu.org/licenses/>.
+
+  Additional permission under GNU GPL version 3 section 7
+
+  If you modify this Program, or any covered work, by linking or
+  combining it with NVIDIA Corporation's libraries from the NVIDIA CUDA
+  Toolkit and the NVIDIA CUDA Deep Neural Network library (or a
+  modified version of those libraries), containing parts covered by the
+  terms of the respective license agreement, the licensors of this
+  Program grant you additional permission to convey the resulting work.
+*/
+
+#include "neural/factory.h"
+
+#include <condition_variable>
+#include <queue>
+#include <thread>
+#include "utils/exception.h"
+
+namespace lczero {
+namespace {
+
+class RoundRobinNetwork : public Network {
+ public:
+  RoundRobinNetwork(const WeightsFile& weights, const OptionsDict& options) {
+    const auto parents = options.ListSubdicts();
+    if (parents.empty()) {
+      // If options are empty, or multiplexer configured in root object,
+      // initialize on root object and default backend.
+      auto backends = NetworkFactory::Get()->GetBackendsList();
+      AddBackend(backends[0], weights, options);
+    }
+
+    for (const auto& name : parents) {
+      AddBackend(name, weights, options.GetSubdict(name));
+    }
+  }
+
+  void AddBackend(const std::string& name, const WeightsFile& weights,
+                  const OptionsDict& opts) {
+    const std::string backend = opts.GetOrDefault<std::string>("backend", name);
+
+    networks_.emplace_back(
+        NetworkFactory::Get()->Create(backend, weights, opts));
+  }
+
+  std::unique_ptr<NetworkComputation> NewComputation() override {
+    long long val = ++counter_;
+    return networks_[val % networks_.size()]->NewComputation();
+  }
+
+  ~RoundRobinNetwork() {}
+
+ private:
+  std::vector<std::unique_ptr<Network>> networks_;
+  std::atomic<long long> counter_;
+};
+
+std::unique_ptr<Network> MakeRoundRobinNetwork(const WeightsFile& weights,
+                                               const OptionsDict& options) {
+  return std::make_unique<RoundRobinNetwork>(weights, options);
+}
+
+REGISTER_NETWORK("roundrobin", MakeRoundRobinNetwork, -999)
+
+}  // namespace
+}  // namespace lczero
diff --git a/src/selfplay/game.cc b/src/selfplay/game.cc
index 2b64db9bf5..ae199ec023 100644
--- a/src/selfplay/game.cc
+++ b/src/selfplay/game.cc
@@ -38,11 +38,15 @@ const OptionId kReuseTreeId{"reuse-tree", "ReuseTree",
 const OptionId kResignPercentageId{
     "resign-percentage", "ResignPercentage",
     "Resign when win percentage drops below specified value."};
+const OptionId kResignEarliestMoveId{"resign-earliest-move",
+                                     "ResignEarliestMove",
+                                     "Earliest move that resign is allowed."};
 }  // namespace
 
 void SelfPlayGame::PopulateUciParams(OptionsParser* options) {
   options->Add<BoolOption>(kReuseTreeId) = false;
   options->Add<FloatOption>(kResignPercentageId, 0.0f, 100.0f) = 0.0f;
+  options->Add<IntOption>(kResignEarliestMoveId, 0, 1000) = 0;
 }
 
 SelfPlayGame::SelfPlayGame(PlayerOptions player1, PlayerOptions player2,
@@ -104,7 +108,9 @@ void SelfPlayGame::Play(int white_threads, int black_threads, bool training,
     float eval = search_->GetBestEval();
     eval = (eval + 1) / 2;
     if (eval < min_eval_[idx]) min_eval_[idx] = eval;
-    if (enable_resign) {
+    int move_number = tree_[0]->GetPositionHistory().GetLength() / 2 + 1;
+    if (enable_resign && move_number >= options_[idx].uci_options->Get<int>(
+                                            kResignEarliestMoveId.GetId())) {
       const float resignpct =
           options_[idx].uci_options->Get<float>(kResignPercentageId.GetId()) /
           100;
diff --git a/src/selfplay/tournament.cc b/src/selfplay/tournament.cc
index 81dee9f5c5..8e319cb4b4 100644
--- a/src/selfplay/tournament.cc
+++ b/src/selfplay/tournament.cc
@@ -86,6 +86,13 @@ void SelfPlayTournament::PopulateOptions(OptionsParser* options) {
   SelfPlayGame::PopulateUciParams(options);
   auto defaults = options->GetMutableDefaultsOptions();
   defaults->Set<int>(SearchParams::kMiniBatchSizeId.GetId(), 32);
+  defaults->Set<float>(SearchParams::kCpuctId.GetId(), 1.2f);
+  defaults->Set<float>(SearchParams::kCpuctFactorId.GetId(), 0.0f);
+  defaults->Set<float>(SearchParams::kPolicySoftmaxTempId.GetId(), 1.0f);
+  defaults->Set<int>(SearchParams::kMaxCollisionVisitsId.GetId(), 1);
+  defaults->Set<int>(SearchParams::kMaxCollisionEventsId.GetId(), 1);
+  defaults->Set<int>(SearchParams::kCacheHistoryLengthId.GetId(), 7);
+  defaults->Set<bool>(SearchParams::kOutOfOrderEvalId.GetId(), false);
   defaults->Set<float>(SearchParams::kSmartPruningFactorId.GetId(), 0.0f);
   defaults->Set<float>(SearchParams::kTemperatureId.GetId(), 1.0f);
   defaults->Set<bool>(SearchParams::kNoiseId.GetId(), true);
diff --git a/src/utils/cache.h b/src/utils/cache.h
index 2e2556e2d9..e820055c63 100644
--- a/src/utils/cache.h
+++ b/src/utils/cache.h
@@ -57,11 +57,10 @@ class LruCache {
   }
 
   // Inserts the element under key @key with value @val.
-  // If the element is pinned, old value is still kept (until fully unpinned),
-  // but new lookups will return updated value.
-  // If @pinned, pins inserted element, Unpin has to be called to unpin.
-  // In any case, puts element to front of the queue (makes it last to evict).
-  V* Insert(K key, std::unique_ptr<V> val, bool pinned = false) {
+  // Puts element to front of the queue (makes it last to evict).
+  void Insert(K key, std::unique_ptr<V> val) {
+    if (capacity_.load(std::memory_order_relaxed) == 0) return;
+
     Mutex::Lock lock(mutex_);
 
     auto hash = hasher_(key) % hash_.size();
@@ -76,16 +75,17 @@ class LruCache {
     ShrinkToCapacity(capacity_ - 1);
     ++size_;
     ++allocated_;
-    Item* new_item = new Item(key, std::move(val), pinned ? 1 : 0);
+    Item* new_item = new Item(key, std::move(val));
     new_item->next_in_hash = hash_head;
     hash_head = new_item;
     InsertIntoLru(new_item);
-    return new_item->value.get();
   }
 
   // Checks whether a key exists. Doesn't lock. Of course the next moment the
   // key may be evicted.
   bool ContainsKey(K key) {
+    if (capacity_.load(std::memory_order_relaxed) == 0) return false;
+
     Mutex::Lock lock(mutex_);
     auto hash = hasher_(key) % hash_.size();
     for (Item* iter = hash_[hash]; iter; iter = iter->next_in_hash) {
@@ -99,6 +99,8 @@ class LruCache {
   // evict); furthermore, a call to Unpin must be made for each such element.
   // Use of LruCacheLock is recommended to automate this pin management.
   V* LookupAndPin(K key) {
+    if (capacity_.load(std::memory_order_relaxed) == 0) return nullptr;
+
     Mutex::Lock lock(mutex_);
 
     auto hash = hasher_(key) % hash_.size();
@@ -149,9 +151,9 @@ class LruCache {
   void SetCapacity(int capacity) {
     Mutex::Lock lock(mutex_);
 
-    if (capacity_ == capacity) return;
+    if (capacity_.load(std::memory_order_relaxed) == capacity) return;
     ShrinkToCapacity(capacity);
-    capacity_ = capacity;
+    capacity_.store(capacity);
 
     std::vector<Item*> new_hash(
         static_cast<size_t>(capacity * kLoadFactor + 1));
@@ -179,16 +181,15 @@ class LruCache {
     Mutex::Lock lock(mutex_);
     return size_;
   }
-  int GetCapacity() const {
-    Mutex::Lock lock(mutex_);
-    return capacity_;
+  int GetCapacity() const { 
+	return capacity_.load(std::memory_order_relaxed);
   }
   static constexpr size_t GetItemStructSize() { return sizeof(Item); }
 
  private:
   struct Item {
-    Item(K key, std::unique_ptr<V> value, int pins)
-        : key(key), value(std::move(value)), pins(pins) {}
+    Item(K key, std::unique_ptr<V> value)
+        : key(key), value(std::move(value)) {}
     K key;
     std::unique_ptr<V> value;
     int pins = 0;
@@ -268,7 +269,7 @@ class LruCache {
   }
 
   // Fresh in front, stale on back.
-  int capacity_ GUARDED_BY(mutex_);
+  std::atomic<int> capacity_;
   int size_ GUARDED_BY(mutex_) = 0;
   int allocated_ GUARDED_BY(mutex_) = 0;
   Item* lru_head_ GUARDED_BY(mutex_) = nullptr;  // Newest elements.
diff --git a/src/utils/fastmath.h b/src/utils/fastmath.h
index 9f182e423a..ba0855a5eb 100644
--- a/src/utils/fastmath.h
+++ b/src/utils/fastmath.h
@@ -36,9 +36,9 @@ namespace lczero {
 // The approximation used here is log2(2^N*(1+f)) ~ N+f*(1.342671-0.342671*f)
 // where N is the integer and f the fractional part, f>=0.
 inline float FastLog2(const float a) {
-  int32_t tmp;
+  uint32_t tmp;
   std::memcpy(&tmp, &a, sizeof(float));
-  int expb = (tmp >> 23);
+  uint32_t expb = tmp >> 23;
   tmp = (tmp & 0x7fffff) | (0x7f << 23);
   float out;
   std::memcpy(&out, &tmp, sizeof(float));
@@ -50,12 +50,12 @@ inline float FastLog2(const float a) {
 // where N is the integer and f the fractional part, f>=0.
 inline float FastPow2(const float a) {
   if (a < -126) return 0.0;
-  int exp = floor(a);
+  int32_t exp = floor(a);
   float out = a - exp;
   out = 1.0f + out * (0.656366f + 0.343634f * out);
   int32_t tmp;
   std::memcpy(&tmp, &out, sizeof(float));
-  tmp += exp << 23;
+  tmp += static_cast<int32_t>(static_cast<uint32_t>(exp) << 23);
   std::memcpy(&out, &tmp, sizeof(float));
   return out;
 }
diff --git a/src/version.inc b/src/version.inc
index 02885e82d8..5662d344ef 100644
--- a/src/version.inc
+++ b/src/version.inc
@@ -1,4 +1,4 @@
 #define LC0_VERSION_MAJOR 0
-#define LC0_VERSION_MINOR 20
+#define LC0_VERSION_MINOR 21
 #define LC0_VERSION_PATCH 0
 #define LC0_VERSION_POSTFIX "dev"
diff --git a/subprojects/protobuf-3.6.0.wrap b/subprojects/protobuf-3.6.0.wrap
new file mode 100644
index 0000000000..5a10b720b8
--- /dev/null
+++ b/subprojects/protobuf-3.6.0.wrap
@@ -0,0 +1,10 @@
+[wrap-file]
+directory = protobuf-3.6.0
+
+source_url = https://github.com/protocolbuffers/protobuf/releases/download/v3.6.0/protobuf-all-3.6.0.tar.gz
+source_filename = protobuf-all-3.6.0.tar.gz
+source_hash = 1532154addf85080330fdd037949d4653dfce16550df5c70ea0cd212d8aff3af
+
+patch_url = https://github.com/borg323/protobuf/releases/download/3.6.0/protobuf-3.6.0-wrap.zip
+patch_filename = protobuf-3.6.0-wrap.zip
+patch_hash = a14730d2e3702c4a0d7b3f05a380ec6b2c0b138a5b00539705b5c3a8df9885e3
diff --git a/tensorflow.md b/tensorflow.md
new file mode 100644
index 0000000000..d5dd9831f3
--- /dev/null
+++ b/tensorflow.md
@@ -0,0 +1,11 @@
+To build with tensorflow under linux you need to install Tensorflow_cc from
+<https://github.com/FloopCZ/tensorflow_cc>. Either release v1.9.0 or v1.12.0.
+Tensorflow_cc requires a specific version of protobuf, which constrains the
+build. Release v1.9.0 works out of the box, since the default protobuf
+subproject (v3.5.1) is compatible and is used instead of a system installed
+version. In contrast release v1.12.0 needs protobuf v3.6.0 which can be built
+by adding `-Dprotobuf-3-6-0=true` to the build command line. Note that this
+protobuf version has issues with static builds and crashes so is not
+recommended for normal use. The crashes look very similar to:
+* <https://github.com/protocolbuffers/protobuf/issues/5107>
+* <https://github.com/protocolbuffers/protobuf/issues/5353>