sync public and internal master

marian-nmt · Mar 18, 2021 · 272096c · 272096c
2 parents 77c3e35 + 8f73923
commit 272096c
Show file tree

Hide file tree

Showing 87 changed files with 2,879 additions and 1,189 deletions.
diff --git a/.clang-format b/.clang-format
@@ -3,7 +3,7 @@ Language:        Cpp
 # BasedOnStyle:  Google
 AccessModifierOffset: -2
 AlignAfterOpenBracket: Align
-AlignConsecutiveAssignments: false
+AlignConsecutiveAssignments: true
 AlignConsecutiveDeclarations: false
 AlignEscapedNewlinesLeft: true
 AlignOperands:   true
@@ -71,7 +71,7 @@ PenaltyBreakString: 1000
 PenaltyExcessCharacter: 1000000
 PenaltyReturnTypeOnItsOwnLine: 200
 PointerAlignment: Left
-ReflowComments:  true
+ReflowComments:  false
 SortIncludes:    true
 SpaceAfterCStyleCast: false
 SpaceBeforeAssignmentOperators: true

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 ## [Unreleased]
 
 ### Added
+- Support for MS-internal binary shortlist
 - Local/global sharding with MPI training via `--sharding local`
 - fp16 support for factors.
 - Correct training with fp16 via `--fp16`. 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -327,7 +327,7 @@ if(CUDA_FOUND)
   if(USE_STATIC_LIBS)
     set(EXT_LIBS ${EXT_LIBS} ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusparse_LIBRARY})
     set(CUDA_LIBS ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_cusparse_LIBRARY})
-    
+
     find_library(CUDA_culibos_LIBRARY NAMES culibos PATHS ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64)
     # The cuLIBOS library does not seem to exist in Windows CUDA toolkit installs
     if(CUDA_culibos_LIBRARY)
@@ -504,8 +504,8 @@ if(USE_STATIC_LIBS)
 endif()
 
 # Find MPI
-if(USE_MPI)  
-  # 2.0 refers to MPI2 standard. OpenMPI is an implementation of that standard regardless of the specific OpenMPI version 
+if(USE_MPI)
+  # 2.0 refers to MPI2 standard. OpenMPI is an implementation of that standard regardless of the specific OpenMPI version
   # e.g. OpenMPI 1.10 implements MPI2 and will be found correctly.
   find_package(MPI 2.0 REQUIRED)
   if(MPI_FOUND)
@@ -518,19 +518,22 @@ if(USE_MPI)
   endif(MPI_FOUND)
 endif(USE_MPI)
 
-# TODO: move inside if(BOOST_COMPONENTS) 
-if(USE_STATIC_LIBS)
-  set(Boost_USE_STATIC_LIBS ON)
-endif()
 
 ###############################################################################
 # Find Boost if required
 if(BOOST_COMPONENTS)
+  if(USE_STATIC_LIBS)
+    set(Boost_USE_STATIC_LIBS ON)
+  endif()
+
   find_package(Boost COMPONENTS ${BOOST_COMPONENTS})
   if(Boost_FOUND)
     include_directories(${Boost_INCLUDE_DIRS})
     set(EXT_LIBS ${EXT_LIBS} ${Boost_LIBRARIES})
     set(EXT_LIBS ${EXT_LIBS} ${ZLIB_LIBRARIES}) # hack for static compilation
+    if(MSVC)
+      add_definitions(-DBOOST_ALL_NO_LIB=1) # hack for missing date-time stub
+    endif()
   else(Boost_FOUND)
     message(SEND_ERROR "Cannot find Boost libraries. Terminating.")
   endif(Boost_FOUND)

diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-v1.10.6
+v1.10.7
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -15,6 +15,7 @@ pool:
   name: Azure Pipelines
 
 variables:
+  BOOST_ROOT_WINDOWS: "C:/hostedtoolcache/windows/Boost/1.72.0/x86_64"
   CUDA_PATH_WINDOWS: "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA"
   MKL_DIR: "$(Build.SourcesDirectory)/mkl"
   MKL_URL: "https://romang.blob.core.windows.net/mariandev/ci/mkl-2020.1-windows-static.zip"
@@ -69,6 +70,14 @@ stages:
     #    key: 'v0 | "$(VCPKG_PACKAGES)" | vcpkg | "$(Agent.OS)"'
     #    path: $(VCPKG_DIR)
 
+    # Boost is no longer pre-installed on Azure/GitHub-hosted Windows runners
+    - pwsh: |
+        Write-Host "Downloading Boost to $(BOOST_ROOT_WINDOWS)"
+        $Url = "https://sourceforge.net/projects/boost/files/boost-binaries/1.72.0/boost_1_72_0-msvc-14.2-64.exe"
+        C:\msys64\usr\bin\wget.exe -nv $Url -O "$(Pipeline.Workspace)/boost.exe"
+        Start-Process -Wait -FilePath "$(Pipeline.Workspace)/boost.exe" "/SILENT","/SP-","/SUPPRESSMSGBOXES","/DIR=$(BOOST_ROOT_WINDOWS)"
+      displayName: Download Boost
+
     - pwsh: |
         git clone https://github.com/Microsoft/vcpkg.git $(VCPKG_DIR)
         cd $(VCPKG_DIR)
@@ -121,9 +130,7 @@ stages:
         # Set envvars so that CMake can find the installed packages
         MKLROOT: $(MKL_DIR)
         CUDA_PATH: $(CUDA_PATH_WINDOWS)/v$(cuda_version)
-        # Boost is pre-installed on Azure/GitHub-hosted Windows runners
-        # https://github.com/actions/virtual-environments/blob/main/images/win/Windows2019-Readme.md#boost
-        BOOST_ROOT: $(BOOST_ROOT_1_72_0)
+        BOOST_ROOT: $(BOOST_ROOT_WINDOWS)
 
     - script: |
         call "$(VS_PATH)/VC/Auxiliary/Build/vcvarsall.bat" x64
@@ -226,12 +233,18 @@ stages:
     - checkout: self
       submodules: true
 
-      # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev
-      # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev
+    # The following packages are already installed on Azure-hosted runners: build-essential openssl libssl-dev
+    # No need to install libprotobuf{17,10,9v5} on Ubuntu {20,18,16}.04 because it is installed together with libprotobuf-dev
     - bash: sudo apt-get install -y libgoogle-perftools-dev libprotobuf-dev protobuf-compiler
       displayName: Install packages
 
-      # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
+    # Boost is no longer pre-installed on Azure/GitHub-hosted runners
+    # TODO: check which Boost components are really needed and update the list
+    - bash: sudo apt-get install -y libboost-system-dev
+      displayName: Install Boost
+      condition: eq(variables.boost, true)
+
+    # https://software.intel.com/content/www/us/en/develop/articles/installing-intel-free-libs-and-python-apt-repo.html
     - bash: |
         wget -qO- "https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2019.PUB" | sudo apt-key add -
         sudo sh -c "echo deb https://apt.repos.intel.com/mkl all main > /etc/apt/sources.list.d/intel-mkl.list"
@@ -240,13 +253,11 @@ stages:
       displayName: Install MKL
       condition: eq(variables.cpu, true)
 
-      # The script simplifies installation of different versions of CUDA
+    # The script simplifies installation of different versions of CUDA
     - bash: ./scripts/ci/install_cuda_ubuntu.sh $(cuda)
       displayName: Install CUDA
       condition: eq(variables.gpu, true)
 
-    # Boost is already installed on Azure-hosted runners in a non-standard location
-    # https://github.com/actions/virtual-environments/issues/687#issuecomment-610471671
     - bash: |
         mkdir -p build
         cd build
@@ -260,9 +271,6 @@ stages:
           -DUSE_FBGEMM=$(cpu) \
           -DUSE_SENTENCEPIECE=on \
           -DUSE_STATIC_LIBS=$(static) \
-          -DBOOST_ROOT=$BOOST_ROOT_1_72_0 \
-          -DBOOST_INCLUDEDIR=$BOOST_ROOT_1_72_0/include \
-          -DBOOST_LIBRARYDIR=$BOOST_ROOT_1_72_0/lib \
           -DBoost_ARCHITECTURE=-x64 \
           -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-$(cuda)
       displayName: Configure CMake
@@ -346,7 +354,7 @@ stages:
     - checkout: self
       submodules: true
 
-    - bash: brew install openblas protobuf
+    - bash: brew install boost openblas openssl protobuf
       displayName: Install packages
 
     # Openblas location is exported explicitly because openblas is keg-only, which means it was not symlinked into /usr/local/.

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -40,6 +40,7 @@ set(MARIAN_SOURCES
   data/corpus_sqlite.cpp
   data/corpus_nbest.cpp
   data/text_input.cpp
+  data/shortlist.cpp
 
   3rd_party/cnpy/cnpy.cpp
   3rd_party/ExceptionWithCallStack.cpp
@@ -72,6 +73,9 @@ set(MARIAN_SOURCES
   layers/loss.cpp
   layers/weight.cpp
   layers/lsh.cpp
+  layers/embedding.cpp
+  layers/output.cpp
+  layers/logits.cpp
 
   rnn/cells.cpp
   rnn/attention.cpp
@@ -84,6 +88,7 @@ set(MARIAN_SOURCES
   models/model_factory.cpp
   models/encoder_decoder.cpp
   models/transformer_stub.cpp
+  models/costs.cpp
 
   rescorer/score_collector.cpp
   embedder/vector_collector.cpp
@@ -103,10 +108,15 @@ set(MARIAN_SOURCES
   training/validator.cpp
   training/communicator.cpp
 
-  # this is only compiled to catch build errors, but not linked
+  # this is only compiled to catch build errors
   microsoft/quicksand.cpp
   microsoft/cosmos.cpp
 
+  # copied from quicksand to be able to read binary shortlist
+  microsoft/shortlist/utils/Converter.cpp
+  microsoft/shortlist/utils/StringUtils.cpp
+  microsoft/shortlist/utils/ParameterTree.cpp
+
   $<TARGET_OBJECTS:libyaml-cpp>
   $<TARGET_OBJECTS:SQLiteCpp>
   $<TARGET_OBJECTS:pathie-cpp>

diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
diff --git a/src/common/definitions.h b/src/common/definitions.h
diff --git a/src/common/file_stream.cpp b/src/common/file_stream.cpp
diff --git a/src/common/io_item.h b/src/common/io_item.h
diff --git a/src/common/options.h b/src/common/options.h
diff --git a/src/common/timer.cpp b/src/common/timer.cpp
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
diff --git a/src/data/batch.h b/src/data/batch.h
diff --git a/src/data/corpus.cpp b/src/data/corpus.cpp
diff --git a/src/data/corpus_base.cpp b/src/data/corpus_base.cpp
diff --git a/src/data/factored_vocab.cpp b/src/data/factored_vocab.cpp
@@ -546,7 +546,6 @@ void FactoredVocab::constructNormalizationInfoForVocab() {
 /*virtual*/ void FactoredVocab::transcodeToShortlistInPlace(WordIndex* ptr, size_t num) const {
   for (; num-- > 0; ptr++) {
     auto word = Word::fromWordIndex(*ptr);
-    auto wordString = word2string(word);
     auto lemmaIndex = getFactor(word, 0) + groupRanges_[0].first;
     *ptr = (WordIndex)lemmaIndex;
   }

diff --git a/src/data/factored_vocab.h b/src/data/factored_vocab.h
diff --git a/src/data/shortlist.cpp b/src/data/shortlist.cpp
@@ -0,0 +1,153 @@
+#include "data/shortlist.h"
+#include "microsoft/shortlist/utils/ParameterTree.h"
+
+namespace marian {
+namespace data {
+
+// cast current void pointer to T pointer and move forward by num elements 
+template <typename T>
+const T* get(const void*& current, size_t num = 1) {
+  const T* ptr = (const T*)current;
+  current = (const T*)current + num;
+  return ptr;
+}
+
+QuicksandShortlistGenerator::QuicksandShortlistGenerator(Ptr<Options> options,
+                                                         Ptr<const Vocab> srcVocab,
+                                                         Ptr<const Vocab> trgVocab,
+                                                         size_t srcIdx,
+                                                         size_t /*trgIdx*/,
+                                                         bool /*shared*/)
+    : options_(options),
+      srcVocab_(srcVocab),
+      trgVocab_(trgVocab),
+      srcIdx_(srcIdx) {
+  std::vector<std::string> vals = options_->get<std::vector<std::string>>("shortlist");
+
+  ABORT_IF(vals.empty(), "No path to filter path given");
+  std::string fname = vals[0];
+
+  auto firstNum   = vals.size() > 1 ? std::stoi(vals[1]) : 0;
+  auto bestNum    = vals.size() > 2 ? std::stoi(vals[2]) : 0;
+  float threshold = vals.size() > 3 ? std::stof(vals[3]) : 0;
+
+  if(firstNum != 0 || bestNum != 0 || threshold != 0) {
+    LOG(warn, "You have provided additional parameters for the Quicksand shortlist, but they are ignored.");
+  }
+
+  mmap_ = mio::mmap_source(fname); // memory-map the binary file once
+  const void* current = mmap_.data(); // pointer iterator over binary file
+
+  // compare magic number in binary file to make sure we are reading the right thing
+  const int32_t MAGIC_NUMBER = 1234567890;
+  int32_t header_magic_number = *get<int32_t>(current);
+  ABORT_IF(header_magic_number != MAGIC_NUMBER, "Trying to mmap Quicksand shortlist but encountered wrong magic number");
+
+  auto config = ::quicksand::ParameterTree::FromBinaryReader(current);
+  use16bit_ = config->GetBoolReq("use_16_bit");
+
+  LOG(info, "[data] Mapping Quicksand shortlist from {}", fname);
+
+  idSize_ = sizeof(int32_t);
+  if (use16bit_) {
+    idSize_ = sizeof(uint16_t);
+  }
+
+  // mmap the binary shortlist pieces
+  numDefaultIds_        = *get<int32_t>(current);
+  defaultIds_           =  get<int32_t>(current, numDefaultIds_);
+  numSourceIds_         = *get<int32_t>(current);
+  sourceLengths_        =  get<int32_t>(current, numSourceIds_);
+  sourceOffsets_        =  get<int32_t>(current, numSourceIds_);
+  numShortlistIds_      = *get<int32_t>(current);
+  sourceToShortlistIds_ =  get<uint8_t>(current, idSize_ * numShortlistIds_);
+
+  // display parameters
+  LOG(info, 
+      "[data] Quicksand shortlist has {} source ids, {} default ids and {} shortlist ids",
+      numSourceIds_, 
+      numDefaultIds_, 
+      numShortlistIds_);
+}
+
+Ptr<Shortlist> QuicksandShortlistGenerator::generate(Ptr<data::CorpusBatch> batch) const {
+  auto srcBatch = (*batch)[srcIdx_];
+  auto maxShortlistSize = trgVocab_->size();
+
+  std::unordered_set<int32_t> indexSet;
+  for(int32_t i = 0; i < numDefaultIds_ && i < maxShortlistSize; ++i) {
+    int32_t id = defaultIds_[i];
+    indexSet.insert(id);
+  }
+
+  // State
+  std::vector<std::pair<const uint8_t*, int32_t>> curShortlists(maxShortlistSize);
+  auto curShortlistIt = curShortlists.begin();
+
+  // Because we might fill up our shortlist before reaching max_shortlist_size, we fill the shortlist in order of rank.
+  // E.g., first rank of word 0, first rank of word 1, ... second rank of word 0, ...
+  int32_t maxLength = 0;
+  for (Word word : srcBatch->data()) {
+    int32_t sourceId = (int32_t)word.toWordIndex();
+    srcVocab_->transcodeToShortlistInPlace((WordIndex*)&sourceId, 1);
+
+    if (sourceId < numSourceIds_) { // if it's a valid source id
+      const uint8_t* curShortlistIds = sourceToShortlistIds_ + idSize_ * sourceOffsets_[sourceId]; // start position for mapping
+      int32_t length = sourceLengths_[sourceId]; // how many mappings are there
+      curShortlistIt->first  = curShortlistIds;
+      curShortlistIt->second = length;
+      curShortlistIt++;
+
+      if (length > maxLength)
+        maxLength = length;
+    }
+  }
+
+  // collect the actual shortlist mappings
+  for (int32_t i = 0; i < maxLength && indexSet.size() < maxShortlistSize; i++) {
+    for (int32_t j = 0; j < curShortlists.size() && indexSet.size() < maxShortlistSize; j++) {
+      int32_t length = curShortlists[j].second;
+      if (i < length) {
+        const uint8_t* source_shortlist_ids_bytes = curShortlists[j].first;
+        int32_t id = 0;
+        if (use16bit_) {
+          const uint16_t* source_shortlist_ids = reinterpret_cast<const uint16_t*>(source_shortlist_ids_bytes);
+          id = (int32_t)source_shortlist_ids[i];
+        }
+        else {
+          const int32_t* source_shortlist_ids = reinterpret_cast<const int32_t*>(source_shortlist_ids_bytes);
+          id = source_shortlist_ids[i];
+        }
+        indexSet.insert(id);
+      }
+    }
+  }
+
+  // turn into vector and sort (selected indices)
+  std::vector<WordIndex> indices;
+  indices.reserve(indexSet.size());
+  for(auto i : indexSet)
+    indices.push_back((WordIndex)i);
+
+  std::sort(indices.begin(), indices.end());
+  return New<Shortlist>(indices);
+}
+
+Ptr<ShortlistGenerator> createShortlistGenerator(Ptr<Options> options,
+                                                 Ptr<const Vocab> srcVocab,
+                                                 Ptr<const Vocab> trgVocab,
+                                                 size_t srcIdx,
+                                                 size_t trgIdx,
+                                                 bool shared) {
+  std::vector<std::string> vals = options->get<std::vector<std::string>>("shortlist");
+  ABORT_IF(vals.empty(), "No path to shortlist given");
+  std::string fname = vals[0];
+  if(filesystem::Path(fname).extension().string() == ".bin") {
+    return New<QuicksandShortlistGenerator>(options, srcVocab, trgVocab, srcIdx, trgIdx, shared);
+  } else {
+    return New<LexicalShortlistGenerator>(options, srcVocab, trgVocab, srcIdx, trgIdx, shared);
+  }
+}
+
+}  // namespace data
+}  // namespace marian