Reduced the number of times indices need to be copied to the GPU

marian-nmt · Oct 9, 2020 · 201dc0a · 201dc0a
1 parent 0ff56ea
commit 201dc0a
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 8 deletions.
diff --git a/src/layers/generic.cpp b/src/layers/generic.cpp
@@ -88,8 +88,10 @@ namespace marian {
     }
 
     // if selIdx are given, then we must reshuffle accordingly
-    if (!hypIndices.empty()) // use the same function that shuffles decoder state
-      sel = rnn::State::select(sel, hypIndices, (int)beamSize, /*isBatchMajor=*/false);
+    if (!hypIndices.empty()) { // use the same function that shuffles decoder state
+      auto indices = graph()->indices(hypIndices);
+      sel = rnn::State::select(sel, indices, (int)beamSize, /*isBatchMajor=*/false);
+    }
     return sel;
   }
 

diff --git a/src/models/states.h b/src/models/states.h
@@ -30,7 +30,8 @@ class EncoderState {
   // Sub-select active batch entries from encoder context and context mask
   Ptr<EncoderState> select(const std::vector<IndexType>& batchIndices) { // [batchIndex] indices of active batch entries
     // Dimension -2 is OK for both, RNN and Transformer models as the encoder context in Transformer gets transposed to the same dimension layout
-    return New<EncoderState>(index_select(context_, -2, batchIndices), index_select(mask_, -2, batchIndices), batch_);
+    auto indices = context_->graph()->indices(batchIndices);
+    return New<EncoderState>(index_select(context_, -2, indices), index_select(mask_, -2, indices), batch_);
   }
 };
 

diff --git a/src/rnn/types.h b/src/rnn/types.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "common/definitions.h"
+#include "common/shape.h"
 #include "marian.h"
 
 #include <iostream>
@@ -12,23 +14,22 @@ struct State {
   Expr output;
   Expr cell;
 
-  State select(const std::vector<IndexType>& selIdx, // [beamIndex * activeBatchSize + batchIndex]
+  State select(Expr selIdx, // [beamIndex * activeBatchSize + batchIndex]
                int beamSize, bool isBatchMajor) const {
     return{ select(output, selIdx, beamSize, isBatchMajor),
             select(cell,   selIdx, beamSize, isBatchMajor) };
   }
 
   // this function is also called by Logits
   static Expr select(Expr sel, // [beamSize, dimTime, dimBatch, dimDepth] or [beamSize, dimBatch, dimTime, dimDepth] (dimTime = 1 for RNN)
-                     const std::vector<IndexType>& selIdx, // [beamIndex * activeBatchSize + batchIndex]
+                     Expr selIdx, // [beamIndex * activeBatchSize + batchIndex]
                      int beamSize, bool isBatchMajor)
   {
     if (!sel)
       return sel; // keep nullptr untouched
 
     sel = atleast_4d(sel);
-
-    int dimBatch = (int)selIdx.size() / beamSize;
+    int dimBatch =(int) selIdx->shape().elements()/beamSize;
     int dimDepth = sel->shape()[-1];
     int dimTime  = isBatchMajor ? sel->shape()[-2] : sel->shape()[-3];
 
@@ -83,8 +84,24 @@ class States {
   States select(const std::vector<IndexType>& selIdx, // [beamIndex * activeBatchSize + batchIndex]
                 int beamSize, bool isBatchMajor) const {
     States selected;
+    Expr indices;
+    // I think this doesn't work if model split among gpus but not sure if it matters
+
+    for (auto& state : states_) {
+      if (state.cell) {
+        indices = state.cell->graph()->indices(selIdx);
+        break;
+      }
+
+      if (state.output) {
+        indices = state.output->graph()->indices(selIdx);
+        break;
+      }
+    }
+
+    // GPU OPT: Implement kernel to batch these on GPU
     for(auto& state : states_)
-      selected.push_back(state.select(selIdx, beamSize, isBatchMajor));
+      selected.push_back(state.select(indices, beamSize, isBatchMajor));
     return selected;
   }