castorini · lintool · May 2, 2022 · Apr 26, 2022 · Apr 26, 2022 · May 1, 2022
diff --git a/src/main/java/io/anserini/search/SimpleSearcher.java b/src/main/java/io/anserini/search/SimpleSearcher.java
@@ -681,6 +681,44 @@ public Document document(String docid) {
     return IndexReaderUtils.document(reader, docid);
   }
 
+  /**
+   * Returns a map of collection docid to Lucene {@link Document}.
+   * Batch version of {@link #document(String)}.
+   *
+   * @param docids list of docids
+   * @return a map of docid to corresponding Lucene {@link Document}
+   */
+  public Map<String, Document> batchGetDocument(List<String> docids, int threads) {
+    ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(threads);
+    ConcurrentHashMap<String, Document> results = new ConcurrentHashMap<>();
+
+    for (String docid: docids) {
+      executor.execute(() -> {
+        try {
+          Document result = IndexReaderUtils.document(reader, docid);
+          results.put(docid, result);
+        } catch (Exception e){}
+      });
+    }
+
+    executor.shutdown();
+
+    try {
+      // Wait for existing tasks to terminate
+      while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
+        LOG.info(String.format("%.2f percent completed",
+                (double) executor.getCompletedTaskCount() / docids.size() * 100.0d));
+      }
+    } catch (InterruptedException ie) {
+      // (Re-)Cancel if current thread also interrupted
+      executor.shutdownNow();
+      // Preserve interrupt status
+      Thread.currentThread().interrupt();
+    }
+
+    return results;
+  }
+
   /**
    * Fetches the Lucene {@link Document} based on some field other than its unique collection docid.
    * For example, scientific articles might have DOIs.

diff --git a/src/test/java/io/anserini/search/SimpleSearcherTest.java b/src/test/java/io/anserini/search/SimpleSearcherTest.java
@@ -20,6 +20,7 @@
 import io.anserini.index.IndexArgs;
 import io.anserini.search.SimpleSearcher.Result;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.document.Document;
 import org.apache.lucene.search.TermQuery;
 import org.junit.Test;
 
@@ -54,6 +55,25 @@ public void testGetDoc() throws Exception {
     searcher.close();
   }
 
+  @Test
+  public void testBatchGetDoc() throws Exception {
+    SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString());
+
+    ArrayList<String> docIds = new ArrayList<String>();
+    docIds.add("doc1");
+    docIds.add("doc2");
+    docIds.add("doc3");
+    docIds.add("fake_doc");
+
+    Map<String, Document> results = searcher.batchGetDocument(docIds, 2);
+    assertEquals("here is some text here is some more text. city.", results.get("doc1").get("contents"));
+    assertEquals("more texts", results.get("doc2").get("contents"));
+    assertEquals("here is a test", results.get("doc3").get("contents"));
+    assertNull(results.get("fake_doc"));
+
+    searcher.close();
+  }
+
   @Test
   public void testGetDocByField() throws Exception {
     SimpleSearcher searcher = new SimpleSearcher(super.tempDir1.toString());