castorini · thongnt99 · Mar 23, 2023 · Mar 26, 2023 · Mar 26, 2023 · Mar 27, 2023
diff --git a/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java b/src/main/java/io/anserini/collection/JsonSparseVectorCollection.java
@@ -0,0 +1,84 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import com.fasterxml.jackson.databind.JsonNode;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * A JSON sparse document collection for learned sparse retrieval
+ */
+public class JsonSparseVectorCollection extends DocumentCollection<JsonSparseVectorCollection.Document> {
+  public JsonSparseVectorCollection(Path path) {
+    this.path = path;
+  }
+
+  @Override
+  public FileSegment<JsonSparseVectorCollection.Document> createFileSegment(BufferedReader bufferedReader) throws IOException {
+    return new JsonSparseVectorCollection.Segment<>(bufferedReader);
+  }
+
+  @Override
+  public FileSegment<JsonSparseVectorCollection.Document> createFileSegment(Path path) throws IOException {
+    return new JsonSparseVectorCollection.Segment<>(path);
+  }
+
+  public static class Segment<T extends JsonSparseVectorCollection.Document> extends JsonCollection.Segment<T> {
+    public Segment(Path path) throws IOException {
+      super(path);
+    }
+
+    public Segment(BufferedReader bufferedReader) throws IOException {
+      super(bufferedReader);
+    }
+
+    @Override
+    protected Document createNewDocument(JsonNode json) {
+      return new Document(json);
+    }
+  }
+
+  public static class Document extends JsonCollection.Document implements SourceSparseVectorDocument {
+    private Map<String, Float> vector;
+    public Document(JsonNode json) {
+      super(json);
+      this.vector = new HashMap<>();
+      // We're going to take the map associated with "vector" and generate pseudo-document.
+      JsonNode vectorNode = json.get("vector");
+
+      // Iterate through the features:
+      final StringBuilder sb = new StringBuilder();
+      vectorNode.fields().forEachRemaining( e -> {
+        Float cnt = e.getValue().floatValue();
+        // Generate pseudo-document by appending the feature cnt times,
+        // where cnt is the value of the feature
+        this.vector.put(e.getKey(), cnt);
+      });
+    }
+
+    @Override
+    public Map<String, Float> vector() {
+      return this.vector;
+    }
+
+  }
+}
diff --git a/src/main/java/io/anserini/collection/SourceSparseVectorDocument.java b/src/main/java/io/anserini/collection/SourceSparseVectorDocument.java
@@ -0,0 +1,32 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.collection;
+
+import java.util.Map;
+
+/**
+ * A raw document from a collection. A {@code SourceDocument} is explicitly distinguish a from a
+ * Lucene {@link org.apache.lucene.document.Document}, which is the Lucene representation that
+ * can be directly inserted into an index.
+ */
+public interface SourceSparseVectorDocument {
+  /**
+   * Return the vector containing term and weight
+   * @return a map that map term to weight
+   */
+  Map<String, Float> vector();
+}
diff --git a/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/SparseVectorDocumentGenerator.java
@@ -0,0 +1,80 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.index.generator;
+
+import io.anserini.collection.InvalidContentsException;
+import io.anserini.collection.SourceDocument;
+import io.anserini.collection.SourceSparseVectorDocument;
+import io.anserini.index.Constants;
+import io.anserini.index.IndexCollection;
+import org.apache.lucene.document.*;
+import org.apache.lucene.util.BytesRef;
+
+import java.util.Map;
+
+/**
+ * Converts a {@link SourceDocument} into a Lucene {@link Document}, ready to be indexed.
+ *
+ * @param <T> type of the source document
+ */
+public class SparseVectorDocumentGenerator<T extends SourceSparseVectorDocument & SourceDocument> implements LuceneDocumentGenerator<T> {
+  protected IndexCollection.Args args;
+
+  protected SparseVectorDocumentGenerator() {
+
+  }
+  /**
+   * Constructor with config and counters
+   *
+   * @param args configuration arguments
+   */
+  public SparseVectorDocumentGenerator(IndexCollection.Args args) {
+    this.args = args;
+  }
+
+  @Override
+  public Document createDocument(T src) throws GeneratorException {
+    String id = src.id();
+    Map<String, Float> vector;
+    try {
+      vector = src.vector();
+    } catch (InvalidContentsException e) {
+      // Catch and rethrow; indexer will eat the exception at top level and increment counters accordingly.
+      throw new InvalidDocumentException();
+    }
+
+    if (vector.size() == 0) {
+      throw new EmptyDocumentException();
+    }
+
+    // Make a new, empty document.
+    final Document document = new Document();
+
+    // Store the collection docid.
+    document.add(new StringField(Constants.ID, id, Field.Store.YES));
+    // This is needed to break score ties by docid.
+    document.add(new BinaryDocValuesField(Constants.ID, new BytesRef(id)));
+
+    if (args.storeRaw) {
+      document.add(new StoredField(Constants.RAW, src.raw()));
+    }
+    for (String term : vector.keySet()){
+      document.add(new FeatureField(Constants.CONTENTS, term, vector.get(term)));
+    }
+    return document;
+  }
+}
diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java
@@ -35,6 +35,7 @@
 import io.anserini.rerank.lib.Rm3Reranker;
 import io.anserini.rerank.lib.RocchioReranker;
 import io.anserini.rerank.lib.ScoreTiesAdjusterReranker;
+import io.anserini.search.query.BagOfWordsQueryGenerator;
 import io.anserini.search.query.QueryGenerator;
 import io.anserini.search.query.SdmQueryGenerator;
 import io.anserini.search.similarity.AccurateBM25Similarity;
@@ -1250,6 +1251,8 @@ public <K> ScoredDocuments search(IndexSearcher searcher, K qid, String queryStr
 
     if (args.sdm) {
       query = new SdmQueryGenerator(args.sdm_tw, args.sdm_ow, args.sdm_uw).buildQuery(Constants.CONTENTS, analyzer, queryString);
+    } else if (args.impact){
+        query = new BagOfWordsQueryGenerator().buildFeatureQuery(Constants.CONTENTS, analyzer, queryString);
     } else {
       QueryGenerator generator;
       try {

diff --git a/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java b/src/main/java/io/anserini/search/query/BagOfWordsQueryGenerator.java
@@ -17,14 +17,17 @@
 package io.anserini.search.query;
 
 import io.anserini.analysis.AnalyzerUtils;
+import io.anserini.index.Constants;
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.FeatureField;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.function.Function;
@@ -33,7 +36,7 @@
 /*
  * Bag of Terms query builder
  */
-public class BagOfWordsQueryGenerator extends QueryGenerator {
+public class BagOfWordsQueryGenerator extends QueryGenerator implements  FeatureGenerator {
   @Override
   public Query buildQuery(String field, Analyzer analyzer, String queryText) {
     List<String> tokens = AnalyzerUtils.analyze(analyzer, queryText);
@@ -47,6 +50,34 @@ public Query buildQuery(String field, Analyzer analyzer, String queryText) {
     return builder.build();
   }
 
+  public Query buildFeatureQuery(String field, Analyzer analyzer, String queryText) {
+    List<String> tokens = AnalyzerUtils.analyze(analyzer, queryText);
+    Map<String, Long> collect = tokens.stream()
+            .collect(Collectors.groupingBy(Function.identity(), Collectors.counting()));
+    BooleanQuery.Builder builder = new BooleanQuery.Builder();
+    Map<String, Float> normalizedScore = new HashMap<>();
+    float maxWeight = 0;
+    for (String t : collect.keySet()){
+      float s = (float) collect.get(t);
+      normalizedScore.put(t, s);
+      if (s > maxWeight) {
+        maxWeight = s;
+      }
+    }
+    // The maximum weight for FeatureQuery is 64, this constraint could be lifted but might not be necessary.
+    // Note: This normalization makes the scores between different queries not comparable
+    if (maxWeight > 64){
+      for (String t : normalizedScore.keySet()){
+        normalizedScore.put(t,normalizedScore.get(t)/maxWeight* (float)64.0);
+      }
+    }
+
+    for (String t : normalizedScore.keySet()) {
+      builder.add(FeatureField.newLinearQuery(Constants.CONTENTS, t, normalizedScore.get(t)),BooleanClause.Occur.SHOULD);
+    }
+    return builder.build();
+  }
+
   @Override
   public Query buildQuery(Map<String, Float> fields, Analyzer analyzer, String queryText) {
     BooleanQuery.Builder builder = new BooleanQuery.Builder();

diff --git a/src/main/java/io/anserini/search/query/FeatureGenerator.java b/src/main/java/io/anserini/search/query/FeatureGenerator.java
@@ -0,0 +1,15 @@
+package io.anserini.search.query;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.search.Query;
+
+public interface FeatureGenerator{
+  /**
+   *  Generate queries with terms as features
+   * @param field
+   * @param analyzer
+   * @param queryText
+   * @return
+   */
+  Query buildFeatureQuery(String field, Analyzer analyzer, String queryText);
+}