From 61f6f20ff6872484966ea1badcdcdcebf1eea852 Mon Sep 17 00:00:00 2001 From: Chris Kamphuis Date: Fri, 6 Sep 2019 19:48:26 +0200 Subject: [PATCH] Script to extract doc lengths (#791) Script that produces tsv file which contains the length of all documents, the unique term count and the lossy unique term count as used in the BM25similarity class. --- pom.xml | 4 + .../io/anserini/search/SearchCollection.java | 4 +- .../anserini/util/ExtractDocumentLengths.java | 78 +++++++++++++++++++ 3 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 src/main/java/io/anserini/util/ExtractDocumentLengths.java diff --git a/pom.xml b/pom.xml index 54f424f964..ddbc07727f 100644 --- a/pom.xml +++ b/pom.xml @@ -117,6 +117,10 @@ io.anserini.search.SearchMsmarco SearchMsmarco + + io.anserini.util.ExtractDocumentLengths + DocLen + io.anserini.eval.Eval Eval diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 557003f97c..9199ba92f2 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -237,7 +237,7 @@ public void close() throws IOException { reader.close(); } - public List constructSimiliries() { + public List constructSimilarities() { // Figure out which scoring model to use. List similarities = new ArrayList<>(); if (args.ql || args.qld) { @@ -361,7 +361,7 @@ public void runTopics() throws IOException { final String runTag = args.runtag == null ? "Anserini" : args.runtag; final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(args.threads); - this.similarities = constructSimiliries(); + this.similarities = constructSimilarities(); Map cascades = constructRerankerCascades(); for (TaggedSimilarity taggedSimilarity : this.similarities) { for (Map.Entry cascade : cascades.entrySet()) { diff --git a/src/main/java/io/anserini/util/ExtractDocumentLengths.java b/src/main/java/io/anserini/util/ExtractDocumentLengths.java new file mode 100644 index 0000000000..93883f0f68 --- /dev/null +++ b/src/main/java/io/anserini/util/ExtractDocumentLengths.java @@ -0,0 +1,78 @@ +/** + * Anserini: A Lucene toolkit for replicable information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.util; + +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.SmallFloat; +import org.kohsuke.args4j.*; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.PrintStream; +import java.nio.file.Paths; + +public class ExtractDocumentLengths { + + public static class Args { + @Option(name = "-index", metaVar = "[path]", required = true, usage = "Lucene index") + String index; + + @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file") + String output; + } + + public static void main(String[] args) throws Exception { + Args myArgs = new Args(); + CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90)); + + try { + parser.parseArgument(args); + } catch (CmdLineException e) { + System.err.println(e.getMessage()); + parser.printUsage(System.err); + return; + } + + Directory dir = FSDirectory.open(Paths.get(myArgs.index)); + IndexReader reader = DirectoryReader.open(dir); + + PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output))); + + int numDocs = reader.numDocs(); + out.println("luceneID\tcount\tuniquecount\tlossycount"); + for (int i = 0; i < numDocs; i++) { + int total = 0; + Terms terms = reader.getTermVector(i, "contents"); + if(terms == null) { + out.println(i + "\t" + 0 + "\t" + 0 + "\t" + 0); + continue; + } + TermsEnum termsEnum = terms.iterator(); + while ((termsEnum.next()) != null) { + total += termsEnum.totalTermFreq(); + } + long length = SmallFloat.longToInt4(terms.size()); + out.println(i + "\t" + total + "\t" + terms.size() + "\t" + length) ; + } + out.close(); + } +}