apache · gerlowskija · Jul 9, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
diff --git a/solr/core/src/java/org/apache/solr/core/SolrConfig.java b/solr/core/src/java/org/apache/solr/core/SolrConfig.java
@@ -103,6 +103,8 @@ public class SolrConfig implements MapSerializable {
   private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
 
   public static final String DEFAULT_CONF_FILE = "solrconfig.xml";
+  public static final String MIN_PREFIX_LENGTH = "minPrefixLength";
+  public static final int DEFAULT_MIN_PREFIX_LENGTH = 0;
   private final String resourceName;
 
   private int znodeVersion;
@@ -291,6 +293,8 @@ private SolrConfig(
             BooleanQuery.getMaxClauseCount(),
             "set 'maxBooleanClauses' in solr.xml to increase global limit");
       }
+      prefixQueryMinPrefixLength =
+          get("query").get(MIN_PREFIX_LENGTH).intVal(DEFAULT_MIN_PREFIX_LENGTH);
 
       // Warn about deprecated / discontinued parameters
       // boolToFilterOptimizer has had no effect since 3.1
@@ -667,6 +671,7 @@ public SolrRequestParsers getRequestParsers() {
 
   /* The set of materialized parameters: */
   public final int booleanQueryMaxClauseCount;
+  public final int prefixQueryMinPrefixLength;
   // SolrIndexSearcher - nutch optimizer -- Disabled since 3.1
   //  public final boolean filtOptEnabled;
   //  public final int filtOptCacheSize;
@@ -1018,6 +1023,7 @@ public Map<String, Object> toMap(Map<String, Object> result) {
     m.put("queryResultMaxDocsCached", queryResultMaxDocsCached);
     m.put("enableLazyFieldLoading", enableLazyFieldLoading);
     m.put("maxBooleanClauses", booleanQueryMaxClauseCount);
+    m.put(MIN_PREFIX_LENGTH, prefixQueryMinPrefixLength);
 
     for (SolrPluginInfo plugin : plugins) {
       List<PluginInfo> infos = getPluginInfos(plugin.clazz.getName());

diff --git a/solr/core/src/java/org/apache/solr/schema/StrField.java b/solr/core/src/java/org/apache/solr/schema/StrField.java
@@ -26,13 +26,16 @@
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.queries.function.ValueSource;
 import org.apache.lucene.queries.function.valuesource.SortedSetFieldSource;
+import org.apache.lucene.search.PrefixQuery;
+import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.SortedSetSelector;
 import org.apache.lucene.util.BytesRef;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.util.ByteArrayUtf8CharSequence;
 import org.apache.solr.response.TextResponseWriter;
 import org.apache.solr.search.QParser;
+import org.apache.solr.search.QueryUtils;
 import org.apache.solr.uninverting.UninvertingReader.Type;
 
 public class StrField extends PrimitiveFieldType {
@@ -68,6 +71,20 @@ public List<IndexableField> createFields(SchemaField field, Object value) {
     return Collections.singletonList(fval);
   }
 
+  @Override
+  public Query getPrefixQuery(QParser parser, SchemaField sf, String termStr) {
+    final var query = super.getPrefixQuery(parser, sf, termStr);
+
+    // Some internal usage (e.g. faceting) creates PrefixQueries without a surrounding QParser, so
+    // check for null here before using QParser to access the limit value
+    if (query instanceof PrefixQuery && parser != null) {
+      final var minPrefixLength =
+          parser.getReq().getCore().getSolrConfig().prefixQueryMinPrefixLength;
+      QueryUtils.ensurePrefixQueryObeysMinimumPrefixLength(query, termStr, minPrefixLength);
+    }
+    return query;
+  }
+
   public static BytesRef getBytesRef(Object value) {
     if (value instanceof ByteArrayUtf8CharSequence) {
       ByteArrayUtf8CharSequence utf8 = (ByteArrayUtf8CharSequence) value;

diff --git a/solr/core/src/java/org/apache/solr/schema/TextField.java b/solr/core/src/java/org/apache/solr/schema/TextField.java
@@ -26,6 +26,7 @@
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.function.ValueSource;
 import org.apache.lucene.queries.function.valuesource.SortedSetFieldSource;
+import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.SortField;
 import org.apache.lucene.search.SortedSetSelector;
@@ -38,6 +39,7 @@
 import org.apache.solr.query.SolrRangeQuery;
 import org.apache.solr.response.TextResponseWriter;
 import org.apache.solr.search.QParser;
+import org.apache.solr.search.QueryUtils;
 import org.apache.solr.uninverting.UninvertingReader.Type;
 
 /**
@@ -165,6 +167,20 @@ public Query getFieldTermQuery(QParser parser, SchemaField field, String externa
     return new TermQuery(new Term(field.getName(), br));
   }
 
+  @Override
+  public Query getPrefixQuery(QParser parser, SchemaField sf, String termStr) {
+    final var query = super.getPrefixQuery(parser, sf, termStr);
+
+    // Some internal usage (e.g. faceting) creates PrefixQueries without a surrounding QParser, so
+    // check for null here before using QParser to access the limit value
+    if (query instanceof PrefixQuery && parser != null) {
+      final var minPrefixLength =
+          parser.getReq().getCore().getSolrConfig().prefixQueryMinPrefixLength;
+      QueryUtils.ensurePrefixQueryObeysMinimumPrefixLength(query, termStr, minPrefixLength);
+    }
+    return query;
+  }
+
   @Override
   public Object toObject(SchemaField sf, BytesRef term) {
     return term.utf8ToString();

diff --git a/solr/core/src/java/org/apache/solr/search/ComplexPhraseQParserPlugin.java b/solr/core/src/java/org/apache/solr/search/ComplexPhraseQParserPlugin.java
@@ -19,6 +19,7 @@
 import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser;
 import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.SolrParams;
@@ -134,6 +135,19 @@ protected Query newWildcardQuery(org.apache.lucene.index.Term t) {
               }
             }
 
+            @Override
+            protected Query getPrefixQuery(String field, String termStr) throws ParseException {
+              // TODO check the field type and call QueryUtils.ensureBlah here
+              final var query = super.getPrefixQuery(field, termStr);
+              if (query instanceof PrefixQuery) {
+                final var minPrefixLength =
+                    getReq().getCore().getSolrConfig().prefixQueryMinPrefixLength;
+                QueryUtils.ensurePrefixQueryObeysMinimumPrefixLength(
+                    query, termStr, minPrefixLength);
+              }
+              return query;
+            }
+
             private Query setRewriteMethod(org.apache.lucene.search.Query query) {
               if (query instanceof MultiTermQuery) {
                 ((MultiTermQuery) query)

diff --git a/solr/core/src/java/org/apache/solr/search/QueryUtils.java b/solr/core/src/java/org/apache/solr/search/QueryUtils.java
@@ -21,6 +21,7 @@
 import java.util.Collections;
 import java.util.IdentityHashMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import org.apache.lucene.search.BooleanClause;
@@ -34,6 +35,7 @@
 import org.apache.lucene.search.Query;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.CommonParams;
+import org.apache.solr.core.SolrConfig;
 import org.apache.solr.request.SolrQueryRequest;
 
 /** */
@@ -83,6 +85,25 @@ public static boolean isConstantScoreQuery(Query q) {
     }
   }
 
+  public static void ensurePrefixQueryObeysMinimumPrefixLength(
+      Query query, String prefix, int minPrefixLength) {
+    // TODO Should we provide a query-param to disable the limit on a request-by-request basis?  I
+    // can imagine scenarios where advanced users may want to enforce the limit on most fields,
+    // but ignore it for a few fields that they know to be low-cardinality and therefore "less
+    // risky"
+    if (prefix.length() < minPrefixLength) {
+      final var message =
+          String.format(
+              Locale.ROOT,
+              "Query [%s] does not meet the minimum prefix length [%d] (actual=[%d]).  Please try with a larger prefix, or adjust %s in your solrconfig.xml",
+              query,
+              minPrefixLength,
+              prefix.length(),
+              SolrConfig.MIN_PREFIX_LENGTH);
+      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, message);
+    }
+  }
+
   /**
    * Returns the original query if it was already a positive query, otherwise return the negative of
    * the query (i.e., a positive query).

diff --git a/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java b/solr/core/src/java/org/apache/solr/search/facet/FacetFieldProcessor.java
@@ -544,6 +544,10 @@ protected Query makeBucketQuery(final String bucketValue) {
   private void calculateNumBuckets(SimpleOrderedMap<Object> target) throws IOException {
     DocSet domain = fcontext.base;
     if (freq.prefix != null) {
+      // TODO - Should we enforce minPrefixLength here in the case of 'string' fields, or omit
+      //  since this is an "internal" request? If we want to enforce the limit in this case,
+      //  we should have StrField read the configured limit and cache it in 'init' so that it can
+      //  be read at 'getPrefixQuery' call-time without a QParser.
       Query prefixFilter = sf.getType().getPrefixQuery(null, sf, freq.prefix);
       domain = fcontext.searcher.getDocSet(prefixFilter, domain);
     }

diff --git a/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml b/solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
@@ -89,6 +89,18 @@
     -->
     <maxBooleanClauses>${solr.max.booleanClauses:1024}</maxBooleanClauses>
 
+    <!-- Minimum acceptable prefix-size for prefix-based queries on string fields.
+
+         Prefix-based queries consume memory in proportion to the number of terms in the index
+         that start with that prefix.  Short prefixes tend to match many many more indexed-terms
+         and consume more memory as a result, sometimes causing stability issues on the node.
+
+         This setting allows administrators to require that prefixes meet or exceed a specified
+         minimum length requirement.  Prefix queries that don't meet this requirement return an
+         error to users.
+    -->
+    <minPrefixLength>${solr.min.prefixLength:2}</minPrefixLength>
+
     <!-- Cache specification for Filters or DocSets - unordered set of *all* documents
          that match a particular query.
       -->

diff --git a/solr/core/src/test/org/apache/solr/ConvertedLegacyTest.java b/solr/core/src/test/org/apache/solr/ConvertedLegacyTest.java
@@ -637,8 +637,8 @@ public void testABunchOfConvertedStuff() {
         "<add><doc><field name=\"id\">107</field><field name=\"val_s\">port</field></doc></add>");
     assertU("<commit/>");
 
-    assertQ(req("val_s:a*"), "//*[@numFound='3']");
-    assertQ(req("val_s:p*"), "//*[@numFound='4']");
+    assertQ(req("val_s:ap*"), "//*[@numFound='3']");
+    assertQ(req("val_s:pe*"), "//*[@numFound='3']");
     // val_s:* %//*[@numFound="8"]
 
     // test wildcard query

diff --git a/solr/core/src/test/org/apache/solr/core/SolrCoreTest.java b/solr/core/src/test/org/apache/solr/core/SolrCoreTest.java
@@ -298,6 +298,7 @@ public void testConfiguration() {
     assertEquals(
         "wrong config for slowQueryThresholdMillis", 2000, solrConfig.slowQueryThresholdMillis);
     assertEquals("wrong config for maxBooleanClauses", 1024, solrConfig.booleanQueryMaxClauseCount);
+    assertEquals("wrong config for minPrefixLength", 2, solrConfig.prefixQueryMinPrefixLength);
     assertTrue("wrong config for enableLazyFieldLoading", solrConfig.enableLazyFieldLoading);
     assertEquals("wrong config for queryResultWindowSize", 10, solrConfig.queryResultWindowSize);
   }

diff --git a/solr/core/src/test/org/apache/solr/search/PrefixQueryTest.java b/solr/core/src/test/org/apache/solr/search/PrefixQueryTest.java
@@ -0,0 +1,106 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.search;
+
+import org.apache.solr.SolrTestCaseJ4;
+import org.apache.solr.common.SolrException;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+/**
+ * Unit tests for prefix-query functionality - mostly testing the 'minPrefixLength' setting
+ * available in solrconfig.xml
+ */
+public class PrefixQueryTest extends SolrTestCaseJ4 {
+
+  private static final String[] MIN_PREFIX_SUPPORTING_FIELDS = new String[] {"val_s", "t_val"};
+
+  @BeforeClass
+  public static void beforeTests() throws Exception {
+    initCore("solrconfig.xml", "schema.xml");
+
+    assertU(createDocWithFieldVal("1", "aaa"));
+    assertU(createDocWithFieldVal("2", "aab"));
+    assertU(createDocWithFieldVal("3", "aac"));
+    assertU(createDocWithFieldVal("4", "abc"));
+
+    assertU(createDocWithFieldVal("5", "bbb"));
+    assertU(createDocWithFieldVal("6", "bbc"));
+
+    assertU("<commit/>");
+  }
+
+  // Sanity-check of a few queries we'll use in other tests
+  @Test
+  public void testPrefixQueryMatchesExpectedDocuments() {
+    for (String fieldName : MIN_PREFIX_SUPPORTING_FIELDS) {
+      assertQ(req(fieldName + ":*"), "//*[@numFound='6']");
+      assertQ(req(fieldName + ":aa*"), "//*[@numFound='3']");
+      assertQ(req(fieldName + ":bb*"), "//*[@numFound='2']");
+    }
+  }
+
+  @Test
+  public void testPrefixQueryObeysMinPrefixLimit() {
+    for (String fieldName : MIN_PREFIX_SUPPORTING_FIELDS) {
+      assertQEx(
+          "Prefix query didn't obey limit",
+          "does not meet the minimum prefix length [2] (actual=[1])",
+          req(fieldName + ":a*"),
+          SolrException.ErrorCode.BAD_REQUEST);
+    }
+  }
+
+  @Test
+  public void testPrefixQParserObeysMinPrefixLimit() {
+    for (String fieldName : MIN_PREFIX_SUPPORTING_FIELDS) {
+      assertQEx(
+          "Prefix query didn't obey limit",
+          "does not meet the minimum prefix length [2] (actual=[1])",
+          req("q", "{!prefix f=" + fieldName + "}a"),
+          SolrException.ErrorCode.BAD_REQUEST);
+    }
+  }
+
+  @Test
+  public void testComplexPhraseQParserObeysMinPrefixLimit() {
+    for (String fieldName : MIN_PREFIX_SUPPORTING_FIELDS) {
+      assertQEx(
+          "{!complex} query didn't obey min-prefix limit",
+          "does not meet the minimum prefix length [2] (actual=[1])",
+          req("q", "{!complexphrase inOrder=true}" + fieldName + ":\"a*\""),
+          SolrException.ErrorCode.BAD_REQUEST);
+    }
+  }
+
+  @Test
+  public void testQuestionMarkWildcardsCountTowardsMinimumPrefix() {
+    // Both of these queries succeed since the '?' wildcard is counted as a part of the prefix
+    assertQ(req("val_s:a?c*"), "//*[@numFound='2']"); // Matches 'aac' and 'abc'
+    assertQ(req("val_s:a??*"), "//*[@numFound='4']"); // Matches all documents starting with 'a'
+  }
+
+  private static String createDocWithFieldVal(String id, String fieldVal) {
+    return "<add><doc><field name=\"id\">"
+        + id
+        + "</field><field name=\"val_s\">"
+        + fieldVal
+        + "</field><field name=\"t_val\">"
+        + fieldVal
+        + "</field></doc></add>";
+  }
+}
diff --git a/solr/core/src/test/org/apache/solr/search/TestComplexPhraseQParserPlugin.java b/solr/core/src/test/org/apache/solr/search/TestComplexPhraseQParserPlugin.java
@@ -57,7 +57,7 @@ public void testDefaultField() {
         "//doc[./str[@name='id']='1']");
 
     assertQ(
-        req("q", "{!complexphrase} \"j* smyth~\""),
+        req("q", "{!complexphrase} \"jo* smyth~\""),
         "//result[@numFound='2']",
         "//doc[./str[@name='id']='1']",
         "//doc[./str[@name='id']='2']");
@@ -121,7 +121,7 @@ public void test() {
 
     assertQ(
         "wildcards and fuzzies are OK in phrases",
-        sumLRF.makeRequest("name:\"j* smyth~\""),
+        sumLRF.makeRequest("name:\"jo* smyth~\""),
         "//doc[./str[@name='id']='1']",
         "//doc[./str[@name='id']='2']",
         "//result[@numFound='2']");
@@ -252,7 +252,7 @@ public void testMultipleFields() {
         "//doc[./str[@name='id']='3']");
 
     assertQ(
-        req("q", "{!complexphrase} name:\"prot* dige*\" AND text:\"d* r*\""),
+        req("q", "{!complexphrase} name:\"prot* dige*\" AND text:\"dn* ru*\""),
         "//result[@numFound='1']",
         "//doc[./str[@name='id']='3']");
 

diff --git a/solr/core/src/test/org/apache/solr/search/TestFiltering.java b/solr/core/src/test/org/apache/solr/search/TestFiltering.java
@@ -64,7 +64,7 @@ public void testLiveDocsSharing() throws Exception {
 
     String[] queries = {
       "foo_s:foo",
-      "foo_s:f*",
+      "foo_s:fo*",
       "*:*",
       "id:[* TO *]",
       "id:[0 TO 99]",

diff --git a/solr/core/src/test/org/apache/solr/search/join/BJQParserTest.java b/solr/core/src/test/org/apache/solr/search/join/BJQParserTest.java
@@ -55,6 +55,7 @@ public class BJQParserTest extends SolrTestCaseJ4 {
 
   @BeforeClass
   public static void beforeClass() throws Exception {
+    System.setProperty("solr.min.prefixLength", String.valueOf(1));
     initCore("solrconfig.xml", "schema15.xml");
     createIndex();
   }

diff --git a/solr/server/solr/configsets/_default/conf/solrconfig.xml b/solr/server/solr/configsets/_default/conf/solrconfig.xml
@@ -360,6 +360,19 @@
       -->
     <maxBooleanClauses>${solr.max.booleanClauses:1024}</maxBooleanClauses>
 
+
+    <!-- Minimum acceptable prefix-size for prefix-based queries on string fields.
+
+         Prefix-based queries consume memory in proportion to the number of terms in the index
+         that start with that prefix.  Short prefixes tend to match many many more indexed-terms
+         and consume more memory as a result, sometimes causing stability issues on the node.
+
+         This setting allows administrators to require that prefixes meet or exceed a specified
+         minimum length requirement.  Prefix queries that don't meet this requirement return an
+         error to users.
+    -->
+    <minPrefixLength>${solr.min.prefixLength:2}</minPrefixLength>
+
     <!-- Solr Internal Query Caches
          Starting with Solr 9.0 the default cache implementation used is CaffeineCache.
     -->