Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SOLR-17195: Add 'minPrefixLength' soft limit #2499

Merged
merged 12 commits into from
Jul 9, 2024
6 changes: 6 additions & 0 deletions solr/core/src/java/org/apache/solr/core/SolrConfig.java
Original file line number Diff line number Diff line change
@@ -103,6 +103,8 @@ public class SolrConfig implements MapSerializable {
private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

public static final String DEFAULT_CONF_FILE = "solrconfig.xml";
public static final String MIN_PREFIX_LENGTH = "minPrefixLength";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

from a naming standpoint, this is very ambiguous. Prefix of what? And should this be here at all vs. a request parameter, interpreted by some (not all) query parsers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Naming-wise, I was very much following the model of maxBooleanClauses. But in hindsight, I think you're right that this is too ambiguous.

For now I'll change this to minPrefixQueryTermLength, but if there's another option there you like better, lmk.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the "Term" in that add anything? I suggest removing that component. Not a strong opinion though.

public static final int DEFAULT_MIN_PREFIX_LENGTH = 0;
private final String resourceName;

private int znodeVersion;
@@ -291,6 +293,8 @@ private SolrConfig(
BooleanQuery.getMaxClauseCount(),
"set 'maxBooleanClauses' in solr.xml to increase global limit");
}
prefixQueryMinPrefixLength =
get("query").get(MIN_PREFIX_LENGTH).intVal(DEFAULT_MIN_PREFIX_LENGTH);

// Warn about deprecated / discontinued parameters
// boolToFilterOptimizer has had no effect since 3.1
@@ -667,6 +671,7 @@ public SolrRequestParsers getRequestParsers() {

/* The set of materialized parameters: */
public final int booleanQueryMaxClauseCount;
public final int prefixQueryMinPrefixLength;
// SolrIndexSearcher - nutch optimizer -- Disabled since 3.1
// public final boolean filtOptEnabled;
// public final int filtOptCacheSize;
@@ -1018,6 +1023,7 @@ public Map<String, Object> toMap(Map<String, Object> result) {
m.put("queryResultMaxDocsCached", queryResultMaxDocsCached);
m.put("enableLazyFieldLoading", enableLazyFieldLoading);
m.put("maxBooleanClauses", booleanQueryMaxClauseCount);
m.put(MIN_PREFIX_LENGTH, prefixQueryMinPrefixLength);

for (SolrPluginInfo plugin : plugins) {
List<PluginInfo> infos = getPluginInfos(plugin.clazz.getName());
17 changes: 17 additions & 0 deletions solr/core/src/java/org/apache/solr/schema/StrField.java
Original file line number Diff line number Diff line change
@@ -26,13 +26,16 @@
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.SortedSetFieldSource;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedSetSelector;
import org.apache.lucene.util.BytesRef;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.util.ByteArrayUtf8CharSequence;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QueryUtils;
import org.apache.solr.uninverting.UninvertingReader.Type;

public class StrField extends PrimitiveFieldType {
@@ -68,6 +71,20 @@ public List<IndexableField> createFields(SchemaField field, Object value) {
return Collections.singletonList(fval);
}

@Override
public Query getPrefixQuery(QParser parser, SchemaField sf, String termStr) {
final var query = super.getPrefixQuery(parser, sf, termStr);

// Some internal usage (e.g. faceting) creates PrefixQueries without a surrounding QParser, so
// check for null here before using QParser to access the limit value
if (query instanceof PrefixQuery && parser != null) {
final var minPrefixLength =
parser.getReq().getCore().getSolrConfig().prefixQueryMinPrefixLength;
QueryUtils.ensurePrefixQueryObeysMinimumPrefixLength(query, termStr, minPrefixLength);
dsmiley marked this conversation as resolved.
Show resolved Hide resolved
}
return query;
}

public static BytesRef getBytesRef(Object value) {
if (value instanceof ByteArrayUtf8CharSequence) {
ByteArrayUtf8CharSequence utf8 = (ByteArrayUtf8CharSequence) value;
16 changes: 16 additions & 0 deletions solr/core/src/java/org/apache/solr/schema/TextField.java
Original file line number Diff line number Diff line change
@@ -26,6 +26,7 @@
import org.apache.lucene.index.Term;
import org.apache.lucene.queries.function.ValueSource;
import org.apache.lucene.queries.function.valuesource.SortedSetFieldSource;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedSetSelector;
@@ -38,6 +39,7 @@
import org.apache.solr.query.SolrRangeQuery;
import org.apache.solr.response.TextResponseWriter;
import org.apache.solr.search.QParser;
import org.apache.solr.search.QueryUtils;
import org.apache.solr.uninverting.UninvertingReader.Type;

/**
@@ -165,6 +167,20 @@ public Query getFieldTermQuery(QParser parser, SchemaField field, String externa
return new TermQuery(new Term(field.getName(), br));
}

@Override
public Query getPrefixQuery(QParser parser, SchemaField sf, String termStr) {
final var query = super.getPrefixQuery(parser, sf, termStr);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why override these methods in TextField & StrField with duplicative code when you could modify FieldType.getPrefixQuery instead? Doing it there also allows the cool existence query thing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why override these methods in TextField & StrField with duplicative code when you could modify FieldType.getPrefixQuery instead?

Moving the logic into FieldType.getPrefixQuery would have it apply to all field types, which I'm not sure we want. The limit will probably always be unhelpful for "boolean" and "enum" type fields for instance, since they're generally so low cardinality as to be immune to the "runaway prefix query" problem.

For now I've acquiesced and moved the logic to FieldType, but curious to hear your thoughts on some of those exceptional cases.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see that the limit doesn't make much sense for those two yet it's also harmless (I think). I prefer a consistent approach with no duplication. Thanks for doing this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well there is some harm, albeit small. A prefix query on a "known safe" field type like Enum might trigger the limit, and cause the "Admin aggravation/annoyance" we've discussed and tried to minimize throughout this PR.

That'd require a user to submit a query like my_enum_field:i*, which is presumably a pretty rare case. But you can imagine a user maybe doing that as a shorthand way to get docs matching a few target enum values


// Some internal usage (e.g. faceting) creates PrefixQueries without a surrounding QParser, so
// check for null here before using QParser to access the limit value
if (query instanceof PrefixQuery && parser != null) {
final var minPrefixLength =
parser.getReq().getCore().getSolrConfig().prefixQueryMinPrefixLength;
QueryUtils.ensurePrefixQueryObeysMinimumPrefixLength(query, termStr, minPrefixLength);
}
return query;
}

@Override
public Object toObject(SchemaField sf, BytesRef term) {
return term.utf8ToString();
Original file line number Diff line number Diff line change
@@ -19,6 +19,7 @@
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.complexPhrase.ComplexPhraseQueryParser;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.SolrParams;
@@ -134,6 +135,19 @@ protected Query newWildcardQuery(org.apache.lucene.index.Term t) {
}
}

@Override
protected Query getPrefixQuery(String field, String termStr) throws ParseException {
// TODO check the field type and call QueryUtils.ensureBlah here
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not clear why this parser needs modifications like this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed this in testing initially and was surprised as well.

It looks like ComplexPhraseQParserPlugin assumes "text" and doesn't rely on FieldType (or its subclasses) to do query-construction in a schema-aware manner. I'd be curious to see what other QParsers act similarly, but I'm not familiar enough with our query-parsing code generally to say whether it makes sense or might be problematic. But definitely orthogonal to this PR either way.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hopefully a simple fix to this parser to consult the field type will work. That's the right thing to do. Perhaps whoever wrote this didn't know better or it didn't exist at the time.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did find a way to make this change, but ended up backing off of it. (I pushed that change to this PR but then reverted it, if you're curious to what was involved.)

A few reasons for that back off:

  1. It ended up causing some complications on the Lucene-side. ComplexPhraseQueryParser (the Lucene class) makes certain type assumptions about the Query instances it handles. Injecting our own FieldType/schema aware query-creation logic into that class breaks those expectations in a pretty big way and causes a lot of test failures 😦
  2. It's orthogonal to my main goal in this PR, and I'm leery of scope creep.
  3. If we do want to address this, it's probably something that merits its own ticket to investigate and fix. After all, this probably isn't the only place in Solr that calls directly into a Lucene QueryParser without somehow creating sub-queries in a FieldType-aware way.

final var query = super.getPrefixQuery(field, termStr);
if (query instanceof PrefixQuery) {
final var minPrefixLength =
getReq().getCore().getSolrConfig().prefixQueryMinPrefixLength;
QueryUtils.ensurePrefixQueryObeysMinimumPrefixLength(
query, termStr, minPrefixLength);
}
return query;
}

private Query setRewriteMethod(org.apache.lucene.search.Query query) {
if (query instanceof MultiTermQuery) {
((MultiTermQuery) query)
21 changes: 21 additions & 0 deletions solr/core/src/java/org/apache/solr/search/QueryUtils.java
Original file line number Diff line number Diff line change
@@ -21,6 +21,7 @@
import java.util.Collections;
import java.util.IdentityHashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.search.BooleanClause;
@@ -34,6 +35,7 @@
import org.apache.lucene.search.Query;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.core.SolrConfig;
import org.apache.solr.request.SolrQueryRequest;

/** */
@@ -83,6 +85,25 @@ public static boolean isConstantScoreQuery(Query q) {
}
}

public static void ensurePrefixQueryObeysMinimumPrefixLength(
Query query, String prefix, int minPrefixLength) {
// TODO Should we provide a query-param to disable the limit on a request-by-request basis? I
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes indeed; in fact the QParser default algorithm could lookup a local-param (not request level) like prefixQueryMinimumLength. This will be straightforward once you create the QParser default method as I suggested.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done!

// can imagine scenarios where advanced users may want to enforce the limit on most fields,
// but ignore it for a few fields that they know to be low-cardinality and therefore "less
dsmiley marked this conversation as resolved.
Show resolved Hide resolved
// risky"
if (prefix.length() < minPrefixLength) {
final var message =
String.format(
Locale.ROOT,
"Query [%s] does not meet the minimum prefix length [%d] (actual=[%d]). Please try with a larger prefix, or adjust %s in your solrconfig.xml",
query,
minPrefixLength,
prefix.length(),
SolrConfig.MIN_PREFIX_LENGTH);
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, message);
}
}

/**
* Returns the original query if it was already a positive query, otherwise return the negative of
* the query (i.e., a positive query).
Original file line number Diff line number Diff line change
@@ -544,6 +544,10 @@ protected Query makeBucketQuery(final String bucketValue) {
private void calculateNumBuckets(SimpleOrderedMap<Object> target) throws IOException {
DocSet domain = fcontext.base;
if (freq.prefix != null) {
// TODO - Should we enforce minPrefixLength here in the case of 'string' fields, or omit
// since this is an "internal" request? If we want to enforce the limit in this case,
// we should have StrField read the configured limit and cache it in 'init' so that it can
// be read at 'getPrefixQuery' call-time without a QParser.
Query prefixFilter = sf.getType().getPrefixQuery(null, sf, freq.prefix);
domain = fcontext.searcher.getDocSet(prefixFilter, domain);
}
12 changes: 12 additions & 0 deletions solr/core/src/test-files/solr/collection1/conf/solrconfig.xml
Original file line number Diff line number Diff line change
@@ -89,6 +89,18 @@
-->
<maxBooleanClauses>${solr.max.booleanClauses:1024}</maxBooleanClauses>

<!-- Minimum acceptable prefix-size for prefix-based queries on string fields.

Prefix-based queries consume memory in proportion to the number of terms in the index
that start with that prefix. Short prefixes tend to match many many more indexed-terms
and consume more memory as a result, sometimes causing stability issues on the node.

This setting allows administrators to require that prefixes meet or exceed a specified
minimum length requirement. Prefix queries that don't meet this requirement return an
error to users.
-->
<minPrefixLength>${solr.min.prefixLength:2}</minPrefixLength>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again; name is confusing and I disagree with the choice of solr.min.prefixLength. The second component should be a category like "query", so could be "solr.query.minPrefixLength" for example.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To avoid fragmentation, let's continue XML-tag naming discussion in this thread here.

For the governing sys prop, I've changed that to 'solr.query.minPrefixLength' as suggested.


<!-- Cache specification for Filters or DocSets - unordered set of *all* documents
that match a particular query.
-->
4 changes: 2 additions & 2 deletions solr/core/src/test/org/apache/solr/ConvertedLegacyTest.java
Original file line number Diff line number Diff line change
@@ -637,8 +637,8 @@ public void testABunchOfConvertedStuff() {
"<add><doc><field name=\"id\">107</field><field name=\"val_s\">port</field></doc></add>");
assertU("<commit/>");

assertQ(req("val_s:a*"), "//*[@numFound='3']");
assertQ(req("val_s:p*"), "//*[@numFound='4']");
assertQ(req("val_s:ap*"), "//*[@numFound='3']");
assertQ(req("val_s:pe*"), "//*[@numFound='3']");
// val_s:* %//*[@numFound="8"]

// test wildcard query
1 change: 1 addition & 0 deletions solr/core/src/test/org/apache/solr/core/SolrCoreTest.java
Original file line number Diff line number Diff line change
@@ -298,6 +298,7 @@ public void testConfiguration() {
assertEquals(
"wrong config for slowQueryThresholdMillis", 2000, solrConfig.slowQueryThresholdMillis);
assertEquals("wrong config for maxBooleanClauses", 1024, solrConfig.booleanQueryMaxClauseCount);
assertEquals("wrong config for minPrefixLength", 2, solrConfig.prefixQueryMinPrefixLength);
assertTrue("wrong config for enableLazyFieldLoading", solrConfig.enableLazyFieldLoading);
assertEquals("wrong config for queryResultWindowSize", 10, solrConfig.queryResultWindowSize);
}
106 changes: 106 additions & 0 deletions solr/core/src/test/org/apache/solr/search/PrefixQueryTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search;

import org.apache.solr.SolrTestCaseJ4;
import org.apache.solr.common.SolrException;
import org.junit.BeforeClass;
import org.junit.Test;

/**
* Unit tests for prefix-query functionality - mostly testing the 'minPrefixLength' setting
* available in solrconfig.xml
*/
public class PrefixQueryTest extends SolrTestCaseJ4 {

private static final String[] MIN_PREFIX_SUPPORTING_FIELDS = new String[] {"val_s", "t_val"};

@BeforeClass
public static void beforeTests() throws Exception {
initCore("solrconfig.xml", "schema.xml");

assertU(createDocWithFieldVal("1", "aaa"));
assertU(createDocWithFieldVal("2", "aab"));
assertU(createDocWithFieldVal("3", "aac"));
assertU(createDocWithFieldVal("4", "abc"));

assertU(createDocWithFieldVal("5", "bbb"));
assertU(createDocWithFieldVal("6", "bbc"));

assertU("<commit/>");
}

// Sanity-check of a few queries we'll use in other tests
@Test
public void testPrefixQueryMatchesExpectedDocuments() {
for (String fieldName : MIN_PREFIX_SUPPORTING_FIELDS) {
assertQ(req(fieldName + ":*"), "//*[@numFound='6']");
assertQ(req(fieldName + ":aa*"), "//*[@numFound='3']");
assertQ(req(fieldName + ":bb*"), "//*[@numFound='2']");
}
}

@Test
public void testPrefixQueryObeysMinPrefixLimit() {
for (String fieldName : MIN_PREFIX_SUPPORTING_FIELDS) {
assertQEx(
"Prefix query didn't obey limit",
"does not meet the minimum prefix length [2] (actual=[1])",
req(fieldName + ":a*"),
SolrException.ErrorCode.BAD_REQUEST);
}
}

@Test
public void testPrefixQParserObeysMinPrefixLimit() {
for (String fieldName : MIN_PREFIX_SUPPORTING_FIELDS) {
assertQEx(
"Prefix query didn't obey limit",
"does not meet the minimum prefix length [2] (actual=[1])",
req("q", "{!prefix f=" + fieldName + "}a"),
SolrException.ErrorCode.BAD_REQUEST);
}
}

@Test
public void testComplexPhraseQParserObeysMinPrefixLimit() {
for (String fieldName : MIN_PREFIX_SUPPORTING_FIELDS) {
assertQEx(
"{!complex} query didn't obey min-prefix limit",
"does not meet the minimum prefix length [2] (actual=[1])",
req("q", "{!complexphrase inOrder=true}" + fieldName + ":\"a*\""),
SolrException.ErrorCode.BAD_REQUEST);
}
}

@Test
public void testQuestionMarkWildcardsCountTowardsMinimumPrefix() {
// Both of these queries succeed since the '?' wildcard is counted as a part of the prefix
assertQ(req("val_s:a?c*"), "//*[@numFound='2']"); // Matches 'aac' and 'abc'
assertQ(req("val_s:a??*"), "//*[@numFound='4']"); // Matches all documents starting with 'a'
}

private static String createDocWithFieldVal(String id, String fieldVal) {
return "<add><doc><field name=\"id\">"
+ id
+ "</field><field name=\"val_s\">"
+ fieldVal
+ "</field><field name=\"t_val\">"
+ fieldVal
+ "</field></doc></add>";
}
}
Original file line number Diff line number Diff line change
@@ -57,7 +57,7 @@ public void testDefaultField() {
"//doc[./str[@name='id']='1']");

assertQ(
req("q", "{!complexphrase} \"j* smyth~\""),
req("q", "{!complexphrase} \"jo* smyth~\""),
"//result[@numFound='2']",
"//doc[./str[@name='id']='1']",
"//doc[./str[@name='id']='2']");
@@ -121,7 +121,7 @@ public void test() {

assertQ(
"wildcards and fuzzies are OK in phrases",
sumLRF.makeRequest("name:\"j* smyth~\""),
sumLRF.makeRequest("name:\"jo* smyth~\""),
"//doc[./str[@name='id']='1']",
"//doc[./str[@name='id']='2']",
"//result[@numFound='2']");
@@ -252,7 +252,7 @@ public void testMultipleFields() {
"//doc[./str[@name='id']='3']");

assertQ(
req("q", "{!complexphrase} name:\"prot* dige*\" AND text:\"d* r*\""),
req("q", "{!complexphrase} name:\"prot* dige*\" AND text:\"dn* ru*\""),
"//result[@numFound='1']",
"//doc[./str[@name='id']='3']");

Original file line number Diff line number Diff line change
@@ -64,7 +64,7 @@ public void testLiveDocsSharing() throws Exception {

String[] queries = {
"foo_s:foo",
"foo_s:f*",
"foo_s:fo*",
"*:*",
"id:[* TO *]",
"id:[0 TO 99]",
Original file line number Diff line number Diff line change
@@ -55,6 +55,7 @@ public class BJQParserTest extends SolrTestCaseJ4 {

@BeforeClass
public static void beforeClass() throws Exception {
System.setProperty("solr.min.prefixLength", String.valueOf(1));
initCore("solrconfig.xml", "schema15.xml");
createIndex();
}
13 changes: 13 additions & 0 deletions solr/server/solr/configsets/_default/conf/solrconfig.xml
Original file line number Diff line number Diff line change
@@ -360,6 +360,19 @@
-->
<maxBooleanClauses>${solr.max.booleanClauses:1024}</maxBooleanClauses>


<!-- Minimum acceptable prefix-size for prefix-based queries on string fields.

Prefix-based queries consume memory in proportion to the number of terms in the index
that start with that prefix. Short prefixes tend to match many many more indexed-terms
and consume more memory as a result, sometimes causing stability issues on the node.

This setting allows administrators to require that prefixes meet or exceed a specified
minimum length requirement. Prefix queries that don't meet this requirement return an
error to users.
-->
<minPrefixLength>${solr.min.prefixLength:2}</minPrefixLength>

<!-- Solr Internal Query Caches
Starting with Solr 9.0 the default cache implementation used is CaffeineCache.
-->
Loading