-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
WIP - Semantic text query #1
Changes from 11 commits
20932c6
50ad0d6
9c964dd
5e80ba5
ea0bfb2
83d4835
63e46cc
8916e82
562027d
a4c0fc0
79487c2
271e0e2
fb5a913
86adada
ad5478c
13edac3
e3a7590
daaf34b
8750cd4
f4b6440
288592c
b57dbe0
3b6e850
a24d121
207a8fd
b3e04a8
d000196
46eb1ee
f135d5e
ae600d9
922d479
a662e09
e25267f
c4651d3
647005f
b7476f5
d7ea7ae
42cad64
99e4fce
b9418bd
12d7728
f4f8414
7486dc4
e1b74aa
47ca7ae
2846aa7
9a7de59
38709e2
6deb746
f121372
c572ca1
a923934
e562d3b
e4e63f8
e6f2e70
4661023
ad2b95e
739072c
844e75f
69c674e
8c911a1
95c42fb
03fe8c0
1be3e9a
0315982
42b6585
6d70cbc
84dd66c
a55ce08
a04712b
950d46a
98ceb06
b8dc5c3
410efb6
7fb4b74
0d0b319
5920c91
2dc9e89
ac08fe6
f4702fa
065158e
b3fc714
35df385
991c21f
3870bc8
fa67339
f65f8ec
4c21c96
3cde13c
7cbdb6c
f06a580
16bdbe4
954c428
0dca71e
d1ec0d2
76cf927
550b5bb
5d4bb6e
48ceed0
d4263c2
9c72157
ffa4d40
1f1636e
d693fc8
2d4a49a
edf96a5
5f508a1
86f4b18
0da5220
280fa40
6b50b6d
80dae50
ce8402f
31f9b71
a9f4b64
fb7b1bc
40af275
011693a
c53a8e1
4da7a12
3a1d7e3
be55e4b
b1fcedd
50c1dcb
09bdb16
d842ce9
c8a35d3
dee0be5
d26214f
d7fbabc
a9783c4
268ba12
8daa5eb
c998970
d046a5c
98a7ca5
6073e74
b650051
c5b705d
f396321
b2d25b7
5b82681
39dd09b
b752169
1cfa86e
a13f52f
dbf72e1
c60706c
ad65def
8fd6e30
89c61cd
8090c61
f0e4317
92c2b36
b95cb8c
2ba37ff
ae4e57d
5671669
21b64ba
10ec23a
d67af19
39a4ddb
1dd2712
d37d93a
f86532b
e568f70
93988f9
e0d2616
3b2b9b3
bc47d18
2e9e8f8
229dba3
bc2a77f
528f194
35b2dbe
138bc6a
4b8b09f
5f3e4ae
50090f1
2c0d472
3587bc8
d507072
acfb500
5d0296a
b4b32aa
8df3a30
e390edb
73a170b
1df32c9
136b1a1
067aba9
0fb3a6e
4bea4a7
52b027e
6bb6cab
07ae23a
0a7c88c
8cc438e
5d809f3
c888d39
5d1d75a
416de3c
42786aa
1978b1f
987b778
b37e6e9
0b664dd
cbb09d2
294fa4d
3b8177f
c97160a
9e5fbf6
3addbed
3a21ce6
35ff280
7ddf519
e55bdba
b96f8f0
9c1a079
fc8e2b7
c57d96a
6c11826
fdfc08a
0732628
6ae9dbf
89786f5
6f87bd3
6ab69e5
93fd12d
ca10472
0dd92b0
1c03489
30828a5
d65461d
3f7ccde
139c94b
e9ff896
1e76b18
7191758
4f2c8ca
61b3d98
fbbfbd5
7c6120b
335afe5
fe13a04
3e5c3c5
881c394
b1a3ee8
2039fb3
e9c5d72
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,8 +7,16 @@ | |
|
||
package org.elasticsearch.xpack.inference.mapper; | ||
|
||
import org.apache.lucene.index.Term; | ||
import org.apache.lucene.search.BooleanClause; | ||
import org.apache.lucene.search.BooleanQuery; | ||
import org.apache.lucene.search.BoostQuery; | ||
import org.apache.lucene.search.Query; | ||
import org.apache.lucene.search.TermQuery; | ||
import org.apache.lucene.search.join.BitSetProducer; | ||
import org.apache.lucene.search.join.ScoreMode; | ||
import org.elasticsearch.common.Strings; | ||
import org.elasticsearch.common.lucene.search.Queries; | ||
import org.elasticsearch.index.fielddata.FieldDataContext; | ||
import org.elasticsearch.index.fielddata.IndexFieldData; | ||
import org.elasticsearch.index.mapper.DocumentParserContext; | ||
|
@@ -21,10 +29,15 @@ | |
import org.elasticsearch.index.mapper.TextSearchInfo; | ||
import org.elasticsearch.index.mapper.ValueFetcher; | ||
import org.elasticsearch.index.query.SearchExecutionContext; | ||
import org.elasticsearch.index.search.ESToParentBlockJoinQuery; | ||
import org.elasticsearch.inference.InferenceResults; | ||
import org.elasticsearch.xpack.core.ml.inference.results.TextExpansionResults; | ||
|
||
import java.io.IOException; | ||
import java.util.Map; | ||
|
||
import static org.elasticsearch.action.bulk.BulkShardRequestInferenceProvider.INFERENCE_CHUNKS_RESULTS; | ||
|
||
/** | ||
* A {@link FieldMapper} for semantic text fields. These fields have a model id reference, that is used for performing inference | ||
* at ingestion and query time. | ||
|
@@ -126,5 +139,38 @@ public ValueFetcher valueFetcher(SearchExecutionContext context, String format) | |
public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) { | ||
throw new IllegalArgumentException("[semantic_text] fields do not support sorting, scripting or aggregating"); | ||
} | ||
|
||
public Query semanticQuery( | ||
InferenceResults inferenceResults, | ||
SearchExecutionContext context, | ||
float boost, | ||
String queryName | ||
) { | ||
// Cant use QueryBuilders.boolQuery() because a mapper is not registered for <field>.inference, causing | ||
// TermQueryBuilder#doToQuery to fail (at TermQueryBuilder:202) | ||
// TODO: Handle boost and queryName | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. TextExpansionQueryBuilder uses I see two ways to resolve this:
WDYT? |
||
String fieldName = name() + "." + INFERENCE_CHUNKS_RESULTS; | ||
BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder().setMinimumNumberShouldMatch(1); | ||
|
||
// TODO: Support dense vectors | ||
if (inferenceResults instanceof TextExpansionResults textExpansionResults) { | ||
for (TextExpansionResults.WeightedToken weightedToken : textExpansionResults.getWeightedTokens()) { | ||
queryBuilder.add( | ||
new BoostQuery( | ||
new TermQuery( | ||
new Term(fieldName, weightedToken.token()) | ||
), | ||
weightedToken.weight() | ||
), | ||
BooleanClause.Occur.SHOULD | ||
); | ||
} | ||
} else { | ||
throw new IllegalArgumentException("Unsupported inference results type [" + inferenceResults.getWriteableName() + "]"); | ||
} | ||
|
||
BitSetProducer parentFilter = context.bitsetFilter(Queries.newNonNestedFilter(context.indexVersionCreated())); | ||
return new ESToParentBlockJoinQuery(queryBuilder.build(), parentFilter, ScoreMode.Total, name()); | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rebuilding the
modelsForFields
map like this every time is obviously inefficient, but it should be fine for a small number ofsemantic_text
fields.I have some ideas about how to cache
modelsForFields
per index that should make this more performant, but it requires larger changes that would bloat this PR and increase its scope considerably. Probably better to iterate on this in a separate PR.