forked from manticoresoftware/manticoresearch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sphinx.h
1448 lines (1159 loc) · 50.5 KB
/
sphinx.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//
// Copyright (c) 2017-2023, Manticore Software LTD (https://manticoresearch.com)
// Copyright (c) 2001-2016, Andrew Aksyonoff
// Copyright (c) 2008-2016, Sphinx Technologies Inc
// All rights reserved
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License. You should have
// received a copy of the GPL license along with this program; if you
// did not, you can find it at http://www.gnu.org/
//
#ifndef _sphinx_
#define _sphinx_
/////////////////////////////////////////////////////////////////////////////
#include "sphinxstd.h"
#include "indexsettings.h"
#include "fileutils.h"
#include "collation.h"
#include "binlog_defs.h"
#include "task_dispatcher.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <limits.h>
#if _WIN32
#include <winsock2.h>
#include <WS2tcpip.h>
#else
#include <sys/types.h>
#include <unistd.h>
#endif
#if _WIN32
#define STDOUT_FILENO fileno(stdout)
#define STDERR_FILENO fileno(stderr)
#endif
#include "sphinxdefs.h"
#include "schema/locator.h"
#include "schema/schema.h"
#include "indexfilebase.h"
/////////////////////////////////////////////////////////////////////////////
// defined in sphinxversion.cpp in order to isolate from total rebuild on minor changes
extern const char * szMANTICORE_VERSION;
extern const char * szMANTICORE_NAME;
extern const char * szMANTICORE_BANNER;
extern const char * szMANTICORE_BANNER_TEXT;
extern const char * szGIT_COMMIT_ID;
extern const char * szGIT_BRANCH_ID;
extern const char * szGDB_SOURCE_DIR;
#define SPHINX_SEARCHD_PROTO 1
#define SPHINX_CLIENT_VERSION 1
/////////////////////////////////////////////////////////////////////////////
extern int64_t g_iIndexerCurrentDocID;
extern int64_t g_iIndexerCurrentHits;
extern int64_t g_iIndexerCurrentRangeMin;
extern int64_t g_iIndexerCurrentRangeMax;
extern int64_t g_iIndexerPoolStartDocID;
extern int64_t g_iIndexerPoolStartHit;
/////////////////////////////////////////////////////////////////////////////
/// millisecond-precision sleep
void sphSleepMsec ( int iMsec );
/// immediately interrupt current query
void sphInterruptNow();
/// check if we got interrupted
bool sphInterrupted();
//////////////////////////////////////////////////////////////////////////
struct CSphMultiformContainer;
class CSphWriter;
/// wordforms container
struct CSphWordforms;
// converts stopword/wordform/exception file paths for configless mode
class FilenameBuilder_i
{
public:
virtual ~FilenameBuilder_i() {}
virtual CSphString GetFullPath ( const CSphString & sName ) const = 0;
};
/////////////////////////////////////////////////////////////////////////////
// DATASOURCES
/////////////////////////////////////////////////////////////////////////////
/// hit processing tools
/// Hitpos_t consists of three things:
/// 1) high bits store field number
/// 2) middle bit - field end marker
/// 3) lower bits store hit position in field
template < int FIELD_BITS >
class Hitman_c
{
protected:
enum
{
FIELD_OFF = 32 - FIELD_BITS, // 24
POS_BITS = FIELD_OFF - 1, // 23
FIELDEND_OFF = POS_BITS, // 23
FIELDEND_MASK = (1UL << POS_BITS), // 0x00800000
POS_MASK = FIELDEND_MASK - 1, // 0x007FFFFF
FIELD_MASK = ~(FIELDEND_MASK|POS_MASK),// 0xFF000000
};
public:
static Hitpos_t Create ( int iField, int iPos )
{
return ( iField << FIELD_OFF ) + ( iPos & POS_MASK );
}
static Hitpos_t Create ( int iField, int iPos, bool bEnd )
{
return ( iField << FIELD_OFF ) + ( ((int)bEnd) << FIELDEND_OFF ) + ( iPos & POS_MASK );
}
static inline int GetField ( Hitpos_t uHitpos )
{
return uHitpos >> FIELD_OFF;
}
static inline void DecrementField ( Hitpos_t& uHitpos )
{
assert ( uHitpos & FIELD_MASK );
uHitpos -= (1UL << FIELD_OFF);
}
static inline int GetPos ( Hitpos_t uHitpos )
{
return uHitpos & POS_MASK;
}
static inline bool IsEnd ( Hitpos_t uHitpos )
{
return ( uHitpos & FIELDEND_MASK )!=0;
}
static inline DWORD GetPosWithField ( Hitpos_t uHitpos )
{
return uHitpos & ~FIELDEND_MASK;
}
static void AddPos ( Hitpos_t * pHitpos, int iAdd )
{
// FIXME! add range checks (eg. so that 0:0-1 does not overflow)
*pHitpos += iAdd;
}
static Hitpos_t CreateSum ( Hitpos_t uHitpos, int iAdd )
{
// FIXME! add range checks (eg. so that 0:0-1 does not overflow)
return ( uHitpos+iAdd ) & ~FIELDEND_MASK;
}
static void SetEndMarker ( Hitpos_t * pHitpos )
{
*pHitpos |= FIELDEND_MASK;
}
};
// this could be just DWORD[] but it's methods are very handy
// used to store field information e.g. which fields do we need to search in
struct FieldMask_t
{
static const int SIZE = SPH_MAX_FIELDS/32;
STATIC_ASSERT ( ( SPH_MAX_FIELDS%32 )==0, ASSUME_MAX_FIELDS_ARE_REPRESENTABLE_BY_DWORD );
DWORD m_dMask [ SIZE ];
// no custom cstr and d-tor - to be usable from inside unions
// deep copy for it is ok - so, no explicit copying constructor and operator=
// old-fashion layer to work with DWORD (32-bit) mask.
// all bits above 32 assumed to be unset.
void Assign32 ( DWORD uMask )
{
UnsetAll();
m_dMask[0] = uMask;
}
DWORD GetMask32 () const
{
return m_dMask[0];
}
DWORD operator[] ( int iIdx ) const
{
assert ( 0<=iIdx && iIdx<SIZE );
return m_dMask [ iIdx ];
}
DWORD & operator[] ( int iIdx )
{
assert ( 0<=iIdx && iIdx<SIZE );
return m_dMask [ iIdx ];
}
// set n-th bit
void Set ( int iIdx )
{
assert ( 0<=iIdx && iIdx<(int)sizeof(m_dMask)*8 );
m_dMask [ iIdx/32 ] |= 1 << ( iIdx%32 );
}
// set all bits
void SetAll()
{
memset ( m_dMask, 0xff, sizeof(m_dMask) );
}
// unset n-th bit, or all
void Unset ( int iIdx )
{
assert ( 0<=iIdx && iIdx<(int)sizeof(m_dMask)*8 );
m_dMask [ iIdx/32 ] &= ~(1 << ( iIdx%32 ));
}
void UnsetAll()
{
memset ( m_dMask, 0, sizeof(m_dMask) );
}
// test if n-th bit is set
bool Test ( int iIdx ) const
{
assert ( iIdx>=0 && iIdx<(int)sizeof(m_dMask)*8 );
return ( m_dMask [ iIdx/32 ] & ( 1 << ( iIdx%32 ) ) )!=0;
}
// test if all bits are set or unset
bool TestAll ( bool bSet ) const
{
DWORD uTest = bSet ? 0xffffffff : 0;
for ( auto uMask : m_dMask )
if ( uMask!=uTest )
return false;
return true;
}
void Negate()
{
for ( auto& uMask : m_dMask )
uMask = ~uMask;
}
// keep bits up to iIdx; shift bits over iIdx right by 1
void DeleteBit ( int iIdx )
{
const auto iDwordIdx = iIdx / 32;
const auto iDwordBitPos = iIdx % 32;
DWORD uCarryBit = 0;
for ( int i = SIZE-1; i>iDwordIdx; --i )
{
bool bNextCarry = m_dMask[i] & 1;
m_dMask[i] = uCarryBit | ( m_dMask[i] >> 1 );
uCarryBit = bNextCarry ? 0x80000000 : 0;
}
DWORD uShiftBit = 1 << ( iDwordBitPos ); // like: 00000000 00000000 00000100 00000000
DWORD uKeepMask = uShiftBit-1; // like: 00000000 00000000 00000011 11111111
DWORD uMoveMask = ~(uShiftBit | uKeepMask); // like: 11111111 11111111 11111000 00000000
DWORD uKept = m_dMask[iDwordIdx] & uKeepMask;
m_dMask[iDwordIdx] = uCarryBit | ( ( m_dMask[iDwordIdx] & uMoveMask ) >> 1 ) | uKept;
}
};
struct RowTagged_t
{
RowID_t m_tID { INVALID_ROWID }; ///< document ID
int m_iTag {0}; ///< index tag
RowTagged_t() = default;
RowTagged_t ( const CSphMatch & tMatch );
RowTagged_t ( RowID_t tRowID, int iTag );
bool operator== ( const RowTagged_t & tRow ) const;
bool operator!= ( const RowTagged_t & tRow ) const;
};
//////////////////////////////////////////////////////////////////////////
// defined in stripper/html_stripper.h
class CSphHTMLStripper;
/// field filter
class ISphFieldFilter
{
public:
virtual ~ISphFieldFilter() = default;
virtual int Apply ( const BYTE * sField, int iLength, CSphVector<BYTE> & dStorage, bool bQuery ) = 0;
int Apply ( const void* szField, CSphVector<BYTE>& dStorage, bool bQuery )
{
return Apply ( (const BYTE*)szField, (int) strlen ( (const char*)szField ), dStorage, bQuery );
}
int Apply ( ByteBlob_t sField, CSphVector<BYTE>& dStorage, bool bQuery )
{
return Apply ( sField.first, sField.second, dStorage, bQuery );
}
virtual void GetSettings ( CSphFieldFilterSettings & tSettings ) const = 0;
virtual std::unique_ptr<ISphFieldFilter> Clone() const = 0;
};
/// create a regexp field filter
std::unique_ptr<ISphFieldFilter> sphCreateRegexpFilter ( const CSphFieldFilterSettings & tFilterSettings, CSphString & sError );
/// create an ICU field filter
std::unique_ptr<ISphFieldFilter> sphCreateFilterICU ( std::unique_ptr<ISphFieldFilter> pParent, const char * szBlendChars, CSphString & sError );
/////////////////////////////////////////////////////////////////////////////
// SEARCH QUERIES
/////////////////////////////////////////////////////////////////////////////
/// search query filter
struct CommonFilterSettings_t
{
ESphFilter m_eType = SPH_FILTER_VALUES; ///< filter type
union
{
SphAttr_t m_iMinValue = LLONG_MIN; ///< range min
float m_fMinValue; ///< range min
};
union
{
SphAttr_t m_iMaxValue = LLONG_MAX; ///< range max
float m_fMaxValue; ///< range max
};
};
class CSphFilterSettings : public CommonFilterSettings_t
{
public:
CSphString m_sAttrName = ""; ///< filtered attribute name
bool m_bExclude = false; ///< whether this is "include" or "exclude" filter (default is "include")
bool m_bHasEqualMin = true; ///< has filter "equal" component or pure greater/less (for min)
bool m_bHasEqualMax = true; ///< has filter "equal" component or pure greater/less (for max)
bool m_bOpenLeft = false;
bool m_bOpenRight = false;
bool m_bIsNull = false; ///< for NULL or NOT NULL
ESphMvaFunc m_eMvaFunc = SPH_MVAFUNC_NONE; ///< MVA and stringlist folding function
CSphVector<SphAttr_t> m_dValues; ///< integer values set
StrVec_t m_dStrings; ///< string values
public:
CSphFilterSettings () = default;
// fixme! Dependency from external values implies, that CsphFilterSettings is NOT standalone,
// and it's state is no way 'undependent'. It would be good to capture external values, at least
// with ref-counted technique, exactly here, to locate all usecases near each other.
void SetExternalValues ( const VecTraits_T<SphAttr_t>& dValues );
int GetNumValues () const { return GetValues().GetLength(); }
const VecTraits_T<SphAttr_t>& GetValues () const { return m_dExtValues.IsEmpty() ? m_dValues : m_dExtValues; }
bool operator == ( const CSphFilterSettings & rhs ) const;
bool operator != ( const CSphFilterSettings & rhs ) const { return !( (*this)==rhs ); }
uint64_t GetHash() const;
private:
VecTraits_T<SphAttr_t> m_dExtValues;
};
// keyword info
struct CSphKeywordInfo
{
CSphString m_sTokenized;
CSphString m_sNormalized;
int m_iDocs = 0;
int m_iHits = 0;
int m_iQpos = 0;
};
inline void Swap ( CSphKeywordInfo & v1, CSphKeywordInfo & v2 )
{
v1.m_sTokenized.Swap ( v2.m_sTokenized );
v1.m_sNormalized.Swap ( v2.m_sNormalized );
::Swap ( v1.m_iDocs, v2.m_iDocs );
::Swap ( v1.m_iHits, v2.m_iHits );
::Swap ( v1.m_iQpos, v2.m_iQpos );
}
/// query selection item
struct CSphQueryItem
{
CSphString m_sExpr; ///< expression to compute
CSphString m_sAlias; ///< alias to return
ESphAggrFunc m_eAggrFunc { SPH_AGGR_NONE };
};
/// search query complex filter tree
struct FilterTreeItem_t
{
int m_iLeft = -1; // left node at parser filter operations
int m_iRight = -1; // right node at parser filter operations
int m_iFilterItem = -1; // index into query filters
bool m_bOr = false;
bool operator == ( const FilterTreeItem_t & rhs ) const;
bool operator != ( const FilterTreeItem_t & rhs ) const { return !( (*this)==rhs ); }
uint64_t GetHash() const;
};
/// table function interface
struct CSphQuery;
struct AggrResult_t;
class ISphTableFunc
{
public:
virtual ~ISphTableFunc() {}
virtual bool ValidateArgs ( const StrVec_t & dArgs, const CSphQuery & tQuery, CSphString & sError ) = 0;
virtual bool Process ( AggrResult_t * pResult, CSphString & sError ) = 0;
virtual bool LimitPushdown ( int, int ) { return false; } // FIXME! implement this
};
class QueryParser_i;
enum class SecondaryIndexType_e
{
NONE,
FILTER,
LOOKUP,
INDEX,
ANALYZER,
TOTAL
};
struct IndexHint_t
{
CSphString m_sIndex;
SecondaryIndexType_e m_eType = SecondaryIndexType_e::NONE;
bool m_bForce = true;
};
const int DEFAULT_MAX_MATCHES = 1000;
/// search query. Pure struct, no member functions
struct CSphQuery
{
CSphString m_sIndexes {"*"}; ///< indexes to search
CSphString m_sQuery; ///< cooked query string for the engine (possibly transformed during legacy matching modes fixup)
CSphString m_sRawQuery; ///< raw query string from the client for searchd log, agents, etc
int m_iOffset=0; ///< offset into result set (as X in MySQL LIMIT X,Y clause)
int m_iLimit=20; ///< limit into result set (as Y in MySQL LIMIT X,Y clause)
CSphVector<DWORD> m_dWeights; ///< user-supplied per-field weights. may be NULL. default is NULL
ESphMatchMode m_eMode = SPH_MATCH_EXTENDED; ///< match mode. default is "match all"
ESphRankMode m_eRanker = SPH_RANK_DEFAULT; ///< ranking mode, default is proximity+BM25
CSphString m_sRankerExpr; ///< ranking expression for SPH_RANK_EXPR
CSphString m_sUDRanker; ///< user-defined ranker name
CSphString m_sUDRankerOpts; ///< user-defined ranker options
ESphSortOrder m_eSort = SPH_SORT_RELEVANCE; ///< sort mode
CSphString m_sSortBy; ///< attribute to sort by
int64_t m_iRandSeed = -1; ///< random seed for ORDER BY RAND(), -1 means do not set
int m_iMaxMatches = DEFAULT_MAX_MATCHES; ///< max matches to retrieve, default is 1000. more matches use more memory and CPU time to hold and sort them
bool m_bExplicitMaxMatches = false; ///< did we specify the max_matches explicitly?
bool m_bSortKbuffer = false; ///< whether to use PQ or K-buffer sorting algorithm
bool m_bZSlist = false; ///< whether the ranker has to fetch the zonespanlist with this query
bool m_bSimplify = false; ///< whether to apply boolean simplification
bool m_bPlainIDF = false; ///< whether to use PlainIDF=log(N/n) or NormalizedIDF=log((N-n+1)/n)
bool m_bGlobalIDF = false; ///< whether to use local indexes or a global idf file
bool m_bNormalizedTFIDF = true; ///< whether to scale IDFs by query word count, so that TF*IDF is normalized
bool m_bLocalDF = false; ///< whether to use calculate DF among local indexes
bool m_bLowPriority = false; ///< set low thread priority for this query
DWORD m_uDebugFlags = 0;
QueryOption_e m_eExpandKeywords = QUERY_OPT_DEFAULT; ///< control automatic query-time keyword expansion
bool m_bAccurateAggregation = false; ///< setting via options
bool m_bExplicitAccurateAggregation = false; ///< whether anything was set via options
int m_iDistinctThresh = 3500; ///< distinct accuracy thresh
bool m_bExplicitDistinctThresh = false; ///< whether thresh was set via options
int m_iMaxMatchThresh = 16384;
CSphVector<CSphFilterSettings> m_dFilters; ///< filters
CSphVector<FilterTreeItem_t> m_dFilterTree;
CSphVector<IndexHint_t> m_dIndexHints; ///< secondary index hints
CSphString m_sGroupBy; ///< group-by attribute name(s)
CSphString m_sFacetBy; ///< facet-by attribute name(s)
ESphGroupBy m_eGroupFunc = SPH_GROUPBY_ATTR; ///< function to pre-process group-by attribute value with
CSphString m_sGroupSortBy { "@groupby desc" }; ///< sorting clause for groups in group-by mode
CSphString m_sGroupDistinct; ///< count distinct values for this attribute
int m_iCutoff = -1; ///< matches count threshold to stop searching at (<=0 means to search until all matches are found)
int m_iRetryCount = -1; ///< retry count, for distributed queries. (-1 means 'use default')
int m_iRetryDelay = -1; ///< retry delay, for distributed queries. (-1 means 'use default')
int m_iAgentQueryTimeoutMs = 0; ///< agent query timeout override, for distributed queries
bool m_bGeoAnchor = false; ///< do we have an anchor
CSphString m_sGeoLatAttr; ///< latitude attr name
CSphString m_sGeoLongAttr; ///< longitude attr name
float m_fGeoLatitude = 0.0f; ///< anchor latitude
float m_fGeoLongitude = 0.0f; ///< anchor longitude
CSphVector<CSphNamedInt> m_dIndexWeights; ///< per-index weights
CSphVector<CSphNamedInt> m_dFieldWeights; ///< per-field weights
DWORD m_uMaxQueryMsec = 0; ///< max local index search time, in milliseconds (default is 0; means no limit)
int m_iMaxPredictedMsec = 0; ///< max predicted (!) search time limit, in milliseconds (0 means no limit)
CSphString m_sComment; ///< comment to pass verbatim in the log file
CSphString m_sSelect; ///< select-list (attributes and/or expressions)
CSphString m_sOrderBy; ///< order-by clause
CSphString m_sOuterOrderBy; ///< temporary (?) subselect hack
int m_iOuterOffset = 0; ///< keep and apply outer offset at master
int m_iOuterLimit = 0;
bool m_bHasOuter = false;
bool m_bIgnoreNonexistent = false; ///< whether to warning or not about non-existent columns in select list
bool m_bIgnoreNonexistentIndexes = false; ///< whether to error or not about non-existent indexes in index list
bool m_bStrict = false; ///< whether to warning or not about incompatible types
bool m_bSync = false; ///< whether or not use synchronous operations (optimize, etc.)
bool m_bNotOnlyAllowed = false; ///< whether allow single full-text not operator
CSphString m_sStore; ///< don't delete result, just store in given uservar by name
CSphFilterSettings m_tHaving; ///< post aggregate filtering (got applied only on master)
int m_iSQLSelectStart = -1; ///< SQL parser helper
int m_iSQLSelectEnd = -1; ///< SQL parser helper
int m_iGroupbyLimit = 1; ///< number of elems within group
CSphVector<CSphQueryItem> m_dItems; ///< parsed select-list
CSphVector<CSphQueryItem> m_dRefItems; ///< select-list prior replacing by facet
ESphCollation m_eCollation = SPH_COLLATION_DEFAULT; ///< ORDER BY collation
bool m_bAgent = false; ///< agent mode (may need extra cols on output)
CSphString m_sQueryTokenFilterLib; ///< token filter library name
CSphString m_sQueryTokenFilterName; ///< token filter name
CSphString m_sQueryTokenFilterOpts; ///< token filter options
bool m_bFacet = false; ///< whether this a facet query
bool m_bFacetHead = false;
QueryType_e m_eQueryType {QUERY_API}; ///< queries from sphinxql require special handling
const QueryParser_i * m_pQueryParser = nullptr; ///< queries do not own this parser
// JSON output
StrVec_t m_dIncludeItems;
StrVec_t m_dExcludeItems;
const void* m_pCookie = nullptr; ///< opaque mark, used to manage lifetime of the vec of queries
int m_iCouncurrency = 0; ///< limit N of threads to run query with. 0 means 'no limit'
CSphVector<CSphString> m_dStringSubkeys;
CSphVector<int64_t> m_dIntSubkeys;
Dispatcher::Template_t m_tMainDispatcher;
Dispatcher::Template_t m_tPseudoShardingDispatcher;
};
/// parse select list string into items
bool ParseSelectList ( CSphString &sError, CSphQuery &pResult );
/// some low-level query stats
struct CSphQueryStats
{
int64_t * m_pNanoBudget = nullptr;///< pointer to max_predicted_time budget (counted in nanosec)
DWORD m_iFetchedDocs = 0; ///< processed documents
DWORD m_iFetchedHits = 0; ///< processed hits (aka positions)
DWORD m_iSkips = 0; ///< number of Skip() calls
void Add ( const CSphQueryStats & tStats );
};
struct IteratorDesc_t
{
CSphString m_sAttr;
CSphString m_sType;
int m_iUsed = 1;
};
struct IteratorStats_t
{
CSphVector<IteratorDesc_t> m_dIterators;
int m_iTotal = 0;
void Merge ( const IteratorStats_t & tSrc );
};
/// search query meta-info
class CSphQueryResultMeta
{
public:
int m_iQueryTime = 0; ///< query time, milliseconds
int m_iRealQueryTime = 0; ///< query time, measured just from start to finish of the query. In milliseconds
int64_t m_iCpuTime = 0; ///< user time, microseconds
int m_iMultiplier = 1; ///< multi-query multiplier, -1 to indicate error
using WordStat_t = std::pair<int64_t, int64_t>;
SmallStringHash_T<WordStat_t> m_hWordStats; ///< hash of i-th search term (normalized word form)
int m_iMatches = 0; ///< total matches returned (upto MAX_MATCHES)
int64_t m_iTotalMatches = 0; ///< total matches found (unlimited)
bool m_bTotalMatchesApprox = false; ///< whether m_iTotalMatches shows exact or approximate numbers
CSphIOStats m_tIOStats; ///< i/o stats for the query
int64_t m_iAgentCpuTime = 0; ///< agent cpu time (for distributed searches)
CSphIOStats m_tAgentIOStats; ///< agent IO stats (for distributed searches)
int64_t m_iPredictedTime = 0; ///< local predicted time
int64_t m_iAgentPredictedTime = 0; ///< distributed predicted time
DWORD m_iAgentFetchedDocs = 0; ///< distributed fetched docs
DWORD m_iAgentFetchedHits = 0; ///< distributed fetched hits
DWORD m_iAgentFetchedSkips = 0; ///< distributed fetched skips
CSphQueryStats m_tStats; ///< query prediction counters
bool m_bHasPrediction = false; ///< is prediction counters set?
CSphString m_sError; ///< error message
CSphString m_sWarning; ///< warning message
QueryProfile_c * m_pProfile = nullptr; ///< filled when query profiling is enabled; NULL otherwise
IteratorStats_t m_tIteratorStats; ///< iterators used while calculating the query
bool m_bBigram = false; ///< whatever to remove bigram symbol on adding word to stat
virtual ~CSphQueryResultMeta () {} ///< dtor
void AddStat ( const CSphString & sWord, int64_t iDocs, int64_t iHits );
void MergeWordStats ( const CSphQueryResultMeta& tOther );// sort wordstat to achieve reproducable result over different runs
CSphFixedVector<SmallStringHash_T<CSphQueryResultMeta::WordStat_t>::KeyValue_t *> MakeSortedWordStat () const;
};
/// search query result (meta-info)
class QueryProfile_c;
class DocstoreReader_i;
class CSphQueryResult
{
public:
CSphQueryResultMeta * m_pMeta = nullptr; ///< not owned
const BYTE * m_pBlobPool = nullptr; ///< pointer to blob attr storage. Used only during calculations.
const DocstoreReader_i* m_pDocstore = nullptr; ///< pointer to docstore reader fixme! not need in aggr
columnar::Columnar_i * m_pColumnar = nullptr;
};
/////////////////////////////////////////////////////////////////////////////
// ATTRIBUTE UPDATE QUERY
/////////////////////////////////////////////////////////////////////////////
struct TypedAttribute_t
{
CSphString m_sName;
ESphAttr m_eType;
};
struct CSphAttrUpdate
{
CSphVector<TypedAttribute_t> m_dAttributes; ///< update schema, attributes to update
CSphVector<DWORD> m_dPool; ///< update values pool
CSphVector<BYTE> m_dBlobs; ///< update pool for blob attrs
CSphVector<DocID_t> m_dDocids; ///< document IDs vector
CSphVector<int> m_dRowOffset; ///< document row offsets in the pool (1 per doc, or empty, means 0 always)
bool m_bIgnoreNonexistent = false; ///< whether to warn about non-existen attrs, or just silently ignore them
bool m_bStrict = false; ///< whether to check for incompatible types first, or just ignore them
bool m_bReusable = true; ///< whether update is standalone and never rewritten, or need deep-copy
inline int GetRowOffset (int i) const
{
return m_dRowOffset.IsEmpty() ? 0 : m_dRowOffset[i];
}
};
using AttrUpdateSharedPtr_t = SharedPtr_t<CSphAttrUpdate>;
inline AttrUpdateSharedPtr_t MakeReusableUpdate ( AttrUpdateSharedPtr_t& pUpdate )
{
if ( pUpdate->m_bReusable )
return pUpdate;
AttrUpdateSharedPtr_t pNewUpdate { new CSphAttrUpdate };
*pNewUpdate = *pUpdate;
pNewUpdate->m_bReusable = true;
return pNewUpdate;
}
struct AttrUpdateInc_t // for cascade (incremental) update
{
AttrUpdateSharedPtr_t m_pUpdate; ///< the unchangeable update pool
CSphBitvec m_dUpdated; ///< bitmask of updated rows
int m_iAffected = 0; ///< num of updated rows.
explicit AttrUpdateInc_t ( AttrUpdateSharedPtr_t pUpd )
: m_pUpdate ( std::move(pUpd) )
, m_dUpdated ( m_pUpdate->m_dDocids.GetLength() )
{}
void MarkUpdated ( int iUpd )
{
if ( m_dUpdated.BitGet ( iUpd ) )
return;
++m_iAffected;
m_dUpdated.BitSet ( iUpd );
}
bool AllApplied () const
{
assert ( m_dUpdated.GetSize() >= m_iAffected );
return m_dUpdated.GetSize() == m_iAffected;
}
};
/////////////////////////////////////////////////////////////////////////////
// FULLTEXT INDICES
/////////////////////////////////////////////////////////////////////////////
/// progress info
class MergeCb_c
{
std::atomic<bool> * m_pStop = nullptr;
public:
enum Event_e : BYTE
{
E_IDLE,
E_COLLECT_START, // begin collecting alive docs on merge; payload is chunk ID
E_COLLECT_FINISHED, // collecting alive docs on merge is finished; payload is chunk ID
E_MERGEATTRS_START,
E_MERGEATTRS_FINISHED,
E_KEYWORDS,
E_FINISHED,
};
explicit MergeCb_c ( std::atomic<bool>* pStop = nullptr )
: m_pStop ( pStop )
{}
virtual ~MergeCb_c() = default;
virtual void SetEvent ( Event_e eEvent, int64_t iPayload ) {}
inline bool NeedStop () const
{
return sphInterrupted() || ( m_pStop && m_pStop->load ( std::memory_order_relaxed ) );
}
};
class CSphIndexProgress : private MergeCb_c
{
MergeCb_c * m_pMergeHook;
private:
virtual void ShowImpl ( bool bPhaseEnd ) const {};
public:
enum Phase_e
{
PHASE_COLLECT, ///< document collection phase
PHASE_SORT, ///< final sorting phase
PHASE_LOOKUP, ///< docid lookup construction
PHASE_MERGE, ///< index merging
PHASE_SI_BUILD, ///< secondary index build
PHASE_UNKNOWN,
};
Phase_e m_ePhase; ///< current indexing phase
union {
int64_t m_iDocuments; ///< PHASE_COLLECT: documents collected so far
int64_t m_iDocids; ///< PHASE_LOOKUP: docids added to lookup so far
int64_t m_iHits; ///< PHASE_SORT: hits sorted so far
int64_t m_iWords; ///< PHASE_MERGE: words merged so far
};
union {
int64_t m_iBytes; ///< PHASE_COLLECT: bytes collected so far;
int64_t m_iDocidsTotal; ///< PHASE_LOOKUP: total docids
int64_t m_iHitsTotal; ///< PHASE_SORT: hits total
};
public:
explicit CSphIndexProgress( MergeCb_c * pMergeHook = nullptr )
{
if ( pMergeHook )
m_pMergeHook = pMergeHook;
else
m_pMergeHook = static_cast<MergeCb_c *>(this);
PhaseBegin ( PHASE_UNKNOWN );
}
inline void PhaseBegin ( Phase_e ePhase )
{
m_ePhase = ePhase;
m_iDocuments = m_iBytes = 0;
}
inline void PhaseEnd() const
{
if ( m_ePhase!=PHASE_UNKNOWN )
ShowImpl ( true );
}
inline void Show() const
{
ShowImpl ( false );
}
// cb
MergeCb_c& GetMergeCb() const { return *m_pMergeHook; }
};
/// JSON key lookup stuff
struct JsonKey_t
{
CSphString m_sKey; ///< name string
DWORD m_uMask = 0; ///< Bloom mask for this key
int m_iLen = 0; ///< name length, in bytes
JsonKey_t () = default;
explicit JsonKey_t ( const char * sKey, int iLen );
};
/// forward refs to internal searcher classes
class ISphQword;
class ISphQwordSetup;
class CSphQueryContext;
class ISphFilter;
struct GetKeywordsSettings_t;
struct SuggestArgs_t;
struct SuggestResult_t;
struct ISphKeywordsStat
{
virtual ~ISphKeywordsStat() {}
virtual bool FillKeywords ( CSphVector <CSphKeywordInfo> & dKeywords ) const = 0;
};
struct CSphIndexStatus
{
int64_t m_iRamUse = 0;
int64_t m_iRamRetired = 0;
int64_t m_iMapped = 0; // total size of mmapped files
union {
int64_t m_iMappedResident = 0; // size of mmaped which are in core
int64_t m_iStackNeed; // for pq - max size of stack for eval node over all index on *this* node
};
union {
int64_t m_iMappedDocs = 0; // size of mmapped doclists
int64_t m_iStackBase; // for pq - base size over necessary
};
int64_t m_iMappedResidentDocs = 0; // size of mmaped resident doclists
int64_t m_iMappedHits = 0; // size of mmapped hitlists
int64_t m_iMappedResidentHits = 0; // size of mmaped resident doclists
int64_t m_iDiskUse = 0; // place occupied by index on disk (despite if it fetched into mem or not)
int64_t m_iRamChunkSize = 0; // not used for plain
int m_iNumRamChunks = 0; // not used for plain
int m_iNumChunks = 0; // not used for plain
int64_t m_iMemLimit = 0; // not used for plain
int64_t m_iTID = 0;
int64_t m_iSavedTID = 0;
int64_t m_iDead = 0;
double m_fSaveRateLimit {0.0}; // not used for plain. Part of m_iMemLimit to be achieved before flushing
};
struct CSphMultiQueryArgs : public ISphNoncopyable
{
const int m_iIndexWeight;
int m_iTag = 0;
DWORD m_uPackedFactorFlags { SPH_FACTOR_DISABLE };
bool m_bLocalDF = false;
const SmallStringHash_T<int64_t> * m_pLocalDocs = nullptr;
int64_t m_iTotalDocs = 0;
bool m_bModifySorterSchemas = true;
bool m_bFinalizeSorters = true;
int m_iThreads = 1;
int m_iTotalThreads = 1;
CSphMultiQueryArgs ( int iIndexWeight );
};
struct RowToUpdateData_t
{
RowID_t m_tRow; /// row in the index
int m_iIdx; /// idx in updateset
};
using RowsToUpdateData_t = CSphVector<RowToUpdateData_t>;
using RowsToUpdate_t = VecTraits_T<RowToUpdateData_t>;
struct PostponedUpdate_t
{
AttrUpdateSharedPtr_t m_pUpdate;
RowsToUpdateData_t m_dRowsToUpdate;
};
struct UpdateContext_t;
using BlockerFn = std::function<bool()>;
// common attribute update code for both RT and plain indexes
// an index or a part of an index that has its own row ids
class IndexSegment_c
{
mutable IndexSegment_c* m_pKillHook = nullptr; // if set, killed docids will be emerged also here.
protected:
enum
{
ATTRS_UPDATED = ( 1UL<<0 ),
ATTRS_BLOB_UPDATED = ( 1UL<<1 ),
ATTRS_ROWMAP_UPDATED = ( 1UL<<2 )
};
private:
virtual bool Update_WriteBlobRow ( UpdateContext_t & tCtx, RowID_t tRowID, ByteBlob_t tBlob,
int nBlobAttrs, const CSphAttrLocator & tBlobRowLoc, bool & bCritical, CSphString & sError ) {return false;};
static bool Update_InplaceJson ( const RowsToUpdate_t& dRows, UpdateContext_t & tCtx, CSphString & sError, bool bDryRun );
bool Update_Blobs ( const RowsToUpdate_t& dRows, UpdateContext_t & tCtx, bool & bCritical, CSphString & sError );
static void Update_Plain ( const RowsToUpdate_t& dRows, UpdateContext_t & tCtx );
public:
virtual int Kill ( DocID_t /*tDocID*/ ) { return 0; }
virtual int KillMulti ( const VecTraits_T<DocID_t> & /*dKlist*/ ) { return 0; };
virtual int CheckThenKillMulti ( const VecTraits_T<DocID_t>& dKlist, BlockerFn&& /*fnWatcher*/ ) { return KillMulti ( dKlist ); };
virtual ~IndexSegment_c() = default;
inline void SetKillHook ( IndexSegment_c * pKillHook ) const noexcept
{
m_pKillHook = pKillHook;
}
inline bool HasKillHook () const noexcept
{
return m_pKillHook!=nullptr;
}
inline void KillHook ( DocID_t tDocID ) const noexcept
{
if ( HasKillHook() )
m_pKillHook->Kill ( tDocID );
}
public:
bool Update_UpdateAttributes ( const RowsToUpdate_t& dRows, UpdateContext_t& tCtx, bool& bCritical, CSphString& sError );
/// apply serie of updates, assuming them prepared (no need to full-scan attributes), and index is offline, i.e. no concurrency
virtual void UpdateAttributesOffline ( VecTraits_T<PostponedUpdate_t>& dPostUpdates ) {}
inline void ResetPostponedUpdates()
{
m_bAttrsBusy = false;
m_dPostponedUpdates.Reset();
}
public:
// stuff for dispatch races between changes and updates
mutable std::atomic<bool> m_bAttrsBusy { false };
CSphVector<PostponedUpdate_t> m_dPostponedUpdates;
};
bool Update_CheckAttributes ( const CSphAttrUpdate& tUpd, const ISphSchema& tSchema, CSphString& sError );
// helper - collects killed documents
struct KillAccum_t final : public IndexSegment_c
{
CSphVector<DocID_t> m_dDocids;
int Kill ( DocID_t tDocID ) final
{
m_dDocids.Add ( tDocID );
return 1;
}